encoding.csv: add a sequential reader too (suitable for very large .csv files, it does not read everything at once) (#20140)

2025-09-10 07:47:20 -04:00 · 2023-12-10 23:57:08 +01:00 · 2023-12-10 23:57:08 +01:00 · cfcbcb416a
commit cfcbcb416a
parent 99d9473643
4 changed files with 439 additions and 10 deletions
--- a/vlib/encoding/csv/README_csv_reader.md
+++ b/vlib/encoding/csv/README_csv_reader.md
@ -1,6 +1,77 @@
 # CSV Reader
-This module is a Random Access CSV file reader, it indexes the file before reading the data.
+There are two CSV readers in this module:
 * Random Access reader
 * Sequential reader
 # Sequential CSV reader
 The sequential reader read the file row by row using only the memory needed for readings.
 Here is a very simple example of usage:
 ```v
 import encoding.csv
 fn main() {
 	mut csvr := csv.csv_sequential_reader(file_path: 'big2.csv', end_line_len: csv.endline_crlf_len)!
 	for csvr.has_data() > 1 {
 		println(csvr.get_next_row()!)
 	}
 	csvr.dispose_csv_reader()
 }
 ```
 This is the simplest way to use it to read csv files in sequential mode,
 with default configuration every cell is read as `string`.
 The function `get_row()` is used to read a single row, and it returns an array of `string`.
 ## Reading from different sources `csv_sequential_reader`
 The CSV Sequential Reader can read from files, and memory buffers.
 ### read from a file
 ```v ignore
 csv.csv_sequential_reader(file_path:file_path)
 ```
 ### read from a memory buffer
 ```v ignore
 csv.csv_sequential_reader(scr_buf:voidptr(buffer_ptr),  scr_buf_len: buffer_len)
 ```
 When you call `csv.csv_sequential_reader` a `SequentialReader` struct is initialized passing 
 a `SequentialReaderConfig` struct as a parameter.
 Using these structs, it is possible to change the behavior of the CSV Reader.
 ## The `SequentialReaderConfig` struct
 The config struct is as follows:
 ```v ignore
 pub struct SequentialReaderConfig {
 	scr_buf      voidptr // pointer to the buffer of data
 	scr_buf_len  i64     // if > 0 use the RAM pointed by scr_buf as source of data
 	file_path    string
 	start_index  i64
 	end_index    i64    = -1
 	mem_buf_size int    = 1024 * 64 // default buffer size 64KByte
 	separator    u8     = `,`
 	comment      u8     = `#` // every line that start with the comment char is ignored
 	default_cell string = '*' // return this string if out of the csv boundaries
 	empty_cell   string // return this string if empty cell
 	end_line_len int = endline_cr_len // size of the endline rune
 	quote        u8  = `"` // double quote is the standard quote char
 }
 ```
 |Field|Description|
 |------------|--------------|
 |`scr_buf`, `scr_buf_len`|If `scr_buf_len > 0` the reader will use the  `scr_buf` pointer as the base address of the data to parse and  `scr_buf_len` as the length of the buffer itself|
 |`file_path`| if `scr_buf_len == 0` the reader will try to open the `file_path`  file|
 |`start_index`,`end_index`| **Internal usage for now**|
 |`mem_buf_size`|memory allocated for the reading operations on the file, more memory more speed|
 |`separator`|char used as cell separator in the CSV file, default is comma|
 |`comment`|every line that start with the comment char is ignored|
 |`default_cell`|return this string if the query coordinates are out of the csv boundaries|
 |`empty_cell`|return this string if the query coordinates are on an empty cell|
 |`end_line_len`|size of the endline, `endline_cr_len=1`,`endline_crlf_len=2`|
 |`quote`|quote char for the cells|
 # Random Access CSV Reader
 The Random Access CSV file reader indexes the file before reading the data.
 This indexing operation permits access to every cell of the CSV file in random order.
 Here is a very simple example of usage:
@ -30,12 +101,12 @@ will give the following output:
 ['0', '1', '2']
 ['3', '4', '5']
 ```
-This is the simplest way to use it to read csv files, with default configuration 
+This is the simplest way to use it to read csv files in a random access mode, 
-every cell is read as `string`.
+with default configuration every cell is read as `string`.
 The function `get_row()` is used to read a single row, and it returns an array of `string`.
 ## Reading from different sources `csv_reader`
-The CSV Reader can read from files, strings, memory buffers.
+The CSV Random access Reader can read from files, strings, memory buffers.
 ### read from a file
 ```v ignore
 csv.csv_reader(file_path:file_path)
@ -64,7 +135,7 @@ pub struct RandomAccessReaderConfig {
 	end_index    i64    = -1
 	mem_buf_size int    = 1024 * 64 // default buffer size 64KByte
 	separator    u8     = `,`
-	comment      u8     = `#` // every line that start with the quote char is ignored
+	comment      u8     = `#` // every line that start with the comment char is ignored
 	default_cell string = '*' // return this string if out of the csv boundaries
 	empty_cell   string // return this string if empty cell
 	end_line_len int = csv.endline_cr_len // size of the endline rune
@ -79,6 +150,7 @@ pub struct RandomAccessReaderConfig {
 |`start_index`,`end_index`| **Internal usage for now**|
 |`mem_buf_size`|memory allocated for the reading operations on the file, more memory more speed|
 |`separator`|char used as cell separator in the CSV file, default is comma|
 |`comment`|every line that start with the comment char is ignored
 |`default_cell`|return this string if the query coordinates are out of the csv boundaries|
 |`empty_cell`|return this string if the query coordinates are on an empty cell|
 |`end_line_len`|size of the endline, `endline_cr_len=1`,`endline_crlf_len=2`|
--- a/vlib/encoding/csv/csv_reader_random_access.v
+++ b/vlib/encoding/csv/csv_reader_random_access.v
@ -1,11 +1,10 @@
 /*
-csv reader 1.0 alpha
+csv random access reader 1.0 alpha
 Copyright (c) 2023 Dario Deledda. All rights reserved.
 Use of this source code is governed by an MIT license
 that can be found in the LICENSE file.
 Known limitations:
 - no stream reading
 */
@ -404,7 +403,7 @@ pub fn (mut cr RandomAccessReader) get_cell(cfg GetCellConfig) !string {
 			}
 			// create the string from the buffer
-			mut tmp_mem := malloc(isize(len + 1))
+			mut tmp_mem := malloc_noscan(isize(len + 1))
 			/*
 			defer {
 				free(tmp_mem)
--- a/vlib/encoding/csv/csv_reader_sequential.v
+++ b/vlib/encoding/csv/csv_reader_sequential.v
@ -0,0 +1,297 @@
 /*
 csv serial reader 1.0 alpha
 Copyright (c) 2023 Dario Deledda. All rights reserved.
 Use of this source code is governed by an MIT license
 that can be found in the LICENSE file.
 Known limitations:
 */
 module csv
 import os
@[params]
 pub struct SequentialReaderConfig {
 	scr_buf      voidptr // pointer to the buffer of data
 	scr_buf_len  i64     // if > 0 use the RAM pointed by scr_buf as source of data
 	file_path    string
 	start_index  i64
 	end_index    i64    = -1
 	mem_buf_size int    = 1024 * 64 // default buffer size 64KByte
 	separator    u8     = `,`
 	comment      u8     = `#` // every line that start with the comment char is ignored
 	default_cell string = '*' // return this string if out of the csv boundaries
 	empty_cell   string // return this string if empty cell
 	end_line_len int = endline_cr_len // size of the endline rune
 	quote        u8  = `"` // double quote is the standard quote char
 }
 pub struct SequentialReader {
 pub mut:
 	index i64
 	f              os.File
 	f_len          i64
 	is_bom_present bool
 	start_index i64
 	end_index   i64 = -1
 	end_line      u8  = `\n`
 	end_line_len  int = endline_cr_len // size of the endline rune \n = 1, \r\n = 2
 	separator     u8  = `,` // comma is the default separator
 	separator_len int = 1 // size of the separator rune
 	quote         u8  = `"` // double quote is the standard quote char
 	comment u8 = `#` // every line that start with the quote char is ignored
 	default_cell string = '*' // return this string if out of the csv boundaries
 	empty_cell   string = '#' // retunrn this if empty cell
 	// ram buffer
 	mem_buf_type  u32 // buffer type 0=File,1=RAM
 	mem_buf       voidptr // buffer used to load chars from file
 	mem_buf_size  i64     // size of the buffer
 	mem_buf_start i64 = -1 // start index in the file of the read buffer
 	mem_buf_end   i64 = -1 // end index in the file of the read buffer
 	ch_buf []u8 = []u8{cap: 1024}
 	// error management
 	row_count i64
 	col_count i64
 }
 // csv_sequential_reader creates a sequential csv reader
 pub fn csv_sequential_reader(cfg SequentialReaderConfig) !&SequentialReader {
 	mut cr := &SequentialReader{}
 	cr.start_index = cfg.start_index
 	cr.end_index = cfg.end_index
 	// reading from a RAM buffer
 	if cfg.scr_buf != 0 && cfg.scr_buf_len > 0 {
 		cr.mem_buf_type = ram_csv // RAM buffer
 		cr.mem_buf = cfg.scr_buf
 		cr.mem_buf_size = cfg.scr_buf_len
 		if cfg.end_index == -1 {
 			cr.end_index = cfg.scr_buf_len
 		}
 		// check if BOM header is in the memory buffer
 		unsafe {
 			if *&u8(cr.mem_buf) == 0xEF && *(&u8(cr.mem_buf) + 1) == 0xBB
 				&& *(&u8(cr.mem_buf) + 2) == 0xBF {
 				cr.is_bom_present = true
 				cr.index += 3 // skip the BOM
 				cr.start_index += 3 // skip the BOM
 			}
 		}
 		cr.mem_buf_start = 0
 		cr.mem_buf_end = cr.mem_buf_size
 		// check if is a file source
 	} else if cfg.file_path.len > 0 {
 		if !os.exists(cfg.file_path) {
 			return error('ERROR: file ${cfg.file_path} not found!')
 		}
 		cr.mem_buf_type = file_csv // File buffer
 		// allocate the memory
 		unsafe {
 			cr.mem_buf = malloc(cfg.mem_buf_size)
 			cr.mem_buf_size = cfg.mem_buf_size
 		}
 		cr.f = os.open_file(cfg.file_path, 'rb')!
 		cr.f.seek(0, .end)!
 		cr.f_len = cr.f.tell()!
 		cr.f.seek(cfg.start_index, .start)!
 		cr.index = cr.f.tell()!
 		if cfg.end_index == -1 {
 			cr.end_index = cr.f_len
 		}
 		// check if BOM header is in the file
 		if cr.index == 0 {
 			if cr.f.read_into_ptr(cr.mem_buf, 4)! == 4 {
 				unsafe {
 					if *&u8(cr.mem_buf) == 0xEF && *(&u8(cr.mem_buf) + 1) == 0xBB
 						&& *(&u8(cr.mem_buf) + 2) == 0xBF {
 						cr.is_bom_present = true
 						cr.index += 3 // skip the BOM
 						cr.start_index += 3 // skip the BOM
 					}
 				}
 			}
 			cr.f.seek(cfg.start_index, .start)!
 		}
 	}
 	cr.default_cell = cfg.default_cell
 	cr.empty_cell = cfg.empty_cell
 	cr.end_line_len = cfg.end_line_len
 	cr.separator = cfg.separator
 	cr.comment = cfg.comment
 	cr.quote = cfg.quote
 	return cr
 }
 // dispose_csv_reader release the resources used by the csv_reader
 pub fn (mut cr SequentialReader) dispose_csv_reader() {
 	if cr.mem_buf_type == ram_csv {
 		// do nothing, ram buffer is static
 	} else if cr.mem_buf_type == file_csv {
 		// file close
 		if cr.f.is_opened {
 			cr.f.close()
 		}
 		// free the allocated memory
 		if cr.mem_buf_size > 0 {
 			unsafe {
 				free(cr.mem_buf)
 			}
 			cr.mem_buf = unsafe { nil }
 			cr.mem_buf_size = 0
 		}
 	}
 }
 // has_data return the bytes available for future readings
 pub fn (mut cr SequentialReader) has_data() i64 {
 	return cr.end_index - cr.start_index
 }
 fn (mut cr SequentialReader) fill_buffer(index i64) ! {
 	if cr.mem_buf_type == ram_csv {
 		// for now do nothing if ram buffer
 	} else {
 		cr.f.seek(index, .start)!
 		// IMPORTANT: add 64 bit support in vlib!!
 		read_bytes_count := cr.f.read_into_ptr(cr.mem_buf, int(cr.mem_buf_size))!
 		cr.mem_buf_start = index
 		cr.mem_buf_end = index + read_bytes_count
 	}
 }
 enum SequentialReadingState as u16 {
 	comment
 	quote
 	after_quote
 	cell
 	newline
 }
 // get_next_row get the next row from the CSV file as a string array
 pub fn (mut cr SequentialReader) get_next_row() ![]string {
 	mut row_res := []string{}
 	// clear the cell buffer
 	cr.ch_buf.clear()
 	mut i := cr.start_index
 	mut state := SequentialReadingState.cell
 	p := &u8(cr.mem_buf)
 	for i < cr.end_index {
 		if i < cr.mem_buf_start || i >= cr.mem_buf_end {
 			cr.fill_buffer(i)!
 		}
 		unsafe {
 			ch := *(p + i - cr.mem_buf_start)
 			if state == .cell {
 				if ch == cr.separator {
 					// must be optimized
 					cr.ch_buf << 0
 					row_res << if (cr.ch_buf.len - 1) == 0 {
 						cr.empty_cell
 					} else {
 						(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
 					}
 					cr.ch_buf.clear()
 				} else if cr.ch_buf.len == 0 && ch == cr.comment && row_res.len == 0 {
 					state = .comment
 				} else if ch == cr.quote {
 					state = .quote
 					cr.ch_buf.clear()
 					cr.col_count++
 					i++
 					continue
 				} else if ch == cr.end_line {
 					cr.row_count++
 					cr.col_count = 0
 					// skip empty rows
 					if !(row_res.len == 0 && cr.ch_buf.len < 1) {
 						cr.ch_buf << 0
 						row_res << if (cr.ch_buf.len - 1) == 0 {
 							cr.empty_cell
 						} else {
 							(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
 						}
 						i += cr.end_line_len - 1
 						break
 					}
 				} else if ch == `\r` && cr.end_line_len == 2 {
 					// skip CR
 				} else { // normal char inside a cell
 					cr.ch_buf << ch
 				}
 			}
 			if state == .comment {
 				if cr.ch_buf.len > 0 {
 					// must be optimized
 					cr.ch_buf << 0
 					row_res << if (cr.ch_buf.len - 1) == 0 {
 						cr.empty_cell
 					} else {
 						(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
 					}
 					cr.ch_buf.clear()
 				} else if ch == cr.end_line {
 					state = .cell
 				}
 			}
 			if state == .quote {
 				if ch == cr.quote {
 					// must be optimized
 					cr.ch_buf << 0
 					row_res << if (cr.ch_buf.len - 1) == 0 {
 						cr.empty_cell
 					} else {
 						(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
 					}
 					cr.ch_buf.clear()
 					state = .after_quote
 					cr.col_count++
 					i++
 					continue
 				} else if ch == cr.end_line {
 					return error('ERROR: quote not closed at row ${cr.row_count} after column ${cr.col_count}!')
 				} else { // normal char inside a quote inside a cell
 					cr.ch_buf << ch
 				}
 			}
 			if state == .after_quote {
 				if ch == cr.separator {
 					state = .cell
 				} else if ch == cr.end_line {
 					cr.row_count++
 					cr.col_count = 0
 					cr.ch_buf.clear()
 					i += cr.end_line_len - 1
 					break
 				}
 			}
 		}
 		cr.col_count++
 		i++
 	}
 	cr.start_index = i
 	return row_res
 }
--- a/vlib/encoding/csv/csv_reader_test.v
+++ b/vlib/encoding/csv/csv_reader_test.v
@ -105,7 +105,68 @@ const txt3 = 'a,b,c,d\r\n0,1,2,3\r\n4,5,6,7\r\n'
 const txt4 = 'a,b,c,d\n0,1,2,3\n4,5,6,7\n'
 /******************************************************************************
 *
-* Test Functions
+* Test Sequential Functions
 *
 ******************************************************************************/
 fn test_csv_sequential() {
 	mut csvr := csv.csv_sequential_reader(scr_buf: txt1.str, scr_buf_len: txt1.len)!
 	mut data := [][]string{}
 	for csvr.has_data() > 1 {
 		data << csvr.get_next_row()!
 	}
 	csvr.dispose_csv_reader()
 	assert data[0][0] == 'a', 'test_csv_sequential1 reading failed!'
 	// there is a final empty row in txt1
 	assert data[data.len - 2][0] == 'a', 'test_csv_sequential2 reading failed!'
 	assert data[data.len - 2][1] == 'b,c,d', 'test_csv_sequential3 reading failed!'
 	csvr = csv.csv_sequential_reader(scr_buf: txt2.str, scr_buf_len: txt2.len)!
 	csvr.empty_cell = '####'
 	data = [][]string{}
 	for csvr.has_data() > 1 {
 		data << csvr.get_next_row()!
 	}
 	csvr.dispose_csv_reader()
 	assert data[data.len - 2][2] == '####', 'test_csv_sequential4 reading failed!'
 	assert data[data.len - 2][5] == 'pippo', 'test_csv_sequential5 reading failed!'
 	// create a temp file to test csv parsing from file
 	file_path_str := os.join_path(os.temp_dir(), 'test_csv.csv')
 	// println("file_path_str: ${file_path_str}")
 	// test Windows confguration
 	mut tmp_txt1 := txt1.replace('\n', '\r\n')
 	mut f := os.open_file(file_path_str, 'wb')!
 	unsafe {
 		f.write_ptr(tmp_txt1.str, tmp_txt1.len)
 	}
 	// f.write_string(tmp_txt1)!
 	f.close()
 	csvr = csv.csv_sequential_reader(
 		file_path: file_path_str
 		mem_buf_size: 64
 		end_line_len: csv.endline_crlf_len
 	)!
 	data = [][]string{}
 	for csvr.has_data() > 1 {
 		data << csvr.get_next_row()!
 	}
 	csvr.dispose_csv_reader()
 	assert data[0][0] == 'a', 'test_csv_sequential1 reading failed!'
 	// there is a final empty row in txt1
 	assert data[data.len - 2][0] == 'a', 'test_csv_sequential2 reading failed!'
 	assert data[data.len - 2][1] == 'b,c,d', 'test_csv_sequential3 reading failed!'
 	// remove the temp file
 	os.rm(file_path_str)!
 }
 /******************************************************************************
 *
 * Test Random Access Functions
 *
 ******************************************************************************/
 fn perform_test(mut csvr csv.RandomAccessReader) ! {