encoding.csv: add a sequential reader too (suitable for very large .csv files, it does not read everything at once) (#20140)

2025-09-08 06:41:58 -04:00 · 2023-12-10 23:57:08 +01:00 · 2023-12-10 23:57:08 +01:00 · cfcbcb416a
commit cfcbcb416a
parent 99d9473643
4 changed files with 439 additions and 10 deletions
--- a/vlib/encoding/csv/README_csv_reader.md
+++ b/vlib/encoding/csv/README_csv_reader.md
@ -1,6 +1,77 @@
-
 # CSV Reader
-This module is a Random Access CSV file reader, it indexes the file before reading the data.
+There are two CSV readers in this module:
+
+* Random Access reader
+* Sequential reader
+ 
+# Sequential CSV reader
+The sequential reader read the file row by row using only the memory needed for readings.
+Here is a very simple example of usage:
+
+```v
+import encoding.csv
+
+fn main() {
+	mut csvr := csv.csv_sequential_reader(file_path: 'big2.csv', end_line_len: csv.endline_crlf_len)!
+	for csvr.has_data() > 1 {
+		println(csvr.get_next_row()!)
+	}
+	csvr.dispose_csv_reader()
+}
+```
+This is the simplest way to use it to read csv files in sequential mode,
+with default configuration every cell is read as `string`.
+The function `get_row()` is used to read a single row, and it returns an array of `string`.
+
+## Reading from different sources `csv_sequential_reader`
+The CSV Sequential Reader can read from files, and memory buffers.
+
+### read from a file
+```v ignore
+csv.csv_sequential_reader(file_path:file_path)
+```
+### read from a memory buffer
+```v ignore
+csv.csv_sequential_reader(scr_buf:voidptr(buffer_ptr),  scr_buf_len: buffer_len)
+```
+When you call `csv.csv_sequential_reader` a `SequentialReader` struct is initialized passing 
+a `SequentialReaderConfig` struct as a parameter.
+Using these structs, it is possible to change the behavior of the CSV Reader.
+
+## The `SequentialReaderConfig` struct
+The config struct is as follows:
+```v ignore
+pub struct SequentialReaderConfig {
+	scr_buf      voidptr // pointer to the buffer of data
+	scr_buf_len  i64     // if > 0 use the RAM pointed by scr_buf as source of data
+	file_path    string
+	start_index  i64
+	end_index    i64    = -1
+	mem_buf_size int    = 1024 * 64 // default buffer size 64KByte
+	separator    u8     = `,`
+	comment      u8     = `#` // every line that start with the comment char is ignored
+	default_cell string = '*' // return this string if out of the csv boundaries
+	empty_cell   string // return this string if empty cell
+	end_line_len int = endline_cr_len // size of the endline rune
+	quote        u8  = `"` // double quote is the standard quote char
+}
+```
+|Field|Description|
+|------------|--------------|
+|`scr_buf`, `scr_buf_len`|If `scr_buf_len > 0` the reader will use the  `scr_buf` pointer as the base address of the data to parse and  `scr_buf_len` as the length of the buffer itself|
+|`file_path`| if `scr_buf_len == 0` the reader will try to open the `file_path`  file|
+|`start_index`,`end_index`| **Internal usage for now**|
+|`mem_buf_size`|memory allocated for the reading operations on the file, more memory more speed|
+|`separator`|char used as cell separator in the CSV file, default is comma|
+|`comment`|every line that start with the comment char is ignored|
+|`default_cell`|return this string if the query coordinates are out of the csv boundaries|
+|`empty_cell`|return this string if the query coordinates are on an empty cell|
+|`end_line_len`|size of the endline, `endline_cr_len=1`,`endline_crlf_len=2`|
+|`quote`|quote char for the cells|
+
+
+# Random Access CSV Reader
+The Random Access CSV file reader indexes the file before reading the data.
 This indexing operation permits access to every cell of the CSV file in random order.
 Here is a very simple example of usage:

@ -30,12 +101,12 @@ will give the following output:
 ['0', '1', '2']
 ['3', '4', '5']
 ```
-This is the simplest way to use it to read csv files, with default configuration 
-every cell is read as `string`.
+This is the simplest way to use it to read csv files in a random access mode, 
+with default configuration every cell is read as `string`.
 The function `get_row()` is used to read a single row, and it returns an array of `string`.

 ## Reading from different sources `csv_reader`
-The CSV Reader can read from files, strings, memory buffers.
+The CSV Random access Reader can read from files, strings, memory buffers.
 ### read from a file
 ```v ignore
 csv.csv_reader(file_path:file_path)
@ -64,7 +135,7 @@ pub struct RandomAccessReaderConfig {
 	end_index    i64    = -1
 	mem_buf_size int    = 1024 * 64 // default buffer size 64KByte
 	separator    u8     = `,`
-	comment      u8     = `#` // every line that start with the quote char is ignored
+	comment      u8     = `#` // every line that start with the comment char is ignored
 	default_cell string = '*' // return this string if out of the csv boundaries
 	empty_cell   string // return this string if empty cell
 	end_line_len int = csv.endline_cr_len // size of the endline rune
@ -79,6 +150,7 @@ pub struct RandomAccessReaderConfig {
 |`start_index`,`end_index`| **Internal usage for now**|
 |`mem_buf_size`|memory allocated for the reading operations on the file, more memory more speed|
 |`separator`|char used as cell separator in the CSV file, default is comma|
+|`comment`|every line that start with the comment char is ignored
 |`default_cell`|return this string if the query coordinates are out of the csv boundaries|
 |`empty_cell`|return this string if the query coordinates are on an empty cell|
 |`end_line_len`|size of the endline, `endline_cr_len=1`,`endline_crlf_len=2`|
--- a/vlib/encoding/csv/csv_reader_random_access.v
+++ b/vlib/encoding/csv/csv_reader_random_access.v
@ -1,11 +1,10 @@
 /*
-csv reader 1.0 alpha
+csv random access reader 1.0 alpha

 Copyright (c) 2023 Dario Deledda. All rights reserved.
 Use of this source code is governed by an MIT license
 that can be found in the LICENSE file.

-
 Known limitations:
 - no stream reading
 */
@ -404,7 +403,7 @@ pub fn (mut cr RandomAccessReader) get_cell(cfg GetCellConfig) !string {
 			}

 			// create the string from the buffer
-			mut tmp_mem := malloc(isize(len + 1))
+			mut tmp_mem := malloc_noscan(isize(len + 1))
 			/*
 			defer {
 				free(tmp_mem)
--- a/vlib/encoding/csv/csv_reader_sequential.v
+++ b/vlib/encoding/csv/csv_reader_sequential.v
@ -0,0 +1,297 @@
+/*
+csv serial reader 1.0 alpha
+
+Copyright (c) 2023 Dario Deledda. All rights reserved.
+Use of this source code is governed by an MIT license
+that can be found in the LICENSE file.
+
+Known limitations:
+*/
+module csv
+
+import os
+
+@[params]
+pub struct SequentialReaderConfig {
+	scr_buf      voidptr // pointer to the buffer of data
+	scr_buf_len  i64     // if > 0 use the RAM pointed by scr_buf as source of data
+	file_path    string
+	start_index  i64
+	end_index    i64    = -1
+	mem_buf_size int    = 1024 * 64 // default buffer size 64KByte
+	separator    u8     = `,`
+	comment      u8     = `#` // every line that start with the comment char is ignored
+	default_cell string = '*' // return this string if out of the csv boundaries
+	empty_cell   string // return this string if empty cell
+	end_line_len int = endline_cr_len // size of the endline rune
+	quote        u8  = `"` // double quote is the standard quote char
+}
+
+pub struct SequentialReader {
+pub mut:
+	index i64
+
+	f              os.File
+	f_len          i64
+	is_bom_present bool
+
+	start_index i64
+	end_index   i64 = -1
+
+	end_line      u8  = `\n`
+	end_line_len  int = endline_cr_len // size of the endline rune \n = 1, \r\n = 2
+	separator     u8  = `,` // comma is the default separator
+	separator_len int = 1 // size of the separator rune
+	quote         u8  = `"` // double quote is the standard quote char
+
+	comment u8 = `#` // every line that start with the quote char is ignored
+
+	default_cell string = '*' // return this string if out of the csv boundaries
+	empty_cell   string = '#' // retunrn this if empty cell
+	// ram buffer
+	mem_buf_type  u32 // buffer type 0=File,1=RAM
+	mem_buf       voidptr // buffer used to load chars from file
+	mem_buf_size  i64     // size of the buffer
+	mem_buf_start i64 = -1 // start index in the file of the read buffer
+	mem_buf_end   i64 = -1 // end index in the file of the read buffer
+
+	ch_buf []u8 = []u8{cap: 1024}
+	// error management
+	row_count i64
+	col_count i64
+}
+
+// csv_sequential_reader creates a sequential csv reader
+pub fn csv_sequential_reader(cfg SequentialReaderConfig) !&SequentialReader {
+	mut cr := &SequentialReader{}
+
+	cr.start_index = cfg.start_index
+	cr.end_index = cfg.end_index
+
+	// reading from a RAM buffer
+	if cfg.scr_buf != 0 && cfg.scr_buf_len > 0 {
+		cr.mem_buf_type = ram_csv // RAM buffer
+		cr.mem_buf = cfg.scr_buf
+		cr.mem_buf_size = cfg.scr_buf_len
+		if cfg.end_index == -1 {
+			cr.end_index = cfg.scr_buf_len
+		}
+
+		// check if BOM header is in the memory buffer
+		unsafe {
+			if *&u8(cr.mem_buf) == 0xEF && *(&u8(cr.mem_buf) + 1) == 0xBB
+				&& *(&u8(cr.mem_buf) + 2) == 0xBF {
+				cr.is_bom_present = true
+				cr.index += 3 // skip the BOM
+				cr.start_index += 3 // skip the BOM
+			}
+		}
+		cr.mem_buf_start = 0
+		cr.mem_buf_end = cr.mem_buf_size
+
+		// check if is a file source
+	} else if cfg.file_path.len > 0 {
+		if !os.exists(cfg.file_path) {
+			return error('ERROR: file ${cfg.file_path} not found!')
+		}
+		cr.mem_buf_type = file_csv // File buffer
+		// allocate the memory
+		unsafe {
+			cr.mem_buf = malloc(cfg.mem_buf_size)
+			cr.mem_buf_size = cfg.mem_buf_size
+		}
+		cr.f = os.open_file(cfg.file_path, 'rb')!
+
+		cr.f.seek(0, .end)!
+		cr.f_len = cr.f.tell()!
+
+		cr.f.seek(cfg.start_index, .start)!
+		cr.index = cr.f.tell()!
+
+		if cfg.end_index == -1 {
+			cr.end_index = cr.f_len
+		}
+
+		// check if BOM header is in the file
+		if cr.index == 0 {
+			if cr.f.read_into_ptr(cr.mem_buf, 4)! == 4 {
+				unsafe {
+					if *&u8(cr.mem_buf) == 0xEF && *(&u8(cr.mem_buf) + 1) == 0xBB
+						&& *(&u8(cr.mem_buf) + 2) == 0xBF {
+						cr.is_bom_present = true
+						cr.index += 3 // skip the BOM
+						cr.start_index += 3 // skip the BOM
+					}
+				}
+			}
+			cr.f.seek(cfg.start_index, .start)!
+		}
+	}
+
+	cr.default_cell = cfg.default_cell
+	cr.empty_cell = cfg.empty_cell
+	cr.end_line_len = cfg.end_line_len
+	cr.separator = cfg.separator
+	cr.comment = cfg.comment
+	cr.quote = cfg.quote
+
+	return cr
+}
+
+// dispose_csv_reader release the resources used by the csv_reader
+pub fn (mut cr SequentialReader) dispose_csv_reader() {
+	if cr.mem_buf_type == ram_csv {
+		// do nothing, ram buffer is static
+	} else if cr.mem_buf_type == file_csv {
+		// file close
+		if cr.f.is_opened {
+			cr.f.close()
+		}
+
+		// free the allocated memory
+		if cr.mem_buf_size > 0 {
+			unsafe {
+				free(cr.mem_buf)
+			}
+			cr.mem_buf = unsafe { nil }
+			cr.mem_buf_size = 0
+		}
+	}
+}
+
+// has_data return the bytes available for future readings
+pub fn (mut cr SequentialReader) has_data() i64 {
+	return cr.end_index - cr.start_index
+}
+
+fn (mut cr SequentialReader) fill_buffer(index i64) ! {
+	if cr.mem_buf_type == ram_csv {
+		// for now do nothing if ram buffer
+	} else {
+		cr.f.seek(index, .start)!
+		// IMPORTANT: add 64 bit support in vlib!!
+		read_bytes_count := cr.f.read_into_ptr(cr.mem_buf, int(cr.mem_buf_size))!
+		cr.mem_buf_start = index
+		cr.mem_buf_end = index + read_bytes_count
+	}
+}
+
+enum SequentialReadingState as u16 {
+	comment
+	quote
+	after_quote
+	cell
+	newline
+}
+
+// get_next_row get the next row from the CSV file as a string array
+pub fn (mut cr SequentialReader) get_next_row() ![]string {
+	mut row_res := []string{}
+	// clear the cell buffer
+	cr.ch_buf.clear()
+	mut i := cr.start_index
+	mut state := SequentialReadingState.cell
+
+	p := &u8(cr.mem_buf)
+	for i < cr.end_index {
+		if i < cr.mem_buf_start || i >= cr.mem_buf_end {
+			cr.fill_buffer(i)!
+		}
+		unsafe {
+			ch := *(p + i - cr.mem_buf_start)
+
+			if state == .cell {
+				if ch == cr.separator {
+					// must be optimized
+					cr.ch_buf << 0
+					row_res << if (cr.ch_buf.len - 1) == 0 {
+						cr.empty_cell
+					} else {
+						(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
+					}
+					cr.ch_buf.clear()
+				} else if cr.ch_buf.len == 0 && ch == cr.comment && row_res.len == 0 {
+					state = .comment
+				} else if ch == cr.quote {
+					state = .quote
+					cr.ch_buf.clear()
+					cr.col_count++
+					i++
+					continue
+				} else if ch == cr.end_line {
+					cr.row_count++
+					cr.col_count = 0
+
+					// skip empty rows
+					if !(row_res.len == 0 && cr.ch_buf.len < 1) {
+						cr.ch_buf << 0
+						row_res << if (cr.ch_buf.len - 1) == 0 {
+							cr.empty_cell
+						} else {
+							(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
+						}
+						i += cr.end_line_len - 1
+						break
+					}
+				} else if ch == `\r` && cr.end_line_len == 2 {
+					// skip CR
+				} else { // normal char inside a cell
+					cr.ch_buf << ch
+				}
+			}
+
+			if state == .comment {
+				if cr.ch_buf.len > 0 {
+					// must be optimized
+					cr.ch_buf << 0
+					row_res << if (cr.ch_buf.len - 1) == 0 {
+						cr.empty_cell
+					} else {
+						(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
+					}
+					cr.ch_buf.clear()
+				} else if ch == cr.end_line {
+					state = .cell
+				}
+			}
+
+			if state == .quote {
+				if ch == cr.quote {
+					// must be optimized
+					cr.ch_buf << 0
+					row_res << if (cr.ch_buf.len - 1) == 0 {
+						cr.empty_cell
+					} else {
+						(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
+					}
+					cr.ch_buf.clear()
+
+					state = .after_quote
+					cr.col_count++
+					i++
+					continue
+				} else if ch == cr.end_line {
+					return error('ERROR: quote not closed at row ${cr.row_count} after column ${cr.col_count}!')
+				} else { // normal char inside a quote inside a cell
+					cr.ch_buf << ch
+				}
+			}
+
+			if state == .after_quote {
+				if ch == cr.separator {
+					state = .cell
+				} else if ch == cr.end_line {
+					cr.row_count++
+					cr.col_count = 0
+					cr.ch_buf.clear()
+					i += cr.end_line_len - 1
+					break
+				}
+			}
+		}
+		cr.col_count++
+		i++
+	}
+	cr.start_index = i
+	return row_res
+}
--- a/vlib/encoding/csv/csv_reader_test.v
+++ b/vlib/encoding/csv/csv_reader_test.v
@ -105,7 +105,68 @@ const txt3 = 'a,b,c,d\r\n0,1,2,3\r\n4,5,6,7\r\n'
 const txt4 = 'a,b,c,d\n0,1,2,3\n4,5,6,7\n'
 /******************************************************************************
 *
-* Test Functions
+* Test Sequential Functions
+*
+******************************************************************************/
+fn test_csv_sequential() {
+	mut csvr := csv.csv_sequential_reader(scr_buf: txt1.str, scr_buf_len: txt1.len)!
+	mut data := [][]string{}
+	for csvr.has_data() > 1 {
+		data << csvr.get_next_row()!
+	}
+	csvr.dispose_csv_reader()
+	assert data[0][0] == 'a', 'test_csv_sequential1 reading failed!'
+	// there is a final empty row in txt1
+	assert data[data.len - 2][0] == 'a', 'test_csv_sequential2 reading failed!'
+	assert data[data.len - 2][1] == 'b,c,d', 'test_csv_sequential3 reading failed!'
+
+	csvr = csv.csv_sequential_reader(scr_buf: txt2.str, scr_buf_len: txt2.len)!
+	csvr.empty_cell = '####'
+	data = [][]string{}
+	for csvr.has_data() > 1 {
+		data << csvr.get_next_row()!
+	}
+	csvr.dispose_csv_reader()
+	assert data[data.len - 2][2] == '####', 'test_csv_sequential4 reading failed!'
+	assert data[data.len - 2][5] == 'pippo', 'test_csv_sequential5 reading failed!'
+
+	// create a temp file to test csv parsing from file
+	file_path_str := os.join_path(os.temp_dir(), 'test_csv.csv')
+	// println("file_path_str: ${file_path_str}")
+
+	// test Windows confguration
+	mut tmp_txt1 := txt1.replace('\n', '\r\n')
+
+	mut f := os.open_file(file_path_str, 'wb')!
+	unsafe {
+		f.write_ptr(tmp_txt1.str, tmp_txt1.len)
+	}
+	// f.write_string(tmp_txt1)!
+	f.close()
+
+	csvr = csv.csv_sequential_reader(
+		file_path: file_path_str
+		mem_buf_size: 64
+		end_line_len: csv.endline_crlf_len
+	)!
+	data = [][]string{}
+	for csvr.has_data() > 1 {
+		data << csvr.get_next_row()!
+	}
+	csvr.dispose_csv_reader()
+
+	assert data[0][0] == 'a', 'test_csv_sequential1 reading failed!'
+	// there is a final empty row in txt1
+	assert data[data.len - 2][0] == 'a', 'test_csv_sequential2 reading failed!'
+	assert data[data.len - 2][1] == 'b,c,d', 'test_csv_sequential3 reading failed!'
+
+	// remove the temp file
+	os.rm(file_path_str)!
+}
+
+/******************************************************************************
+*
+* Test Random Access Functions
 *
 ******************************************************************************/
 fn perform_test(mut csvr csv.RandomAccessReader) ! {