encoding.csv: add a sequential reader too (suitable for very large .csv files, it does not read everything at once) (#20140)

This commit is contained in:
penguindark 2023-12-10 23:57:08 +01:00 committed by GitHub
parent 99d9473643
commit cfcbcb416a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 439 additions and 10 deletions

View File

@ -1,6 +1,77 @@
# CSV Reader
This module is a Random Access CSV file reader, it indexes the file before reading the data.
There are two CSV readers in this module:
* Random Access reader
* Sequential reader
# Sequential CSV reader
The sequential reader read the file row by row using only the memory needed for readings.
Here is a very simple example of usage:
```v
import encoding.csv
fn main() {
mut csvr := csv.csv_sequential_reader(file_path: 'big2.csv', end_line_len: csv.endline_crlf_len)!
for csvr.has_data() > 1 {
println(csvr.get_next_row()!)
}
csvr.dispose_csv_reader()
}
```
This is the simplest way to use it to read csv files in sequential mode,
with default configuration every cell is read as `string`.
The function `get_row()` is used to read a single row, and it returns an array of `string`.
## Reading from different sources `csv_sequential_reader`
The CSV Sequential Reader can read from files, and memory buffers.
### read from a file
```v ignore
csv.csv_sequential_reader(file_path:file_path)
```
### read from a memory buffer
```v ignore
csv.csv_sequential_reader(scr_buf:voidptr(buffer_ptr), scr_buf_len: buffer_len)
```
When you call `csv.csv_sequential_reader` a `SequentialReader` struct is initialized passing
a `SequentialReaderConfig` struct as a parameter.
Using these structs, it is possible to change the behavior of the CSV Reader.
## The `SequentialReaderConfig` struct
The config struct is as follows:
```v ignore
pub struct SequentialReaderConfig {
scr_buf voidptr // pointer to the buffer of data
scr_buf_len i64 // if > 0 use the RAM pointed by scr_buf as source of data
file_path string
start_index i64
end_index i64 = -1
mem_buf_size int = 1024 * 64 // default buffer size 64KByte
separator u8 = `,`
comment u8 = `#` // every line that start with the comment char is ignored
default_cell string = '*' // return this string if out of the csv boundaries
empty_cell string // return this string if empty cell
end_line_len int = endline_cr_len // size of the endline rune
quote u8 = `"` // double quote is the standard quote char
}
```
|Field|Description|
|------------|--------------|
|`scr_buf`, `scr_buf_len`|If `scr_buf_len > 0` the reader will use the `scr_buf` pointer as the base address of the data to parse and `scr_buf_len` as the length of the buffer itself|
|`file_path`| if `scr_buf_len == 0` the reader will try to open the `file_path` file|
|`start_index`,`end_index`| **Internal usage for now**|
|`mem_buf_size`|memory allocated for the reading operations on the file, more memory more speed|
|`separator`|char used as cell separator in the CSV file, default is comma|
|`comment`|every line that start with the comment char is ignored|
|`default_cell`|return this string if the query coordinates are out of the csv boundaries|
|`empty_cell`|return this string if the query coordinates are on an empty cell|
|`end_line_len`|size of the endline, `endline_cr_len=1`,`endline_crlf_len=2`|
|`quote`|quote char for the cells|
# Random Access CSV Reader
The Random Access CSV file reader indexes the file before reading the data.
This indexing operation permits access to every cell of the CSV file in random order.
Here is a very simple example of usage:
@ -30,12 +101,12 @@ will give the following output:
['0', '1', '2']
['3', '4', '5']
```
This is the simplest way to use it to read csv files, with default configuration
every cell is read as `string`.
This is the simplest way to use it to read csv files in a random access mode,
with default configuration every cell is read as `string`.
The function `get_row()` is used to read a single row, and it returns an array of `string`.
## Reading from different sources `csv_reader`
The CSV Reader can read from files, strings, memory buffers.
The CSV Random access Reader can read from files, strings, memory buffers.
### read from a file
```v ignore
csv.csv_reader(file_path:file_path)
@ -64,7 +135,7 @@ pub struct RandomAccessReaderConfig {
end_index i64 = -1
mem_buf_size int = 1024 * 64 // default buffer size 64KByte
separator u8 = `,`
comment u8 = `#` // every line that start with the quote char is ignored
comment u8 = `#` // every line that start with the comment char is ignored
default_cell string = '*' // return this string if out of the csv boundaries
empty_cell string // return this string if empty cell
end_line_len int = csv.endline_cr_len // size of the endline rune
@ -79,6 +150,7 @@ pub struct RandomAccessReaderConfig {
|`start_index`,`end_index`| **Internal usage for now**|
|`mem_buf_size`|memory allocated for the reading operations on the file, more memory more speed|
|`separator`|char used as cell separator in the CSV file, default is comma|
|`comment`|every line that start with the comment char is ignored
|`default_cell`|return this string if the query coordinates are out of the csv boundaries|
|`empty_cell`|return this string if the query coordinates are on an empty cell|
|`end_line_len`|size of the endline, `endline_cr_len=1`,`endline_crlf_len=2`|

View File

@ -1,11 +1,10 @@
/*
csv reader 1.0 alpha
csv random access reader 1.0 alpha
Copyright (c) 2023 Dario Deledda. All rights reserved.
Use of this source code is governed by an MIT license
that can be found in the LICENSE file.
Known limitations:
- no stream reading
*/
@ -404,7 +403,7 @@ pub fn (mut cr RandomAccessReader) get_cell(cfg GetCellConfig) !string {
}
// create the string from the buffer
mut tmp_mem := malloc(isize(len + 1))
mut tmp_mem := malloc_noscan(isize(len + 1))
/*
defer {
free(tmp_mem)

View File

@ -0,0 +1,297 @@
/*
csv serial reader 1.0 alpha
Copyright (c) 2023 Dario Deledda. All rights reserved.
Use of this source code is governed by an MIT license
that can be found in the LICENSE file.
Known limitations:
*/
module csv
import os
@[params]
pub struct SequentialReaderConfig {
scr_buf voidptr // pointer to the buffer of data
scr_buf_len i64 // if > 0 use the RAM pointed by scr_buf as source of data
file_path string
start_index i64
end_index i64 = -1
mem_buf_size int = 1024 * 64 // default buffer size 64KByte
separator u8 = `,`
comment u8 = `#` // every line that start with the comment char is ignored
default_cell string = '*' // return this string if out of the csv boundaries
empty_cell string // return this string if empty cell
end_line_len int = endline_cr_len // size of the endline rune
quote u8 = `"` // double quote is the standard quote char
}
pub struct SequentialReader {
pub mut:
index i64
f os.File
f_len i64
is_bom_present bool
start_index i64
end_index i64 = -1
end_line u8 = `\n`
end_line_len int = endline_cr_len // size of the endline rune \n = 1, \r\n = 2
separator u8 = `,` // comma is the default separator
separator_len int = 1 // size of the separator rune
quote u8 = `"` // double quote is the standard quote char
comment u8 = `#` // every line that start with the quote char is ignored
default_cell string = '*' // return this string if out of the csv boundaries
empty_cell string = '#' // retunrn this if empty cell
// ram buffer
mem_buf_type u32 // buffer type 0=File,1=RAM
mem_buf voidptr // buffer used to load chars from file
mem_buf_size i64 // size of the buffer
mem_buf_start i64 = -1 // start index in the file of the read buffer
mem_buf_end i64 = -1 // end index in the file of the read buffer
ch_buf []u8 = []u8{cap: 1024}
// error management
row_count i64
col_count i64
}
// csv_sequential_reader creates a sequential csv reader
pub fn csv_sequential_reader(cfg SequentialReaderConfig) !&SequentialReader {
mut cr := &SequentialReader{}
cr.start_index = cfg.start_index
cr.end_index = cfg.end_index
// reading from a RAM buffer
if cfg.scr_buf != 0 && cfg.scr_buf_len > 0 {
cr.mem_buf_type = ram_csv // RAM buffer
cr.mem_buf = cfg.scr_buf
cr.mem_buf_size = cfg.scr_buf_len
if cfg.end_index == -1 {
cr.end_index = cfg.scr_buf_len
}
// check if BOM header is in the memory buffer
unsafe {
if *&u8(cr.mem_buf) == 0xEF && *(&u8(cr.mem_buf) + 1) == 0xBB
&& *(&u8(cr.mem_buf) + 2) == 0xBF {
cr.is_bom_present = true
cr.index += 3 // skip the BOM
cr.start_index += 3 // skip the BOM
}
}
cr.mem_buf_start = 0
cr.mem_buf_end = cr.mem_buf_size
// check if is a file source
} else if cfg.file_path.len > 0 {
if !os.exists(cfg.file_path) {
return error('ERROR: file ${cfg.file_path} not found!')
}
cr.mem_buf_type = file_csv // File buffer
// allocate the memory
unsafe {
cr.mem_buf = malloc(cfg.mem_buf_size)
cr.mem_buf_size = cfg.mem_buf_size
}
cr.f = os.open_file(cfg.file_path, 'rb')!
cr.f.seek(0, .end)!
cr.f_len = cr.f.tell()!
cr.f.seek(cfg.start_index, .start)!
cr.index = cr.f.tell()!
if cfg.end_index == -1 {
cr.end_index = cr.f_len
}
// check if BOM header is in the file
if cr.index == 0 {
if cr.f.read_into_ptr(cr.mem_buf, 4)! == 4 {
unsafe {
if *&u8(cr.mem_buf) == 0xEF && *(&u8(cr.mem_buf) + 1) == 0xBB
&& *(&u8(cr.mem_buf) + 2) == 0xBF {
cr.is_bom_present = true
cr.index += 3 // skip the BOM
cr.start_index += 3 // skip the BOM
}
}
}
cr.f.seek(cfg.start_index, .start)!
}
}
cr.default_cell = cfg.default_cell
cr.empty_cell = cfg.empty_cell
cr.end_line_len = cfg.end_line_len
cr.separator = cfg.separator
cr.comment = cfg.comment
cr.quote = cfg.quote
return cr
}
// dispose_csv_reader release the resources used by the csv_reader
pub fn (mut cr SequentialReader) dispose_csv_reader() {
if cr.mem_buf_type == ram_csv {
// do nothing, ram buffer is static
} else if cr.mem_buf_type == file_csv {
// file close
if cr.f.is_opened {
cr.f.close()
}
// free the allocated memory
if cr.mem_buf_size > 0 {
unsafe {
free(cr.mem_buf)
}
cr.mem_buf = unsafe { nil }
cr.mem_buf_size = 0
}
}
}
// has_data return the bytes available for future readings
pub fn (mut cr SequentialReader) has_data() i64 {
return cr.end_index - cr.start_index
}
fn (mut cr SequentialReader) fill_buffer(index i64) ! {
if cr.mem_buf_type == ram_csv {
// for now do nothing if ram buffer
} else {
cr.f.seek(index, .start)!
// IMPORTANT: add 64 bit support in vlib!!
read_bytes_count := cr.f.read_into_ptr(cr.mem_buf, int(cr.mem_buf_size))!
cr.mem_buf_start = index
cr.mem_buf_end = index + read_bytes_count
}
}
enum SequentialReadingState as u16 {
comment
quote
after_quote
cell
newline
}
// get_next_row get the next row from the CSV file as a string array
pub fn (mut cr SequentialReader) get_next_row() ![]string {
mut row_res := []string{}
// clear the cell buffer
cr.ch_buf.clear()
mut i := cr.start_index
mut state := SequentialReadingState.cell
p := &u8(cr.mem_buf)
for i < cr.end_index {
if i < cr.mem_buf_start || i >= cr.mem_buf_end {
cr.fill_buffer(i)!
}
unsafe {
ch := *(p + i - cr.mem_buf_start)
if state == .cell {
if ch == cr.separator {
// must be optimized
cr.ch_buf << 0
row_res << if (cr.ch_buf.len - 1) == 0 {
cr.empty_cell
} else {
(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
}
cr.ch_buf.clear()
} else if cr.ch_buf.len == 0 && ch == cr.comment && row_res.len == 0 {
state = .comment
} else if ch == cr.quote {
state = .quote
cr.ch_buf.clear()
cr.col_count++
i++
continue
} else if ch == cr.end_line {
cr.row_count++
cr.col_count = 0
// skip empty rows
if !(row_res.len == 0 && cr.ch_buf.len < 1) {
cr.ch_buf << 0
row_res << if (cr.ch_buf.len - 1) == 0 {
cr.empty_cell
} else {
(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
}
i += cr.end_line_len - 1
break
}
} else if ch == `\r` && cr.end_line_len == 2 {
// skip CR
} else { // normal char inside a cell
cr.ch_buf << ch
}
}
if state == .comment {
if cr.ch_buf.len > 0 {
// must be optimized
cr.ch_buf << 0
row_res << if (cr.ch_buf.len - 1) == 0 {
cr.empty_cell
} else {
(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
}
cr.ch_buf.clear()
} else if ch == cr.end_line {
state = .cell
}
}
if state == .quote {
if ch == cr.quote {
// must be optimized
cr.ch_buf << 0
row_res << if (cr.ch_buf.len - 1) == 0 {
cr.empty_cell
} else {
(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
}
cr.ch_buf.clear()
state = .after_quote
cr.col_count++
i++
continue
} else if ch == cr.end_line {
return error('ERROR: quote not closed at row ${cr.row_count} after column ${cr.col_count}!')
} else { // normal char inside a quote inside a cell
cr.ch_buf << ch
}
}
if state == .after_quote {
if ch == cr.separator {
state = .cell
} else if ch == cr.end_line {
cr.row_count++
cr.col_count = 0
cr.ch_buf.clear()
i += cr.end_line_len - 1
break
}
}
}
cr.col_count++
i++
}
cr.start_index = i
return row_res
}

View File

@ -105,7 +105,68 @@ const txt3 = 'a,b,c,d\r\n0,1,2,3\r\n4,5,6,7\r\n'
const txt4 = 'a,b,c,d\n0,1,2,3\n4,5,6,7\n'
/******************************************************************************
*
* Test Functions
* Test Sequential Functions
*
******************************************************************************/
fn test_csv_sequential() {
mut csvr := csv.csv_sequential_reader(scr_buf: txt1.str, scr_buf_len: txt1.len)!
mut data := [][]string{}
for csvr.has_data() > 1 {
data << csvr.get_next_row()!
}
csvr.dispose_csv_reader()
assert data[0][0] == 'a', 'test_csv_sequential1 reading failed!'
// there is a final empty row in txt1
assert data[data.len - 2][0] == 'a', 'test_csv_sequential2 reading failed!'
assert data[data.len - 2][1] == 'b,c,d', 'test_csv_sequential3 reading failed!'
csvr = csv.csv_sequential_reader(scr_buf: txt2.str, scr_buf_len: txt2.len)!
csvr.empty_cell = '####'
data = [][]string{}
for csvr.has_data() > 1 {
data << csvr.get_next_row()!
}
csvr.dispose_csv_reader()
assert data[data.len - 2][2] == '####', 'test_csv_sequential4 reading failed!'
assert data[data.len - 2][5] == 'pippo', 'test_csv_sequential5 reading failed!'
// create a temp file to test csv parsing from file
file_path_str := os.join_path(os.temp_dir(), 'test_csv.csv')
// println("file_path_str: ${file_path_str}")
// test Windows confguration
mut tmp_txt1 := txt1.replace('\n', '\r\n')
mut f := os.open_file(file_path_str, 'wb')!
unsafe {
f.write_ptr(tmp_txt1.str, tmp_txt1.len)
}
// f.write_string(tmp_txt1)!
f.close()
csvr = csv.csv_sequential_reader(
file_path: file_path_str
mem_buf_size: 64
end_line_len: csv.endline_crlf_len
)!
data = [][]string{}
for csvr.has_data() > 1 {
data << csvr.get_next_row()!
}
csvr.dispose_csv_reader()
assert data[0][0] == 'a', 'test_csv_sequential1 reading failed!'
// there is a final empty row in txt1
assert data[data.len - 2][0] == 'a', 'test_csv_sequential2 reading failed!'
assert data[data.len - 2][1] == 'b,c,d', 'test_csv_sequential3 reading failed!'
// remove the temp file
os.rm(file_path_str)!
}
/******************************************************************************
*
* Test Random Access Functions
*
******************************************************************************/
fn perform_test(mut csvr csv.RandomAccessReader) ! {