v/vlib/encoding/csv/csv_reader_test.v

473 lines
11 KiB
V

/*
csv reader 1.0 alpha
Copyright (c) 2023 Dario Deledda. All rights reserved.
Use of this source code is governed by an MIT license
that can be found in the LICENSE file.
This file contains tests
Known limitations:
*/
import encoding.csv
import strings
import os
import rand
/******************************************************************************
*
* Test Data
*
******************************************************************************/
// dataset 1
const txt1 = '
#
# pippo
#
a,b,c,d,e,f,g
0,dario,.2,3.2e-2,4,"pero5",6
# first comment, test @# again
1,2,3,4,5,6,7
2,3,4,5,6,7,8
3,4,5,6,7,8,9
a,"b,c,d",0,#,3,"pippo"
# last comment
'
const target_header_list = [
csv.HeaderItem{
label: 'a'
column: 0
htype: .int
},
csv.HeaderItem{
label: 'b'
column: 1
htype: .string
},
csv.HeaderItem{
label: 'c'
column: 2
htype: .f32
},
csv.HeaderItem{
label: 'd'
column: 3
htype: .f32
},
csv.HeaderItem{
label: 'e'
column: 4
htype: .int
},
csv.HeaderItem{
label: 'f'
column: 5
htype: .string
},
csv.HeaderItem{
label: 'g'
column: 6
htype: .int
},
]
const target_data = [
['a', 'b', 'c', 'd', 'e', 'f', 'g'],
['0', 'dario', '.2', '3.2e-2', '4', '"pero5"', '6'],
['1', '2', '3', '4', '5', '6', '7'],
['2', '3', '4', '5', '6', '7', '8'],
['3', '4', '5', '6', '7', '8', '9'],
['a', '"b,c,d"', '0', '#', '3', '"pippo"'], // 6 columns for test purpose
]
// dataset 2 crlf string from windows
const txt2 = '
#
# pippo
#
a,b,c,d,e,f,g
0,dario,.2,3.2e-2,4,"pero5",6
# first comment, test @# again
1,2,3,4,5,6,7
2,3,4,5,6,7,8
3,4,5,6,7,8,9
a,"b,c,d",,#,3,"pippo"
# last comment
'
// dataset 3/4
const txt3 = 'a,b,c,d\r\n0,1,2,3\r\n4,5,6,7\r\n'
const txt4 = 'a,b,c,d\n0,1,2,3\n4,5,6,7\n'
/******************************************************************************
*
* Test Sequential Functions
*
******************************************************************************/
fn test_csv_sequential() {
mut csvr := csv.csv_sequential_reader(scr_buf: txt1.str, scr_buf_len: txt1.len)!
mut data := [][]string{}
for csvr.has_data() > 1 {
data << csvr.get_next_row()!
}
csvr.dispose_csv_reader()
assert data[0][0] == 'a', 'test_csv_sequential1 reading failed!'
// there is a final empty row in txt1
assert data[data.len - 2][0] == 'a', 'test_csv_sequential2 reading failed!'
assert data[data.len - 2][1] == 'b,c,d', 'test_csv_sequential3 reading failed!'
csvr = csv.csv_sequential_reader(scr_buf: txt2.str, scr_buf_len: txt2.len)!
csvr.empty_cell = '####'
data = [][]string{}
for csvr.has_data() > 1 {
data << csvr.get_next_row()!
}
csvr.dispose_csv_reader()
assert data[data.len - 2][2] == '####', 'test_csv_sequential4 reading failed!'
assert data[data.len - 2][5] == 'pippo', 'test_csv_sequential5 reading failed!'
// create a temp file to test csv parsing from file
file_path_str := os.join_path(os.temp_dir(), 'test_csv.csv')
// println("file_path_str: ${file_path_str}")
// test Windows confguration
mut tmp_txt1 := txt1.replace('\n', '\r\n')
mut f := os.open_file(file_path_str, 'wb')!
unsafe {
f.write_ptr(tmp_txt1.str, tmp_txt1.len)
}
// f.write_string(tmp_txt1)!
f.close()
csvr = csv.csv_sequential_reader(
file_path: file_path_str
mem_buf_size: 64
end_line_len: csv.endline_crlf_len
)!
data = [][]string{}
for csvr.has_data() > 1 {
data << csvr.get_next_row()!
}
csvr.dispose_csv_reader()
assert data[0][0] == 'a', 'test_csv_sequential1 reading failed!'
// there is a final empty row in txt1
assert data[data.len - 2][0] == 'a', 'test_csv_sequential2 reading failed!'
assert data[data.len - 2][1] == 'b,c,d', 'test_csv_sequential3 reading failed!'
// remove the temp file
os.rm(file_path_str)!
}
/******************************************************************************
*
* Test Random Access Functions
*
******************************************************************************/
fn perform_test(mut csvr csv.RandomAccessReader) ! {
csvr.build_header_dict(csv.GetHeaderConf{})!
// test the Header reader
// println("csvr.header_list: ${csvr.header_list}")
assert csvr.header_list == target_header_list, 'header_list not matched!'
/*
println("--------------------------------")
for x in csvr.csv_map#[..5] {
println(x.len)
println(x)
}
println("--------------------------------")
*/
// test the data reading
mut data := [][]string{len: csvr.csv_map.len}
for x in 0 .. csvr.csv_map.len {
data[x] = csvr.get_row(x)!
// if x % 10000 == 0 {
// println("#${x:-6d}")
//}
}
/*
// debug print
println("---------------")
for x in 0..csvr.csv_map.len {
println(csvr.get_row(x)!)
}
*/
// test if we have the same amount of data rows
assert data.len == csvr.csv_map.len, 'data len not equal'
// test the data retriever
for row_count, row in target_data {
// println("${data[row_count]} ${row}")
assert data[row_count] == row, ''
}
// test lfcr cr
assert csvr.get_cell(x: 6, y: 4)! == '9'
// test the get cell behaviour
assert csvr.get_cell(x: csvr.header_map['b'], y: 1)! == 'dario', 'get_cell failed 1'
assert csvr.get_cell(x: csvr.header_map['g'], y: 5)! == csvr.default_cell, 'get_cell out of data failed 2'
assert csvr.get_cellt(x: 0, y: 1)! == csv.CellValue(0), 'get_cellt [int] failed'
assert csvr.get_cellt(x: 1, y: 1)! == csv.CellValue('dario'), 'get_cellt [string] failed'
assert csvr.get_cellt(x: 2, y: 1)! == csv.CellValue(f32(.2)), 'get_cell [f32] failed'
// test the filter quote flag
csvr.quote_remove = true
assert csvr.get_cell(x: 1, y: 5)! == 'b,c,d', 'get_cell filer quote flag failed'
}
fn perform_test2(mut csvr csv.RandomAccessReader) ! {
csvr.build_header_dict(csv.GetHeaderConf{})!
// test the empty cells
assert csvr.get_cell(x: csvr.header_map['c'], y: 5)! == csvr.empty_cell, 'get_cell empty_cell failed 2'
}
fn perform_test3(mut csvr csv.RandomAccessReader) ! {
csvr.build_header_dict(csv.GetHeaderConf{})!
/*
// debug print
println("---------------")
for x in 0..csvr.csv_map.len {
println(csvr.get_row(x)!)
}
*/
assert csvr.get_cell(x: csvr.header_map['d'], y: 2)! == '7', 'test \n \r\n failed'
}
fn test_csv_string() {
// test the csv parsing from RAM
mut csvr := csv.csv_reader_from_string(txt1)!
perform_test(mut csvr)!
csvr.dispose_csv_reader()
// create a temp file to test csv parsing from file
file_path_str := os.join_path(os.temp_dir(), 'test_csv.csv')
// println("file_path_str: ${file_path_str}")
// test Windows confguration
mut tmp_txt1 := txt1.replace('\n', '\r\n')
mut f := os.open_file(file_path_str, 'wb')!
unsafe {
f.write_ptr(tmp_txt1.str, tmp_txt1.len)
}
// f.write_string(tmp_txt1)!
f.close()
// parse the temp file
csvr = csv.csv_reader(
file_path: file_path_str
mem_buf_size: 32
end_line_len: csv.endline_crlf_len
)!
perform_test(mut csvr)!
csvr.dispose_csv_reader()
// remove the temp file
os.rm(file_path_str)!
csvr = csv.csv_reader_from_string(txt2)!
perform_test2(mut csvr)!
csvr.dispose_csv_reader()
// test crlf endline
csvr = csv.csv_reader(
scr_buf: txt3.str
scr_buf_len: txt3.len
end_line_len: csv.endline_crlf_len
)!
perform_test3(mut csvr)!
csvr.dispose_csv_reader()
// test cr endline
csvr = csv.csv_reader(scr_buf: txt4.str, scr_buf_len: txt4.len, end_line_len: csv.endline_cr_len)!
perform_test3(mut csvr)!
csvr.dispose_csv_reader()
}
fn test_coherence() {
file_path_str := os.join_path(os.temp_dir(), 'test_csv.csv')
mut f := os.open_file(file_path_str, 'w')!
mut b := strings.new_builder(64536)
mut i := u64(0)
mut sum := u64(0)
for rows in 0 .. 1000 {
for col in 0 .. 1000 {
if col > 0 {
b.write_u8(`,`)
}
b.write_string(i.str())
i++
sum += i
}
b.write_string('\n')
}
f.write_string(b.str())!
f.close()
sum -= i
// println('sum: ${sum}')
// parse the temp file
mut csvr := csv.csv_reader(
file_path: file_path_str
mem_buf_size: 32
end_line_len: csv.endline_cr_len
)!
mut sum1 := u64(0)
for row_index in 0 .. csvr.csv_map.len {
row := csvr.get_row(row_index)!
for x in row {
sum1 += u64(x.int())
}
}
// println('sum: ${sum1}')
csvr.dispose_csv_reader()
// remove the temp file
os.rm(file_path_str)!
assert sum == sum1, 'csv coherence test failed'
}
// Debug code
fn main() {
test_csv_string()
}
// Multithreaded tests
fn create_csv(file_path string, size int) !i64 {
// create csv file for the test
mut csv_txt := 'pippo,count,count1,pera,sempronio,float'
mut f := os.open_file(file_path, 'w')!
f.write_string(csv_txt + '\n')!
mut count := i64(0)
for i in 0 .. size {
tmp := "${rand.int()}, ${i}, 3, \"txt1${i}\", \"txt2${i}\", ${f32(rand.u32()) / 1000.0}\n"
f.write_string(tmp)!
// if i % 1_000_000 == 0 {
// println(i)
// }
count += i
}
f.close()
return count
}
fn read_lines(id int, csvr csv.RandomAccessReader, mut data [][]csv.CellValue, start_row int, end_row int) {
// println(" func ${data.len},${data[1].len}")
unsafe {
for count, col_elem in csvr.header_list {
// println("Check: ${col_elem}")
match col_elem.htype {
.string {
// println('id:${id} String here')
for row_index in start_row .. end_row {
// println("str ${count},${row_index}")
data[count][row_index - 1] = csvr.get_cell(x: count, y: row_index) or {
panic('Str get_cell failed')
}
}
}
.int {
// println('id:${id} Int here')
for row_index in start_row .. end_row {
// println("int ${count},${row_index}")
data[count][row_index - 1] = csvr.get_cell(x: count, y: row_index) or {
panic('Int get_cell failed')
}.trim_space().int()
}
}
.f32 {
// println('id:${id} f32 here')
for row_index in start_row .. end_row {
// println("f32 ${count},${row_index}")
data[count][row_index - 1] = csvr.get_cell(x: count, y: row_index) or {
panic('F32 get_cell failed')
}.trim_space().f32()
}
}
}
}
} // unsafe
}
fn test_multithreading() {
file_path_str := os.join_path(os.temp_dir(), 'test_csv.csv')
size := 10_000
// create the test file
res_count := create_csv(file_path_str, size)!
slices := 2 // number of slice of the csv
mem_buf_size := 1024 * 1024 * 1
mut csvr := []csv.RandomAccessReader{}
// init first csv reader
csvr << csv.csv_reader(file_path: file_path_str, mem_buf_size: mem_buf_size)!
csvr[0].build_header_dict(csv.GetHeaderConf{})!
// init other csv readers using the first reader configuration
for _ in 1 .. slices {
mut tmp_csvr := csv.csv_reader(
file_path: file_path_str
mem_buf_size: mem_buf_size
create_map_csv: false
)!
tmp_csvr.copy_configuration(csvr[0])
csvr << tmp_csvr
}
// read the data from the csv file
mut data := [][]csv.CellValue{}
n_rows := csvr[0].csv_map.len
unsafe {
data = [][]csv.CellValue{len: csvr[0].header_list.len, init: []csv.CellValue{len: n_rows}}
}
step := n_rows / slices
mut start := 1
mut end := if (start + step) > n_rows { n_rows } else { start + step }
mut threads := []thread{}
for task_index in 0 .. slices {
threads << spawn read_lines(task_index, csvr[task_index], mut &data, start, end)
start = end
end = if (start + step) > n_rows { n_rows } else { start + step }
}
threads.wait()
// release the csv readers
for mut item in csvr {
item.dispose_csv_reader()
}
// check for the integer column sum
mut ck_count := i64(0)
for i in 0 .. csvr[0].csv_map.len - 1 {
ck_count += data[1][i] as int
}
assert ck_count == res_count, 'check on csv file failed!'
// remove the temp file
os.rm(file_path_str)!
}