mirror of
https://github.com/vlang/v.git
synced 2025-08-04 02:07:28 -04:00
581 lines
16 KiB
V
581 lines
16 KiB
V
/*
|
|
csv random access reader 1.0 alpha
|
|
|
|
Copyright (c) 2023 Dario Deledda. All rights reserved.
|
|
Use of this source code is governed by an MIT license
|
|
that can be found in the LICENSE file.
|
|
|
|
Known limitations:
|
|
- no stream reading
|
|
*/
|
|
module csv
|
|
|
|
import os
|
|
|
|
/******************************************************************************
|
|
*
|
|
* Consts
|
|
*
|
|
******************************************************************************/
|
|
// endline lengths
|
|
pub const endline_cr_len = 1
|
|
pub const endline_crlf_len = 2
|
|
|
|
// Type of read buffer
|
|
pub const ram_csv = 1
|
|
pub const file_csv = 0
|
|
|
|
/******************************************************************************
|
|
*
|
|
* Structs
|
|
*
|
|
******************************************************************************/
|
|
pub enum ColumType {
|
|
string = 0
|
|
int = 1
|
|
f32 = 2
|
|
}
|
|
|
|
pub struct HeaderItem {
|
|
pub mut:
|
|
label string
|
|
column int
|
|
htype ColumType = .string
|
|
}
|
|
|
|
@[heap]
|
|
pub struct RandomAccessReader {
|
|
pub mut:
|
|
index i64
|
|
|
|
f os.File
|
|
f_len i64
|
|
is_bom_present bool
|
|
|
|
start_index i64
|
|
end_index i64 = -1
|
|
|
|
end_line u8 = `\n`
|
|
end_line_len int = endline_cr_len // size of the endline rune \n = 1, \r\n = 2
|
|
separator u8 = `,` // comma is the default separator
|
|
separator_len int = 1 // size of the separator rune
|
|
quote u8 = `"` // double quote is the standard quote char
|
|
quote_remove bool // if true clear the cell from the quotes
|
|
comment u8 = `#` // every line that start with the quote char is ignored
|
|
|
|
default_cell string = '*' // return this string if out of the csv boundaries
|
|
empty_cell string = '#' // retunrn this if empty cell
|
|
// ram buffer
|
|
mem_buf_type u32 // buffer type 0=File,1=RAM
|
|
mem_buf voidptr // buffer used to load chars from file
|
|
mem_buf_size i64 // size of the buffer
|
|
mem_buf_start i64 = -1 // start index in the file of the read buffer
|
|
mem_buf_end i64 = -1 // end index in the file of the read buffer
|
|
// csv map for quick access
|
|
create_map_csv bool = true // flag to enable the csv map creation
|
|
csv_map [][]i64
|
|
// header
|
|
header_row int = -1 // row index of the header in the csv_map
|
|
header_list []HeaderItem // list of the header item
|
|
header_map map[string]int // map from header label to column index
|
|
}
|
|
|
|
@[params]
|
|
pub struct RandomAccessReaderConfig {
|
|
pub:
|
|
scr_buf voidptr // pointer to the buffer of data
|
|
scr_buf_len i64 // if > 0 use the RAM pointed from scr_buf as source of data
|
|
file_path string
|
|
start_index i64
|
|
end_index i64 = -1
|
|
mem_buf_size int = 1024 * 64 // default buffer size 64KByte
|
|
separator u8 = `,`
|
|
comment u8 = `#` // every line that start with the quote char is ignored
|
|
default_cell string = '*' // return this string if out of the csv boundaries
|
|
empty_cell string // return this string if empty cell
|
|
end_line_len int = endline_cr_len // size of the endline rune
|
|
quote u8 = `"` // double quote is the standard quote char
|
|
quote_remove bool // if true clear the cell from the quotes
|
|
create_map_csv bool = true // if true make the map of the csv file
|
|
}
|
|
|
|
/******************************************************************************
|
|
*
|
|
* Init, dispose, fill buffer
|
|
*
|
|
******************************************************************************/
|
|
|
|
// csv_reader_from_string create a csv reader from a string
|
|
pub fn csv_reader_from_string(in_str string) !&RandomAccessReader {
|
|
return csv_reader(RandomAccessReaderConfig{ scr_buf: in_str.str, scr_buf_len: in_str.len })!
|
|
}
|
|
|
|
// csv_reader create a random access csv reader
|
|
pub fn csv_reader(cfg RandomAccessReaderConfig) !&RandomAccessReader {
|
|
mut cr := &RandomAccessReader{}
|
|
|
|
cr.start_index = cfg.start_index
|
|
cr.end_index = cfg.end_index
|
|
|
|
if cfg.scr_buf != 0 && cfg.scr_buf_len > 0 {
|
|
cr.mem_buf_type = ram_csv // RAM buffer
|
|
cr.mem_buf = cfg.scr_buf
|
|
cr.mem_buf_size = cfg.scr_buf_len
|
|
if cfg.end_index == -1 {
|
|
cr.end_index = cfg.scr_buf_len
|
|
}
|
|
|
|
// check if BOM header is in the memory buffer
|
|
unsafe {
|
|
if *&u8(cr.mem_buf) == 0xEF && *(&u8(cr.mem_buf) + 1) == 0xBB
|
|
&& *(&u8(cr.mem_buf) + 2) == 0xBF {
|
|
cr.is_bom_present = true
|
|
cr.index += 3 // skip the BOM
|
|
}
|
|
}
|
|
}
|
|
// check if is a file source
|
|
else if cfg.file_path.len > 0 {
|
|
if !os.exists(cfg.file_path) {
|
|
return error('ERROR: file ${cfg.file_path} not found!')
|
|
}
|
|
cr.mem_buf_type = file_csv // File buffer
|
|
// allocate the memory
|
|
unsafe {
|
|
cr.mem_buf = malloc(cfg.mem_buf_size)
|
|
cr.mem_buf_size = cfg.mem_buf_size
|
|
}
|
|
cr.f = os.open_file(cfg.file_path, 'rb')!
|
|
|
|
cr.f.seek(0, .end)!
|
|
cr.f_len = cr.f.tell()!
|
|
|
|
cr.f.seek(cfg.start_index, .start)!
|
|
cr.index = cr.f.tell()!
|
|
|
|
if cfg.end_index == -1 {
|
|
cr.end_index = cr.f_len
|
|
}
|
|
|
|
// check if BOM header is in the file
|
|
if cr.index == 0 {
|
|
if cr.f.read_into_ptr(cr.mem_buf, 4)! == 4 {
|
|
unsafe {
|
|
if *&u8(cr.mem_buf) == 0xEF && *(&u8(cr.mem_buf) + 1) == 0xBB
|
|
&& *(&u8(cr.mem_buf) + 2) == 0xBF {
|
|
cr.is_bom_present = true
|
|
cr.index += 3 // skip the BOM
|
|
}
|
|
}
|
|
}
|
|
cr.f.seek(cfg.start_index, .start)!
|
|
}
|
|
}
|
|
|
|
cr.default_cell = cfg.default_cell
|
|
cr.empty_cell = cfg.empty_cell
|
|
cr.end_line_len = cfg.end_line_len
|
|
cr.separator = cfg.separator
|
|
cr.comment = cfg.comment
|
|
cr.quote_remove = cfg.quote_remove
|
|
cr.quote = cfg.quote
|
|
|
|
cr.create_map_csv = cfg.create_map_csv
|
|
if cr.create_map_csv {
|
|
cr.map_csv()!
|
|
}
|
|
|
|
return cr
|
|
}
|
|
|
|
// dispose_csv_reader release the resources used by the csv_reader
|
|
pub fn (mut cr RandomAccessReader) dispose_csv_reader() {
|
|
if cr.mem_buf_type == ram_csv {
|
|
// do nothing, ram buffer is static
|
|
} else if cr.mem_buf_type == file_csv {
|
|
// file close
|
|
if cr.f.is_opened {
|
|
cr.f.close()
|
|
}
|
|
|
|
// free the allocated memory
|
|
if cr.mem_buf_size > 0 {
|
|
unsafe {
|
|
free(cr.mem_buf)
|
|
}
|
|
cr.mem_buf = unsafe { nil }
|
|
cr.mem_buf_size = 0
|
|
}
|
|
}
|
|
}
|
|
|
|
fn (mut cr RandomAccessReader) fill_buffer(i i64) !i64 {
|
|
// use ram
|
|
if cr.mem_buf_type == ram_csv {
|
|
// do nothing, ram buffer are static for now
|
|
cr.mem_buf_start = i
|
|
cr.mem_buf_end = cr.mem_buf_size
|
|
read_bytes_count := cr.mem_buf_end - cr.mem_buf_start
|
|
// println("fill_buffer RAM: ${i} read_bytes_count: ${read_bytes_count} mem_buf_start: ${cr.mem_buf_start} mem_buf_end: ${cr.mem_buf_end}")
|
|
return i64(read_bytes_count)
|
|
// use file
|
|
} else if cr.mem_buf_type == file_csv {
|
|
cr.start_index = i
|
|
cr.f.seek(cr.start_index, .start)!
|
|
// IMPORTANT: add 64 bit support in vlib!!
|
|
read_bytes_count := cr.f.read_into_ptr(cr.mem_buf, int(cr.mem_buf_size))!
|
|
cr.mem_buf_start = i
|
|
cr.mem_buf_end = i + read_bytes_count
|
|
// println("fill_buffer FILE: ${i} read_bytes_count: ${read_bytes_count} mem_buf_start: ${cr.mem_buf_start} mem_buf_end: ${cr.mem_buf_end}")
|
|
return i64(read_bytes_count)
|
|
}
|
|
return i64(-1)
|
|
}
|
|
|
|
// copy_configuration copies the configuration from another csv RandomAccessReader
|
|
// this function is a helper for using the RandomAccessReader in multi threaded applications
|
|
// pay attention to the free process
|
|
pub fn (mut cr RandomAccessReader) copy_configuration(src_cr RandomAccessReader) {
|
|
cr.header_row = src_cr.header_row
|
|
unsafe {
|
|
cr.header_list = &src_cr.header_list
|
|
cr.header_map = &src_cr.header_map
|
|
cr.csv_map = &src_cr.csv_map
|
|
}
|
|
}
|
|
|
|
/******************************************************************************
|
|
*
|
|
* Csv mapper, mapped reader
|
|
*
|
|
******************************************************************************/
|
|
// map_csv create an index of whole csv file to consent random access to every cell in the file
|
|
pub fn (mut cr RandomAccessReader) map_csv() ! {
|
|
mut count := 0
|
|
mut i := i64(0)
|
|
mut capture_flag := true
|
|
mut drop_row := false
|
|
mut quote_flag := false // true if we are parsing inside a quote
|
|
|
|
// if File return to the start of the file
|
|
if cr.mem_buf_type == file_csv {
|
|
cr.f.seek(cr.start_index, .start)!
|
|
}
|
|
|
|
unsafe {
|
|
p := &u8(cr.mem_buf)
|
|
cr.csv_map << []i64{}
|
|
cr.csv_map[0] << if cr.is_bom_present { 3 } else { 0 } // skip the BOM data
|
|
|
|
// mut counter := i64(0)
|
|
for i < cr.end_index {
|
|
read_bytes_count := cr.fill_buffer(i)!
|
|
|
|
// DEBUG print
|
|
// perc := f32(counter) / f32(cr.end_index) * 100.0
|
|
// println("${perc:.2f}")
|
|
|
|
// println("${i:-12d} of ${cr.f_len:-12d} readed: ${read_bytes_count}")
|
|
mut p1 := p
|
|
mut i1 := i64(0)
|
|
for i1 < read_bytes_count {
|
|
// println("loop char: ${*&u8(p1):c}")
|
|
// manage quote char
|
|
if *p1 == cr.quote {
|
|
quote_flag = !quote_flag
|
|
p1++
|
|
i1++
|
|
}
|
|
else if // manage comment line
|
|
!quote_flag && *p1 == cr.comment && cr.csv_map[cr.csv_map.len - 1].len <= 1 {
|
|
drop_row = true
|
|
p1++
|
|
i1++
|
|
// println("drop_row: ${cr.csv_map.len - 1}")
|
|
}
|
|
else if // capture separator
|
|
!quote_flag && capture_flag && *p1 == cr.separator && !drop_row {
|
|
cr.csv_map[cr.csv_map.len - 1] << (i + i1)
|
|
|
|
p1 += cr.separator_len
|
|
i1 += cr.separator_len
|
|
}
|
|
else if // capture end line
|
|
*p1 == cr.end_line {
|
|
if quote_flag {
|
|
error_col := cr.csv_map[cr.csv_map.len - 1].last() - cr.csv_map[cr.csv_map.len - 1].first()
|
|
return error('ERROR: quote not closed at row ${count} after column ${error_col}!')
|
|
}
|
|
count++
|
|
|
|
cr.csv_map[cr.csv_map.len - 1] << (i + i1) - (cr.end_line_len - 1)
|
|
p1 += cr.end_line_len
|
|
i1 += cr.end_line_len
|
|
|
|
if drop_row == true {
|
|
cr.csv_map[cr.csv_map.len - 1].clear()
|
|
drop_row = false
|
|
} else {
|
|
// skip empty rows
|
|
if cr.csv_map[cr.csv_map.len - 1].len == 2
|
|
&& cr.csv_map[cr.csv_map.len - 1][0] == cr.csv_map[cr.csv_map.len - 1][1] {
|
|
// recycle the row
|
|
cr.csv_map[cr.csv_map.len - 1].clear()
|
|
} else {
|
|
// it all ok, insert a new row
|
|
cr.csv_map << []i64{cap: cr.csv_map[cr.csv_map.len - 1].len}
|
|
}
|
|
}
|
|
|
|
cr.csv_map[cr.csv_map.len - 1] << (i + i1) - (cr.end_line_len - 1)
|
|
|
|
p1 -= (cr.end_line_len - 1)
|
|
i1 -= (cr.end_line_len - 1)
|
|
|
|
// DEBUG checks
|
|
// r := &u8(cr.mem_buf) + (i + i1) - (cr.end_line_len - 1)
|
|
// r := p1
|
|
// println("char: ${*r:c}")
|
|
} else {
|
|
p1++
|
|
i1++
|
|
}
|
|
}
|
|
i += read_bytes_count
|
|
// counter += i1
|
|
}
|
|
}
|
|
// remove last row if it is not a valid one
|
|
if cr.csv_map[cr.csv_map.len - 1].len < 2 {
|
|
cr.csv_map.delete(cr.csv_map.len - 1)
|
|
}
|
|
|
|
// if File return to the start of the file
|
|
if cr.mem_buf_type == file_csv {
|
|
cr.f.seek(cr.start_index, .start)!
|
|
}
|
|
|
|
// println("map_csv Done! ${count}")
|
|
}
|
|
|
|
// get_row get a row from the CSV file as a string array
|
|
pub fn (mut cr RandomAccessReader) get_row(y int) ![]string {
|
|
mut h := []string{}
|
|
if cr.csv_map.len > 1 {
|
|
for x in 0 .. (cr.csv_map[y].len - 1) {
|
|
h << cr.get_cell(x: x, y: y)!
|
|
}
|
|
}
|
|
return h
|
|
}
|
|
|
|
@[params]
|
|
pub struct GetCellConfig {
|
|
pub:
|
|
x int
|
|
y int
|
|
}
|
|
|
|
// get_cell read a single cel nd return a string
|
|
pub fn (mut cr RandomAccessReader) get_cell(cfg GetCellConfig) !string {
|
|
if cfg.y < cr.csv_map.len && cfg.x < (cr.csv_map[cfg.y].len - 1) {
|
|
mut start := cr.csv_map[cfg.y][cfg.x]
|
|
mut end := cr.csv_map[cfg.y][cfg.x + 1]
|
|
|
|
if cfg.x > 0 {
|
|
start++
|
|
}
|
|
|
|
mut len := end - start
|
|
// println("len calc: ${len}")
|
|
if len <= 0 {
|
|
return cr.empty_cell
|
|
}
|
|
|
|
// fill the buffer if needed
|
|
if !(start >= cr.mem_buf_start && end < cr.mem_buf_end) {
|
|
cr.fill_buffer(start)!
|
|
}
|
|
unsafe {
|
|
// execute this section only if we need to remove the quotes
|
|
if cr.quote_remove {
|
|
// println("[${start},${end}] len:${len}")
|
|
// remove front quote and spaces
|
|
mut tmp_p := &u8(cr.mem_buf) + start - cr.start_index
|
|
for start < end {
|
|
if *tmp_p == cr.quote {
|
|
start++
|
|
break
|
|
}
|
|
start++
|
|
tmp_p++
|
|
}
|
|
// println("after start quote filtering [${start},${end}] len:${len}")
|
|
// remove back quote and spaces
|
|
tmp_p = &u8(cr.mem_buf) + end - cr.start_index
|
|
for end > start {
|
|
if *tmp_p == cr.quote {
|
|
break
|
|
}
|
|
tmp_p--
|
|
end--
|
|
}
|
|
// println("after end quote filtering [${start},${end}] len:${len}")
|
|
|
|
len = end - start
|
|
// println("len calc2: ${len}")
|
|
if len <= 0 {
|
|
return cr.empty_cell
|
|
}
|
|
// println("[${start},${end}] len:${len}")
|
|
}
|
|
|
|
// create the string from the buffer
|
|
mut tmp_mem := malloc_noscan(isize(len + 1))
|
|
/*
|
|
defer {
|
|
free(tmp_mem)
|
|
}
|
|
*/
|
|
mem_start := &u8(cr.mem_buf) + start - cr.start_index
|
|
vmemcpy(tmp_mem, mem_start, isize(len))
|
|
tmp_mem[len] = 0 // 0 for C string compatibility
|
|
ret_str := tos(tmp_mem, int(len))
|
|
return ret_str
|
|
}
|
|
}
|
|
return cr.default_cell
|
|
}
|
|
|
|
pub type CellValue = f32 | int | string
|
|
|
|
// get_cellt read a single cell and return a sum type CellValue
|
|
pub fn (mut cr RandomAccessReader) get_cellt(cfg GetCellConfig) !CellValue {
|
|
if cr.header_row >= 0 && cfg.x < cr.header_list.len {
|
|
h := cr.header_list[cfg.x]
|
|
res := cr.get_cell(cfg)!
|
|
match h.htype {
|
|
.int {
|
|
return res.trim_space().int()
|
|
}
|
|
.string {
|
|
return res
|
|
}
|
|
.f32 {
|
|
return res.trim_space().f32()
|
|
}
|
|
}
|
|
}
|
|
return cr.get_cell(cfg)!
|
|
}
|
|
|
|
/******************************************************************************
|
|
*
|
|
* Header management
|
|
*
|
|
******************************************************************************/
|
|
@[params]
|
|
pub struct GetHeaderConf {
|
|
pub:
|
|
header_row int // row where to inspect the header
|
|
}
|
|
|
|
// build_header_dict infer the header, it use the first available row in not row number is passesd
|
|
// it try to infer the type of column using the first available row after the header
|
|
// By default all the column are of the string type
|
|
pub fn (mut cr RandomAccessReader) build_header_dict(cfg GetHeaderConf) ! {
|
|
if cr.csv_map.len > 1 && cfg.header_row >= 0 && cfg.header_row < cr.csv_map.len {
|
|
cr.header_row = cfg.header_row
|
|
for col in 0 .. (cr.csv_map[cfg.header_row].len - 1) {
|
|
// fill the base struct
|
|
label := cr.get_cell(x: col, y: cfg.header_row)!
|
|
mut h := HeaderItem{
|
|
label: label
|
|
column: col
|
|
htype: .string
|
|
}
|
|
|
|
// try to infer the type if we haev at least one more row
|
|
if cfg.header_row + 1 < cr.csv_map.len {
|
|
x := cr.get_cell(x: col, y: cfg.header_row + 1)!.trim_space().to_lower()
|
|
mut sign_c := int(0)
|
|
mut int_c := int(0)
|
|
mut float_c := int(0)
|
|
mut alpha_c := int(0)
|
|
mut htype := ColumType.string
|
|
// raw extimation fo the type
|
|
for c in x {
|
|
if c in [`+`, `-`] {
|
|
sign_c++
|
|
continue
|
|
}
|
|
if c >= `0` && c <= `9` {
|
|
int_c++
|
|
continue
|
|
}
|
|
if c == `.` {
|
|
float_c++
|
|
continue
|
|
}
|
|
if c in [`e`, `E`] && (float_c > 0 || int_c > 0) {
|
|
float_c++
|
|
continue
|
|
}
|
|
alpha_c++
|
|
break
|
|
}
|
|
|
|
// if no alpha_c can be and int or a float
|
|
if alpha_c == 0 {
|
|
if float_c > 0 {
|
|
htype = .f32
|
|
} else {
|
|
htype = .int
|
|
}
|
|
}
|
|
h.htype = htype
|
|
}
|
|
|
|
cr.header_list << h
|
|
cr.header_map[label] = col
|
|
}
|
|
}
|
|
}
|
|
|
|
/******************************************************************************
|
|
*
|
|
* Utility function
|
|
*
|
|
******************************************************************************/
|
|
// rows_count count the rows in the csv between start_index and end_index
|
|
pub fn (mut cr RandomAccessReader) rows_count() !i64 {
|
|
mut count := i64(0)
|
|
mut i := i64(0)
|
|
|
|
if cr.mem_buf_type == file_csv {
|
|
cr.f.seek(cr.start_index, .start)!
|
|
}
|
|
unsafe {
|
|
p := &u8(cr.mem_buf)
|
|
for i < cr.end_index {
|
|
read_bytes_count := cr.fill_buffer(i)!
|
|
// println("${i:-12d} of ${cr.f_len:-12d} readed: ${read_bytes_count}")
|
|
mut p1 := p
|
|
mut i1 := 0
|
|
for i1 < read_bytes_count {
|
|
if *p1 == cr.end_line {
|
|
count++
|
|
}
|
|
p1++
|
|
i1++
|
|
}
|
|
i += read_bytes_count
|
|
}
|
|
}
|
|
if cr.mem_buf_type == file_csv {
|
|
cr.f.seek(cr.start_index, .start)!
|
|
}
|
|
// println("rows_count Done!")
|
|
return count
|
|
}
|