diff --git a/vlib/encoding/xml/parser.v b/vlib/encoding/xml/parser.v index 51d0a0cf60..8f5a63f6e4 100644 --- a/vlib/encoding/xml/parser.v +++ b/vlib/encoding/xml/parser.v @@ -18,6 +18,9 @@ const ( double_dash = '--'.bytes() c_tag = '[C'.bytes() data_chars = 'DATA'.bytes() + + byte_order_marking_first = u8(0xEF) + byte_order_marking_bytes = [u8(0xBB), 0xBF] ) // Helper types to assist in parsing @@ -296,18 +299,30 @@ fn parse_doctype(mut reader io.Reader) !DocumentType { } fn parse_prolog(mut reader io.Reader) !(Prolog, u8) { - // Trim trailing whitespace + // Skip trailing whitespace and invalid characters mut local_buf := [u8(0)] mut ch := next_char(mut reader, mut local_buf)! for { match ch { - ` `, `\t`, `\n` { + ` `, `\t`, `\r`, `\n` { ch = next_char(mut reader, mut local_buf)! continue } `<` { break } + xml.byte_order_marking_first { + // UTF-8 BOM + mut bom_buf := [u8(0), 0] + if reader.read(mut bom_buf)! != 2 { + return error('Invalid UTF-8 BOM.') + } + if bom_buf != xml.byte_order_marking_bytes { + return error('Invalid UTF-8 BOM.') + } + ch = next_char(mut reader, mut local_buf)! + continue + } else { return error('Expecting a prolog or root node starting with "<".') } diff --git a/vlib/encoding/xml/test/local/18_single_letter_tag/shared.xml b/vlib/encoding/xml/test/local/19_single_letter_tag/shared.xml similarity index 100% rename from vlib/encoding/xml/test/local/18_single_letter_tag/shared.xml rename to vlib/encoding/xml/test/local/19_single_letter_tag/shared.xml diff --git a/vlib/encoding/xml/test/local/18_single_letter_tag/shared_test.v b/vlib/encoding/xml/test/local/19_single_letter_tag/shared_test.v similarity index 100% rename from vlib/encoding/xml/test/local/18_single_letter_tag/shared_test.v rename to vlib/encoding/xml/test/local/19_single_letter_tag/shared_test.v diff --git a/vlib/encoding/xml/test/local/20_bom_file/bom_test.v b/vlib/encoding/xml/test/local/20_bom_file/bom_test.v new file mode 100644 index 0000000000..23067c1ca2 --- /dev/null +++ b/vlib/encoding/xml/test/local/20_bom_file/bom_test.v @@ -0,0 +1,17 @@ +module main + +import os +import encoding.xml + +fn test_valid_parsing() { + // We use a .bin file to avoid stripping the BOM from the XML file + path := os.join_path(os.dir(@FILE), 'workbook.bin') + + doc := xml.XMLDocument.from_file(path) or { + assert false, 'Failed to parse workbook.bin' + exit(1) + } + + sheets := doc.get_elements_by_tag('sheet') + assert sheets.len == 1, 'Expected 1 sheet, got ${sheets.len}' +} diff --git a/vlib/encoding/xml/test/local/20_bom_file/workbook.bin b/vlib/encoding/xml/test/local/20_bom_file/workbook.bin new file mode 100644 index 0000000000..a2de93b304 --- /dev/null +++ b/vlib/encoding/xml/test/local/20_bom_file/workbook.bin @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + \ No newline at end of file