encoding.xml: update parser logic to skip BOM before prolog (#19858)

This commit is contained in:
Subhomoy Haldar 2023-11-13 12:24:39 +00:00 committed by GitHub
parent e0207b6830
commit 5f08d45c7c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 51 additions and 2 deletions

View File

@ -18,6 +18,9 @@ const (
double_dash = '--'.bytes()
c_tag = '[C'.bytes()
data_chars = 'DATA'.bytes()
byte_order_marking_first = u8(0xEF)
byte_order_marking_bytes = [u8(0xBB), 0xBF]
)
// Helper types to assist in parsing
@ -296,18 +299,30 @@ fn parse_doctype(mut reader io.Reader) !DocumentType {
}
fn parse_prolog(mut reader io.Reader) !(Prolog, u8) {
// Trim trailing whitespace
// Skip trailing whitespace and invalid characters
mut local_buf := [u8(0)]
mut ch := next_char(mut reader, mut local_buf)!
for {
match ch {
` `, `\t`, `\n` {
` `, `\t`, `\r`, `\n` {
ch = next_char(mut reader, mut local_buf)!
continue
}
`<` {
break
}
xml.byte_order_marking_first {
// UTF-8 BOM
mut bom_buf := [u8(0), 0]
if reader.read(mut bom_buf)! != 2 {
return error('Invalid UTF-8 BOM.')
}
if bom_buf != xml.byte_order_marking_bytes {
return error('Invalid UTF-8 BOM.')
}
ch = next_char(mut reader, mut local_buf)!
continue
}
else {
return error('Expecting a prolog or root node starting with "<".')
}

View File

@ -0,0 +1,17 @@
module main
import os
import encoding.xml
fn test_valid_parsing() {
// We use a .bin file to avoid stripping the BOM from the XML file
path := os.join_path(os.dir(@FILE), 'workbook.bin')
doc := xml.XMLDocument.from_file(path) or {
assert false, 'Failed to parse workbook.bin'
exit(1)
}
sheets := doc.get_elements_by_tag('sheet')
assert sheets.len == 1, 'Expected 1 sheet, got ${sheets.len}'
}

View File

@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" mc:Ignorable="x15" xmlns:x15="http://schemas.microsoft.com/office/spreadsheetml/2010/11/main">
<fileVersion appName="xl" lastEdited="6" lowestEdited="6" rupBuild="14420"/>
<workbookPr defaultThemeVersion="164011"/>
<bookViews>
<workbookView xWindow="0" yWindow="0" windowWidth="22260" windowHeight="12645"/>
</bookViews>
<sheets>
<sheet name="Sheet1" sheetId="1" r:id="rId1"/>
</sheets>
<calcPr calcId="162913"/>
<extLst>
<ext uri="{140A7094-0E35-4892-8432-C4D2E57EDEB5}" xmlns:x15="http://schemas.microsoft.com/office/spreadsheetml/2010/11/main">
<x15:workbookPr chartTrackingRefBase="1"/>
</ext>
</extLst>
</workbook>