module xml import io import os import strings const default_prolog_attributes = { 'version': '1.0' 'encoding': 'UTF-8' } const default_string_builder_cap = 32 const element_len = '` { break } return error('XML Comment not closed. Expected ">".') } else { comment_buffer.write_u8(ch) comment_buffer.write_u8(after_ch) } } else { comment_buffer.write_u8(ch) } } } comment_contents := comment_buffer.str() return XMLComment{comment_contents} } enum CDATAParserState { normal single double } fn parse_cdata(mut reader io.Reader) !XMLCData { mut contents_buf := strings.new_builder(xml.default_string_builder_cap) mut state := CDATAParserState.normal mut local_buf := [u8(0)] for { ch := next_char(mut reader, mut local_buf)! contents_buf.write_u8(ch) match ch { `]` { match state { .double { // Another ] after the ]] for some reason. Keep the state } .single { state = .double } .normal { state = .single } } } `>` { match state { .double { break } else { state = .normal } } } else { state = .normal } } } contents := contents_buf.str().trim_space() if !contents.ends_with(']]>') { return error('CDATA section not closed.') } return XMLCData{contents[1..contents.len - 3]} } fn parse_entity(contents string) !(DTDEntity, string) { // We find the nearest '>' to the start of the ENTITY entity_end := contents.index('>') or { return error('Entity declaration not closed.') } entity_contents := contents[xml.entity_len..entity_end] name := entity_contents.trim_left(' \t\n').all_before(' ') if name == '' { return error('Entity is missing name.') } value := entity_contents.all_after_first(name).trim_space().trim('"\'') if value.len == 0 { return error('Entity is missing value.') } // TODO: Add support for SYSTEM and PUBLIC entities return DTDEntity{name, value}, contents[entity_end + 1..] } fn parse_element(contents string) !(DTDElement, string) { // We find the nearest '>' to the start of the ELEMENT element_end := contents.index('>') or { return error('Element declaration not closed.') } element_contents := contents[xml.element_len..element_end].trim_left(' \t\n') mut name_span := TextSpan{} for ch in element_contents { match ch { ` `, `\t`, `\n` { break } // Valid characters in an entity name are: // 1. Lowercase alphabet - a-z // 2. Uppercase alphabet - A-Z // 3. Numbers - 0-9 // 4. Underscore - _ // 5. Colon - : // 6. Period - . `a`...`z`, `A`...`Z`, `0`...`9`, `_`, `:`, `.` { name_span.end++ } else { return error('Invalid character in element name: "${ch}"') } } } name := element_contents[name_span.start..name_span.end].trim_left(' \t\n') if name == '' { return error('Element is missing name.') } definition_string := element_contents.all_after_first(name).trim_space().trim('"\'') definition := if definition_string.starts_with('(') { // We have a list of possible children // Ensure that both ( and ) are present if !definition_string.ends_with(')') { return error('Element declaration not closed.') } definition_string.trim('()').split(',') } else { // Invalid definition return error('Invalid element definition: ${definition_string}') } // TODO: Add support for SYSTEM and PUBLIC entities return DTDElement{name, definition}, contents[element_end + 1..] } fn parse_doctype(mut reader io.Reader) !DocumentType { // We may have more < in the doctype so keep count mut depth := 1 mut doctype_buffer := strings.new_builder(xml.default_string_builder_cap) mut local_buf := [u8(0)] for { ch := next_char(mut reader, mut local_buf)! doctype_buffer.write_u8(ch) match ch { `<` { depth++ } `>` { depth-- if depth == 0 { break } } else {} } } doctype_contents := doctype_buffer.str().trim_space() name := doctype_contents.all_before('[').trim_space() mut list_contents := doctype_contents.all_after('[').all_before(']').trim_space() mut items := []DTDListItem{} for list_contents.len > 0 { if list_contents.starts_with('` { if found_question_mark { break } return error('Invalid prolog: Found ">" before "?".') } else { if found_question_mark { found_question_mark = false prolog_buffer.write_u8(`?`) } prolog_buffer.write_u8(ch) } } } prolog_attributes := prolog_buffer.str().trim_space() attributes := if prolog_attributes.len == 0 { xml.default_prolog_attributes } else { parse_attributes(prolog_attributes)! } version := attributes['version'] or { return error('XML declaration missing version.') } encoding := attributes['encoding'] or { 'UTF-8' } mut comments := []XMLComment{} mut doctype := DocumentType{ name: '' dtd: '' } mut found_doctype := false for { ch = next_char(mut reader, mut local_buf)! match ch { ` `, `\t`, `\n` { continue } `<` { // We have a comment, DOCTYPE, or root node ch = next_char(mut reader, mut local_buf)! match ch { `!` { // A comment or DOCTYPE match next_char(mut reader, mut local_buf)! { `-` { // A comment if next_char(mut reader, mut local_buf)! != `-` { return error('Invalid comment.') } comments << parse_comment(mut reader)! } `D` { if found_doctype { return error('Duplicate DOCTYPE declaration.') } // OCTYPE mut doc_buf := []u8{len: 6} if reader.read(mut doc_buf)! != 6 { return error('Invalid DOCTYPE.') } if doc_buf != xml.doctype_chars { return error('Invalid DOCTYPE.') } found_doctype = true doctype = parse_doctype(mut reader)! } else { return error('Unsupported control sequence found in prolog.') } } } else { // We have found the start of the root node break } } } else {} } } return Prolog{ version: version encoding: encoding doctype: doctype comments: comments }, ch } fn parse_children(name string, attributes map[string]string, mut reader io.Reader) !XMLNode { mut inner_contents := strings.new_builder(xml.default_string_builder_cap) mut children := []XMLNodeContents{} mut local_buf := [u8(0)] for { ch := next_char(mut reader, mut local_buf)! match ch { `<` { second_char := next_char(mut reader, mut local_buf)! match second_char { `!` { // Comment, CDATA mut next_two := [u8(0), 0] if reader.read(mut next_two)! != 2 { return error('Invalid XML. Incomplete comment or CDATA declaration.') } if next_two == xml.double_dash { // Comment comment := parse_comment(mut reader)! children << comment } else if next_two == xml.c_tag { // DATA mut cdata_buf := []u8{len: 4} if reader.read(mut cdata_buf)! != 4 { return error('Invalid XML. Incomplete CDATA declaration.') } if cdata_buf != xml.data_chars { return error('Invalid XML. Expected "CDATA" after "` if node_end_buffer != ending_chars { return error('XML node <${name}> not closed.') } collected_contents := inner_contents.str().trim_space() if collected_contents.len > 0 { // We have some inner text children << collected_contents.replace('\r\n', '\n') } return XMLNode{ name: name attributes: attributes children: children } } else { // Start of child node child := parse_single_node(second_char, mut reader) or { if err.msg() == 'XML node cannot start with " not closed.') } else { return err } } text := inner_contents.str().trim_space() if text.len > 0 { children << text.replace('\r\n', '\n') } children << child } } } else { inner_contents.write_u8(ch) } } } return error('XML node <${name}> not closed.') } // parse_single_node parses a single XML node from the reader. The first character of the tag is passed // in as the first_char parameter. // This function is meant to assist in parsing nested nodes one at a time. Using this function as // opposed to the recommended static functions makes it easier to parse smaller nodes in extremely large // XML documents without running out of memory. pub fn parse_single_node(first_char u8, mut reader io.Reader) !XMLNode { mut contents := strings.new_builder(xml.default_string_builder_cap) contents.write_u8(first_char) mut local_buf := [u8(0)] for { mut ch := next_char(mut reader, mut local_buf)! if ch == `>` { break } contents.write_u8(ch) } tag_contents := contents.str().trim_space() parts := tag_contents.split_any(' \t\n') name := parts[0].trim_right('/') // Check if it is a self-closing tag if tag_contents.ends_with('/') { // We're not looking for children and inner text return XMLNode{ name: name attributes: parse_attributes(tag_contents[name.len..tag_contents.len - 1].trim_space())! } } attribute_string := tag_contents[name.len..].trim_space() attributes := parse_attributes(attribute_string)! return parse_children(name, attributes, mut reader) } // XMLDocument.from_string parses an XML document from a string. pub fn XMLDocument.from_string(raw_contents string) !XMLDocument { mut reader := FullBufferReader{ contents: raw_contents.bytes() } return XMLDocument.from_reader(mut reader)! } // XMLDocument.from_file parses an XML document from a file. Note that the file is read in its entirety // and then parsed. If the file is too large, try using the XMLDocument.from_reader function instead. pub fn XMLDocument.from_file(path string) !XMLDocument { mut reader := FullBufferReader{ contents: os.read_bytes(path)! } return XMLDocument.from_reader(mut reader)! } // XMLDocument.from_reader parses an XML document from a reader. This is the most generic way to parse // an XML document from any arbitrary source that implements that io.Reader interface. pub fn XMLDocument.from_reader(mut reader io.Reader) !XMLDocument { prolog, first_char := parse_prolog(mut reader) or { if err is os.Eof || err is io.Eof || err.msg() == 'Unexpected End Of File.' { return error('XML document is empty.') } else { return err } } root := parse_single_node(first_char, mut reader)! return XMLDocument{ version: prolog.version encoding: prolog.encoding comments: prolog.comments doctype: prolog.doctype root: root } }