diff --git a/vlib/net/html/dom.v b/vlib/net/html/dom.v index fee530e9a8..f131c0d48f 100644 --- a/vlib/net/html/dom.v +++ b/vlib/net/html/dom.v @@ -96,6 +96,12 @@ fn (mut dom DocumentObjectModel) add_tag_by_attribute(tag &Tag) { fn (mut dom DocumentObjectModel) construct(tag_list []&Tag) { dom.constructed = true + + // If there are no tags, accessing `tag_list` below does panic. + if tag_list.len == 0 { + return + } + mut temp_map := map[string]int{} mut temp_int := null_element mut temp_string := '' @@ -106,6 +112,7 @@ fn (mut dom DocumentObjectModel) construct(tag_list []&Tag) { temp_map['0'] = dom.btree.add_children(tag_list[0]) stack.push(0) root_index := 0 + for index := 1; index < tag_list.len; index++ { mut tag := tag_list[index] dom.print_debug(tag.str()) diff --git a/vlib/net/html/parser.v b/vlib/net/html/parser.v index 283f3edd7d..6ebc30b2fd 100644 --- a/vlib/net/html/parser.v +++ b/vlib/net/html/parser.v @@ -13,6 +13,7 @@ mut: is_attribute bool opened_code_type string line_count int + outside_tag bool lexeme_builder strings.Builder = strings.new_builder(100) code_tags map[string]bool = { 'script': true @@ -90,6 +91,7 @@ fn (mut parser Parser) init() { parser.tags = []&Tag{} parser.dom.close_tags['/!document'] = true parser.lexical_attributes.current_tag = &Tag{} + parser.lexical_attributes.outside_tag = true parser.initialized = true } @@ -231,19 +233,40 @@ pub fn (mut parser Parser) split_parse(data string) { parser.lexical_attributes.lexeme_builder.go_back_to(0) parser.generate_tag() parser.lexical_attributes.open_tag = true + parser.lexical_attributes.outside_tag = false } else { parser.lexical_attributes.lexeme_builder.write_u8(chr) } } + + // If `data` has not tags but has only text. + if parser.lexical_attributes.outside_tag { + temp_string := parser.lexical_attributes.lexeme_builder.str() + + if parser.tags.len == 0 { + parser.tags << &Tag{ + name: 'text' + content: temp_string + } + } else if parser.tags.len == 1 { + mut tag := parser.tags.first() + + if tag.name == 'text' { + tag.content += temp_string + } + } + } } // parse_html parses the given HTML string pub fn (mut parser Parser) parse_html(data string) { parser.init() mut lines := data.split_into_lines() - for line in lines { + for index, line in lines { parser.lexical_attributes.line_count++ - parser.split_parse(line) + // Parser shouldn't replace `\n`, because it may break JS code or text which sticks together. + // After `split_into_lines()` we need to add `\n` again. + parser.split_parse(if index < lines.len - 1 { '${line}\n' } else { line }) } parser.generate_tag() parser.dom.debug_file = parser.debug_file diff --git a/vlib/net/html/parser_test.v b/vlib/net/html/parser_test.v index 6d0d1d5f2b..60ae631cd9 100644 --- a/vlib/net/html/parser_test.v +++ b/vlib/net/html/parser_test.v @@ -2,6 +2,34 @@ module html import strings +fn test_parse_empty_string() { + mut parser := Parser{} + + parser.parse_html('') + + assert parser.tags.len == 0 +} + +fn test_parse_text() { + mut parser := Parser{} + text_content := 'test\nparse\ntext' + + parser.parse_html(text_content) + + assert parser.tags.len == 1 + assert parser.tags.first().text() == text_content +} + +fn test_parse_one_tag_with_text() { + mut parser := Parser{} + text_content := 'tag\nwith\ntext' + p_tag := '
${text_content}
' + + parser.parse_html(p_tag) + + assert parser.tags.first().text() == text_content +} + fn test_split_parse() { mut parser := Parser{} parser.init() @@ -37,5 +65,5 @@ fn test_script_tag() { script_content := "\nvar googletag = googletag || {};\ngoogletag.cmd = googletag.cmd || [];if(3 > 5) {console.log('Birl');}\n" temp_html := '' parser.parse_html(temp_html) - assert parser.tags[2].content.len == script_content.replace('\n', '').len + assert parser.tags[2].content.len == script_content.len } diff --git a/vlib/net/html/tag.v b/vlib/net/html/tag.v index d300e39f06..920b72a6d6 100644 --- a/vlib/net/html/tag.v +++ b/vlib/net/html/tag.v @@ -40,7 +40,7 @@ pub fn (tag Tag) text() string { return '\n' } mut text_str := strings.new_builder(200) - text_str.write_string(tag.content.replace('\n', '')) + text_str.write_string(tag.content) for child in tag.children { text_str.write_string(child.text()) }