From 47c15ad949eb6589ee14d208444b4e759a611143 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Tue, 11 Feb 2014 06:45:27 +0000 Subject: Implement document fragment parsing. Introduce a notable behavior change in default parsing mode: documents without a document element node are now considered invalid. This is technically a breaking change, however the amount of documents it affects is very small, all parsed data still persists, and lack of this check results in very confusing behavior in a number of cases. In order to be able to parse documents without an element node, a fragment parsing flag is introduced. Parsing a buffer in fragment mode treats the buffer as a fragment of a valid XML. As a consequence, top-level PCDATA is added to the tree; additionally, there are no restrictions on the number of nodes -- so documents without a document element are considered valid. Due to the way parsing works internally, load_buffer_inplace occasionally can not preserve the document contents if it's parsed in a fragment mode. While unfortunate, this problem is fundamental; since the use case is relatively obscure, hopefully documenting this shortcoming will be enough. git-svn-id: https://pugixml.googlecode.com/svn/trunk@980 99668b35-9821-0410-8761-19e4c4f06640 --- src/pugixml.cpp | 78 +++++++++++++---- src/pugixml.hpp | 8 +- tests/test_document.cpp | 135 ++++++++++++++++++++++-------- tests/test_dom_modify.cpp | 17 ++++ tests/test_memory.cpp | 2 +- tests/test_parse.cpp | 194 ++++++++++++++++++++++++++++++++++++------- tests/test_parse_doctype.cpp | 16 ++-- tests/test_write.cpp | 15 ++-- 8 files changed, 363 insertions(+), 102 deletions(-) diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 926458e..1893125 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -2199,7 +2199,7 @@ PUGI__NS_BEGIN char_t* parse_doctype_group(char_t* s, char_t endch, bool toplevel) { - assert(s[0] == '<' && s[1] == '!'); + assert((s[0] == '<' || s[0] == 0) && s[1] == '!'); s++; while (*s) @@ -2331,6 +2331,9 @@ PUGI__NS_BEGIN s = parse_doctype_group(s, endch, true); if (!s) return s; + assert((*s == 0 && endch == '>') || *s == '>'); + if (*s) *s++ = 0; + if (PUGI__OPTSET(parse_doctype)) { while (PUGI__IS_CHARTYPE(*mark, ct_space)) ++mark; @@ -2339,9 +2342,6 @@ PUGI__NS_BEGIN cursor->value = mark; - assert((*s == 0 && endch == '>') || *s == '>'); - if (*s) *s++ = 0; - PUGI__POPNODE(); } } @@ -2629,7 +2629,7 @@ PUGI__NS_BEGIN PUGI__SKIPWS(); // Eat whitespace if no genuine PCDATA here. - if (*s == '<') + if (*s == '<' || !*s) { // We skipped some whitespace characters because otherwise we would take the tag branch instead of PCDATA one assert(mark != s); @@ -2640,13 +2640,13 @@ PUGI__NS_BEGIN } else if (PUGI__OPTSET(parse_ws_pcdata_single)) { - if (s[1] != '/' || cursor->first_child) continue; + if (s[0] != '<' || s[1] != '/' || cursor->first_child) continue; } } s = mark; - if (cursor->parent) + if (cursor->parent || PUGI__OPTSET(parse_fragment)) { PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree. cursor->value = s; // Save the offset. @@ -2676,14 +2676,43 @@ PUGI__NS_BEGIN return s; } + #ifdef PUGIXML_WCHAR_MODE + static char_t* parse_skip_bom(char_t* s) + { + return (s[0] == 0xfeff) ? s + 1 : s; + } + #else + static char_t* parse_skip_bom(char_t* s) + { + return (s[0] == '\xef' && s[1] == '\xbb' && s[2] == '\xbf') ? s + 3 : s; + } + #endif + + static bool has_element_node_siblings(xml_node_struct* node) + { + while (node) + { + xml_node_type type = static_cast((node->header & impl::xml_memory_page_type_mask) + 1); + if (type == node_element) return true; + + node = node->next_sibling; + } + + return false; + } + static xml_parse_result parse(char_t* buffer, size_t length, xml_document_struct* xmldoc, xml_node_struct* root, unsigned int optmsk) { // allocator object is a part of document object xml_allocator& alloc = *static_cast(xmldoc); // early-out for empty documents - if (length == 0) return make_parse_result(status_ok); + if (length == 0) + return make_parse_result(PUGI__OPTSET(parse_fragment) ? status_ok : status_no_document_element); + // get last child of the root before parsing + xml_node_struct* last_root_child = root->first_child ? root->first_child->prev_sibling_c : 0; + // create parser on stack xml_parser parser(alloc); @@ -2691,24 +2720,35 @@ PUGI__NS_BEGIN char_t endch = buffer[length - 1]; buffer[length - 1] = 0; + // skip BOM to make sure it does not end up as part of parse output + char_t* buffer_data = parse_skip_bom(buffer); + // perform actual parsing - parser.parse_tree(buffer, root, optmsk, endch); + parser.parse_tree(buffer_data, root, optmsk, endch); + + // update allocator state + alloc = parser.alloc; xml_parse_result result = make_parse_result(parser.error_status, parser.error_offset ? parser.error_offset - buffer : 0); assert(result.offset >= 0 && static_cast(result.offset) <= length); - // roll back offset if it occurs on a null terminator in the source buffer - if (result.offset > 0 && static_cast(result.offset) == length - 1 && endch == 0) - result.offset--; + if (result) + { + // since we removed last character, we have to handle the only possible false positive (stray <) + if (endch == '<') + return make_parse_result(status_unrecognized_tag, length - 1); - // update allocator state - alloc = parser.alloc; + // check if there are any element nodes parsed + xml_node_struct* first_root_child_parsed = last_root_child ? last_root_child->next_sibling : root->first_child; - // since we removed last character, we have to handle the only possible false positive - if (result && endch == '<') + if (!PUGI__OPTSET(parse_fragment) && !has_element_node_siblings(first_root_child_parsed)) + return make_parse_result(status_no_document_element, length - 1); + } + else { - // there's no possible well-formed document with < at the end - return make_parse_result(status_unrecognized_tag, length - 1); + // roll back offset if it occurs on a null terminator in the source buffer + if (result.offset > 0 && static_cast(result.offset) == length - 1 && endch == 0) + result.offset--; } return result; @@ -5469,6 +5509,8 @@ namespace pugi case status_append_invalid_root: return "Unable to append nodes: root is not an element or document"; + case status_no_document_element: return "No document element found"; + default: return "Unknown error"; } } diff --git a/src/pugixml.hpp b/src/pugixml.hpp index e19a4a3..e5009fe 100644 --- a/src/pugixml.hpp +++ b/src/pugixml.hpp @@ -151,6 +151,10 @@ namespace pugi // This flag is off by default; turning it on may result in slower parsing and more memory consumption. const unsigned int parse_ws_pcdata_single = 0x0400; + // This flag determines if plain character data that does not have a parent node is added to the DOM tree, and if an empty document + // is a valid document. This flag is off by default. + const unsigned int parse_fragment = 0x0800; + // The default parsing mode. // Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded, // End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules. @@ -880,7 +884,9 @@ namespace pugi status_bad_end_element, // Parsing error occurred while parsing end element tag status_end_element_mismatch,// There was a mismatch of start-end tags (closing tag had incorrect name, some tag was not closed or there was an excessive closing tag) - status_append_invalid_root // Unable to append nodes since root type is not node_element or node_document (exclusive to xml_node::append_buffer) + status_append_invalid_root, // Unable to append nodes since root type is not node_element or node_document (exclusive to xml_node::append_buffer) + + status_no_document_element // Parsing resulted in a document without element nodes }; // Parsing result diff --git a/tests/test_document.cpp b/tests/test_document.cpp index 3ac8bf8..e6c7b00 100644 --- a/tests/test_document.cpp +++ b/tests/test_document.cpp @@ -249,7 +249,7 @@ TEST(document_load_file_empty) { pugi::xml_document doc; - CHECK(doc.load_file("tests/data/empty.xml")); + CHECK(doc.load_file("tests/data/empty.xml").status == status_no_document_element); CHECK(!doc.first_child()); } @@ -907,16 +907,52 @@ TEST(document_load_buffer_empty) xml_encoding encoding = encodings[i]; xml_document doc; - CHECK(doc.load_buffer(buffer, 0, parse_default, encoding) && !doc.first_child()); - CHECK(doc.load_buffer(0, 0, parse_default, encoding) && !doc.first_child()); + CHECK(doc.load_buffer(buffer, 0, parse_default, encoding).status == status_no_document_element && !doc.first_child()); + CHECK(doc.load_buffer(0, 0, parse_default, encoding).status == status_no_document_element && !doc.first_child()); - CHECK(doc.load_buffer_inplace(buffer, 0, parse_default, encoding) && !doc.first_child()); - CHECK(doc.load_buffer_inplace(0, 0, parse_default, encoding) && !doc.first_child()); + CHECK(doc.load_buffer_inplace(buffer, 0, parse_default, encoding).status == status_no_document_element && !doc.first_child()); + CHECK(doc.load_buffer_inplace(0, 0, parse_default, encoding).status == status_no_document_element && !doc.first_child()); void* own_buffer = pugi::get_memory_allocation_function()(1); - CHECK(doc.load_buffer_inplace_own(own_buffer, 0, parse_default, encoding) && !doc.first_child()); - CHECK(doc.load_buffer_inplace_own(0, 0, parse_default, encoding) && !doc.first_child()); + CHECK(doc.load_buffer_inplace_own(own_buffer, 0, parse_default, encoding).status == status_no_document_element && !doc.first_child()); + CHECK(doc.load_buffer_inplace_own(0, 0, parse_default, encoding).status == status_no_document_element && !doc.first_child()); + } +} + +TEST(document_load_buffer_empty_fragment) +{ + xml_encoding encodings[] = + { + encoding_auto, + encoding_utf8, + encoding_utf16_le, + encoding_utf16_be, + encoding_utf16, + encoding_utf32_le, + encoding_utf32_be, + encoding_utf32, + encoding_wchar, + encoding_latin1 + }; + + char buffer[1]; + + for (unsigned int i = 0; i < sizeof(encodings) / sizeof(encodings[0]); ++i) + { + xml_encoding encoding = encodings[i]; + + xml_document doc; + CHECK(doc.load_buffer(buffer, 0, parse_fragment, encoding) && !doc.first_child()); + CHECK(doc.load_buffer(0, 0, parse_fragment, encoding) && !doc.first_child()); + + CHECK(doc.load_buffer_inplace(buffer, 0, parse_fragment, encoding) && !doc.first_child()); + CHECK(doc.load_buffer_inplace(0, 0, parse_fragment, encoding) && !doc.first_child()); + + void* own_buffer = pugi::get_memory_allocation_function()(1); + + CHECK(doc.load_buffer_inplace_own(own_buffer, 0, parse_fragment, encoding) && !doc.first_child()); + CHECK(doc.load_buffer_inplace_own(0, 0, parse_fragment, encoding) && !doc.first_child()); } } @@ -933,13 +969,27 @@ TEST(document_progressive_truncation) { char* truncated_data = buffer + original_size - i; - memcpy(truncated_data, original_data, i); + // default flags + { + memcpy(truncated_data, original_data, i); - xml_document doc; - bool result = doc.load_buffer_inplace(truncated_data, i); + xml_document doc; + bool result = doc.load_buffer_inplace(truncated_data, i); + + // only eof is parseable + CHECK((i >= 3325) ? result : !result); + } + + // fragment mode + { + memcpy(truncated_data, original_data, i); + + xml_document doc; + bool result = doc.load_buffer_inplace(truncated_data, i, parse_default | parse_fragment); - // some truncate locations are parseable - those that come after declaration, declaration + doctype, declaration + doctype + comment and eof - CHECK(((i - 21) < 3 || (i - 66) < 3 || (i - 95) < 3 || i >= 3325) ? result : !result); + // some truncate locations are parseable - those that come after declaration, declaration + doctype, declaration + doctype + comment and eof + CHECK(((i - 21) < 3 || (i - 66) < 3 || (i - 95) < 3 || i >= 3325) ? result : !result); + } } delete[] buffer; @@ -953,12 +1003,29 @@ TEST(document_load_buffer_short) xml_document doc; - CHECK(doc.load_buffer(data, 4)); - CHECK(doc.load_buffer(data + 1, 3)); - CHECK(doc.load_buffer(data + 2, 2)); - CHECK(doc.load_buffer(data + 3, 1)); - CHECK(doc.load_buffer(data + 4, 0)); - CHECK(doc.load_buffer(0, 0)); + CHECK(doc.load_buffer(data, 4).status == status_no_document_element); + CHECK(doc.load_buffer(data + 1, 3).status == status_no_document_element); + CHECK(doc.load_buffer(data + 2, 2).status == status_no_document_element); + CHECK(doc.load_buffer(data + 3, 1).status == status_no_document_element); + CHECK(doc.load_buffer(data + 4, 0).status == status_no_document_element); + CHECK(doc.load_buffer(0, 0).status == status_no_document_element); + + delete[] data; +} + +TEST(document_load_buffer_short_fragment) +{ + char* data = new char[4]; + memcpy(data, "abcd", 4); + + xml_document doc; + + CHECK(doc.load_buffer(data, 4, parse_fragment) && test_string_equal(doc.text().get(), STR("abcd"))); + CHECK(doc.load_buffer(data + 1, 3, parse_fragment) && test_string_equal(doc.text().get(), STR("bcd"))); + CHECK(doc.load_buffer(data + 2, 2, parse_fragment) && test_string_equal(doc.text().get(), STR("cd"))); + CHECK(doc.load_buffer(data + 3, 1, parse_fragment) && test_string_equal(doc.text().get(), STR("d"))); + CHECK(doc.load_buffer(data + 4, 0, parse_fragment) && !doc.first_child()); + CHECK(doc.load_buffer(0, 0, parse_fragment) && !doc.first_child()); delete[] data; } @@ -970,12 +1037,12 @@ TEST(document_load_buffer_inplace_short) xml_document doc; - CHECK(doc.load_buffer_inplace(data, 4)); - CHECK(doc.load_buffer_inplace(data + 1, 3)); - CHECK(doc.load_buffer_inplace(data + 2, 2)); - CHECK(doc.load_buffer_inplace(data + 3, 1)); - CHECK(doc.load_buffer_inplace(data + 4, 0)); - CHECK(doc.load_buffer_inplace(0, 0)); + CHECK(doc.load_buffer_inplace(data, 4).status == status_no_document_element); + CHECK(doc.load_buffer_inplace(data + 1, 3).status == status_no_document_element); + CHECK(doc.load_buffer_inplace(data + 2, 2).status == status_no_document_element); + CHECK(doc.load_buffer_inplace(data + 3, 1).status == status_no_document_element); + CHECK(doc.load_buffer_inplace(data + 4, 0).status == status_no_document_element); + CHECK(doc.load_buffer_inplace(0, 0).status == status_no_document_element); delete[] data; } @@ -1006,7 +1073,7 @@ TEST_XML_FLAGS(document_element, "", parse_comments) +TEST_XML_FLAGS(document_element_absent, "", parse_comments | parse_fragment) { CHECK(doc.document_element() == xml_node()); } @@ -1070,16 +1137,6 @@ TEST_XML(document_reset_copy_self, "") CHECK_NODE(doc, STR("")); } -struct document_data_t -{ - xml_encoding encoding; - - const unsigned char* data; - size_t size; -}; - -#include - TEST(document_load_buffer_utf_truncated) { const unsigned char utf8[] = {'<', 0xe2, 0x82, 0xac, '/', '>'}; @@ -1088,6 +1145,14 @@ TEST(document_load_buffer_utf_truncated) const unsigned char utf32_be[] = {0, 0, 0, '<', 0, 0, 0x20, 0xac, 0, 0, 0, '/', 0, 0, 0, '>'}; const unsigned char utf32_le[] = {'<', 0, 0, 0, 0xac, 0x20, 0, 0, '/', 0, 0, 0, '>', 0, 0, 0}; + struct document_data_t + { + xml_encoding encoding; + + const unsigned char* data; + size_t size; + }; + const document_data_t data[] = { { encoding_utf8, utf8, sizeof(utf8) }, diff --git a/tests/test_dom_modify.cpp b/tests/test_dom_modify.cpp index c7a3989..c0f156b 100644 --- a/tests/test_dom_modify.cpp +++ b/tests/test_dom_modify.cpp @@ -1057,3 +1057,20 @@ TEST(dom_node_append_buffer_out_of_memory_buffer) CHECK(doc.append_buffer(data, sizeof(data)).status == status_out_of_memory); CHECK(!doc.first_child()); } + +TEST_XML(dom_node_append_buffer_fragment, "") +{ + xml_node node = doc.child(STR("node")); + + CHECK(node.append_buffer("1", 1).status == status_no_document_element); + CHECK_NODE(doc, STR("1")); + + CHECK(node.append_buffer("2", 1, parse_fragment)); + CHECK_NODE(doc, STR("12")); + + CHECK(node.append_buffer("3", 1).status == status_no_document_element); + CHECK_NODE(doc, STR("123")); + + CHECK(node.append_buffer("4", 1, parse_fragment)); + CHECK_NODE(doc, STR("1234")); +} diff --git a/tests/test_memory.cpp b/tests/test_memory.cpp index a37b91e..32d395b 100644 --- a/tests/test_memory.cpp +++ b/tests/test_memory.cpp @@ -119,7 +119,7 @@ TEST(memory_large_allocations) CHECK(allocate_count == deallocate_count + 1); // only one live page left (it waits for new allocations) char buffer; - CHECK(doc.load_buffer_inplace(&buffer, 0, parse_default, get_native_encoding())); + CHECK(doc.load_buffer_inplace(&buffer, 0, parse_fragment, get_native_encoding())); CHECK(allocate_count == deallocate_count); // no live pages left } diff --git a/tests/test_parse.cpp b/tests/test_parse.cpp index c165a65..6d9d4cc 100644 --- a/tests/test_parse.cpp +++ b/tests/test_parse.cpp @@ -1,10 +1,12 @@ #include "common.hpp" +#include "writer_string.hpp" + TEST(parse_pi_skip) { xml_document doc; - unsigned int flag_sets[] = {parse_minimal, parse_minimal | parse_declaration}; + unsigned int flag_sets[] = {parse_fragment, parse_fragment | parse_declaration}; for (unsigned int i = 0; i < sizeof(flag_sets) / sizeof(flag_sets[0]); ++i) { @@ -21,7 +23,7 @@ TEST(parse_pi_skip) TEST(parse_pi_parse) { xml_document doc; - CHECK(doc.load(STR(""), parse_minimal | parse_pi)); + CHECK(doc.load(STR(""), parse_fragment | parse_pi)); xml_node pi1 = doc.first_child(); xml_node pi2 = doc.last_child(); @@ -38,7 +40,7 @@ TEST(parse_pi_parse) TEST(parse_pi_parse_spaces) { xml_document doc; - CHECK(doc.load(STR(""), parse_minimal | parse_pi)); + CHECK(doc.load(STR(""), parse_fragment | parse_pi)); xml_node pi = doc.first_child(); @@ -51,7 +53,7 @@ TEST(parse_pi_error) { xml_document doc; - unsigned int flag_sets[] = {parse_minimal, parse_minimal | parse_pi}; + unsigned int flag_sets[] = {parse_fragment, parse_fragment | parse_pi}; for (unsigned int i = 0; i < sizeof(flag_sets) / sizeof(flag_sets[0]); ++i) { @@ -81,22 +83,22 @@ TEST(parse_pi_error) CHECK(doc.load(STR(""), parse_minimal | parse_pi).status == status_bad_pi); - CHECK(doc.load(STR(""), parse_minimal | parse_pi).status == status_bad_pi); - CHECK(doc.load(STR(""), parse_minimal | parse_pi).status == status_bad_pi); + CHECK(doc.load(STR(""), parse_fragment | parse_pi).status == status_bad_pi); + CHECK(doc.load(STR(""), parse_fragment | parse_pi).status == status_bad_pi); + CHECK(doc.load(STR(""), parse_fragment | parse_pi).status == status_bad_pi); } TEST(parse_comments_skip) { xml_document doc; - CHECK(doc.load(STR(""), parse_minimal)); + CHECK(doc.load(STR(""), parse_fragment)); CHECK(!doc.first_child()); } TEST(parse_comments_parse) { xml_document doc; - CHECK(doc.load(STR(""), parse_minimal | parse_comments)); + CHECK(doc.load(STR(""), parse_fragment | parse_comments)); xml_node c1 = doc.first_child(); xml_node c2 = doc.last_child(); @@ -113,7 +115,7 @@ TEST(parse_comments_parse) TEST(parse_comments_parse_no_eol) { xml_document doc; - CHECK(doc.load(STR(""), parse_minimal | parse_comments)); + CHECK(doc.load(STR(""), parse_fragment | parse_comments)); xml_node c = doc.first_child(); CHECK(c.type() == node_comment); @@ -123,7 +125,7 @@ TEST(parse_comments_parse_no_eol) TEST(parse_comments_parse_eol) { xml_document doc; - CHECK(doc.load(STR(""), parse_minimal | parse_comments | parse_eol)); + CHECK(doc.load(STR(""), parse_fragment | parse_comments | parse_eol)); xml_node c = doc.first_child(); CHECK(c.type() == node_comment); @@ -134,7 +136,7 @@ TEST(parse_comments_error) { xml_document doc; - unsigned int flag_sets[] = {parse_minimal, parse_minimal | parse_comments, parse_minimal | parse_comments | parse_eol}; + unsigned int flag_sets[] = {parse_fragment, parse_fragment | parse_comments, parse_fragment | parse_comments | parse_eol}; for (unsigned int i = 0; i < sizeof(flag_sets) / sizeof(flag_sets[0]); ++i) { @@ -152,21 +154,21 @@ TEST(parse_comments_error) TEST(parse_cdata_skip) { xml_document doc; - CHECK(doc.load(STR(""), parse_minimal)); + CHECK(doc.load(STR(""), parse_fragment)); CHECK(!doc.first_child()); } TEST(parse_cdata_skip_contents) { xml_document doc; - CHECK(doc.load(STR("hello, world!"), parse_minimal)); + CHECK(doc.load(STR("hello, world!"), parse_fragment)); CHECK_NODE(doc, STR("hello, world!")); } TEST(parse_cdata_parse) { xml_document doc; - CHECK(doc.load(STR(""), parse_minimal | parse_cdata)); + CHECK(doc.load(STR(""), parse_fragment | parse_cdata)); xml_node c1 = doc.first_child(); xml_node c2 = doc.last_child(); @@ -183,7 +185,7 @@ TEST(parse_cdata_parse) TEST(parse_cdata_parse_no_eol) { xml_document doc; - CHECK(doc.load(STR(""), parse_minimal | parse_cdata)); + CHECK(doc.load(STR(""), parse_fragment | parse_cdata)); xml_node c = doc.first_child(); CHECK(c.type() == node_cdata); @@ -193,7 +195,7 @@ TEST(parse_cdata_parse_no_eol) TEST(parse_cdata_parse_eol) { xml_document doc; - CHECK(doc.load(STR(""), parse_minimal | parse_cdata | parse_eol)); + CHECK(doc.load(STR(""), parse_fragment | parse_cdata | parse_eol)); xml_node c = doc.first_child(); CHECK(c.type() == node_cdata); @@ -204,7 +206,7 @@ TEST(parse_cdata_error) { xml_document doc; - unsigned int flag_sets[] = {parse_minimal, parse_minimal | parse_cdata, parse_minimal | parse_cdata | parse_eol}; + unsigned int flag_sets[] = {parse_fragment, parse_fragment | parse_cdata, parse_fragment | parse_cdata | parse_eol}; for (unsigned int i = 0; i < sizeof(flag_sets) / sizeof(flag_sets[0]); ++i) { @@ -229,7 +231,7 @@ TEST(parse_cdata_error) TEST(parse_ws_pcdata_skip) { xml_document doc; - CHECK(doc.load(STR(" "), parse_minimal)); + CHECK(doc.load(STR(" "), parse_fragment)); CHECK(!doc.first_child()); CHECK(doc.load(STR(" "), parse_minimal)); @@ -286,8 +288,6 @@ TEST(parse_ws_pcdata_permutations) test_data_t test_data[] = { // external pcdata should be discarded (whitespace or not) - {7, STR("ext1"), STR(""), 1}, - {7, STR(" "), STR(""), 1}, {7, STR("ext1"), STR(""), 2}, {7, STR("ext1ext2"), STR(""), 2}, {7, STR(" "), STR(""), 2}, @@ -314,11 +314,13 @@ TEST(parse_ws_pcdata_permutations) {4, STR("\t\t\n\n"), STR("\n\n"), 3}, // error case: terminate PCDATA in the middle {7, STR("abcdef"), STR("abcdef"), -3}, - {7, STR(" "), STR(" "), -3}, + {5, STR(" "), STR(""), -2}, + {2, STR(" "), STR(" "), -3}, // error case: terminate PCDATA as early as possible {7, STR(""), STR(""), -2}, {7, STR("a"), STR("a"), -3}, - {7, STR(" "), STR(" "), -3}, + {5, STR(" "), STR(""), -2}, + {2, STR(" "), STR(" "), -3}, }; for (size_t i = 0; i < sizeof(test_data) / sizeof(test_data[0]); ++i) @@ -342,6 +344,57 @@ TEST(parse_ws_pcdata_permutations) } } +TEST(parse_ws_pcdata_fragment_permutations) +{ + struct test_data_t + { + unsigned int mask; // 1 = default flags, 2 = parse_ws_pcdata, 4 = parse_ws_pcdata_single + const pugi::char_t* source; + const pugi::char_t* result; + int nodes; // negative if parsing should fail + }; + + test_data_t test_data[] = + { + // external pcdata should be preserved + {7, STR("ext1"), STR("ext1"), 2}, + {5, STR(" "), STR(""), 1}, + {2, STR(" "), STR(" "), 2}, + {7, STR("ext1"), STR("ext1"), 3}, + {7, STR("ext2"), STR("ext2"), 3}, + {7, STR("ext1ext2"), STR("ext1ext2"), 4}, + {7, STR("ext1ext2ext3"), STR("ext1ext2ext3"), 6}, + {5, STR(" "), STR(""), 2}, + {2, STR(" "), STR(" "), 3}, + {5, STR(" "), STR(""), 2}, + {2, STR(" "), STR(" "), 3}, + {5, STR(" "), STR(""), 2}, + {2, STR(" "), STR(" "), 4}, + {5, STR(" "), STR(""), 3}, + {2, STR(" "), STR(" "), 6}, + }; + + for (size_t i = 0; i < sizeof(test_data) / sizeof(test_data[0]); ++i) + { + const test_data_t& td = test_data[i]; + + for (int flag = 0; flag < 3; ++flag) + { + if (td.mask & (1 << flag)) + { + unsigned int flags[] = {parse_default, parse_default | parse_ws_pcdata, parse_default | parse_ws_pcdata_single}; + + xml_document doc; + CHECK((td.nodes > 0) == doc.load(td.source, flags[flag] | parse_fragment)); + CHECK_NODE(doc, td.result); + + int nodes = get_tree_node_count(doc); + CHECK((td.nodes < 0 ? -td.nodes : td.nodes) == nodes); + } + } + } +} + TEST(parse_pcdata_no_eol) { xml_document doc; @@ -685,14 +738,14 @@ TEST(parse_tag_error) TEST(parse_declaration_cases) { xml_document doc; - CHECK(doc.load(STR(""), parse_minimal | parse_pi)); + CHECK(doc.load(STR(""), parse_fragment | parse_pi)); CHECK(!doc.first_child()); } TEST(parse_declaration_attr_cases) { xml_document doc; - CHECK(doc.load(STR(""), parse_minimal | parse_pi)); + CHECK(doc.load(STR(""), parse_fragment | parse_pi)); CHECK(!doc.first_child()); } @@ -700,7 +753,7 @@ TEST(parse_declaration_skip) { xml_document doc; - unsigned int flag_sets[] = {parse_minimal, parse_minimal | parse_pi}; + unsigned int flag_sets[] = {parse_fragment, parse_fragment | parse_pi}; for (unsigned int i = 0; i < sizeof(flag_sets) / sizeof(flag_sets[0]); ++i) { @@ -717,7 +770,7 @@ TEST(parse_declaration_skip) TEST(parse_declaration_parse) { xml_document doc; - CHECK(doc.load(STR(""), parse_minimal | parse_declaration)); + CHECK(doc.load(STR(""), parse_fragment | parse_declaration)); xml_node d1 = doc.first_child(); xml_node d2 = doc.last_child(); @@ -734,7 +787,7 @@ TEST(parse_declaration_error) { xml_document doc; - unsigned int flag_sets[] = {parse_minimal, parse_minimal | parse_declaration}; + unsigned int flag_sets[] = {parse_fragment, parse_fragment | parse_declaration}; for (unsigned int i = 0; i < sizeof(flag_sets) / sizeof(flag_sets[0]); ++i) { @@ -746,14 +799,15 @@ TEST(parse_declaration_error) CHECK(doc.load(STR(""), flags).status == status_bad_pi); } - CHECK(doc.load(STR(""), parse_minimal | parse_declaration).status == status_bad_attribute); - CHECK(doc.load(STR(""), parse_minimal | parse_declaration).status == status_bad_pi); + CHECK(doc.load(STR(""), parse_fragment | parse_declaration).status == status_bad_attribute); + CHECK(doc.load(STR(""), parse_fragment | parse_declaration).status == status_bad_pi); } TEST(parse_empty) { xml_document doc; - CHECK(doc.load(STR("")) && !doc.first_child()); + CHECK(doc.load(STR("")).status == status_no_document_element && !doc.first_child()); + CHECK(doc.load(STR(""), parse_fragment) && !doc.first_child()); } TEST(parse_out_of_memory) @@ -843,3 +897,81 @@ TEST(parse_result_default) CHECK(result.offset == 0); CHECK(result.encoding == encoding_auto); } + +TEST(parse_bom_fragment) +{ + struct test_data_t + { + xml_encoding encoding; + const char* data; + size_t size; + const char_t* text; + }; + + const test_data_t data[] = + { + { encoding_utf8, "\xef\xbb\xbf", 3, STR("") }, + { encoding_utf8, "\xef\xbb\xbftest", 7, STR("test") }, + { encoding_utf16_be, "\xfe\xff", 2, STR("") }, + { encoding_utf16_be, "\xfe\xff\x00t\x00o\x00s\x00t", 10, STR("tost") }, + { encoding_utf16_le, "\xff\xfe", 2, STR("") }, + { encoding_utf16_le, "\xff\xfet\x00o\x00s\x00t\x00", 10, STR("tost") }, + { encoding_utf32_be, "\x00\x00\xfe\xff", 4, STR("") }, + { encoding_utf32_be, "\x00\x00\xfe\xff\x00\x00\x00t\x00\x00\x00o\x00\x00\x00s\x00\x00\x00t", 20, STR("tost") }, + { encoding_utf32_le, "\xff\xfe\x00\x00", 4, STR("") }, + { encoding_utf32_le, "\xff\xfe\x00\x00t\x00\x00\x00o\x00\x00\x00s\x00\x00\x00t\x00\x00\x00", 20, STR("tost") }, + }; + + for (size_t i = 0; i < sizeof(data) / sizeof(data[0]); ++i) + { + xml_document doc; + CHECK(doc.load_buffer(data[i].data, data[i].size, parse_fragment, data[i].encoding)); + CHECK_STRING(doc.text().get(), data[i].text); + CHECK(save_narrow(doc, format_no_declaration | format_raw | format_write_bom, data[i].encoding) == std::string(data[i].data, data[i].size)); + } +} + +TEST(parse_bom_fragment_invalid_utf8) +{ + xml_document doc; + + CHECK(doc.load_buffer("\xef\xbb\xbb", 3, parse_fragment, encoding_utf8)); + + const char_t* value = doc.text().get(); + +#ifdef PUGIXML_WCHAR_MODE + CHECK(value[0] == wchar_cast(0xfefb) && value[1] == 0); +#else + CHECK_STRING(value, "\xef\xbb\xbb"); +#endif +} + +TEST(parse_bom_fragment_invalid_utf16) +{ + xml_document doc; + + CHECK(doc.load_buffer("\xff\xfe", 2, parse_fragment, encoding_utf16_be)); + + const char_t* value = doc.text().get(); + +#ifdef PUGIXML_WCHAR_MODE + CHECK(value[0] == wchar_cast(0xfffe) && value[1] == 0); +#else + CHECK_STRING(value, "\xef\xbf\xbe"); +#endif +} + +TEST(parse_bom_fragment_invalid_utf32) +{ + xml_document doc; + + CHECK(doc.load_buffer("\xff\xff\x00\x00", 4, parse_fragment, encoding_utf32_le)); + + const char_t* value = doc.text().get(); + +#ifdef PUGIXML_WCHAR_MODE + CHECK(value[0] == wchar_cast(0xffff) && value[1] == 0); +#else + CHECK_STRING(value, "\xef\xbf\xbf"); +#endif +} diff --git a/tests/test_parse_doctype.cpp b/tests/test_parse_doctype.cpp index d7a3726..8976890 100644 --- a/tests/test_parse_doctype.cpp +++ b/tests/test_parse_doctype.cpp @@ -20,7 +20,7 @@ static xml_parse_result load_concat(xml_document& doc, const char_t* a, const ch strcat(buffer, c); #endif - return doc.load(buffer); + return doc.load(buffer, parse_fragment); } static bool test_doctype_wf(const char_t* decl) @@ -31,9 +31,9 @@ static bool test_doctype_wf(const char_t* decl) if (!load_concat(doc, decl) || !doc.first_child().empty()) return false; // pcdata pre/postfix - if (!load_concat(doc, STR("a"), decl) || !doc.first_child().empty()) return false; - if (!load_concat(doc, decl, STR("b")) || !doc.first_child().empty()) return false; - if (!load_concat(doc, STR("a"), decl, STR("b")) || !doc.first_child().empty()) return false; + if (!load_concat(doc, STR("a"), decl) || !test_node(doc, STR("a"), STR(""), format_raw)) return false; + if (!load_concat(doc, decl, STR("b")) || !test_node(doc, STR("b"), STR(""), format_raw)) return false; + if (!load_concat(doc, STR("a"), decl, STR("b")) || !test_node(doc, STR("ab"), STR(""), format_raw)) return false; // node pre/postfix if (!load_concat(doc, STR(""), decl) || !test_node(doc, STR(""), STR(""), format_raw)) return false; @@ -41,7 +41,7 @@ static bool test_doctype_wf(const char_t* decl) if (!load_concat(doc, STR(""), decl, STR("")) || !test_node(doc, STR(""), STR(""), format_raw)) return false; // check load-store contents preservation - CHECK(doc.load(decl, parse_doctype)); + CHECK(doc.load(decl, parse_doctype | parse_fragment)); CHECK_NODE(doc, decl); return true; @@ -281,8 +281,8 @@ TEST(parse_doctype_xmlconf_oasis_1) // not actually a doctype :) xml_document doc; - CHECK(doc.load(STR(" "), parse_full) && doc.first_child().type() == node_comment && doc.last_child().type() == node_comment && doc.first_child().next_sibling() == doc.last_child()); - CHECK(doc.load(STR(" &a%b&#c?>"), parse_full) && doc.first_child().type() == node_pi && doc.first_child() == doc.last_child()); + CHECK(doc.load(STR(" "), parse_full | parse_fragment) && doc.first_child().type() == node_comment && doc.last_child().type() == node_comment && doc.first_child().next_sibling() == doc.last_child()); + CHECK(doc.load(STR(" &a%b&#c?>"), parse_full | parse_fragment) && doc.first_child().type() == node_pi && doc.first_child() == doc.last_child()); } TEST(parse_doctype_xmlconf_xmltest_1) @@ -299,7 +299,7 @@ TEST(parse_doctype_xmlconf_xmltest_1) TEST_DOCTYPE_WF(" \"> ]>"); } -TEST_XML_FLAGS(parse_doctype_value, " \"> ]>", parse_minimal | parse_doctype) +TEST_XML_FLAGS(parse_doctype_value, " \"> ]>", parse_fragment | parse_doctype) { xml_node n = doc.first_child(); diff --git a/tests/test_write.cpp b/tests/test_write.cpp index de6f03d..465d111 100644 --- a/tests/test_write.cpp +++ b/tests/test_write.cpp @@ -25,19 +25,19 @@ TEST_XML(write_pcdata, "text") CHECK_NODE_EX(doc, STR("\n\t\n\t\t\n\t\ttext\n\t\n\n"), STR("\t"), format_indent); } -TEST_XML(write_cdata, "") +TEST_XML_FLAGS(write_cdata, "", parse_cdata | parse_fragment) { CHECK_NODE(doc, STR("")); CHECK_NODE_EX(doc, STR("\n"), STR(""), 0); } -TEST_XML(write_cdata_empty, "") +TEST_XML_FLAGS(write_cdata_empty, "", parse_cdata | parse_fragment) { CHECK_NODE(doc, STR("")); CHECK_NODE_EX(doc, STR("\n"), STR(""), 0); } -TEST_XML(write_cdata_escape, "") +TEST_XML_FLAGS(write_cdata_escape, "", parse_cdata | parse_fragment) { CHECK_NODE(doc, STR("")); @@ -51,26 +51,25 @@ TEST_XML(write_cdata_inner, "") CHECK_NODE_EX(doc, STR("\n"), STR(""), 0); } - -TEST_XML_FLAGS(write_comment, "", parse_default | parse_comments) +TEST_XML_FLAGS(write_comment, "", parse_comments | parse_fragment) { CHECK_NODE(doc, STR("")); CHECK_NODE_EX(doc, STR("\n"), STR(""), 0); } -TEST_XML_FLAGS(write_pi, "", parse_default | parse_pi) +TEST_XML_FLAGS(write_pi, "", parse_pi | parse_fragment) { CHECK_NODE(doc, STR("")); CHECK_NODE_EX(doc, STR("\n"), STR(""), 0); } -TEST_XML_FLAGS(write_declaration, "", parse_default | parse_declaration) +TEST_XML_FLAGS(write_declaration, "", parse_declaration | parse_fragment) { CHECK_NODE(doc, STR("")); CHECK_NODE_EX(doc, STR("\n"), STR(""), 0); } -TEST_XML_FLAGS(write_doctype, "", parse_default | parse_doctype) +TEST_XML_FLAGS(write_doctype, "", parse_doctype | parse_fragment) { CHECK_NODE(doc, STR("")); CHECK_NODE_EX(doc, STR("\n"), STR(""), 0); -- cgit v1.2.3