From 47c15ad949eb6589ee14d208444b4e759a611143 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Tue, 11 Feb 2014 06:45:27 +0000 Subject: Implement document fragment parsing. Introduce a notable behavior change in default parsing mode: documents without a document element node are now considered invalid. This is technically a breaking change, however the amount of documents it affects is very small, all parsed data still persists, and lack of this check results in very confusing behavior in a number of cases. In order to be able to parse documents without an element node, a fragment parsing flag is introduced. Parsing a buffer in fragment mode treats the buffer as a fragment of a valid XML. As a consequence, top-level PCDATA is added to the tree; additionally, there are no restrictions on the number of nodes -- so documents without a document element are considered valid. Due to the way parsing works internally, load_buffer_inplace occasionally can not preserve the document contents if it's parsed in a fragment mode. While unfortunate, this problem is fundamental; since the use case is relatively obscure, hopefully documenting this shortcoming will be enough. git-svn-id: https://pugixml.googlecode.com/svn/trunk@980 99668b35-9821-0410-8761-19e4c4f06640 --- tests/test_document.cpp | 135 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 100 insertions(+), 35 deletions(-) (limited to 'tests/test_document.cpp') diff --git a/tests/test_document.cpp b/tests/test_document.cpp index 3ac8bf8..e6c7b00 100644 --- a/tests/test_document.cpp +++ b/tests/test_document.cpp @@ -249,7 +249,7 @@ TEST(document_load_file_empty) { pugi::xml_document doc; - CHECK(doc.load_file("tests/data/empty.xml")); + CHECK(doc.load_file("tests/data/empty.xml").status == status_no_document_element); CHECK(!doc.first_child()); } @@ -907,16 +907,52 @@ TEST(document_load_buffer_empty) xml_encoding encoding = encodings[i]; xml_document doc; - CHECK(doc.load_buffer(buffer, 0, parse_default, encoding) && !doc.first_child()); - CHECK(doc.load_buffer(0, 0, parse_default, encoding) && !doc.first_child()); + CHECK(doc.load_buffer(buffer, 0, parse_default, encoding).status == status_no_document_element && !doc.first_child()); + CHECK(doc.load_buffer(0, 0, parse_default, encoding).status == status_no_document_element && !doc.first_child()); - CHECK(doc.load_buffer_inplace(buffer, 0, parse_default, encoding) && !doc.first_child()); - CHECK(doc.load_buffer_inplace(0, 0, parse_default, encoding) && !doc.first_child()); + CHECK(doc.load_buffer_inplace(buffer, 0, parse_default, encoding).status == status_no_document_element && !doc.first_child()); + CHECK(doc.load_buffer_inplace(0, 0, parse_default, encoding).status == status_no_document_element && !doc.first_child()); void* own_buffer = pugi::get_memory_allocation_function()(1); - CHECK(doc.load_buffer_inplace_own(own_buffer, 0, parse_default, encoding) && !doc.first_child()); - CHECK(doc.load_buffer_inplace_own(0, 0, parse_default, encoding) && !doc.first_child()); + CHECK(doc.load_buffer_inplace_own(own_buffer, 0, parse_default, encoding).status == status_no_document_element && !doc.first_child()); + CHECK(doc.load_buffer_inplace_own(0, 0, parse_default, encoding).status == status_no_document_element && !doc.first_child()); + } +} + +TEST(document_load_buffer_empty_fragment) +{ + xml_encoding encodings[] = + { + encoding_auto, + encoding_utf8, + encoding_utf16_le, + encoding_utf16_be, + encoding_utf16, + encoding_utf32_le, + encoding_utf32_be, + encoding_utf32, + encoding_wchar, + encoding_latin1 + }; + + char buffer[1]; + + for (unsigned int i = 0; i < sizeof(encodings) / sizeof(encodings[0]); ++i) + { + xml_encoding encoding = encodings[i]; + + xml_document doc; + CHECK(doc.load_buffer(buffer, 0, parse_fragment, encoding) && !doc.first_child()); + CHECK(doc.load_buffer(0, 0, parse_fragment, encoding) && !doc.first_child()); + + CHECK(doc.load_buffer_inplace(buffer, 0, parse_fragment, encoding) && !doc.first_child()); + CHECK(doc.load_buffer_inplace(0, 0, parse_fragment, encoding) && !doc.first_child()); + + void* own_buffer = pugi::get_memory_allocation_function()(1); + + CHECK(doc.load_buffer_inplace_own(own_buffer, 0, parse_fragment, encoding) && !doc.first_child()); + CHECK(doc.load_buffer_inplace_own(0, 0, parse_fragment, encoding) && !doc.first_child()); } } @@ -933,13 +969,27 @@ TEST(document_progressive_truncation) { char* truncated_data = buffer + original_size - i; - memcpy(truncated_data, original_data, i); + // default flags + { + memcpy(truncated_data, original_data, i); - xml_document doc; - bool result = doc.load_buffer_inplace(truncated_data, i); + xml_document doc; + bool result = doc.load_buffer_inplace(truncated_data, i); + + // only eof is parseable + CHECK((i >= 3325) ? result : !result); + } + + // fragment mode + { + memcpy(truncated_data, original_data, i); + + xml_document doc; + bool result = doc.load_buffer_inplace(truncated_data, i, parse_default | parse_fragment); - // some truncate locations are parseable - those that come after declaration, declaration + doctype, declaration + doctype + comment and eof - CHECK(((i - 21) < 3 || (i - 66) < 3 || (i - 95) < 3 || i >= 3325) ? result : !result); + // some truncate locations are parseable - those that come after declaration, declaration + doctype, declaration + doctype + comment and eof + CHECK(((i - 21) < 3 || (i - 66) < 3 || (i - 95) < 3 || i >= 3325) ? result : !result); + } } delete[] buffer; @@ -953,12 +1003,29 @@ TEST(document_load_buffer_short) xml_document doc; - CHECK(doc.load_buffer(data, 4)); - CHECK(doc.load_buffer(data + 1, 3)); - CHECK(doc.load_buffer(data + 2, 2)); - CHECK(doc.load_buffer(data + 3, 1)); - CHECK(doc.load_buffer(data + 4, 0)); - CHECK(doc.load_buffer(0, 0)); + CHECK(doc.load_buffer(data, 4).status == status_no_document_element); + CHECK(doc.load_buffer(data + 1, 3).status == status_no_document_element); + CHECK(doc.load_buffer(data + 2, 2).status == status_no_document_element); + CHECK(doc.load_buffer(data + 3, 1).status == status_no_document_element); + CHECK(doc.load_buffer(data + 4, 0).status == status_no_document_element); + CHECK(doc.load_buffer(0, 0).status == status_no_document_element); + + delete[] data; +} + +TEST(document_load_buffer_short_fragment) +{ + char* data = new char[4]; + memcpy(data, "abcd", 4); + + xml_document doc; + + CHECK(doc.load_buffer(data, 4, parse_fragment) && test_string_equal(doc.text().get(), STR("abcd"))); + CHECK(doc.load_buffer(data + 1, 3, parse_fragment) && test_string_equal(doc.text().get(), STR("bcd"))); + CHECK(doc.load_buffer(data + 2, 2, parse_fragment) && test_string_equal(doc.text().get(), STR("cd"))); + CHECK(doc.load_buffer(data + 3, 1, parse_fragment) && test_string_equal(doc.text().get(), STR("d"))); + CHECK(doc.load_buffer(data + 4, 0, parse_fragment) && !doc.first_child()); + CHECK(doc.load_buffer(0, 0, parse_fragment) && !doc.first_child()); delete[] data; } @@ -970,12 +1037,12 @@ TEST(document_load_buffer_inplace_short) xml_document doc; - CHECK(doc.load_buffer_inplace(data, 4)); - CHECK(doc.load_buffer_inplace(data + 1, 3)); - CHECK(doc.load_buffer_inplace(data + 2, 2)); - CHECK(doc.load_buffer_inplace(data + 3, 1)); - CHECK(doc.load_buffer_inplace(data + 4, 0)); - CHECK(doc.load_buffer_inplace(0, 0)); + CHECK(doc.load_buffer_inplace(data, 4).status == status_no_document_element); + CHECK(doc.load_buffer_inplace(data + 1, 3).status == status_no_document_element); + CHECK(doc.load_buffer_inplace(data + 2, 2).status == status_no_document_element); + CHECK(doc.load_buffer_inplace(data + 3, 1).status == status_no_document_element); + CHECK(doc.load_buffer_inplace(data + 4, 0).status == status_no_document_element); + CHECK(doc.load_buffer_inplace(0, 0).status == status_no_document_element); delete[] data; } @@ -1006,7 +1073,7 @@ TEST_XML_FLAGS(document_element, "", parse_comments) +TEST_XML_FLAGS(document_element_absent, "", parse_comments | parse_fragment) { CHECK(doc.document_element() == xml_node()); } @@ -1070,16 +1137,6 @@ TEST_XML(document_reset_copy_self, "") CHECK_NODE(doc, STR("")); } -struct document_data_t -{ - xml_encoding encoding; - - const unsigned char* data; - size_t size; -}; - -#include - TEST(document_load_buffer_utf_truncated) { const unsigned char utf8[] = {'<', 0xe2, 0x82, 0xac, '/', '>'}; @@ -1088,6 +1145,14 @@ TEST(document_load_buffer_utf_truncated) const unsigned char utf32_be[] = {0, 0, 0, '<', 0, 0, 0x20, 0xac, 0, 0, 0, '/', 0, 0, 0, '>'}; const unsigned char utf32_le[] = {'<', 0, 0, 0, 0xac, 0x20, 0, 0, '/', 0, 0, 0, '>', 0, 0, 0}; + struct document_data_t + { + xml_encoding encoding; + + const unsigned char* data; + size_t size; + }; + const document_data_t data[] = { { encoding_utf8, utf8, sizeof(utf8) }, -- cgit v1.2.3