From 47c15ad949eb6589ee14d208444b4e759a611143 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Tue, 11 Feb 2014 06:45:27 +0000 Subject: Implement document fragment parsing. Introduce a notable behavior change in default parsing mode: documents without a document element node are now considered invalid. This is technically a breaking change, however the amount of documents it affects is very small, all parsed data still persists, and lack of this check results in very confusing behavior in a number of cases. In order to be able to parse documents without an element node, a fragment parsing flag is introduced. Parsing a buffer in fragment mode treats the buffer as a fragment of a valid XML. As a consequence, top-level PCDATA is added to the tree; additionally, there are no restrictions on the number of nodes -- so documents without a document element are considered valid. Due to the way parsing works internally, load_buffer_inplace occasionally can not preserve the document contents if it's parsed in a fragment mode. While unfortunate, this problem is fundamental; since the use case is relatively obscure, hopefully documenting this shortcoming will be enough. git-svn-id: https://pugixml.googlecode.com/svn/trunk@980 99668b35-9821-0410-8761-19e4c4f06640 --- src/pugixml.cpp | 78 ++++++++++++++++++++++++++++++++++++++++++++------------- src/pugixml.hpp | 8 +++++- 2 files changed, 67 insertions(+), 19 deletions(-) (limited to 'src') diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 926458e..1893125 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -2199,7 +2199,7 @@ PUGI__NS_BEGIN char_t* parse_doctype_group(char_t* s, char_t endch, bool toplevel) { - assert(s[0] == '<' && s[1] == '!'); + assert((s[0] == '<' || s[0] == 0) && s[1] == '!'); s++; while (*s) @@ -2331,6 +2331,9 @@ PUGI__NS_BEGIN s = parse_doctype_group(s, endch, true); if (!s) return s; + assert((*s == 0 && endch == '>') || *s == '>'); + if (*s) *s++ = 0; + if (PUGI__OPTSET(parse_doctype)) { while (PUGI__IS_CHARTYPE(*mark, ct_space)) ++mark; @@ -2339,9 +2342,6 @@ PUGI__NS_BEGIN cursor->value = mark; - assert((*s == 0 && endch == '>') || *s == '>'); - if (*s) *s++ = 0; - PUGI__POPNODE(); } } @@ -2629,7 +2629,7 @@ PUGI__NS_BEGIN PUGI__SKIPWS(); // Eat whitespace if no genuine PCDATA here. - if (*s == '<') + if (*s == '<' || !*s) { // We skipped some whitespace characters because otherwise we would take the tag branch instead of PCDATA one assert(mark != s); @@ -2640,13 +2640,13 @@ PUGI__NS_BEGIN } else if (PUGI__OPTSET(parse_ws_pcdata_single)) { - if (s[1] != '/' || cursor->first_child) continue; + if (s[0] != '<' || s[1] != '/' || cursor->first_child) continue; } } s = mark; - if (cursor->parent) + if (cursor->parent || PUGI__OPTSET(parse_fragment)) { PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree. cursor->value = s; // Save the offset. @@ -2676,14 +2676,43 @@ PUGI__NS_BEGIN return s; } + #ifdef PUGIXML_WCHAR_MODE + static char_t* parse_skip_bom(char_t* s) + { + return (s[0] == 0xfeff) ? s + 1 : s; + } + #else + static char_t* parse_skip_bom(char_t* s) + { + return (s[0] == '\xef' && s[1] == '\xbb' && s[2] == '\xbf') ? s + 3 : s; + } + #endif + + static bool has_element_node_siblings(xml_node_struct* node) + { + while (node) + { + xml_node_type type = static_cast((node->header & impl::xml_memory_page_type_mask) + 1); + if (type == node_element) return true; + + node = node->next_sibling; + } + + return false; + } + static xml_parse_result parse(char_t* buffer, size_t length, xml_document_struct* xmldoc, xml_node_struct* root, unsigned int optmsk) { // allocator object is a part of document object xml_allocator& alloc = *static_cast(xmldoc); // early-out for empty documents - if (length == 0) return make_parse_result(status_ok); + if (length == 0) + return make_parse_result(PUGI__OPTSET(parse_fragment) ? status_ok : status_no_document_element); + // get last child of the root before parsing + xml_node_struct* last_root_child = root->first_child ? root->first_child->prev_sibling_c : 0; + // create parser on stack xml_parser parser(alloc); @@ -2691,24 +2720,35 @@ PUGI__NS_BEGIN char_t endch = buffer[length - 1]; buffer[length - 1] = 0; + // skip BOM to make sure it does not end up as part of parse output + char_t* buffer_data = parse_skip_bom(buffer); + // perform actual parsing - parser.parse_tree(buffer, root, optmsk, endch); + parser.parse_tree(buffer_data, root, optmsk, endch); + + // update allocator state + alloc = parser.alloc; xml_parse_result result = make_parse_result(parser.error_status, parser.error_offset ? parser.error_offset - buffer : 0); assert(result.offset >= 0 && static_cast(result.offset) <= length); - // roll back offset if it occurs on a null terminator in the source buffer - if (result.offset > 0 && static_cast(result.offset) == length - 1 && endch == 0) - result.offset--; + if (result) + { + // since we removed last character, we have to handle the only possible false positive (stray <) + if (endch == '<') + return make_parse_result(status_unrecognized_tag, length - 1); - // update allocator state - alloc = parser.alloc; + // check if there are any element nodes parsed + xml_node_struct* first_root_child_parsed = last_root_child ? last_root_child->next_sibling : root->first_child; - // since we removed last character, we have to handle the only possible false positive - if (result && endch == '<') + if (!PUGI__OPTSET(parse_fragment) && !has_element_node_siblings(first_root_child_parsed)) + return make_parse_result(status_no_document_element, length - 1); + } + else { - // there's no possible well-formed document with < at the end - return make_parse_result(status_unrecognized_tag, length - 1); + // roll back offset if it occurs on a null terminator in the source buffer + if (result.offset > 0 && static_cast(result.offset) == length - 1 && endch == 0) + result.offset--; } return result; @@ -5469,6 +5509,8 @@ namespace pugi case status_append_invalid_root: return "Unable to append nodes: root is not an element or document"; + case status_no_document_element: return "No document element found"; + default: return "Unknown error"; } } diff --git a/src/pugixml.hpp b/src/pugixml.hpp index e19a4a3..e5009fe 100644 --- a/src/pugixml.hpp +++ b/src/pugixml.hpp @@ -151,6 +151,10 @@ namespace pugi // This flag is off by default; turning it on may result in slower parsing and more memory consumption. const unsigned int parse_ws_pcdata_single = 0x0400; + // This flag determines if plain character data that does not have a parent node is added to the DOM tree, and if an empty document + // is a valid document. This flag is off by default. + const unsigned int parse_fragment = 0x0800; + // The default parsing mode. // Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded, // End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules. @@ -880,7 +884,9 @@ namespace pugi status_bad_end_element, // Parsing error occurred while parsing end element tag status_end_element_mismatch,// There was a mismatch of start-end tags (closing tag had incorrect name, some tag was not closed or there was an excessive closing tag) - status_append_invalid_root // Unable to append nodes since root type is not node_element or node_document (exclusive to xml_node::append_buffer) + status_append_invalid_root, // Unable to append nodes since root type is not node_element or node_document (exclusive to xml_node::append_buffer) + + status_no_document_element // Parsing resulted in a document without element nodes }; // Parsing result -- cgit v1.2.3