From 47c15ad949eb6589ee14d208444b4e759a611143 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Tue, 11 Feb 2014 06:45:27 +0000 Subject: Implement document fragment parsing. Introduce a notable behavior change in default parsing mode: documents without a document element node are now considered invalid. This is technically a breaking change, however the amount of documents it affects is very small, all parsed data still persists, and lack of this check results in very confusing behavior in a number of cases. In order to be able to parse documents without an element node, a fragment parsing flag is introduced. Parsing a buffer in fragment mode treats the buffer as a fragment of a valid XML. As a consequence, top-level PCDATA is added to the tree; additionally, there are no restrictions on the number of nodes -- so documents without a document element are considered valid. Due to the way parsing works internally, load_buffer_inplace occasionally can not preserve the document contents if it's parsed in a fragment mode. While unfortunate, this problem is fundamental; since the use case is relatively obscure, hopefully documenting this shortcoming will be enough. git-svn-id: https://pugixml.googlecode.com/svn/trunk@980 99668b35-9821-0410-8761-19e4c4f06640 --- src/pugixml.hpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'src/pugixml.hpp') diff --git a/src/pugixml.hpp b/src/pugixml.hpp index e19a4a3..e5009fe 100644 --- a/src/pugixml.hpp +++ b/src/pugixml.hpp @@ -151,6 +151,10 @@ namespace pugi // This flag is off by default; turning it on may result in slower parsing and more memory consumption. const unsigned int parse_ws_pcdata_single = 0x0400; + // This flag determines if plain character data that does not have a parent node is added to the DOM tree, and if an empty document + // is a valid document. This flag is off by default. + const unsigned int parse_fragment = 0x0800; + // The default parsing mode. // Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded, // End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules. @@ -880,7 +884,9 @@ namespace pugi status_bad_end_element, // Parsing error occurred while parsing end element tag status_end_element_mismatch,// There was a mismatch of start-end tags (closing tag had incorrect name, some tag was not closed or there was an excessive closing tag) - status_append_invalid_root // Unable to append nodes since root type is not node_element or node_document (exclusive to xml_node::append_buffer) + status_append_invalid_root, // Unable to append nodes since root type is not node_element or node_document (exclusive to xml_node::append_buffer) + + status_no_document_element // Parsing resulted in a document without element nodes }; // Parsing result -- cgit v1.2.3