summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorArseny Kapoulkine <arseny.kapoulkine@gmail.com>2014-02-11 06:45:27 +0000
committerArseny Kapoulkine <arseny.kapoulkine@gmail.com>2014-02-11 06:45:27 +0000
commit47c15ad949eb6589ee14d208444b4e759a611143 (patch)
tree35822cba8d2d3c6e5384c960ff8ea503bf3cf235 /src
parent5fa25a878aa472530cfa981d374d6e9fe4e12c7c (diff)
Implement document fragment parsing.
Introduce a notable behavior change in default parsing mode: documents without a document element node are now considered invalid. This is technically a breaking change, however the amount of documents it affects is very small, all parsed data still persists, and lack of this check results in very confusing behavior in a number of cases. In order to be able to parse documents without an element node, a fragment parsing flag is introduced. Parsing a buffer in fragment mode treats the buffer as a fragment of a valid XML. As a consequence, top-level PCDATA is added to the tree; additionally, there are no restrictions on the number of nodes -- so documents without a document element are considered valid. Due to the way parsing works internally, load_buffer_inplace occasionally can not preserve the document contents if it's parsed in a fragment mode. While unfortunate, this problem is fundamental; since the use case is relatively obscure, hopefully documenting this shortcoming will be enough. git-svn-id: https://pugixml.googlecode.com/svn/trunk@980 99668b35-9821-0410-8761-19e4c4f06640
Diffstat (limited to 'src')
-rw-r--r--src/pugixml.cpp78
-rw-r--r--src/pugixml.hpp8
2 files changed, 67 insertions, 19 deletions
diff --git a/src/pugixml.cpp b/src/pugixml.cpp
index 926458e..1893125 100644
--- a/src/pugixml.cpp
+++ b/src/pugixml.cpp
@@ -2199,7 +2199,7 @@ PUGI__NS_BEGIN
char_t* parse_doctype_group(char_t* s, char_t endch, bool toplevel)
{
- assert(s[0] == '<' && s[1] == '!');
+ assert((s[0] == '<' || s[0] == 0) && s[1] == '!');
s++;
while (*s)
@@ -2331,6 +2331,9 @@ PUGI__NS_BEGIN
s = parse_doctype_group(s, endch, true);
if (!s) return s;
+ assert((*s == 0 && endch == '>') || *s == '>');
+ if (*s) *s++ = 0;
+
if (PUGI__OPTSET(parse_doctype))
{
while (PUGI__IS_CHARTYPE(*mark, ct_space)) ++mark;
@@ -2339,9 +2342,6 @@ PUGI__NS_BEGIN
cursor->value = mark;
- assert((*s == 0 && endch == '>') || *s == '>');
- if (*s) *s++ = 0;
-
PUGI__POPNODE();
}
}
@@ -2629,7 +2629,7 @@ PUGI__NS_BEGIN
PUGI__SKIPWS(); // Eat whitespace if no genuine PCDATA here.
- if (*s == '<')
+ if (*s == '<' || !*s)
{
// We skipped some whitespace characters because otherwise we would take the tag branch instead of PCDATA one
assert(mark != s);
@@ -2640,13 +2640,13 @@ PUGI__NS_BEGIN
}
else if (PUGI__OPTSET(parse_ws_pcdata_single))
{
- if (s[1] != '/' || cursor->first_child) continue;
+ if (s[0] != '<' || s[1] != '/' || cursor->first_child) continue;
}
}
s = mark;
- if (cursor->parent)
+ if (cursor->parent || PUGI__OPTSET(parse_fragment))
{
PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree.
cursor->value = s; // Save the offset.
@@ -2676,14 +2676,43 @@ PUGI__NS_BEGIN
return s;
}
+ #ifdef PUGIXML_WCHAR_MODE
+ static char_t* parse_skip_bom(char_t* s)
+ {
+ return (s[0] == 0xfeff) ? s + 1 : s;
+ }
+ #else
+ static char_t* parse_skip_bom(char_t* s)
+ {
+ return (s[0] == '\xef' && s[1] == '\xbb' && s[2] == '\xbf') ? s + 3 : s;
+ }
+ #endif
+
+ static bool has_element_node_siblings(xml_node_struct* node)
+ {
+ while (node)
+ {
+ xml_node_type type = static_cast<xml_node_type>((node->header & impl::xml_memory_page_type_mask) + 1);
+ if (type == node_element) return true;
+
+ node = node->next_sibling;
+ }
+
+ return false;
+ }
+
static xml_parse_result parse(char_t* buffer, size_t length, xml_document_struct* xmldoc, xml_node_struct* root, unsigned int optmsk)
{
// allocator object is a part of document object
xml_allocator& alloc = *static_cast<xml_allocator*>(xmldoc);
// early-out for empty documents
- if (length == 0) return make_parse_result(status_ok);
+ if (length == 0)
+ return make_parse_result(PUGI__OPTSET(parse_fragment) ? status_ok : status_no_document_element);
+ // get last child of the root before parsing
+ xml_node_struct* last_root_child = root->first_child ? root->first_child->prev_sibling_c : 0;
+
// create parser on stack
xml_parser parser(alloc);
@@ -2691,24 +2720,35 @@ PUGI__NS_BEGIN
char_t endch = buffer[length - 1];
buffer[length - 1] = 0;
+ // skip BOM to make sure it does not end up as part of parse output
+ char_t* buffer_data = parse_skip_bom(buffer);
+
// perform actual parsing
- parser.parse_tree(buffer, root, optmsk, endch);
+ parser.parse_tree(buffer_data, root, optmsk, endch);
+
+ // update allocator state
+ alloc = parser.alloc;
xml_parse_result result = make_parse_result(parser.error_status, parser.error_offset ? parser.error_offset - buffer : 0);
assert(result.offset >= 0 && static_cast<size_t>(result.offset) <= length);
- // roll back offset if it occurs on a null terminator in the source buffer
- if (result.offset > 0 && static_cast<size_t>(result.offset) == length - 1 && endch == 0)
- result.offset--;
+ if (result)
+ {
+ // since we removed last character, we have to handle the only possible false positive (stray <)
+ if (endch == '<')
+ return make_parse_result(status_unrecognized_tag, length - 1);
- // update allocator state
- alloc = parser.alloc;
+ // check if there are any element nodes parsed
+ xml_node_struct* first_root_child_parsed = last_root_child ? last_root_child->next_sibling : root->first_child;
- // since we removed last character, we have to handle the only possible false positive
- if (result && endch == '<')
+ if (!PUGI__OPTSET(parse_fragment) && !has_element_node_siblings(first_root_child_parsed))
+ return make_parse_result(status_no_document_element, length - 1);
+ }
+ else
{
- // there's no possible well-formed document with < at the end
- return make_parse_result(status_unrecognized_tag, length - 1);
+ // roll back offset if it occurs on a null terminator in the source buffer
+ if (result.offset > 0 && static_cast<size_t>(result.offset) == length - 1 && endch == 0)
+ result.offset--;
}
return result;
@@ -5469,6 +5509,8 @@ namespace pugi
case status_append_invalid_root: return "Unable to append nodes: root is not an element or document";
+ case status_no_document_element: return "No document element found";
+
default: return "Unknown error";
}
}
diff --git a/src/pugixml.hpp b/src/pugixml.hpp
index e19a4a3..e5009fe 100644
--- a/src/pugixml.hpp
+++ b/src/pugixml.hpp
@@ -151,6 +151,10 @@ namespace pugi
// This flag is off by default; turning it on may result in slower parsing and more memory consumption.
const unsigned int parse_ws_pcdata_single = 0x0400;
+ // This flag determines if plain character data that does not have a parent node is added to the DOM tree, and if an empty document
+ // is a valid document. This flag is off by default.
+ const unsigned int parse_fragment = 0x0800;
+
// The default parsing mode.
// Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded,
// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
@@ -880,7 +884,9 @@ namespace pugi
status_bad_end_element, // Parsing error occurred while parsing end element tag
status_end_element_mismatch,// There was a mismatch of start-end tags (closing tag had incorrect name, some tag was not closed or there was an excessive closing tag)
- status_append_invalid_root // Unable to append nodes since root type is not node_element or node_document (exclusive to xml_node::append_buffer)
+ status_append_invalid_root, // Unable to append nodes since root type is not node_element or node_document (exclusive to xml_node::append_buffer)
+
+ status_no_document_element // Parsing resulted in a document without element nodes
};
// Parsing result