From 2874f6f21dc22efab1a2884fe463c5461955a225 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Fri, 8 Jan 2016 08:37:26 -0800 Subject: Add initial support for parse_embed_pcdata When this flag is true, PCDATA value is saved to the parent element instead of allocating a new node. This prevents some documents from round-tripping since it loses information, but can provide a significant memory reduction and parsing speedup for some documents. --- src/pugixml.cpp | 17 +++++++++++++---- src/pugixml.hpp | 5 +++++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 35c0d8e..de87dcf 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -3360,13 +3360,22 @@ PUGI__NS_BEGIN if (cursor->parent || PUGI__OPTSET(parse_fragment)) { - PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree. - cursor->value = s; // Save the offset. + if (!PUGI__OPTSET(parse_embed_pcdata)) + { + PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree. + + cursor->value = s; // Save the offset. + + PUGI__POPNODE(); // Pop since this is a standalone. + } + else + { + if (cursor->parent && !cursor->value) + cursor->value = s; // Save the offset. + } s = strconv_pcdata(s); - PUGI__POPNODE(); // Pop since this is a standalone. - if (!*s) break; } else diff --git a/src/pugixml.hpp b/src/pugixml.hpp index 540e6ba..4ed6f55 100644 --- a/src/pugixml.hpp +++ b/src/pugixml.hpp @@ -158,6 +158,11 @@ namespace pugi // is a valid document. This flag is off by default. const unsigned int parse_fragment = 0x1000; + // This flag determines if plain character data is be stored in the parent element's value. This significantly changes the structure of + // the document and does not allow some documents to round-trip; this flag is only recommended for parsing documents with a lot of + // PCDATA nodes in a very memory-constrained environment. This flag is off by default. + const unsigned int parse_embed_pcdata = 0x2000; + // The default parsing mode. // Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded, // End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules. -- cgit v1.2.3 From 8b01f8923c047edf904c766c59ac359b807e7643 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Fri, 8 Jan 2016 08:40:56 -0800 Subject: Support xml_node::child_value/text for parse_embed_pcdata --- src/pugixml.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/pugixml.cpp b/src/pugixml.cpp index de87dcf..8c5b9e1 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -5464,6 +5464,10 @@ namespace pugi if (impl::is_text_node(i) && i->value) return i->value; + // element nodes can have value if parse_embed_pcdata was used + if (PUGI__NODETYPE(_root) == node_element && _root->value) + return _root->value; + return PUGIXML_TEXT(""); } @@ -6211,6 +6215,10 @@ namespace pugi if (impl::is_text_node(node)) return node; + // element nodes can have value if parse_embed_pcdata was used + if (PUGI__NODETYPE(_root) == node_element && _root->value) + return _root; + return 0; } -- cgit v1.2.3 From 85d8b225f2276333001dc0f96179bfef012277ae Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Fri, 8 Jan 2016 08:41:38 -0800 Subject: Support XPath string value for parse_embed_pcdata --- src/pugixml.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 8c5b9e1..c018359 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -7653,6 +7653,10 @@ PUGI__NS_BEGIN { xpath_string result; + // element nodes can have value if parse_embed_pcdata was used + if (n.value()[0]) + result.append(xpath_string::from_const(n.value()), alloc); + xml_node cur = n.first_child(); while (cur && cur != n) -- cgit v1.2.3 From df2a0ad28b24681fdc39275b5260132b0a3e6918 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Sat, 9 Jan 2016 17:46:42 -0800 Subject: Implement output support for embedded PCDATA values This is a bit awkward since preserving correct indentation structure requires a bit of extra work, and the closing tag has to be written by _start function to correctly process the rest of the tree. --- src/pugixml.cpp | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/src/pugixml.cpp b/src/pugixml.cpp index c018359..90c677e 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -4018,17 +4018,40 @@ PUGI__NS_BEGIN if (node->first_attribute) node_output_attributes(writer, node, indent, indent_length, flags, depth); - if (!node->first_child) + // element nodes can have value if parse_embed_pcdata was used + if (!node->value) { - writer.write(' ', '/', '>'); + if (!node->first_child) + { + writer.write(' ', '/', '>'); - return false; + return false; + } + else + { + writer.write('>'); + + return true; + } } else { writer.write('>'); - return true; + text_output(writer, node->value, ctx_special_pcdata, flags); + + if (!node->first_child) + { + writer.write('<', '/'); + writer.write_string(name); + writer.write('>'); + + return false; + } + else + { + return true; + } } } @@ -4136,6 +4159,10 @@ PUGI__NS_BEGIN if (node_output_start(writer, node, indent, indent_length, flags, depth)) { + // element nodes can have value if parse_embed_pcdata was used + if (node->value) + indent_flags = 0; + node = node->first_child; depth++; continue; -- cgit v1.2.3 From bcddf36559c4293d34fd275a4d392982fae94998 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Tue, 12 Jan 2016 20:01:44 -0800 Subject: Only save first PCDATA contents in the element This change fixes an important ordering issue - if element node has a PCDATA child *after* other elements, it's impossible to tell which order the children were in. Since the goal of PCDATA embedding is to save memory when it's the only child, only apply the optimization to the first child. This seems to fix all roundtripping issues so the only caveat is that the DOM structure is different. --- src/pugixml.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 90c677e..f447e97 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -3360,7 +3360,11 @@ PUGI__NS_BEGIN if (cursor->parent || PUGI__OPTSET(parse_fragment)) { - if (!PUGI__OPTSET(parse_embed_pcdata)) + if (PUGI__OPTSET(parse_embed_pcdata) && cursor->parent && !cursor->first_child && !cursor->value) + { + cursor->value = s; // Save the offset. + } + else { PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree. @@ -3368,11 +3372,6 @@ PUGI__NS_BEGIN PUGI__POPNODE(); // Pop since this is a standalone. } - else - { - if (cursor->parent && !cursor->value) - cursor->value = s; // Save the offset. - } s = strconv_pcdata(s); -- cgit v1.2.3 From fc6c8633ddd60ff72592986938fa9e0a31f508d1 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Tue, 12 Jan 2016 20:16:29 -0800 Subject: tests: Add test for parse_embed_pcdata --- tests/test_parse.cpp | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tests/test_parse.cpp b/tests/test_parse.cpp index 2c3f125..47f774e 100644 --- a/tests/test_parse.cpp +++ b/tests/test_parse.cpp @@ -1139,3 +1139,46 @@ TEST(parse_fuzz_doctype) xml_document doc; CHECK(doc.load_buffer(data, sizeof(data)).status == status_bad_doctype); } + +TEST(parse_embed_pcdata) +{ + // parse twice - once with default and once with embed_pcdata flags + for (int i = 0; i < 2; ++i) + { + unsigned int flags = (i == 0) ? parse_default : parse_default | parse_embed_pcdata; + + xml_document doc; + xml_parse_result res = doc.load_string(STR("valuevalue1value2outertext"), flags); + CHECK(res); + + xml_node child = doc.child(STR("node")).child(STR("child")); + + // parse_embed_pcdata omits PCDATA nodes so DOM is different + if (flags & parse_embed_pcdata) + { + CHECK_STRING(doc.child(STR("node")).child(STR("key")).value(), STR("value")); + CHECK(!doc.child(STR("node")).child(STR("key")).first_child()); + } + else + { + CHECK_STRING(doc.child(STR("node")).child(STR("key")).value(), STR("")); + CHECK(doc.child(STR("node")).child(STR("key")).first_child()); + CHECK_STRING(doc.child(STR("node")).child(STR("key")).first_child().value(), STR("value")); + } + + // higher-level APIs work the same though + CHECK_STRING(child.text().get(), STR("outer")); + CHECK_STRING(child.child(STR("inner1")).text().get(), STR("value1")); + + CHECK_STRING(child.child_value(), STR("outer")); + CHECK_STRING(child.child_value(STR("inner2")), STR("value2")); + + #ifndef PUGIXML_NO_XPATH + CHECK_XPATH_NUMBER(doc, STR("count(node/child/*[starts-with(., 'value')])"), 2); + #endif + + CHECK_NODE(doc, STR("valuevalue1value2outertext")); + CHECK_NODE_EX(doc, STR("\nvalue\n\nvalue1\nvalue2outer\ntext\n\n\n"), STR("\t"), 0); + CHECK_NODE_EX(doc, STR("\n\tvalue\n\t\n\t\tvalue1\n\t\tvalue2outer\n\ttext\n\t\n\n"), STR("\t"), format_indent); + } +} \ No newline at end of file -- cgit v1.2.3 From 71d3a797f4c1991070afb868e584f1ab3e8ad669 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Tue, 12 Jan 2016 20:18:12 -0800 Subject: Adjust parse_embed_pcdata documentation Since round-tripping should not be a problem any more don't mention it. --- src/pugixml.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pugixml.hpp b/src/pugixml.hpp index 4ed6f55..e561490 100644 --- a/src/pugixml.hpp +++ b/src/pugixml.hpp @@ -159,8 +159,8 @@ namespace pugi const unsigned int parse_fragment = 0x1000; // This flag determines if plain character data is be stored in the parent element's value. This significantly changes the structure of - // the document and does not allow some documents to round-trip; this flag is only recommended for parsing documents with a lot of - // PCDATA nodes in a very memory-constrained environment. This flag is off by default. + // the document; this flag is only recommended for parsing documents with many PCDATA nodes in memory-constrained environments. + // This flag is off by default. const unsigned int parse_embed_pcdata = 0x2000; // The default parsing mode. -- cgit v1.2.3 From 85238132d3e3029d0454d5e16222a7d48b26e381 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Tue, 12 Jan 2016 20:38:45 -0800 Subject: docs: Add parse_embed_pcdata documentation --- docs/manual.adoc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/manual.adoc b/docs/manual.adoc index ccf3fe7..1d8d88a 100644 --- a/docs/manual.adoc +++ b/docs/manual.adoc @@ -746,6 +746,9 @@ These flags control the resulting tree contents: * [[parse_ws_pcdata_single]]`parse_ws_pcdata_single` determines if whitespace-only PCDATA nodes that have no sibling nodes are to be put in DOM tree. In some cases application needs to parse the whitespace-only contents of nodes, i.e. ` `, but is not interested in whitespace markup elsewhere. It is possible to use <> flag in this case, but it results in excessive allocations and complicates document processing; this flag can be used to avoid that. As an example, after parsing XML string ` ` with `parse_ws_pcdata_single` flag set, `` element will have one child ``, and `` element will have one child with type <> and value `" "`. This flag has no effect if <> is enabled. This flag is *off* by default. +* [[parse_embed_pcdata]]`parse_embed_pcdata` determines if PCDATA contents is to be saved as element values. Normally element nodes have names but not values; this flag forces the parser to store the contents as a value if PCDATA is the first child of the element node (otherwise PCDATA node is created as usual). This can significantly reduce the memory required for documents with many PCDATA nodes. To retrieve the data you can use `xml_node::value()` on the element nodes or any of the higher-level functions like `child_value` or `text`. This flag is *off* by default. +Since this flag significantly changes the DOM structure it is only recommended for parsing documents with many PCDATA nodes in memory-constrained environments. This flag is *off* by default. + * [[parse_fragment]]`parse_fragment` determines if document should be treated as a fragment of a valid XML. Parsing document as a fragment leads to top-level PCDATA content (i.e. text that is not located inside a node) to be added to a tree, and additionally treats documents without element nodes as valid. This flag is *off* by default. CAUTION: Using in-place parsing (<>) with `parse_fragment` flag may result in the loss of the last character of the buffer if it is a part of PCDATA. Since PCDATA values are null-terminated strings, the only way to resolve this is to provide a null-terminated buffer as an input to `load_buffer_inplace` - i.e. `doc.load_buffer_inplace("test\0", 5, pugi::parse_default | pugi::parse_fragment)`. @@ -2611,6 +2614,7 @@ const unsigned int +++parse_pi+++ const unsigned int +++parse_trim_pcdata+++ const unsigned int +++parse_ws_pcdata+++ const unsigned int +++parse_ws_pcdata_single+++ +const unsigned int +++parse_embed_pcdata+++ const unsigned int +++parse_wconv_attribute+++ const unsigned int +++parse_wnorm_attribute+++ ---- -- cgit v1.2.3 From 4f3be7616729cbf0c8768caf861331d710d457a8 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Tue, 12 Jan 2016 20:41:37 -0800 Subject: Preserve order semantics for child_value/text when using parse_embed_pcdata The performance cost is probably negligible and this means we treat embedded value as the first child consistently. --- src/pugixml.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/pugixml.cpp b/src/pugixml.cpp index f447e97..158a24d 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -5486,14 +5486,14 @@ namespace pugi { if (!_root) return PUGIXML_TEXT(""); - for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling) - if (impl::is_text_node(i) && i->value) - return i->value; - // element nodes can have value if parse_embed_pcdata was used if (PUGI__NODETYPE(_root) == node_element && _root->value) return _root->value; + for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling) + if (impl::is_text_node(i) && i->value) + return i->value; + return PUGIXML_TEXT(""); } @@ -6237,14 +6237,14 @@ namespace pugi { if (!_root || impl::is_text_node(_root)) return _root; - for (xml_node_struct* node = _root->first_child; node; node = node->next_sibling) - if (impl::is_text_node(node)) - return node; - // element nodes can have value if parse_embed_pcdata was used if (PUGI__NODETYPE(_root) == node_element && _root->value) return _root; + for (xml_node_struct* node = _root->first_child; node; node = node->next_sibling) + if (impl::is_text_node(node)) + return node; + return 0; } -- cgit v1.2.3