Merge pull request #79 from zeux/embed-pcdata

Add parse_embed_pcdata flag This flag determines if plain character data is be stored in the parent element's value. This significantly changes the structure of the document; this flag is only recommended for parsing documents with a lot of PCDATA nodes in a very memory-constrained environment. Most high-level APIs continue to work; code that inspects DOM using first_child()/value() will have to be adapted.
author: Arseny Kapoulkine <arseny.kapoulkine@gmail.com> 2016-01-14 07:52:40 -0800
committer: Arseny Kapoulkine <arseny.kapoulkine@gmail.com> 2016-01-14 07:52:40 -0800
commit: c388dbeba4f5de655ca74eb21d0a6d29c5eaaee2 (patch)
tree: 2e4f67bf33ac0f4b982831b4cc31f61d50cec836
parent: ad3b492c1a4b3bf3a3163aa2af1641f422dba33f (diff)
parent: 4f3be7616729cbf0c8768caf861331d710d457a8 (diff)
4 files changed, 107 insertions, 8 deletions
diff --git a/docs/manual.adoc b/docs/manual.adoc
index ccf3fe7..1d8d88a 100644
--- a/docs/manual.adoc
+++ b/docs/manual.adoc
@@ -746,6 +746,9 @@ These flags control the resulting tree contents:
 
 * [[parse_ws_pcdata_single]]`parse_ws_pcdata_single` determines if whitespace-only PCDATA nodes that have no sibling nodes are to be put in DOM tree. In some cases application needs to parse the whitespace-only contents of nodes, i.e. `<node>  </node>`, but is not interested in whitespace markup elsewhere. It is possible to use <<parse_ws_pcdata,parse_ws_pcdata>> flag in this case, but it results in excessive allocations and complicates document processing; this flag can be used to avoid that. As an example, after parsing XML string `<node> <a>  </a> </node>` with `parse_ws_pcdata_single` flag set, `<node>` element will have one child `<a>`, and `<a>` element will have one child with type <<node_pcdata,node_pcdata>> and value `"  "`. This flag has no effect if <<parse_ws_pcdata,parse_ws_pcdata>> is enabled. This flag is *off* by default.
 
+* [[parse_embed_pcdata]]`parse_embed_pcdata` determines if PCDATA contents is to be saved as element values. Normally element nodes have names but not values; this flag forces the parser to store the contents as a value if PCDATA is the first child of the element node (otherwise PCDATA node is created as usual). This can significantly reduce the memory required for documents with many PCDATA nodes. To retrieve the data you can use `xml_node::value()` on the element nodes or any of the higher-level functions like `child_value` or `text`. This flag is *off* by default.
+Since this flag significantly changes the DOM structure it is only recommended for parsing documents with many PCDATA nodes in memory-constrained environments. This flag is *off* by default.
+
 * [[parse_fragment]]`parse_fragment` determines if document should be treated as a fragment of a valid XML. Parsing document as a fragment leads to top-level PCDATA content (i.e. text that is not located inside a node) to be added to a tree, and additionally treats documents without element nodes as valid. This flag is *off* by default.
 
 CAUTION: Using in-place parsing (<<xml_document::load_buffer_inplace,load_buffer_inplace>>) with `parse_fragment` flag may result in the loss of the last character of the buffer if it is a part of PCDATA. Since PCDATA values are null-terminated strings, the only way to resolve this is to provide a null-terminated buffer as an input to `load_buffer_inplace` - i.e. `doc.load_buffer_inplace("test\0", 5, pugi::parse_default | pugi::parse_fragment)`.
@@ -2611,6 +2614,7 @@ const unsigned int +++<a href="#parse_pi">parse_pi</a>+++
 const unsigned int +++<a href="#parse_trim_pcdata">parse_trim_pcdata</a>+++
 const unsigned int +++<a href="#parse_ws_pcdata">parse_ws_pcdata</a>+++
 const unsigned int +++<a href="#parse_ws_pcdata_single">parse_ws_pcdata_single</a>+++
+const unsigned int +++<a href="#parse_embed_pcdata">parse_embed_pcdata</a>+++
 const unsigned int +++<a href="#parse_wconv_attribute">parse_wconv_attribute</a>+++
 const unsigned int +++<a href="#parse_wnorm_attribute">parse_wnorm_attribute</a>+++
 ----
diff --git a/src/pugixml.cpp b/src/pugixml.cpp
index 35c0d8e..158a24d 100644
--- a/src/pugixml.cpp
+++ b/src/pugixml.cpp
@@ -3360,13 +3360,21 @@ PUGI__NS_BEGIN
 							
 					if (cursor->parent || PUGI__OPTSET(parse_fragment))
 					{
-						PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree.
-						cursor->value = s; // Save the offset.
+						if (PUGI__OPTSET(parse_embed_pcdata) && cursor->parent && !cursor->first_child && !cursor->value)
+						{
+							cursor->value = s; // Save the offset.
+						}
+						else
+						{
+							PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree.
+
+							cursor->value = s; // Save the offset.
+
+							PUGI__POPNODE(); // Pop since this is a standalone.
+						}
 
 						s = strconv_pcdata(s);
 								
-						PUGI__POPNODE(); // Pop since this is a standalone.
-						
 						if (!*s) break;
 					}
 					else
@@ -4009,17 +4017,40 @@ PUGI__NS_BEGIN
 		if (node->first_attribute)
 			node_output_attributes(writer, node, indent, indent_length, flags, depth);
 
-		if (!node->first_child)
+		// element nodes can have value if parse_embed_pcdata was used
+		if (!node->value)
 		{
-			writer.write(' ', '/', '>');
+			if (!node->first_child)
+			{
+				writer.write(' ', '/', '>');
 
-			return false;
+				return false;
+			}
+			else
+			{
+				writer.write('>');
+
+				return true;
+			}
 		}
 		else
 		{
 			writer.write('>');
 
-			return true;
+			text_output(writer, node->value, ctx_special_pcdata, flags);
+
+			if (!node->first_child)
+			{
+				writer.write('<', '/');
+				writer.write_string(name);
+				writer.write('>');
+
+				return false;
+			}
+			else
+			{
+				return true;
+			}
 		}
 	}
 
@@ -4127,6 +4158,10 @@ PUGI__NS_BEGIN
 
 					if (node_output_start(writer, node, indent, indent_length, flags, depth))
 					{
+						// element nodes can have value if parse_embed_pcdata was used
+						if (node->value)
+							indent_flags = 0;
+
 						node = node->first_child;
 						depth++;
 						continue;
@@ -5451,6 +5486,10 @@ namespace pugi
 	{
 		if (!_root) return PUGIXML_TEXT("");
 		
+		// element nodes can have value if parse_embed_pcdata was used
+		if (PUGI__NODETYPE(_root) == node_element && _root->value)
+			return _root->value;
+
 		for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
 			if (impl::is_text_node(i) && i->value)
 				return i->value;
@@ -6198,6 +6237,10 @@ namespace pugi
 	{
 		if (!_root || impl::is_text_node(_root)) return _root;
 
+		// element nodes can have value if parse_embed_pcdata was used
+		if (PUGI__NODETYPE(_root) == node_element && _root->value)
+			return _root;
+
 		for (xml_node_struct* node = _root->first_child; node; node = node->next_sibling)
 			if (impl::is_text_node(node))
 				return node;
@@ -7636,6 +7679,10 @@ PUGI__NS_BEGIN
 			{
 				xpath_string result;
 
+				// element nodes can have value if parse_embed_pcdata was used
+				if (n.value()[0])
+					result.append(xpath_string::from_const(n.value()), alloc);
+
 				xml_node cur = n.first_child();
 				
 				while (cur && cur != n)
diff --git a/src/pugixml.hpp b/src/pugixml.hpp
index 540e6ba..e561490 100644
--- a/src/pugixml.hpp
+++ b/src/pugixml.hpp
@@ -158,6 +158,11 @@ namespace pugi
 	// is a valid document. This flag is off by default.
 	const unsigned int parse_fragment = 0x1000;
 
+	// This flag determines if plain character data is be stored in the parent element's value. This significantly changes the structure of
+	// the document; this flag is only recommended for parsing documents with many PCDATA nodes in memory-constrained environments.
+	// This flag is off by default.
+	const unsigned int parse_embed_pcdata = 0x2000;
+
 	// The default parsing mode.
 	// Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded,
 	// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
diff --git a/tests/test_parse.cpp b/tests/test_parse.cpp
index 2c3f125..47f774e 100644
--- a/tests/test_parse.cpp
+++ b/tests/test_parse.cpp
@@ -1139,3 +1139,46 @@ TEST(parse_fuzz_doctype)
 	xml_document doc;
 	CHECK(doc.load_buffer(data, sizeof(data)).status == status_bad_doctype);
 }
+
+TEST(parse_embed_pcdata)
+{
+	// parse twice - once with default and once with embed_pcdata flags
+	for (int i = 0; i < 2; ++i)
+	{
+		unsigned int flags = (i == 0) ? parse_default : parse_default | parse_embed_pcdata;
+
+		xml_document doc;
+		xml_parse_result res = doc.load_string(STR("<node><key>value</key><child><inner1>value1</inner1><inner2>value2</inner2>outer</child><two>text<data /></two></node>"), flags);
+		CHECK(res);
+
+		xml_node child = doc.child(STR("node")).child(STR("child"));
+
+		// parse_embed_pcdata omits PCDATA nodes so DOM is different
+		if (flags & parse_embed_pcdata)
+		{
+			CHECK_STRING(doc.child(STR("node")).child(STR("key")).value(), STR("value"));
+			CHECK(!doc.child(STR("node")).child(STR("key")).first_child());
+		}
+		else
+		{
+			CHECK_STRING(doc.child(STR("node")).child(STR("key")).value(), STR(""));
+			CHECK(doc.child(STR("node")).child(STR("key")).first_child());
+			CHECK_STRING(doc.child(STR("node")).child(STR("key")).first_child().value(), STR("value"));
+		}
+
+		// higher-level APIs work the same though
+		CHECK_STRING(child.text().get(), STR("outer"));
+		CHECK_STRING(child.child(STR("inner1")).text().get(), STR("value1"));
+
+		CHECK_STRING(child.child_value(), STR("outer"));
+		CHECK_STRING(child.child_value(STR("inner2")), STR("value2"));
+
+	#ifndef PUGIXML_NO_XPATH
+		CHECK_XPATH_NUMBER(doc, STR("count(node/child/*[starts-with(., 'value')])"), 2);
+	#endif
+
+		CHECK_NODE(doc, STR("<node><key>value</key><child><inner1>value1</inner1><inner2>value2</inner2>outer</child><two>text<data /></two></node>"));
+		CHECK_NODE_EX(doc, STR("<node>\n<key>value</key>\n<child>\n<inner1>value1</inner1>\n<inner2>value2</inner2>outer</child>\n<two>text<data />\n</two>\n</node>\n"), STR("\t"), 0);
+		CHECK_NODE_EX(doc, STR("<node>\n\t<key>value</key>\n\t<child>\n\t\t<inner1>value1</inner1>\n\t\t<inner2>value2</inner2>outer</child>\n\t<two>text<data />\n\t</two>\n</node>\n"), STR("\t"), format_indent);
+	}
+}
+\ No newline at end of file
author	Arseny Kapoulkine <arseny.kapoulkine@gmail.com>	2016-01-14 07:52:40 -0800
committer	Arseny Kapoulkine <arseny.kapoulkine@gmail.com>	2016-01-14 07:52:40 -0800
commit	c388dbeba4f5de655ca74eb21d0a6d29c5eaaee2 (patch)
tree	2e4f67bf33ac0f4b982831b4cc31f61d50cec836
parent	ad3b492c1a4b3bf3a3163aa2af1641f422dba33f (diff)
parent	4f3be7616729cbf0c8768caf861331d710d457a8 (diff)