diff options
author | Arseny Kapoulkine <arseny.kapoulkine@gmail.com> | 2015-03-18 08:34:23 -0700 |
---|---|---|
committer | Arseny Kapoulkine <arseny.kapoulkine@gmail.com> | 2015-03-18 09:59:17 -0700 |
commit | 5f996eba6deaa804bf4caced8acc65d8626720d6 (patch) | |
tree | 6f950e655956c17b657f1239ab0a9f655bf83c87 | |
parent | 51da129b50a0b99ee85af20cc4a4b77f6bc823ff (diff) |
Do not emit surrounding whitespace for text nodes
Previously we omitted extra whitespace for single PCDATA/CDATA children, but in
mixed content there was extra indentation before/after text nodes.
One of the problems with that is that the text that you saved is not exactly
the same as the parsing result using default flags (parse_trim_pcdata helps).
Another problem is that parse-format cycles do not have a fixed point for mixed
content - the result expands indefinitely. Some XML libraries, like Python
minidom, have the same issue, but this is definitely a problem.
Pretty-printing mixed content is hard. It seems that the only other sensible
choice is to switch mixed content nodes to raw formatting. In a way the code in
this change is a weaker version of that - it removes indentation around text
nodes but still keeps it around element siblings/children.
Thus we can switch to mixed-raw formatting at some point later, which will be
a superset of the current behavior.
To do this we have to either switch at the first text node (.NET XmlDocument
does that), or scan the children of each element for a possible text node and
switch before we output the first child.
The former behavior seems non-intuitive (and a bit broken); unfortunately, the
latter behavior can cost up to 20% of the output time for trees *without* mixed
content.
Fixes #13.
-rw-r--r-- | src/pugixml.cpp | 124 | ||||
-rw-r--r-- | tests/test_write.cpp | 53 | ||||
-rw-r--r-- | tests/writer_string.cpp | 6 |
3 files changed, 111 insertions, 72 deletions
diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 4269335..ac90c5f 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -3511,61 +3511,28 @@ PUGI__NS_BEGIN if (node->first_attribute) node_output_attributes(writer, node, flags); - if (flags & format_raw) + if (!node->first_child) { - if (!node->first_child) - writer.write(' ', '/', '>'); - else - { - writer.write('>'); + writer.write(' ', '/', '>'); - return true; - } + return false; } else { - xml_node_struct* first = node->first_child; - - if (!first) - writer.write(' ', '/', '>', '\n'); - else if (!first->next_sibling && (PUGI__NODETYPE(first) == node_pcdata || PUGI__NODETYPE(first) == node_cdata)) - { - writer.write('>'); - - const char_t* value = first->value ? first->value : PUGIXML_TEXT(""); - - if (PUGI__NODETYPE(first) == node_pcdata) - text_output(writer, value, ctx_special_pcdata, flags); - else - text_output_cdata(writer, value); - - writer.write('<', '/'); - writer.write_string(name); - writer.write('>', '\n'); - } - else - { - writer.write('>', '\n'); + writer.write('>'); - return true; - } + return true; } - - return false; } - PUGI__FN void node_output_end(xml_buffered_writer& writer, xml_node_struct* node, unsigned int flags) + PUGI__FN void node_output_end(xml_buffered_writer& writer, xml_node_struct* node) { const char_t* default_name = PUGIXML_TEXT(":anonymous"); const char_t* name = node->name ? node->name : default_name; writer.write('<', '/'); writer.write_string(name); - - if (flags & format_raw) - writer.write('>'); - else - writer.write('>', '\n'); + writer.write('>'); } PUGI__FN void node_output_simple(xml_buffered_writer& writer, xml_node_struct* node, unsigned int flags) @@ -3576,17 +3543,14 @@ PUGI__NS_BEGIN { case node_pcdata: text_output(writer, node->value ? node->value : PUGIXML_TEXT(""), ctx_special_pcdata, flags); - if ((flags & format_raw) == 0) writer.write('\n'); break; case node_cdata: text_output_cdata(writer, node->value ? node->value : PUGIXML_TEXT("")); - if ((flags & format_raw) == 0) writer.write('\n'); break; case node_comment: node_output_comment(writer, node->value ? node->value : PUGIXML_TEXT("")); - if ((flags & format_raw) == 0) writer.write('\n'); break; case node_pi: @@ -3600,7 +3564,6 @@ PUGI__NS_BEGIN } writer.write('?', '>'); - if ((flags & format_raw) == 0) writer.write('\n'); break; case node_declaration: @@ -3608,7 +3571,6 @@ PUGI__NS_BEGIN writer.write_string(node->name ? node->name : default_name); node_output_attributes(writer, node, flags); writer.write('?', '>'); - if ((flags & format_raw) == 0) writer.write('\n'); break; case node_doctype: @@ -3622,7 +3584,6 @@ PUGI__NS_BEGIN } writer.write('>'); - if ((flags & format_raw) == 0) writer.write('\n'); break; default: @@ -3630,9 +3591,16 @@ PUGI__NS_BEGIN } } + enum indent_flags_t + { + indent_newline = 1, + indent_indent = 2 + }; + PUGI__FN void node_output(xml_buffered_writer& writer, xml_node_struct* root, const char_t* indent, unsigned int flags, unsigned int depth) { size_t indent_length = ((flags & (format_indent | format_raw)) == format_indent) ? strlength(indent) : 0; + unsigned int indent_flags = indent_indent; xml_node_struct* node = root; @@ -3641,29 +3609,47 @@ PUGI__NS_BEGIN assert(node); // begin writing current node - if (indent_length) - text_output_indent(writer, indent, indent_length, depth); + if (PUGI__NODETYPE(node) == node_pcdata || PUGI__NODETYPE(node) == node_cdata) + { + node_output_simple(writer, node, flags); - if (PUGI__NODETYPE(node) == node_element) + indent_flags = 0; + } + else { - if (node_output_start(writer, node, flags)) + if ((indent_flags & indent_newline) && (flags & format_raw) == 0) + writer.write('\n'); + + if ((indent_flags & indent_indent) && indent_length) + text_output_indent(writer, indent, indent_length, depth); + + if (PUGI__NODETYPE(node) == node_element) { - node = node->first_child; - depth++; - continue; + indent_flags = indent_newline | indent_indent; + + if (node_output_start(writer, node, flags)) + { + node = node->first_child; + depth++; + continue; + } } - } - else if (PUGI__NODETYPE(node) == node_document) - { - if (node->first_child) + else if (PUGI__NODETYPE(node) == node_document) { - node = node->first_child; - continue; + indent_flags = indent_indent; + + if (node->first_child) + { + node = node->first_child; + continue; + } + } + else + { + node_output_simple(writer, node, flags); + + indent_flags = indent_newline | indent_indent; } - } - else - { - node_output_simple(writer, node, flags); } // continue to the next node @@ -3682,14 +3668,22 @@ PUGI__NS_BEGIN { depth--; - if (indent_length) + if ((indent_flags & indent_newline) && (flags & format_raw) == 0) + writer.write('\n'); + + if ((indent_flags & indent_indent) && indent_length) text_output_indent(writer, indent, indent_length, depth); - node_output_end(writer, node, flags); + node_output_end(writer, node); + + indent_flags = indent_newline | indent_indent; } } } while (node != root); + + if ((indent_flags & indent_newline) && (flags & format_raw) == 0) + writer.write('\n'); } PUGI__FN bool has_declaration(xml_node_struct* node) diff --git a/tests/test_write.cpp b/tests/test_write.cpp index 59cdb3e..a61e1cf 100644 --- a/tests/test_write.cpp +++ b/tests/test_write.cpp @@ -22,19 +22,19 @@ TEST_XML(write_indent, "<node attr='1'><child><sub>text</sub></child></node>") TEST_XML(write_pcdata, "<node attr='1'><child><sub/>text</child></node>") { - CHECK_NODE_EX(doc, STR("<node attr=\"1\">\n\t<child>\n\t\t<sub />\n\t\ttext\n\t</child>\n</node>\n"), STR("\t"), format_indent); + CHECK_NODE_EX(doc, STR("<node attr=\"1\">\n\t<child>\n\t\t<sub />text</child>\n</node>\n"), STR("\t"), format_indent); } TEST_XML_FLAGS(write_cdata, "<![CDATA[value]]>", parse_cdata | parse_fragment) { CHECK_NODE(doc, STR("<![CDATA[value]]>")); - CHECK_NODE_EX(doc, STR("<![CDATA[value]]>\n"), STR(""), 0); + CHECK_NODE_EX(doc, STR("<![CDATA[value]]>"), STR(""), 0); } TEST_XML_FLAGS(write_cdata_empty, "<![CDATA[]]>", parse_cdata | parse_fragment) { CHECK_NODE(doc, STR("<![CDATA[]]>")); - CHECK_NODE_EX(doc, STR("<![CDATA[]]>\n"), STR(""), 0); + CHECK_NODE_EX(doc, STR("<![CDATA[]]>"), STR(""), 0); } TEST_XML_FLAGS(write_cdata_escape, "<![CDATA[value]]>", parse_cdata | parse_fragment) @@ -527,5 +527,50 @@ TEST(write_pcdata_null) doc.first_child().append_child(node_pcdata); - CHECK_NODE_EX(doc, STR("<node>\n\t\n\t\n</node>\n"), STR("\t"), format_indent); + CHECK_NODE_EX(doc, STR("<node></node>\n"), STR("\t"), format_indent); +} + +TEST(write_pcdata_whitespace_fixedpoint) +{ + const char_t* data = STR("<node> test <child>\n <sub/>\n </child>\n</node>"); + + static const unsigned int flags_parse[] = + { + 0, + parse_ws_pcdata, + parse_ws_pcdata_single, + parse_trim_pcdata + }; + + static const unsigned int flags_format[] = + { + 0, + format_raw, + format_indent + }; + + for (unsigned int i = 0; i < sizeof(flags_parse) / sizeof(flags_parse[0]); ++i) + { + xml_document doc; + CHECK(doc.load_string(data, flags_parse[i])); + + for (unsigned int j = 0; j < sizeof(flags_format) / sizeof(flags_format[0]); ++j) + { + std::string saved = write_narrow(doc, flags_format[j], encoding_auto); + + xml_document rdoc; + CHECK(rdoc.load_buffer(&saved[0], saved.size(), flags_parse[i])); + + std::string rsaved = write_narrow(rdoc, flags_format[j], encoding_auto); + + CHECK(saved == rsaved); + } + } +} + +TEST_XML_FLAGS(write_mixed, "<node><child1/><child2>pre<![CDATA[data]]>mid<!--comment--><test/>post<?pi value?>fin</child2><child3/></node>", parse_full) +{ + CHECK_NODE(doc, "<node><child1 /><child2>pre<![CDATA[data]]>mid<!--comment--><test />post<?pi value?>fin</child2><child3 /></node>"); + CHECK_NODE_EX(doc, "<node>\n<child1 />\n<child2>pre<![CDATA[data]]>mid<!--comment-->\n<test />post<?pi value?>fin</child2>\n<child3 />\n</node>\n", STR("\t"), 0); + CHECK_NODE_EX(doc, "<node>\n\t<child1 />\n\t<child2>pre<![CDATA[data]]>mid<!--comment-->\n\t\t<test />post<?pi value?>fin</child2>\n\t<child3 />\n</node>\n", STR("\t"), format_indent); } diff --git a/tests/writer_string.cpp b/tests/writer_string.cpp index 661c792..26bca8d 100644 --- a/tests/writer_string.cpp +++ b/tests/writer_string.cpp @@ -45,7 +45,7 @@ std::string save_narrow(const pugi::xml_document& doc, unsigned int flags, pugi: { xml_writer_string writer; - doc.save(writer, STR(""), flags, encoding); + doc.save(writer, STR("\t"), flags, encoding); return writer.as_narrow(); } @@ -59,7 +59,7 @@ std::string write_narrow(pugi::xml_node node, unsigned int flags, pugi::xml_enco { xml_writer_string writer; - node.print(writer, STR(""), flags, encoding); + node.print(writer, STR("\t"), flags, encoding); return writer.as_narrow(); } @@ -73,7 +73,7 @@ std::basic_string<wchar_t> write_wide(pugi::xml_node node, unsigned int flags, p { xml_writer_string writer; - node.print(writer, STR(""), flags, encoding); + node.print(writer, STR("\t"), flags, encoding); return writer.as_wide(); } |