From febe4f0209f86225ebeedfb0874feb3cb96e7c89 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Wed, 1 Oct 2014 07:02:52 +0000 Subject: Implement copyless copy Now copying nodes or attributes does not copy names/values if the source strings are in a document buffer. As a result, several nodes can now share the same string in document buffer - to support this we 'taint' both source and destination with a special 'shared' bit. Tainting disables offset_debug() and fast-path document order comparison; it also prevents strcpy_insitu from reusing the document buffer memory for the copied node. The downsides include slower XPath queries in some (rare) cases and slightly higher memory consumption in some (rare) cases. XPath queries can execute slower if a lot of old nodes were copied to new nodes *and* a query only touches old nodes (so it used to benefit a lot from fast comparison path) *and* a query produces unsorted node sets that need to be sorted later (both are relatively rare). Higher memory consumption is possible if a lot of nodes were copied and all nodes (both new and old) have their contents modified 'in place' -- previously we could modify the old node in place and the new node required one allocation on copy, and now both nodes have to have their data allocated during modification. This should also be rare. On the bright side, in a lot of cases copying of string data can be avoided - this makes the copy much faster and the document now occupies less memory. For example, some uses of append_buffer are now actually slower compared to building up a document by copying a template from the same document and modifying the copy slightly. In one of the internal benchmarks copying is now 4x faster (the difference can be more dramatic with more string contents and less markup). git-svn-id: https://pugixml.googlecode.com/svn/trunk@1032 99668b35-9821-0410-8761-19e4c4f06640 --- src/pugixml.cpp | 73 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 44 insertions(+), 29 deletions(-) diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 8e61182..3979eb9 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -1654,13 +1654,15 @@ PUGI__NS_BEGIN } #endif - inline bool strcpy_insitu_allow(size_t length, uintptr_t allocated, char_t* target) + inline bool strcpy_insitu_allow(size_t length, uintptr_t header, uintptr_t header_mask, char_t* target) { - assert(target); + // never reuse shared memory + if (header & xml_memory_page_name_or_value_shared_mask) return false; + size_t target_length = strlength(target); // always reuse document buffer memory if possible - if (!allocated) return target_length >= length; + if ((header & header_mask) == 0) return target_length >= length; // reuse heap memory if waste is not too great const size_t reuse_threshold = 32; @@ -1687,7 +1689,7 @@ PUGI__NS_BEGIN return true; } - else if (dest && strcpy_insitu_allow(source_length, header & header_mask, dest)) + else if (dest && strcpy_insitu_allow(source_length, header, header_mask, dest)) { // we can reuse old buffer, so just copy the new data (including zero terminator) memcpy(dest, source, (source_length + 1) * sizeof(char_t)); @@ -3605,42 +3607,55 @@ PUGI__NS_BEGIN return true; } - PUGI__FN void node_copy_contents(xml_node dest, const xml_node source) + PUGI__FN void node_copy_string(char_t*& dest, uintptr_t& header, uintptr_t header_mask, char_t* source, uintptr_t& source_header, xml_allocator* alloc) { - assert(dest.type() == source.type()); - - switch (source.type()) - { - case node_element: - case node_declaration: + if (source) { - dest.set_name(source.name()); + if (alloc && (source_header & header_mask) == 0) + { + dest = source; - for (xml_attribute a = source.first_attribute(); a; a = a.next_attribute()) - dest.append_attribute(a.name()).set_value(a.value()); - break; + // since strcpy_insitu can reuse document buffer memory we need to mark both source and dest as shared + header |= xml_memory_page_name_or_value_shared_mask; + source_header |= xml_memory_page_name_or_value_shared_mask; + } + else + strcpy_insitu(dest, header, header_mask, source); } + } - case node_pcdata: - case node_cdata: - case node_comment: - case node_doctype: - dest.set_value(source.value()); - break; + PUGI__FN void node_copy_contents(xml_allocator* alloc, xml_node dest, const xml_node source) + { + assert(dest.type() == source.type()); - case node_pi: - dest.set_name(source.name()); - dest.set_value(source.value()); - break; + xml_node_struct* dn = dest.internal_object(); + xml_node_struct* sn = source.internal_object(); - default: - assert(!"Invalid node type"); + node_copy_string(dn->name, dn->header, xml_memory_page_name_allocated_mask, sn->name, sn->header, alloc); + node_copy_string(dn->value, dn->header, xml_memory_page_value_allocated_mask, sn->value, sn->header, alloc); + + for (xml_attribute_struct* sa = sn->first_attribute; sa; sa = sa->next_attribute) + { + xml_attribute_struct* da = impl::append_new_attribute(dn, impl::get_allocator(dn)); + + node_copy_string(da->name, da->header, xml_memory_page_name_allocated_mask, sa->name, sa->header, alloc); + node_copy_string(da->value, da->header, xml_memory_page_value_allocated_mask, sa->value, sa->header, alloc); } } + PUGI__FN xml_allocator* node_get_shared_allocator(const xml_node lhs, const xml_node rhs) + { + xml_allocator& la = impl::get_allocator(lhs.internal_object()); + xml_allocator& ra = impl::get_allocator(rhs.internal_object()); + + return (&la == &ra) ? &la : 0; + } + PUGI__FN void node_copy_tree(xml_node dest, const xml_node source) { - node_copy_contents(dest, source); + xml_allocator* alloc = node_get_shared_allocator(dest, source); + + node_copy_contents(alloc, dest, source); xml_node destit = dest; xml_node sourceit = source.first_child(); @@ -3651,7 +3666,7 @@ PUGI__NS_BEGIN { xml_node copy = destit.append_child(sourceit.type()); - node_copy_contents(copy, sourceit); + node_copy_contents(alloc, copy, sourceit); if (sourceit.first_child()) { -- cgit v1.2.3