From 1e57d99484bcf976fd07400483a9c01203c07b8e Mon Sep 17 00:00:00 2001 From: "arseny.kapoulkine" Date: Sun, 29 Aug 2010 15:21:23 +0000 Subject: XPath: Replaced std::string with xpath_string, refactored normalize_space, namespace_uri and translate git-svn-id: http://pugixml.googlecode.com/svn/trunk@657 99668b35-9821-0410-8761-19e4c4f06640 --- src/pugixpath.cpp | 461 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 339 insertions(+), 122 deletions(-) diff --git a/src/pugixpath.cpp b/src/pugixpath.cpp index d67eac6..45d227d 100644 --- a/src/pugixpath.cpp +++ b/src/pugixpath.cpp @@ -58,11 +58,177 @@ namespace pugi { namespace impl { + size_t strlen(const char_t* s); bool strequalrange(const char_t* lhs, const char_t* rhs, size_t count); void widen_ascii(wchar_t* dest, const char* source); } } +namespace +{ + using namespace pugi; + + class xpath_string + { + const char_t* _buffer; + bool _uses_heap; + + static char_t* duplicate_string(const char_t* string, size_t length) + { + char_t* result = static_cast(get_memory_allocation_function()((length + 1) * sizeof(char_t))); + if (!result) return 0; // $$ out of memory + + memcpy(result, string, length * sizeof(char_t)); + result[length] = 0; + + return result; + } + + static char_t* duplicate_string(const char_t* string) + { + return duplicate_string(string, impl::strlen(string)); + } + + public: + xpath_string(): _buffer(PUGIXML_TEXT("")), _uses_heap(false) + { + } + + ~xpath_string() + { + if (_uses_heap) get_memory_deallocation_function()(const_cast(_buffer)); + } + + explicit xpath_string(const char_t* str, bool use_heap = true) + { + if (*str) + { + _buffer = use_heap ? duplicate_string(str) : str; + _uses_heap = use_heap; + } + else + { + _buffer = PUGIXML_TEXT(""); + _uses_heap = false; + } + } + + xpath_string(const char_t* begin, const char_t* end) + { + assert(begin <= end); + + if (begin != end) + { + _buffer = duplicate_string(begin, static_cast(end - begin)); + _uses_heap = true; + } + else + { + _buffer = PUGIXML_TEXT(""); + _uses_heap = false; + } + } + + xpath_string(const xpath_string& o) + { + _buffer = o._uses_heap ? duplicate_string(o._buffer) : o._buffer; + _uses_heap = o._uses_heap; + } + + xpath_string& operator=(const xpath_string& o) + { + xpath_string temp(o); + swap(temp); + + return *this; + } + + void swap(xpath_string& o) + { + std::swap(_buffer, o._buffer); + std::swap(_uses_heap, o._uses_heap); + } + + void append(const xpath_string& o) + { + // skip empty sources + if (!*o._buffer) return; + + // fast append for empty target and constant source + if (!*_buffer && !o._uses_heap) + { + _buffer = o._buffer; + _uses_heap = false; + } + else + { + // need to make heap copy + size_t target_length = impl::strlen(_buffer); + size_t source_length = impl::strlen(o._buffer); + size_t length = target_length + source_length; + + // allocate new buffer + char_t* result = static_cast(get_memory_allocation_function()((length + 1) * sizeof(char_t))); + if (!result) return; // $$ out of memory + + // append both strings in the new buffer + memcpy(result, _buffer, target_length * sizeof(char_t)); + memcpy(result + target_length, o._buffer, source_length * sizeof(char_t)); + result[length] = 0; + + // destroy old buffer + if (_uses_heap) get_memory_deallocation_function()(const_cast(_buffer)); + + // finalize + _buffer = result; + _uses_heap = true; + } + } + + const char_t* c_str() const + { + return _buffer; + } + + size_t length() const + { + return impl::strlen(_buffer); + } + + char_t* data() + { + // make private heap copy + if (!_uses_heap) + { + _buffer = duplicate_string(_buffer); + _uses_heap = true; + } + + return const_cast(_buffer); + } + + bool empty() const + { + return *_buffer == 0; + } + + bool operator==(const xpath_string& o) const + { + return impl::strequal(_buffer, o._buffer); + } + + bool operator!=(const xpath_string& o) const + { + return !impl::strequal(_buffer, o._buffer); + } + }; + + xpath_string xpath_string_const(const char_t* str) + { + return xpath_string(str, false); + } +} + namespace { using namespace pugi; @@ -118,14 +284,23 @@ namespace #ifdef PUGIXML_WCHAR_MODE return wcschr(s, c); #else - return ::strchr(s, c); + return strchr(s, c); #endif } - string_t string_value(const xpath_node& na) + const char_t* find_substring(const char_t* s, const char_t* p) + { + #ifdef PUGIXML_WCHAR_MODE + return wcsstr(s, p); + #else + return strstr(s, p); + #endif + } + + xpath_string string_value(const xpath_node& na) { if (na.attribute()) - return na.attribute().value(); + return xpath_string_const(na.attribute().value()); else { const xml_node& n = na.node(); @@ -136,19 +311,19 @@ namespace case node_cdata: case node_comment: case node_pi: - return n.value(); + return xpath_string_const(n.value()); case node_document: case node_element: { - string_t result; + xpath_string result; xml_node cur = n.first_child(); while (cur && cur != n) { if (cur.type() == node_pcdata || cur.type() == node_cdata) - result += cur.value(); + result.append(xpath_string_const(cur.value())); if (cur.first_child()) cur = cur.first_child(); @@ -167,7 +342,7 @@ namespace } default: - return string_t(); + return xpath_string(); } } } @@ -391,11 +566,11 @@ namespace } #endif - string_t convert_number_to_string(double value) + xpath_string convert_number_to_string(double value) { // try special number conversion const char_t* special = convert_number_to_string_special(value); - if (special) return special; + if (special) return xpath_string_const(special); // get mantissa + exponent form char mantissa_buffer[64]; @@ -451,7 +626,7 @@ namespace assert(s < result + sizeof(result) / sizeof(result[0])); *s = 0; - return string_t(result); + return xpath_string(result); } bool check_string_to_number_format(const char_t* string) @@ -535,30 +710,51 @@ namespace return (value >= -0.5 && value <= 0) ? ceil(value) : floor(value + 0.5); } - const char_t* local_name(const char_t* name) + const char_t* qualified_name(const xpath_node& node) + { + return node.attribute() ? node.attribute().name() : node.node().name(); + } + + const char_t* local_name(const xpath_node& node) { + const char_t* name = qualified_name(node); const char_t* p = find_char(name, ':'); return p ? p + 1 : name; } - - const char_t* namespace_uri(const xml_node& node) + + struct namespace_uri_predicate { - const char_t* pos = find_char(node.name(), ':'); - - string_t ns = PUGIXML_TEXT("xmlns"); - - if (pos) + const char_t* prefix; + size_t prefix_length; + + namespace_uri_predicate(const char_t* name) { - ns += ':'; - ns.append(node.name(), pos); + const char_t* pos = find_char(name, ':'); + + prefix = pos ? name : 0; + prefix_length = pos ? static_cast(pos - name) : 0; } + + bool operator()(const xml_attribute& a) const + { + const char_t* name = a.name(); + + if (!starts_with(name, PUGIXML_TEXT("xmlns"))) return false; + + return prefix ? name[5] == ':' && impl::strequalrange(name + 6, prefix, prefix_length) : name[5] == 0; + } + }; + + const char_t* namespace_uri(const xml_node& node) + { + namespace_uri_predicate pred = node.name(); xml_node p = node; while (p) { - xml_attribute a = p.attribute(ns.c_str()); + xml_attribute a = p.find_attribute(pred); if (a) return a.value(); @@ -570,19 +766,16 @@ namespace const char_t* namespace_uri(const xml_attribute& attr, const xml_node& parent) { - const char_t* pos = find_char(attr.name(), ':'); + namespace_uri_predicate pred = attr.name(); // Default namespace does not apply to attributes - if (!pos) return PUGIXML_TEXT(""); - - string_t ns = PUGIXML_TEXT("xmlns:"); - ns.append(attr.name(), pos); + if (!pred.prefix) return PUGIXML_TEXT(""); xml_node p = parent; while (p) { - xml_attribute a = p.attribute(ns.c_str()); + xml_attribute a = p.find_attribute(pred); if (a) return a.value(); @@ -592,6 +785,62 @@ namespace return PUGIXML_TEXT(""); } + const char_t* namespace_uri(const xpath_node& node) + { + return node.attribute() ? namespace_uri(node.attribute(), node.parent()) : namespace_uri(node.node()); + } + + void normalize_space(char_t* buffer) + { + char_t* write = buffer; + + for (char_t* it = buffer; *it; ) + { + char_t ch = *it++; + + if (IS_CHARTYPEX(ch, ctx_space)) + { + // replace whitespace sequence with single space + while (IS_CHARTYPEX(*it, ctx_space)) it++; + + // avoid leading spaces + if (write != buffer) *write++ = ' '; + } + else *write++ = ch; + } + + // remove trailing space + if (write != buffer && IS_CHARTYPEX(write[-1], ctx_space)) write--; + + // zero-terminate + *write = 0; + } + + void translate(char_t* buffer, const char_t* from, const char_t* to) + { + size_t to_length = impl::strlen(to); + + char_t* write = buffer; + + while (*buffer) + { + #ifdef __DMC__ + volatile // explicitly store to local to work around DMC bug (it loads 4 bytes from buffer otherwise) + #endif + char_t ch = *buffer++; + + const char_t* pos = find_char(from, ch); + + if (!pos) + *write++ = ch; // do not process + else if (static_cast(pos - from) < to_length) + *write++ = to[pos - from]; // replace + } + + // zero-terminate + *write = 0; + } + struct equal_to { template bool operator()(const T& lhs, const T& rhs) const @@ -1456,7 +1705,7 @@ namespace pugi } else if (lt == xpath_type_string) { - string_t l = lhs->eval_string(c); + xpath_string l = lhs->eval_string(c); xpath_node_set rs = rhs->eval_node_set(c); for (xpath_node_set::const_iterator ri = rs.begin(); ri != rs.end(); ++ri) @@ -2068,10 +2317,10 @@ namespace pugi case ast_func_contains: { - string_t lr = _left->eval_string(c); - string_t rr = _right->eval_string(c); + xpath_string lr = _left->eval_string(c); + xpath_string rr = _right->eval_string(c); - return rr.empty() || lr.find(rr) != string_t::npos; + return find_substring(lr.c_str(), rr.c_str()) != 0; } case ast_func_boolean: @@ -2090,7 +2339,7 @@ namespace pugi { if (c.n.attribute()) return false; - string_t lang = _left->eval_string(c); + xpath_string lang = _left->eval_string(c); for (xml_node n = c.n.node(); n; n = n.parent()) { @@ -2170,10 +2419,10 @@ namespace pugi return (double)_left->eval_node_set(c).size(); case ast_func_string_length_0: - return (double)string_value(c.n).size(); + return (double)string_value(c.n).length(); case ast_func_string_length_1: - return (double)_left->eval_string(c).size(); + return (double)_left->eval_string(c).length(); case ast_func_number_0: return convert_string_to_number(string_value(c.n).c_str()); @@ -2232,19 +2481,18 @@ namespace pugi } } - string_t eval_string(const xpath_context& c) + xpath_string eval_string(const xpath_context& c) { switch (_type) { case ast_string_constant: - return _data.string; + return xpath_string_const(_data.string); case ast_func_local_name_0: { xpath_node na = c.n; - if (na.attribute()) return local_name(na.attribute().name()); - else return local_name(na.node().name()); + return xpath_string_const(local_name(na)); } case ast_func_local_name_1: @@ -2252,16 +2500,14 @@ namespace pugi xpath_node_set ns = _left->eval_node_set(c); xpath_node na = ns.first(); - if (na.attribute()) return local_name(na.attribute().name()); - else return local_name(na.node().name()); + return xpath_string_const(local_name(na)); } case ast_func_name_0: { xpath_node na = c.n; - if (na.attribute()) return na.attribute().name(); - else return na.node().name(); + return xpath_string_const(qualified_name(na)); } case ast_func_name_1: @@ -2269,16 +2515,14 @@ namespace pugi xpath_node_set ns = _left->eval_node_set(c); xpath_node na = ns.first(); - if (na.attribute()) return na.attribute().name(); - else return na.node().name(); + return xpath_string_const(qualified_name(na)); } case ast_func_namespace_uri_0: { xpath_node na = c.n; - if (na.attribute()) return namespace_uri(na.attribute(), na.parent()); - else return namespace_uri(na.node()); + return xpath_string_const(namespace_uri(na)); } case ast_func_namespace_uri_1: @@ -2286,8 +2530,7 @@ namespace pugi xpath_node_set ns = _left->eval_node_set(c); xpath_node na = ns.first(); - if (na.attribute()) return namespace_uri(na.attribute(), na.parent()); - else return namespace_uri(na.node()); + return xpath_string_const(namespace_uri(na)); } case ast_func_string_0: @@ -2298,122 +2541,96 @@ namespace pugi case ast_func_concat: { - string_t r = _left->eval_string(c); + xpath_string r = _left->eval_string(c); for (xpath_ast_node* n = _right; n; n = n->_next) - r += n->eval_string(c); + r.append(n->eval_string(c)); return r; } case ast_func_substring_before: { - string_t s = _left->eval_string(c); - string_t::size_type pos = s.find(_right->eval_string(c)); + xpath_string s = _left->eval_string(c); + xpath_string p = _right->eval_string(c); + + const char_t* pos = find_substring(s.c_str(), p.c_str()); - if (pos == string_t::npos) return string_t(); - else return string_t(s.begin(), s.begin() + pos); + return pos ? xpath_string(s.c_str(), pos) : xpath_string(); } case ast_func_substring_after: { - string_t s = _left->eval_string(c); - string_t p = _right->eval_string(c); + xpath_string s = _left->eval_string(c); + xpath_string p = _right->eval_string(c); - string_t::size_type pos = s.find(p); + const char_t* pos = find_substring(s.c_str(), p.c_str()); - if (pos == string_t::npos) return string_t(); - else return string_t(s.begin() + pos + p.length(), s.end()); + return pos ? xpath_string(pos + p.length()) : xpath_string(); } case ast_func_substring_2: { - string_t s = _left->eval_string(c); + xpath_string s = _left->eval_string(c); double first = round_nearest(_right->eval_number(c)); - if (is_nan(first)) return string_t(); // NaN - else if (first >= s.length() + 1) return string_t(); + if (is_nan(first)) return xpath_string(); // NaN + else if (first >= s.length() + 1) return xpath_string(); size_t pos = first < 1 ? 1 : (size_t)first; - return s.substr(pos - 1); + return xpath_string(s.c_str() + (pos - 1)); } case ast_func_substring_3: { - string_t s = _left->eval_string(c); + xpath_string s = _left->eval_string(c); + size_t s_length = s.length(); + double first = round_nearest(_right->eval_number(c)); double last = first + round_nearest(_right->_next->eval_number(c)); - if (is_nan(first) || is_nan(last)) return string_t(); - else if (first >= s.length() + 1) return string_t(); - else if (first >= last) return string_t(); + if (is_nan(first) || is_nan(last)) return xpath_string(); + else if (first >= s_length + 1) return xpath_string(); + else if (first >= last) return xpath_string(); size_t pos = first < 1 ? 1 : (size_t)first; - size_t end = last >= s.length() + 1 ? s.length() + 1 : (size_t)last; + size_t end = last >= s_length + 1 ? s_length + 1 : (size_t)last; size_t size_requested = end - pos; - size_t size_to_end = s.length() - pos + 1; + size_t size_to_end = s_length - pos + 1; + + size_t size = size_requested < size_to_end ? size_requested : size_to_end; - return s.substr(pos - 1, size_requested < size_to_end ? size_requested : size_to_end); + return xpath_string(s.c_str() + pos - 1, s.c_str() + pos - 1 + size); } case ast_func_normalize_space_0: + { + xpath_string s = string_value(c.n); + + normalize_space(s.data()); + + return s; + } + case ast_func_normalize_space_1: { - string_t s = _type == ast_func_normalize_space_0 ? string_value(c.n) : _left->eval_string(c); - - string_t r; - r.reserve(s.size()); - - for (string_t::const_iterator it = s.begin(); it != s.end(); ++it) - { - if (IS_CHARTYPEX(*it, ctx_space)) - { - if (!r.empty() && r[r.size() - 1] != ' ') - r += ' '; - } - else - { - #ifdef __DMC__ - volatile // explicitly store to local to work around DMC bug (it loads 4 bytes from &*it otherwise) - #endif - char_t ch = *it; + xpath_string s = _left->eval_string(c); - r += ch; - } - } - - string_t::size_type pos = r.find_last_not_of(' '); - if (pos == string_t::npos) r = string_t(); - else r.erase(r.begin() + pos + 1, r.end()); + normalize_space(s.data()); - return r; + return s; } case ast_func_translate: { - string_t s = _left->eval_string(c); - string_t from = _right->eval_string(c); - string_t to = _right->_next->eval_string(c); - - for (string_t::iterator it = s.begin(); it != s.end(); ) - { - #ifdef __DMC__ - volatile // explicitly store to local to work around DMC bug (it loads 4 bytes from &*it otherwise) - #endif - char_t ch = *it; + xpath_string s = _left->eval_string(c); + xpath_string from = _right->eval_string(c); + xpath_string to = _right->_next->eval_string(c); + + translate(s.data(), from.c_str(), to.c_str()); - string_t::size_type pos = from.find(ch); - - if (pos == string_t::npos) - ++it; - else if (pos >= to.length()) - it = s.erase(it); - else - *it++ = to[pos]; - } - return s; } @@ -2422,7 +2639,7 @@ namespace pugi switch (_rettype) { case xpath_type_boolean: - return eval_boolean(c) ? PUGIXML_TEXT("true") : PUGIXML_TEXT("false"); + return xpath_string_const(eval_boolean(c) ? PUGIXML_TEXT("true") : PUGIXML_TEXT("false")); case xpath_type_number: return convert_number_to_string(eval_number(c)); @@ -2430,12 +2647,12 @@ namespace pugi case xpath_type_node_set: { xpath_node_set ns = eval_node_set(c); - return ns.empty() ? string_t() : string_value(ns.first()); + return ns.empty() ? xpath_string() : string_value(ns.first()); } default: assert(!"Wrong expression for return type string"); - return string_t(); + return xpath_string(); } } } @@ -3083,9 +3300,9 @@ namespace pugi // QName or NCName:* else { - const char_t* colon_pos = std::char_traits::find(nt_name.begin, static_cast(nt_name.end - nt_name.begin), ':'); + const char_t* colon_pos = std::find(nt_name.begin, nt_name.end, ':'); - if (colon_pos && colon_pos + 2 == nt_name.end && colon_pos[1] == '*') // NCName:* + if (colon_pos + 2 == nt_name.end && colon_pos[1] == '*') // NCName:* { nt_name.end--; // erase * @@ -3490,7 +3707,7 @@ namespace pugi xpath_context c(n, 1, 1); - return _root->eval_string(c); + return _root->eval_string(c).c_str(); } xpath_node_set xpath_query::evaluate_node_set(const xml_node& n) const -- cgit v1.2.3