From d5e29292c67c24b5cd7ac4f6afce2f8ec293144a Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Sun, 26 Oct 2014 09:37:18 -0700 Subject: XPath: Optimize constant filters/predicates If a filter/predicate expression is a constant, we don't need to evaluate it for every nodeset element - we can evaluate it once and pick the right element or keep/discard the entire collection. If the expression is 1, we can early out on first node when evaluating the node set - queries like following::item[1] are now significantly faster. Additionally this change refactors filters/predicates to have additional metadata describing the expression type in _test field that is filled during optimization. Note that predicate_constant selection right now is very simple (but captures most common use cases except for maybe [last()]). --- src/pugixml.cpp | 116 +++++++++++++++++++++++++++++++++++++-------- tests/test_xpath_paths.cpp | 38 +++++++++++++++ 2 files changed, 133 insertions(+), 21 deletions(-) diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 109a635..ac36a31 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -1,4 +1,5 @@ /** +{ * pugixml parser - version 1.4 * -------------------------------------------------------- * Copyright (C) 2006-2014, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) @@ -8199,7 +8200,6 @@ PUGI__NS_BEGIN ast_op_union, // left | right ast_predicate, // apply predicate to set; next points to next predicate ast_filter, // select * from left where right - ast_filter_posinv, // select * from left where right; proximity position invariant ast_string_constant, // string constant ast_number_constant, // number constant ast_variable, // variable @@ -8275,6 +8275,14 @@ PUGI__NS_BEGIN nodetest_all_in_namespace }; + enum predicate_t + { + predicate_default, + predicate_posinv, + predicate_constant, + predicate_constant_one + }; + enum nodeset_eval_t { nodeset_eval_all, @@ -8296,8 +8304,10 @@ PUGI__NS_BEGIN char _type; char _rettype; - // for ast_step / ast_predicate + // for ast_step char _axis; + + // for ast_step/ast_predicate/ast_filter char _test; // tree node structure @@ -8494,7 +8504,7 @@ PUGI__NS_BEGIN size_t size = ns.size() - first; xpath_node* last = ns.begin() + first; - + // remove_if... or well, sort of for (xpath_node* it = last; it != ns.end(); ++it, ++i) { @@ -8509,17 +8519,57 @@ PUGI__NS_BEGIN if (once) break; } } - else if (expr->eval_boolean(c, stack)) + else { - *last++ = *it; + if (expr->eval_boolean(c, stack)) + { + *last++ = *it; - if (once) break; + if (once) break; + } } } ns.truncate(last); } + static void apply_predicate_const(xpath_node_set_raw& ns, size_t first, xpath_ast_node* expr, const xpath_stack& stack) + { + assert(ns.size() >= first); + + size_t size = ns.size() - first; + + xpath_node* last = ns.begin() + first; + + xpath_context c(xpath_node(), 1, size); + + if (expr->rettype() == xpath_type_number) + { + double er = expr->eval_number(c, stack); + + if (er >= 1.0 && er <= size) + { + size_t eri = static_cast(er); + + if (er == eri) + { + xpath_node r = last[eri - 1]; + + *last++ = r; + } + } + } + else + { + if (expr->eval_boolean(c, stack)) + { + last += size; + } + } + + ns.truncate(last); + } + void apply_predicates(xpath_node_set_raw& ns, size_t first, const xpath_stack& stack, nodeset_eval_t eval) { if (ns.size() == first) return; @@ -8528,7 +8578,12 @@ PUGI__NS_BEGIN for (xpath_ast_node* pred = _right; pred; pred = pred->_next) { - apply_predicate(ns, first, pred->_left, stack, !pred->_next && last_once); + assert(pred->_type == ast_predicate); + + if (pred->_test == predicate_constant || pred->_test == predicate_constant_one) + apply_predicate_const(ns, first, pred->_left, stack); + else + apply_predicate(ns, first, pred->_left, stack, !pred->_next && last_once); } } @@ -8938,9 +8993,9 @@ PUGI__NS_BEGIN bool axis_reverse = (axis == axis_ancestor || axis == axis_ancestor_or_self || axis == axis_preceding || axis == axis_preceding_sibling); bool once = - (axis == axis_attribute && _test == nodetest_name) - ? true - : !_right && eval_once(!axis_reverse, eval); + (axis == axis_attribute && _test == nodetest_name) || + (!_right && eval_once(!axis_reverse, eval)) || + (_right && !_right->_next && _right->_test == predicate_constant_one); xpath_node_set_raw ns; ns.set_type(axis_reverse ? xpath_node_set::type_sorted_reverse : xpath_node_set::type_sorted); @@ -9007,6 +9062,7 @@ PUGI__NS_BEGIN xpath_ast_node(ast_type_t type, xpath_ast_node* left, axis_t axis, nodetest_t test, const char_t* contents): _type(static_cast(type)), _rettype(xpath_type_node_set), _axis(static_cast(axis)), _test(static_cast(test)), _left(left), _right(0), _next(0) { + assert(type == ast_step); _data.nodetest = contents; } @@ -9583,27 +9639,33 @@ PUGI__NS_BEGIN xpath_node_set_raw ls = _left->eval_node_set(c, swapped_stack, eval); xpath_node_set_raw rs = _right->eval_node_set(c, stack, eval); - + // we can optimize merging two sorted sets, but this is a very rare operation, so don't bother rs.set_type(xpath_node_set::type_unsorted); rs.append(ls.begin(), ls.end(), stack.result); rs.remove_duplicates(); - + return rs; } case ast_filter: - case ast_filter_posinv: { - xpath_node_set_raw set = _left->eval_node_set(c, stack, nodeset_eval_all); + xpath_node_set_raw set = _left->eval_node_set(c, stack, _test == predicate_constant_one ? nodeset_eval_first : nodeset_eval_all); // either expression is a number or it contains position() call; sort by document order - if (_type == ast_filter) set.sort_do(); + if (_test != predicate_posinv) set.sort_do(); - bool once = eval_once(set.type() == xpath_node_set::type_sorted, eval); + if (_test == predicate_constant || _test == predicate_constant_one) + { + apply_predicate_const(set, 0, _right, stack); + } + else + { + bool once = eval_once(set.type() == xpath_node_set::type_sorted, eval); - apply_predicate(set, 0, _right, stack, once); + apply_predicate(set, 0, _right, stack, once); + } return set; } @@ -9706,6 +9768,21 @@ PUGI__NS_BEGIN if (_right) _right->optimize(alloc); if (_next) _next->optimize(alloc); + // Classify filter/predicate ops to perform various optimizations during evaluation + if (_type == ast_filter || _type == ast_predicate) + { + xpath_ast_node* expr = (_type == ast_filter) ? _right : _left; + + if (expr->_type == ast_number_constant && expr->_data.number == 1.0) + _test = predicate_constant_one; + else if (expr->_type == ast_number_constant || expr->_type == ast_variable) + _test = predicate_constant; + else if (expr->rettype() != xpath_type_number && expr->is_posinv_expr()) + _test = predicate_posinv; + else + _test = predicate_default; + } + // Replace descendant-or-self::node()/child::foo with descendant::foo // The former is a full form of //foo, the latter is much faster since it executes the node test immediately // Do a similar kind of replacement for self/descendant/descendant-or-self axes @@ -9762,7 +9839,6 @@ PUGI__NS_BEGIN case ast_predicate: case ast_filter: - case ast_filter_posinv: return true; default: @@ -10206,9 +10282,7 @@ PUGI__NS_BEGIN if (n->rettype() != xpath_type_node_set) throw_error("Predicate has to be applied to node set"); - bool posinv = expr->rettype() != xpath_type_number && expr->is_posinv_expr(); - - n = new (alloc_node()) xpath_ast_node(posinv ? ast_filter_posinv : ast_filter, xpath_type_node_set, n, expr); + n = new (alloc_node()) xpath_ast_node(ast_filter, xpath_type_node_set, n, expr); if (_lexer.current() != lex_close_square_brace) throw_error("Unmatched square brace"); diff --git a/tests/test_xpath_paths.cpp b/tests/test_xpath_paths.cpp index ee2401a..046592a 100644 --- a/tests/test_xpath_paths.cpp +++ b/tests/test_xpath_paths.cpp @@ -437,6 +437,44 @@ TEST_XML(xpath_paths_predicate_number, "") +{ + CHECK_XPATH_NODESET(doc, STR("node/chapter[0.999999999999999]")); + CHECK_XPATH_NODESET(doc, STR("node/chapter[1]")) % 3; + CHECK_XPATH_NODESET(doc, STR("node/chapter[1.000000000000001]")); + CHECK_XPATH_NODESET(doc, STR("node/chapter[1.999999999999999]")); + CHECK_XPATH_NODESET(doc, STR("node/chapter[2]")) % 4; + CHECK_XPATH_NODESET(doc, STR("node/chapter[2.000000000000001]")); + CHECK_XPATH_NODESET(doc, STR("node/chapter[4.999999999999999]")); + CHECK_XPATH_NODESET(doc, STR("node/chapter[5]")) % 7; + CHECK_XPATH_NODESET(doc, STR("node/chapter[5.000000000000001]")); +} + +TEST_XML(xpath_paths_predicate_number_out_of_range, "") +{ + xml_node n = doc.child(STR("node")).child(STR("chapter")).next_sibling().next_sibling(); + + CHECK_XPATH_NODESET(n, STR("following-sibling::chapter[0]")); + CHECK_XPATH_NODESET(n, STR("following-sibling::chapter[-1]")); + CHECK_XPATH_NODESET(n, STR("following-sibling::chapter[-1000000000000]")); + CHECK_XPATH_NODESET(n, STR("following-sibling::chapter[-1 div 0]")); + CHECK_XPATH_NODESET(n, STR("following-sibling::chapter[1000000000000]")); + CHECK_XPATH_NODESET(n, STR("following-sibling::chapter[1 div 0]")); + CHECK_XPATH_NODESET(n, STR("following-sibling::chapter[0 div 0]")); +} + +TEST_XML(xpath_paths_predicate_constant_boolean, "") +{ + xml_node n = doc.child(STR("node")).child(STR("chapter")).next_sibling().next_sibling(); + + xpath_variable_set set; + set.set(STR("true"), true); + set.set(STR("false"), false); + + CHECK_XPATH_NODESET_VAR(n, STR("following-sibling::chapter[$false]"), &set); + CHECK_XPATH_NODESET_VAR(n, STR("following-sibling::chapter[$true]"), &set) % 6 % 7; +} + TEST_XML(xpath_paths_predicate_several, "") { xml_node n = doc.child(STR("node")); -- cgit v1.2.3