summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArseny Kapoulkine <arseny.kapoulkine@gmail.com>2014-10-26 09:37:18 -0700
committerArseny Kapoulkine <arseny.kapoulkine@gmail.com>2014-10-26 09:37:18 -0700
commitd5e29292c67c24b5cd7ac4f6afce2f8ec293144a (patch)
tree9e454ea5ac1c27697aecbb4dac5d4f542d6c9ceb
parentf31c14e1fdcbd6e406e7e2bea21b303853e028e3 (diff)
XPath: Optimize constant filters/predicates
If a filter/predicate expression is a constant, we don't need to evaluate it for every nodeset element - we can evaluate it once and pick the right element or keep/discard the entire collection. If the expression is 1, we can early out on first node when evaluating the node set - queries like following::item[1] are now significantly faster. Additionally this change refactors filters/predicates to have additional metadata describing the expression type in _test field that is filled during optimization. Note that predicate_constant selection right now is very simple (but captures most common use cases except for maybe [last()]).
-rw-r--r--src/pugixml.cpp116
-rw-r--r--tests/test_xpath_paths.cpp38
2 files changed, 133 insertions, 21 deletions
diff --git a/src/pugixml.cpp b/src/pugixml.cpp
index 109a635..ac36a31 100644
--- a/src/pugixml.cpp
+++ b/src/pugixml.cpp
@@ -1,4 +1,5 @@
/**
+{
* pugixml parser - version 1.4
* --------------------------------------------------------
* Copyright (C) 2006-2014, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
@@ -8199,7 +8200,6 @@ PUGI__NS_BEGIN
ast_op_union, // left | right
ast_predicate, // apply predicate to set; next points to next predicate
ast_filter, // select * from left where right
- ast_filter_posinv, // select * from left where right; proximity position invariant
ast_string_constant, // string constant
ast_number_constant, // number constant
ast_variable, // variable
@@ -8275,6 +8275,14 @@ PUGI__NS_BEGIN
nodetest_all_in_namespace
};
+ enum predicate_t
+ {
+ predicate_default,
+ predicate_posinv,
+ predicate_constant,
+ predicate_constant_one
+ };
+
enum nodeset_eval_t
{
nodeset_eval_all,
@@ -8296,8 +8304,10 @@ PUGI__NS_BEGIN
char _type;
char _rettype;
- // for ast_step / ast_predicate
+ // for ast_step
char _axis;
+
+ // for ast_step/ast_predicate/ast_filter
char _test;
// tree node structure
@@ -8494,7 +8504,7 @@ PUGI__NS_BEGIN
size_t size = ns.size() - first;
xpath_node* last = ns.begin() + first;
-
+
// remove_if... or well, sort of
for (xpath_node* it = last; it != ns.end(); ++it, ++i)
{
@@ -8509,17 +8519,57 @@ PUGI__NS_BEGIN
if (once) break;
}
}
- else if (expr->eval_boolean(c, stack))
+ else
{
- *last++ = *it;
+ if (expr->eval_boolean(c, stack))
+ {
+ *last++ = *it;
- if (once) break;
+ if (once) break;
+ }
}
}
ns.truncate(last);
}
+ static void apply_predicate_const(xpath_node_set_raw& ns, size_t first, xpath_ast_node* expr, const xpath_stack& stack)
+ {
+ assert(ns.size() >= first);
+
+ size_t size = ns.size() - first;
+
+ xpath_node* last = ns.begin() + first;
+
+ xpath_context c(xpath_node(), 1, size);
+
+ if (expr->rettype() == xpath_type_number)
+ {
+ double er = expr->eval_number(c, stack);
+
+ if (er >= 1.0 && er <= size)
+ {
+ size_t eri = static_cast<size_t>(er);
+
+ if (er == eri)
+ {
+ xpath_node r = last[eri - 1];
+
+ *last++ = r;
+ }
+ }
+ }
+ else
+ {
+ if (expr->eval_boolean(c, stack))
+ {
+ last += size;
+ }
+ }
+
+ ns.truncate(last);
+ }
+
void apply_predicates(xpath_node_set_raw& ns, size_t first, const xpath_stack& stack, nodeset_eval_t eval)
{
if (ns.size() == first) return;
@@ -8528,7 +8578,12 @@ PUGI__NS_BEGIN
for (xpath_ast_node* pred = _right; pred; pred = pred->_next)
{
- apply_predicate(ns, first, pred->_left, stack, !pred->_next && last_once);
+ assert(pred->_type == ast_predicate);
+
+ if (pred->_test == predicate_constant || pred->_test == predicate_constant_one)
+ apply_predicate_const(ns, first, pred->_left, stack);
+ else
+ apply_predicate(ns, first, pred->_left, stack, !pred->_next && last_once);
}
}
@@ -8938,9 +8993,9 @@ PUGI__NS_BEGIN
bool axis_reverse = (axis == axis_ancestor || axis == axis_ancestor_or_self || axis == axis_preceding || axis == axis_preceding_sibling);
bool once =
- (axis == axis_attribute && _test == nodetest_name)
- ? true
- : !_right && eval_once(!axis_reverse, eval);
+ (axis == axis_attribute && _test == nodetest_name) ||
+ (!_right && eval_once(!axis_reverse, eval)) ||
+ (_right && !_right->_next && _right->_test == predicate_constant_one);
xpath_node_set_raw ns;
ns.set_type(axis_reverse ? xpath_node_set::type_sorted_reverse : xpath_node_set::type_sorted);
@@ -9007,6 +9062,7 @@ PUGI__NS_BEGIN
xpath_ast_node(ast_type_t type, xpath_ast_node* left, axis_t axis, nodetest_t test, const char_t* contents):
_type(static_cast<char>(type)), _rettype(xpath_type_node_set), _axis(static_cast<char>(axis)), _test(static_cast<char>(test)), _left(left), _right(0), _next(0)
{
+ assert(type == ast_step);
_data.nodetest = contents;
}
@@ -9583,27 +9639,33 @@ PUGI__NS_BEGIN
xpath_node_set_raw ls = _left->eval_node_set(c, swapped_stack, eval);
xpath_node_set_raw rs = _right->eval_node_set(c, stack, eval);
-
+
// we can optimize merging two sorted sets, but this is a very rare operation, so don't bother
rs.set_type(xpath_node_set::type_unsorted);
rs.append(ls.begin(), ls.end(), stack.result);
rs.remove_duplicates();
-
+
return rs;
}
case ast_filter:
- case ast_filter_posinv:
{
- xpath_node_set_raw set = _left->eval_node_set(c, stack, nodeset_eval_all);
+ xpath_node_set_raw set = _left->eval_node_set(c, stack, _test == predicate_constant_one ? nodeset_eval_first : nodeset_eval_all);
// either expression is a number or it contains position() call; sort by document order
- if (_type == ast_filter) set.sort_do();
+ if (_test != predicate_posinv) set.sort_do();
- bool once = eval_once(set.type() == xpath_node_set::type_sorted, eval);
+ if (_test == predicate_constant || _test == predicate_constant_one)
+ {
+ apply_predicate_const(set, 0, _right, stack);
+ }
+ else
+ {
+ bool once = eval_once(set.type() == xpath_node_set::type_sorted, eval);
- apply_predicate(set, 0, _right, stack, once);
+ apply_predicate(set, 0, _right, stack, once);
+ }
return set;
}
@@ -9706,6 +9768,21 @@ PUGI__NS_BEGIN
if (_right) _right->optimize(alloc);
if (_next) _next->optimize(alloc);
+ // Classify filter/predicate ops to perform various optimizations during evaluation
+ if (_type == ast_filter || _type == ast_predicate)
+ {
+ xpath_ast_node* expr = (_type == ast_filter) ? _right : _left;
+
+ if (expr->_type == ast_number_constant && expr->_data.number == 1.0)
+ _test = predicate_constant_one;
+ else if (expr->_type == ast_number_constant || expr->_type == ast_variable)
+ _test = predicate_constant;
+ else if (expr->rettype() != xpath_type_number && expr->is_posinv_expr())
+ _test = predicate_posinv;
+ else
+ _test = predicate_default;
+ }
+
// Replace descendant-or-self::node()/child::foo with descendant::foo
// The former is a full form of //foo, the latter is much faster since it executes the node test immediately
// Do a similar kind of replacement for self/descendant/descendant-or-self axes
@@ -9762,7 +9839,6 @@ PUGI__NS_BEGIN
case ast_predicate:
case ast_filter:
- case ast_filter_posinv:
return true;
default:
@@ -10206,9 +10282,7 @@ PUGI__NS_BEGIN
if (n->rettype() != xpath_type_node_set) throw_error("Predicate has to be applied to node set");
- bool posinv = expr->rettype() != xpath_type_number && expr->is_posinv_expr();
-
- n = new (alloc_node()) xpath_ast_node(posinv ? ast_filter_posinv : ast_filter, xpath_type_node_set, n, expr);
+ n = new (alloc_node()) xpath_ast_node(ast_filter, xpath_type_node_set, n, expr);
if (_lexer.current() != lex_close_square_brace)
throw_error("Unmatched square brace");
diff --git a/tests/test_xpath_paths.cpp b/tests/test_xpath_paths.cpp
index ee2401a..046592a 100644
--- a/tests/test_xpath_paths.cpp
+++ b/tests/test_xpath_paths.cpp
@@ -437,6 +437,44 @@ TEST_XML(xpath_paths_predicate_number, "<node><chapter/><chapter/><chapter/><cha
CHECK_XPATH_NODESET(n, STR("preceding-sibling::chapter[2]")) % 3;
}
+TEST_XML(xpath_paths_predicate_number_boundary, "<node><chapter/><chapter/><chapter/><chapter/><chapter/></node>")
+{
+ CHECK_XPATH_NODESET(doc, STR("node/chapter[0.999999999999999]"));
+ CHECK_XPATH_NODESET(doc, STR("node/chapter[1]")) % 3;
+ CHECK_XPATH_NODESET(doc, STR("node/chapter[1.000000000000001]"));
+ CHECK_XPATH_NODESET(doc, STR("node/chapter[1.999999999999999]"));
+ CHECK_XPATH_NODESET(doc, STR("node/chapter[2]")) % 4;
+ CHECK_XPATH_NODESET(doc, STR("node/chapter[2.000000000000001]"));
+ CHECK_XPATH_NODESET(doc, STR("node/chapter[4.999999999999999]"));
+ CHECK_XPATH_NODESET(doc, STR("node/chapter[5]")) % 7;
+ CHECK_XPATH_NODESET(doc, STR("node/chapter[5.000000000000001]"));
+}
+
+TEST_XML(xpath_paths_predicate_number_out_of_range, "<node><chapter/><chapter/><chapter/><chapter/><chapter/></node>")
+{
+ xml_node n = doc.child(STR("node")).child(STR("chapter")).next_sibling().next_sibling();
+
+ CHECK_XPATH_NODESET(n, STR("following-sibling::chapter[0]"));
+ CHECK_XPATH_NODESET(n, STR("following-sibling::chapter[-1]"));
+ CHECK_XPATH_NODESET(n, STR("following-sibling::chapter[-1000000000000]"));
+ CHECK_XPATH_NODESET(n, STR("following-sibling::chapter[-1 div 0]"));
+ CHECK_XPATH_NODESET(n, STR("following-sibling::chapter[1000000000000]"));
+ CHECK_XPATH_NODESET(n, STR("following-sibling::chapter[1 div 0]"));
+ CHECK_XPATH_NODESET(n, STR("following-sibling::chapter[0 div 0]"));
+}
+
+TEST_XML(xpath_paths_predicate_constant_boolean, "<node><chapter/><chapter/><chapter/><chapter/><chapter/></node>")
+{
+ xml_node n = doc.child(STR("node")).child(STR("chapter")).next_sibling().next_sibling();
+
+ xpath_variable_set set;
+ set.set(STR("true"), true);
+ set.set(STR("false"), false);
+
+ CHECK_XPATH_NODESET_VAR(n, STR("following-sibling::chapter[$false]"), &set);
+ CHECK_XPATH_NODESET_VAR(n, STR("following-sibling::chapter[$true]"), &set) % 6 % 7;
+}
+
TEST_XML(xpath_paths_predicate_several, "<node><employee/><employee secretary=''/><employee assistant=''/><employee secretary='' assistant=''/><employee assistant='' secretary=''/></node>")
{
xml_node n = doc.child(STR("node"));