From 9fa82b15f53f0f20363f50b5b1adf1a762ed96d6 Mon Sep 17 00:00:00 2001 From: "arseny.kapoulkine" Date: Fri, 4 Jun 2010 18:50:26 +0000 Subject: Optimized attribute parsing; behavior of parse_wconv changed, it now assumes that parse_eol is set git-svn-id: http://pugixml.googlecode.com/svn/trunk@503 99668b35-9821-0410-8761-19e4c4f06640 --- src/pugixml.cpp | 174 ++++++++++++++++++++++++++++++++++----------------- src/pugixml.hpp | 5 +- tests/test_parse.cpp | 2 +- 3 files changed, 121 insertions(+), 60 deletions(-) diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 5b668b0..7911689 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -935,7 +935,7 @@ namespace { ct_parse_pcdata = 1, // \0, &, \r, < ct_parse_attr = 2, // \0, &, \r, ', " - ct_parse_attr_ws = 4, // \0, &, \r, ', ", \n, space, tab + ct_parse_attr_ws = 4, // \0, &, \r, ', ", \n, tab ct_space = 8, // \r, \n, space, tab ct_parse_cdata = 16, // \0, ], >, \r ct_parse_comment = 32, // \0, -, >, \r @@ -947,7 +947,7 @@ namespace { 55, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 0, 63, 0, 0, // 0-15 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16-31 - 12, 0, 6, 0, 0, 0, 7, 6, 0, 0, 0, 0, 0, 96, 64, 0, // 32-47 + 8, 0, 6, 0, 0, 0, 7, 6, 0, 0, 0, 0, 0, 96, 64, 0, // 32-47 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 192, 0, 1, 0, 48, 0, // 48-63 0, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 64-79 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 16, 0, 192, // 80-95 @@ -1020,19 +1020,6 @@ namespace template const bool opt2_to_type<_1, _2>::o1 = _1; template const bool opt2_to_type<_1, _2>::o2 = _2; - template struct opt4_to_type - { - static const bool o1; - static const bool o2; - static const bool o3; - static const bool o4; - }; - - template const bool opt4_to_type<_1, _2, _3, _4>::o1 = _1; - template const bool opt4_to_type<_1, _2, _3, _4>::o2 = _2; - template const bool opt4_to_type<_1, _2, _3, _4>::o3 = _3; - template const bool opt4_to_type<_1, _2, _3, _4>::o4 = _4; - bool is_little_endian() { unsigned int ui = 1; @@ -1628,19 +1615,16 @@ namespace typedef char_t* (*strconv_attribute_t)(char_t*, char_t); - template struct strconv_attribute_impl + template struct strconv_attribute_impl { - static char_t* parse(char_t* s, char_t end_quote) + static char_t* parse_wnorm(char_t* s, char_t end_quote) { - const bool opt_wconv = opt4::o1; - const bool opt_wnorm = opt4::o2; - const bool opt_eol = opt4::o3; - const bool opt_escape = opt4::o4; + const bool opt_escape = opt1::o1; gap g; // trim leading whitespaces - if (opt_wnorm && IS_CHARTYPE(*s, ct_space)) + if (IS_CHARTYPE(*s, ct_space)) { char_t* str = s; @@ -1652,22 +1636,18 @@ namespace while (true) { - while (!IS_CHARTYPE(*s, (opt_wnorm || opt_wconv) ? ct_parse_attr_ws : ct_parse_attr)) ++s; + while (!IS_CHARTYPE(*s, ct_parse_attr_ws | ct_space)) ++s; if (*s == end_quote) { char_t* str = g.flush(s); - if (opt_wnorm) - { - do *str-- = 0; - while (IS_CHARTYPE(*str, ct_space)); - } - else *str = 0; + do *str-- = 0; + while (IS_CHARTYPE(*str, ct_space)); return s + 1; } - else if (opt_wnorm && IS_CHARTYPE(*s, ct_space)) + else if (IS_CHARTYPE(*s, ct_space)) { *s++ = ' '; @@ -1679,21 +1659,73 @@ namespace g.push(s, str - s); } } - else if (opt_wconv && IS_CHARTYPE(*s, ct_space)) + else if (opt_escape && *s == '&') + { + s = strconv_escape(s, g); + } + else if (!*s) + { + return 0; + } + else ++s; + } + } + + static char_t* parse_wconv(char_t* s, char_t end_quote) + { + const bool opt_escape = opt1::o1; + + gap g; + + while (true) + { + while (!IS_CHARTYPE(*s, ct_parse_attr_ws)) ++s; + + if (*s == end_quote) + { + *g.flush(s) = 0; + + return s + 1; + } + else if (IS_CHARTYPE(*s, ct_space)) { - if (opt_eol) + if (*s == '\r') { - if (*s == '\r') - { - *s++ = ' '; - - if (*s == '\n') g.push(s, 1); - } - else *s++ = ' '; + *s++ = ' '; + + if (*s == '\n') g.push(s, 1); } else *s++ = ' '; } - else if (opt_eol && *s == '\r') + else if (opt_escape && *s == '&') + { + s = strconv_escape(s, g); + } + else if (!*s) + { + return 0; + } + else ++s; + } + } + + static char_t* parse_eol(char_t* s, char_t end_quote) + { + const bool opt_escape = opt1::o1; + + gap g; + + while (true) + { + while (!IS_CHARTYPE(*s, ct_parse_attr)) ++s; + + if (*s == end_quote) + { + *g.flush(s) = 0; + + return s + 1; + } + else if (*s == '\r') { *s++ = '\n'; @@ -1710,30 +1742,58 @@ namespace else ++s; } } + + static char_t* parse_simple(char_t* s, char_t end_quote) + { + const bool opt_escape = opt1::o1; + + gap g; + + while (true) + { + while (!IS_CHARTYPE(*s, ct_parse_attr)) ++s; + + if (*s == end_quote) + { + *g.flush(s) = 0; + + return s + 1; + } + else if (opt_escape && *s == '&') + { + s = strconv_escape(s, g); + } + else if (!*s) + { + return 0; + } + else ++s; + } + } }; strconv_attribute_t get_strconv_attribute(unsigned int optmask) { - STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_wconv_attribute == 0x80); + STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_wconv_attribute == 0x40); switch ((optmask >> 4) & 15) // get bitmask for flags (wconv wnorm eol escapes) { - case 0: return strconv_attribute_impl >::parse; - case 1: return strconv_attribute_impl >::parse; - case 2: return strconv_attribute_impl >::parse; - case 3: return strconv_attribute_impl >::parse; - case 4: return strconv_attribute_impl >::parse; - case 5: return strconv_attribute_impl >::parse; - case 6: return strconv_attribute_impl >::parse; - case 7: return strconv_attribute_impl >::parse; - case 8: return strconv_attribute_impl >::parse; - case 9: return strconv_attribute_impl >::parse; - case 10: return strconv_attribute_impl >::parse; - case 11: return strconv_attribute_impl >::parse; - case 12: return strconv_attribute_impl >::parse; - case 13: return strconv_attribute_impl >::parse; - case 14: return strconv_attribute_impl >::parse; - case 15: return strconv_attribute_impl >::parse; + case 0: return strconv_attribute_impl >::parse_simple; + case 1: return strconv_attribute_impl >::parse_simple; + case 2: return strconv_attribute_impl >::parse_eol; + case 3: return strconv_attribute_impl >::parse_eol; + case 4: return strconv_attribute_impl >::parse_wconv; + case 5: return strconv_attribute_impl >::parse_wconv; + case 6: return strconv_attribute_impl >::parse_wconv; + case 7: return strconv_attribute_impl >::parse_wconv; + case 8: return strconv_attribute_impl >::parse_wnorm; + case 9: return strconv_attribute_impl >::parse_wnorm; + case 10: return strconv_attribute_impl >::parse_wnorm; + case 11: return strconv_attribute_impl >::parse_wnorm; + case 12: return strconv_attribute_impl >::parse_wnorm; + case 13: return strconv_attribute_impl >::parse_wnorm; + case 14: return strconv_attribute_impl >::parse_wnorm; + case 15: return strconv_attribute_impl >::parse_wnorm; default: return 0; // should not get here } } diff --git a/src/pugixml.hpp b/src/pugixml.hpp index 6f4cece..398dd77 100644 --- a/src/pugixml.hpp +++ b/src/pugixml.hpp @@ -235,17 +235,18 @@ namespace pugi #if !defined(__INTEL_COMPILER) || __INTEL_COMPILER > 800 PUGIXML_DEPRECATED #endif - const unsigned int parse_wnorm_attribute = 0x0040; + const unsigned int parse_wnorm_attribute = 0x0080; /** * This flag determines if attribute value normalization should be performed for all attributes. * This means, that whitespace characters (new line, tab and space) are replaced with space (' '). * Note, that the actions performed while this flag is on are also performed if parse_wnorm_attribute * is on, so this flag has no effect if parse_wnorm_attribute flag is set. + * New line characters are always treated as if parse_eol is set, i.e. \r\n is converted to single space. * * This flag is on by default. */ - const unsigned int parse_wconv_attribute = 0x0080; + const unsigned int parse_wconv_attribute = 0x0040; /** * This flag determines if XML document declaration (this node has the form of in XML) diff --git a/tests/test_parse.cpp b/tests/test_parse.cpp index 400942f..c2f56e5 100644 --- a/tests/test_parse.cpp +++ b/tests/test_parse.cpp @@ -351,7 +351,7 @@ TEST(parse_attribute_no_eol_wconv) { xml_document doc; CHECK(doc.load(STR(""), parse_minimal | parse_wconv_attribute)); - CHECK_STRING(doc.child(STR("node")).attribute(STR("id")).value(), STR(" val1 val2 val3 val4 ")); + CHECK_STRING(doc.child(STR("node")).attribute(STR("id")).value(), STR(" val1 val2 val3 val4 ")); } TEST(parse_attribute_eol_wconv) -- cgit v1.2.3