From 58be69c574390715e4f72ed7e835b236958f9d9c Mon Sep 17 00:00:00 2001
From: "arseny.kapoulkine"
pugixml is just another XML parser. This is a successor to
pugxml (well, to be honest, the only part
-that is left as is is wildcard matching code, the rest was either heavily refactored or rewritten
+that is left as is is wildcard matching code; the rest was either heavily refactored or rewritten
from scratch). The main features (call it USP) are:Introduction
@@ -59,7 +59,7 @@ mode, with the exception of DTD related issues and XML namespaces)
like expat will; it will try to recover the state even if meeting an error (like finding matching
tags for closing ones); it will parse files with data in wrong encoding; and so on)
A couple of words on flag usage. The parsing options are just a set of bits, with each bit corresponding to one flag. You can turn the flag on by OR-ing the options value with this flag's constant:
- parse_w3c | parse_wnorm_pcdata + parse_w3c | parse_wnorm_attributeor turn the flag off by AND-ing the options value with the NEGation of this flag's constant:
diff --git a/src/pugixml.cpp b/src/pugixml.cpp index da53c66..cf74eed 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -3,8 +3,6 @@ // Pug Improved XML Parser - Version 0.2 // -------------------------------------------------------- // Copyright (C) 2006-2007, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) -// Thanks to Palvelev Artyom (cppguru@mail.ru) for hints about optimizing -// conversion functions. // This work is based on the pugxml parser, which is: // Copyright (C) 2003, by Kristen Wegner (kristen@tima.net) // Released into the Public Domain. Use at your own risk. @@ -120,29 +118,30 @@ namespace ct_parse_attr_ws = 4, // \0, &, \r, ', ", \n, space, tab ct_space = 8, // \r, \n, space, tab ct_parse_cdata = 16, // \0, ], >, \r - ct_parse_comment = 32 // \0, -, >, \r + ct_parse_comment = 32, // \0, -, >, \r + ct_symbol = 64 // Any symbol > 127, a-z, A-Z, 0-9, _, :, -, . }; - + static unsigned char chartype_table[256] = { - 55, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 0, 63, 0, 0, // 0-15 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16-31 - 12, 0, 6, 0, 0, 0, 7, 6, 0, 0, 0, 0, 0, 32, 0, 0, // 32-47 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 48, 0, // 48-63 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 64-79 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, // 80-95 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 96-111 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 112-127 - - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + 55, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 0, 63, 0, 0, // 0-15 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16-31 + 12, 0, 6, 0, 0, 0, 7, 6, 0, 0, 0, 0, 0, 96, 64, 0, // 32-47 + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 0, 1, 0, 48, 0, // 48-63 + 0, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, // 64-79 + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 0, 0, 16, 0, 64, // 80-95 + 0, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, // 96-111 + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 0, 0, 0, 0, 0, // 112-127 + + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }; bool is_chartype(char c, chartype ct) @@ -275,9 +274,6 @@ namespace pugi struct xml_parser_impl { xml_allocator& alloc; - bool chartype_symbol_table[256]; - - bool chartype_symbol(char c) const { return chartype_symbol_table[(unsigned char)c]; } struct gap { @@ -724,8 +720,6 @@ namespace pugi { for (unsigned int c = 0; c < 256; ++c) { - chartype_symbol_table[c] = c > 127 || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || - (c >= '0' && c <= '9') || c == '_' || c == ':' || c == '-' || c == '.'; } } @@ -756,10 +750,10 @@ namespace pugi if(*s == '?') // '...' { ++s; - if(chartype_symbol(*s) && OPTSET(parse_pi)) + if(is_chartype(*s, ct_symbol) && OPTSET(parse_pi)) { mark = s; - SCANWHILE(chartype_symbol(*s)); // Read PI target + SCANWHILE(is_chartype(*s, ct_symbol)); // Read PI target ENDSEG(); PUSHNODE(node_pi); // Append a new node on the tree. @@ -900,12 +894,12 @@ namespace pugi continue; } } - else if(chartype_symbol(*s)) // '<#...' + else if(is_chartype(*s, ct_symbol)) // '<#...' { cursor = append_node(cursor); // Append a new node to the tree. cursor->name = s; - SCANWHILE(chartype_symbol(*s)); // Scan for a terminator. + SCANWHILE(is_chartype(*s, ct_symbol)); // Scan for a terminator. ENDSEG(); // Save char in 'ch', terminate & step over. if (*s!=0 && ch == '/') // 'name = s; // Save the offset. - SCANWHILE(chartype_symbol(*s)); // Scan for a terminator. + SCANWHILE(is_chartype(*s, ct_symbol)); // Scan for a terminator. ENDSEG(); // Save char in 'ch', terminate & step over. if(*s!=0 && is_chartype(ch, ct_space)) SKIPWS(); // Eat any whitespace. if(*s!=0 && (ch == '=' || *s == '=')) // '<... #=...' @@ -1040,7 +1034,7 @@ namespace pugi if (name) { - while (*tagname && chartype_symbol(*tagname)) + while (*tagname && is_chartype(*tagname, ct_symbol)) { if (*tagname++ != *name++) goto TAG_NEXTMATCH; } @@ -1063,7 +1057,7 @@ namespace pugi char* name = cursor->name; if (!name) return s; - while (*s && chartype_symbol(*s)) + while (*s && is_chartype(*s, ct_symbol)) { if (*s++ != *name++) return s; } diff --git a/src/pugixml.hpp b/src/pugixml.hpp index b7ded37..de7939e 100644 --- a/src/pugixml.hpp +++ b/src/pugixml.hpp @@ -3,8 +3,6 @@ // Pug Improved XML Parser - Version 0.2 // -------------------------------------------------------- // Copyright (C) 2006-2007, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) -// Thanks to Palvelev Artyom (cppguru@mail.ru) for hints about optimizing -// conversion functions. // This work is based on the pugxml parser, which is: // Copyright (C) 2003, by Kristen Wegner (kristen@tima.net) // Released into the Public Domain. Use at your own risk. -- cgit v1.2.3