summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/pugixml.cpp651
-rw-r--r--src/pugixml.hpp4
2 files changed, 431 insertions, 224 deletions
diff --git a/src/pugixml.cpp b/src/pugixml.cpp
index de3a548..da53c66 100644
--- a/src/pugixml.cpp
+++ b/src/pugixml.cpp
@@ -2,7 +2,9 @@
//
// Pug Improved XML Parser - Version 0.2
// --------------------------------------------------------
-// Copyright (C) 2006, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+// Copyright (C) 2006-2007, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+// Thanks to Palvelev Artyom (cppguru@mail.ru) for hints about optimizing
+// conversion functions.
// This work is based on the pugxml parser, which is:
// Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
// Released into the Public Domain. Use at your own risk.
@@ -110,6 +112,43 @@ namespace
const unsigned char BYTE_MASK_READ = 0x3F;
const unsigned char FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
}
+
+ enum chartype
+ {
+ ct_parse_pcdata = 1, // \0, &, \r, <
+ ct_parse_attr = 2, // \0, &, \r, ', "
+ ct_parse_attr_ws = 4, // \0, &, \r, ', ", \n, space, tab
+ ct_space = 8, // \r, \n, space, tab
+ ct_parse_cdata = 16, // \0, ], >, \r
+ ct_parse_comment = 32 // \0, -, >, \r
+
+ };
+
+ static unsigned char chartype_table[256] =
+ {
+ 55, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 0, 63, 0, 0, // 0-15
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16-31
+ 12, 0, 6, 0, 0, 0, 7, 6, 0, 0, 0, 0, 0, 32, 0, 0, // 32-47
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 48, 0, // 48-63
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 64-79
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, // 80-95
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 96-111
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 112-127
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ };
+
+ bool is_chartype(char c, chartype ct)
+ {
+ return !!(chartype_table[static_cast<unsigned char>(c)] & ct);
+ }
}
namespace pugi
@@ -237,185 +276,341 @@ namespace pugi
{
xml_allocator& alloc;
bool chartype_symbol_table[256];
-
+
bool chartype_symbol(char c) const { return chartype_symbol_table[(unsigned char)c]; }
- static bool chartype_space(char c) { return c < '!' && c > 0; }
- static bool chartype_enter(char c) { return c == '<'; }
- static bool chartype_leave(char c) { return c == '>'; }
- static bool chartype_close(char c) { return c == '/'; }
- static bool chartype_equals(char c) { return c == '='; }
- static bool chartype_special(char c) { return c == '!'; }
- static bool chartype_pi(char c) { return c == '?'; }
- static bool chartype_dash(char c) { return c == '-'; }
- static bool chartype_quote(char c) { return c == '"' || c == '\''; }
- static bool chartype_lbracket(char c) { return c == '['; }
- static bool chartype_rbracket(char c) { return c == ']'; }
-
- template <bool opt_escape, bool opt_wnorm, bool opt_wconv, bool opt_eol> static void strconv_t(char** s)
+ struct gap
{
- if (!s || !*s) return;
-
- if (!opt_escape && !opt_wnorm && !opt_wconv && !opt_eol) return;
-
- // Trim whitespaces
- if (opt_wnorm) while (chartype_space(**s)) ++(*s);
+ char* end;
+ size_t size;
- char* str = *s;
+ gap(): end(0), size(0)
+ {
+ }
- // Skip usual symbols
- if (opt_escape || opt_wnorm || opt_wconv || opt_eol)
+ // Push new gap, move s count bytes further (skipping the gap).
+ // Collapse previous gap.
+ void push(char*& s, size_t count)
{
- while (*str)
+ if (end) // there was a gap already; collapse it
{
- if (opt_escape && *str == '&') break;
- if ((opt_wnorm || opt_wconv || opt_eol) && chartype_space(*str)) break;
-
- ++str;
+ // Move [old_gap_end, new_gap_start) to [old_gap_start, ...)
+ memmove(end - size, end, s - end);
}
+
+ s += count; // end of current gap
+
+ // "merge" two gaps
+ end = s;
+ size += count;
}
+
+ // Collapse all gaps, return past-the-end pointer
+ char* flush(char* s)
+ {
+ if (end)
+ {
+ // Move [old_gap_end, current_pos) to [old_gap_start, ...)
+ memmove(end - size, end, s - end);
- char* lastpos = str;
-
- if (!*str) return;
+ return s - size;
+ }
+ else return s;
+ }
+ };
+
+ static char* strconv_escape(char* s, gap& g)
+ {
+ char* stre = s + 1;
- while (*str)
+ switch (*stre)
{
- if (*str == '&' && opt_escape) // &
+ case '#': // &#...
{
- char* stre = str + 1;
+ unsigned int ucsc = 0;
+
+ ++stre;
- switch (*stre)
+ if (*stre == 'x') // &#x... (hex code)
{
- case '#': // &#...
+ ++stre;
+
+ while (*stre)
{
- unsigned int ucsc = 0;
-
- ++stre;
+ if (*stre >= '0' && *stre <= '9')
+ ucsc = 16 * ucsc + (*stre++ - '0');
+ else if (*stre >= 'A' && *stre <= 'F')
+ ucsc = 16 * ucsc + (*stre++ - 'A' + 10);
+ else if (*stre >= 'a' && *stre <= 'f')
+ ucsc = 16 * ucsc + (*stre++ - 'a' + 10);
+ else break;
+ }
- if (*stre == 'x') // &#x... (hex code)
- {
- ++stre;
-
- while (*stre)
- {
- if (*stre >= '0' && *stre <= '9')
- ucsc = 16 * ucsc + (*stre++ - '0');
- else if (*stre >= 'A' && *stre <= 'F')
- ucsc = 16 * ucsc + (*stre++ - 'A' + 10);
- else if (*stre >= 'a' && *stre <= 'f')
- ucsc = 16 * ucsc + (*stre++ - 'a' + 10);
- else break;
- }
+ if (*stre == ';') ++stre;
+ }
+ else // &#... (dec code)
+ {
+ while (*stre >= '0' && *stre <= '9')
+ ucsc = 10 * ucsc + (*stre++ - '0');
- if (*stre == ';') ++stre;
- }
- else // &#... (dec code)
- {
- while (*stre >= '0' && *stre <= '9')
- ucsc = 10 * ucsc + (*stre++ - '0');
+ if (*stre == ';') ++stre;
+ }
- if (*stre == ';') ++stre;
- }
+ s = strutf16_utf8(s, ucsc);
+
+ g.push(s, stre - s);
+ return stre;
+ }
+ case 'a': // &a
+ {
+ ++stre;
- str = stre;
- lastpos = strutf16_utf8(lastpos, ucsc);
- continue;
- }
- case 'a': // &a
+ if (*stre == 'm') // &am
+ {
+ if (*++stre == 'p' && *++stre == ';') // &amp;
{
+ *s++ = '&';
++stre;
-
- if (*stre == 'm') // &am
- {
- if (*++stre == 'p' && *++stre == ';') // &amp;
- {
- *lastpos++ = '&';
- str = ++stre;
- continue;
- }
- }
- else if (*stre == 'p') // &ap
- {
- if (*++stre == 'o' && *++stre == 's' && *++stre == ';') // &apos;
- {
- *lastpos++ = '\'';
- str = ++stre;
- continue;
- }
- }
- break;
- }
- case 'g': // &g
- {
- if (*++stre == 't' && *++stre == ';') // &gt;
- {
- *lastpos++ = '>';
- str = ++stre;
- continue;
- }
- break;
- }
- case 'l': // &l
- {
- if (*++stre == 't' && *++stre == ';') // &lt;
- {
- *lastpos++ = '<';
- str = ++stre;
- continue;
- }
- break;
+
+ g.push(s, stre - s);
+ return stre;
}
- case 'q': // &q
+ }
+ else if (*stre == 'p') // &ap
+ {
+ if (*++stre == 'o' && *++stre == 's' && *++stre == ';') // &apos;
{
- if (*++stre == 'u' && *++stre == 'o' && *++stre == 't' && *++stre == ';') // &quot;
- {
- *lastpos++ = '"';
- str = ++stre;
- continue;
- }
- break;
+ *s++ = '\'';
+ ++stre;
+
+ g.push(s, stre - s);
+ return stre;
}
}
+ break;
}
- else if (chartype_space(*str) && opt_wnorm)
+ case 'g': // &g
{
- *lastpos++ = ' ';
-
- while (chartype_space(*str)) ++str;
-
- continue;
+ if (*++stre == 't' && *++stre == ';') // &gt;
+ {
+ *s++ = '>';
+ ++stre;
+
+ g.push(s, stre - s);
+ return stre;
+ }
+ break;
}
- else if (chartype_space(*str) && opt_wconv)
+ case 'l': // &l
{
- if (*str == 0x0d && *(str + 1) == 0x0a) ++str;
-
- ++str;
- *lastpos++ = ' ';
+ if (*++stre == 't' && *++stre == ';') // &lt;
+ {
+ *s++ = '<';
+ ++stre;
+
+ g.push(s, stre - s);
+ return stre;
+ }
+ break;
+ }
+ case 'q': // &q
+ {
+ if (*++stre == 'u' && *++stre == 'o' && *++stre == 't' && *++stre == ';') // &quot;
+ {
+ *s++ = '"';
+ ++stre;
+
+ g.push(s, stre - s);
+ return stre;
+ }
+ break;
+ }
+ }
+
+ return stre;
+ }
- continue;
+ static char* strconv_comment(char* s)
+ {
+ if (!*s) return 0;
+
+ gap g;
+
+ while (true)
+ {
+ while (!is_chartype(*s, ct_parse_comment)) ++s;
+
+ if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
+ {
+ *s++ = '\n'; // replace first one with 0x0a
+
+ if (*s == '\n') g.push(s, 1);
}
- else if (*str == 0x0d && !opt_wnorm && opt_eol)
+ else if (*s == '-' && *(s+1) == '-' && *(s+2) == '>') // comment ends here
{
- if (*(str + 1) == 0x0a) ++str;
- ++str;
- *lastpos++ = 0x0a;
+ *g.flush(s) = 0;
+
+ return s + 3;
+ }
+ else if (*s == 0)
+ {
+ return 0;
+ }
+ else ++s;
+ }
+ }
- continue;
+ static char* strconv_cdata(char* s)
+ {
+ if (!*s) return 0;
+
+ gap g;
+
+ while (true)
+ {
+ while (!is_chartype(*s, ct_parse_cdata)) ++s;
+
+ if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
+ {
+ *s++ = '\n'; // replace first one with 0x0a
+
+ if (*s == '\n') g.push(s, 1);
}
+ else if (*s == ']' && *(s+1) == ']' && *(s+2) == '>') // CDATA ends here
+ {
+ *g.flush(s) = 0;
+
+ return s + 1;
+ }
+ else if (*s == 0)
+ {
+ return 0;
+ }
+ else ++s;
+ }
+ }
+
+ template <bool opt_escape, bool opt_eol> static char* strconv_pcdata(char* s)
+ {
+ if (!*s) return 0;
+
+ gap g;
+
+ while (true)
+ {
+ while (!is_chartype(*s, ct_parse_pcdata)) ++s;
- *lastpos++ = *str++;
+ if (opt_eol && *s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
+ {
+ *s++ = '\n'; // replace first one with 0x0a
+
+ if (*s == '\n') g.push(s, 1);
+ }
+ else if (opt_escape && *s == '&')
+ {
+ s = strconv_escape(s, g);
+ }
+ else if (*s == '<') // PCDATA ends here
+ {
+ *g.flush(s) = 0;
+
+ return s + 1;
+ }
+ else if (*s == 0)
+ {
+ return 0;
+ }
+ else ++s;
}
+ }
+
+ static char* strconv_pcdata(char* s, unsigned int opt_escape, unsigned int opt_eol)
+ {
+ if (opt_escape)
+ return opt_eol ? strconv_pcdata<true, true>(s) : strconv_pcdata<true, false>(s);
+ else
+ return opt_eol ? strconv_pcdata<false, true>(s) : strconv_pcdata<false, false>(s);
+ }
+ template <bool opt_escape, bool opt_wnorm, bool opt_wconv, bool opt_eol> static char* strconv_attr(char* s, char end_quote)
+ {
+ if (!*s) return 0;
+
+ gap g;
+
+ // Trim whitespaces
if (opt_wnorm)
{
- do *lastpos-- = 0;
- while (chartype_space(*lastpos));
+ char* str = s;
+
+ while (is_chartype(*str, ct_space)) ++str;
+
+ if (str != s)
+ g.push(s, str - s);
+ }
+
+ while (true)
+ {
+ while (!is_chartype(*s, (opt_wnorm || opt_wconv) ? ct_parse_attr_ws : ct_parse_attr)) ++s;
+
+ if (opt_escape && *s == '&')
+ {
+ s = strconv_escape(s, g);
+ }
+ else if (opt_wnorm && is_chartype(*s, ct_space))
+ {
+ *s++ = ' ';
+
+ if (is_chartype(*s, ct_space))
+ {
+ char* str = s + 1;
+ while (is_chartype(*str, ct_space)) ++str;
+
+ g.push(s, str - s);
+ }
+ }
+ else if (opt_wconv && is_chartype(*s, ct_space))
+ {
+ if (opt_eol)
+ {
+ if (*s == '\r')
+ {
+ *s++ = ' ';
+
+ if (*s == '\n') g.push(s, 1);
+ }
+ else *s++ = ' ';
+ }
+ else *s++ = ' ';
+ }
+ else if (opt_eol && *s == '\r')
+ {
+ *s++ = '\n';
+
+ if (*s == '\n') g.push(s, 1);
+ }
+ else if (*s == end_quote)
+ {
+ char* str = g.flush(s);
+
+ if (opt_wnorm)
+ {
+ do *str-- = 0;
+ while (is_chartype(*str, ct_space));
+ }
+ else *str = 0;
+
+ return s + 1;
+ }
+ else if (!*s)
+ {
+ return 0;
+ }
+ else ++s;
}
- else *lastpos = 0;
}
- static void strconv_setup(void (*&func)(char**), unsigned int opt_escape, unsigned int opt_wnorm, unsigned int opt_wconv, unsigned int opt_eol)
+ static void strconv_attr_setup(char* (*&func)(char*, char), unsigned int opt_escape, unsigned int opt_wnorm, unsigned int opt_wconv, unsigned int opt_eol)
{
if (opt_eol)
{
@@ -423,26 +618,26 @@ namespace pugi
{
if (opt_escape)
{
- if (opt_wnorm) func = &strconv_t<true, true, true, true>;
- else func = &strconv_t<true, false, true, true>;
+ if (opt_wnorm) func = &strconv_attr<true, true, true, true>;
+ else func = &strconv_attr<true, false, true, true>;
}
else
{
- if (opt_wnorm) func = &strconv_t<false, true, true, true>;
- else func = &strconv_t<false, false, true, true>;
+ if (opt_wnorm) func = &strconv_attr<false, true, true, true>;
+ else func = &strconv_attr<false, false, true, true>;
}
}
else
{
if (opt_escape)
{
- if (opt_wnorm) func = &strconv_t<true, true, false, true>;
- else func = &strconv_t<true, false, false, true>;
+ if (opt_wnorm) func = &strconv_attr<true, true, false, true>;
+ else func = &strconv_attr<true, false, false, true>;
}
else
{
- if (opt_wnorm) func = &strconv_t<false, true, false, true>;
- else func = &strconv_t<false, false, false, true>;
+ if (opt_wnorm) func = &strconv_attr<false, true, false, true>;
+ else func = &strconv_attr<false, false, false, true>;
}
}
}
@@ -452,26 +647,26 @@ namespace pugi
{
if (opt_escape)
{
- if (opt_wnorm) func = &strconv_t<true, true, true, false>;
- else func = &strconv_t<true, false, true, false>;
+ if (opt_wnorm) func = &strconv_attr<true, true, true, false>;
+ else func = &strconv_attr<true, false, true, false>;
}
else
{
- if (opt_wnorm) func = &strconv_t<false, true, true, false>;
- else func = &strconv_t<false, false, true, false>;
+ if (opt_wnorm) func = &strconv_attr<false, true, true, false>;
+ else func = &strconv_attr<false, false, true, false>;
}
}
else
{
if (opt_escape)
{
- if (opt_wnorm) func = &strconv_t<true, true, false, false>;
- else func = &strconv_t<true, false, false, false>;
+ if (opt_wnorm) func = &strconv_attr<true, true, false, false>;
+ else func = &strconv_attr<true, false, false, false>;
}
else
{
- if (opt_wnorm) func = &strconv_t<false, true, false, false>;
- else func = &strconv_t<false, false, false, false>;
+ if (opt_wnorm) func = &strconv_attr<false, true, false, false>;
+ else func = &strconv_attr<false, false, false, false>;
}
}
}
@@ -517,7 +712,7 @@ namespace pugi
}
// Parser utilities.
- #define SKIPWS() { while(chartype_space(*s)) ++s; if(*s==0) return s; }
+ #define SKIPWS() { while(is_chartype(*s, ct_space)) ++s; if(*s==0) return s; }
#define OPTSET(OPT) ( optmsk & OPT )
#define PUSHNODE(TYPE) { cursor = append_node(cursor,TYPE); }
#define POPNODE() { cursor = cursor->parent; }
@@ -543,11 +738,9 @@ namespace pugi
{
if(!s || !xmldoc) return s;
- void (*strconv_pcdata)(char**);
- void (*strconv_attribute)(char**);
+ char* (*strconv_attribute)(char*, char);
- strconv_setup(strconv_attribute, OPTSET(parse_escapes), OPTSET(parse_wnorm_attribute), OPTSET(parse_wconv_attribute), OPTSET(parse_eol));
- strconv_setup(strconv_pcdata, OPTSET(parse_escapes), false, false, OPTSET(parse_eol));
+ strconv_attr_setup(strconv_attribute, OPTSET(parse_escapes), OPTSET(parse_wnorm_attribute), OPTSET(parse_wconv_attribute), OPTSET(parse_eol));
char ch = 0; // Current char, in cases where we must null-terminate before we test.
xml_node_struct* cursor = xmldoc; // Tree node cursor.
@@ -555,12 +748,12 @@ namespace pugi
while(*s!=0)
{
LOC_SEARCH: // Obliviously search for next element.
- SCANFOR(chartype_enter(*s)); // Find the next '<'.
- if(chartype_enter(*s))
+ SCANFOR(*s == '<'); // Find the next '<'.
+ if(*s == '<')
{
++s;
LOC_CLASSIFY: // What kind of element?
- if(chartype_pi(*s)) // '<?...'
+ if(*s == '?') // '<?...'
{
++s;
if(chartype_symbol(*s) && OPTSET(parse_pi))
@@ -573,7 +766,7 @@ namespace pugi
cursor->name = mark;
- if (chartype_space(ch))
+ if (is_chartype(ch, ct_space))
{
SKIPWS();
@@ -581,7 +774,7 @@ namespace pugi
}
else mark = 0;
- SCANFOR(chartype_pi(*s) && chartype_leave(*(s+1))); // Look for '?>'.
+ SCANFOR(*s == '?' && *(s+1) == '>'); // Look for '?>'.
ENDSEG();
cursor->value = mark;
@@ -592,18 +785,18 @@ namespace pugi
}
else // Bad PI or parse_pi not set.
{
- SCANFOR(chartype_pi(*s) && chartype_leave(*(s+1))); // Look for '?>'.
+ SCANFOR(*s == '?' && *(s+1) == '>'); // Look for '?>'.
++s;
goto LOC_LEAVE;
}
}
- else if(chartype_special(*s)) // '<!...'
+ else if(*s == '!') // '<!...'
{
++s;
- if(chartype_dash(*s)) // '<!-...'
+ if(*s == '-') // '<!-...'
{
++s;
- if(chartype_dash(*s)) // '<!--...'
+ if(*s == '-') // '<!--...'
{
++s;
@@ -613,36 +806,53 @@ namespace pugi
cursor->value = s; // Save the offset.
}
- // Scan for terminating '-->'.
- SCANFOR(chartype_dash(*s) && chartype_dash(*(s+1)) && chartype_leave(*(s+2)));
+ if (OPTSET(parse_eol) && OPTSET(parse_comments))
+ {
+ s = strconv_comment(s);
+
+ if (!s) return s;
+ }
+ else
+ {
+ // Scan for terminating '-->'.
+ SCANFOR(*s == '-' && *(s+1) == '-' && *(s+2) == '>');
+
+ if (OPTSET(parse_comments))
+ *s = 0; // Zero-terminate this segment at the first terminating '-'.
+
+ s += 2; // Step over the '\0-'.
+ }
if (OPTSET(parse_comments))
{
- *s = 0; // Zero-terminate this segment at the first terminating '-'.
POPNODE(); // Pop since this is a standalone.
}
- s += 2; // Step over the '\0-'.
goto LOC_LEAVE; // Look for any following PCDATA.
}
}
- else if(chartype_lbracket(*s))
+ else if(*s == '[')
{
// '<![CDATA[...'
- if(*++s=='C' && *++s=='D' && *++s=='A' && *++s=='T' && *++s=='A' && chartype_lbracket(*++s))
+ if(*++s=='C' && *++s=='D' && *++s=='A' && *++s=='T' && *++s=='A' && *++s == '[')
{
++s;
if(OPTSET(parse_cdata))
{
PUSHNODE(node_cdata); // Append a new node on the tree.
cursor->value = s; // Save the offset.
- // Scan for terminating ']]>'.
- SCANFOR(chartype_rbracket(*s) && chartype_rbracket(*(s+1)) && chartype_leave(*(s+2)));
- ENDSEG(); // Zero-terminate this segment.
if (OPTSET(parse_eol))
{
- strconv_t<false, false, false, true>(&cursor->value);
+ s = strconv_cdata(s);
+
+ if (!s) return s;
+ }
+ else
+ {
+ // Scan for terminating ']]>'.
+ SCANFOR(*s == ']' && *(s+1) == ']' && *(s+2) == '>');
+ ENDSEG(); // Zero-terminate this segment.
}
POPNODE(); // Pop since this is a standalone.
@@ -650,7 +860,7 @@ namespace pugi
else // Flagged for discard, but we still have to scan for the terminator.
{
// Scan for terminating ']]>'.
- SCANFOR(chartype_rbracket(*s) && chartype_rbracket(*(s+1)) && chartype_leave(*(s+2)));
+ SCANFOR(*s == ']' && *(s+1) == ']' && *(s+2) == '>');
++s;
}
++s; // Step over the last ']'.
@@ -663,27 +873,27 @@ namespace pugi
++s;
SKIPWS(); // Eat any whitespace.
LOC_DOCTYPE:
- SCANWHILE(chartype_quote(*s) || chartype_lbracket(*s) || chartype_leave(*s));
- if(chartype_quote(*s)) // '...SYSTEM "..."
+ SCANWHILE(*s == '\'' || *s == '"' || *s == '[' || *s == '>');
+ if(*s == '\'' || *s == '"') // '...SYSTEM "..."
{
ch = *s++;
SCANFOR(*s == ch);
++s;
goto LOC_DOCTYPE;
}
- if(chartype_lbracket(*s)) // '...[...'
+ if(*s == '[') // '...[...'
{
++s;
unsigned int bd = 1; // Bracket depth counter.
while(*s!=0) // Loop till we're out of all brackets.
{
- if(chartype_rbracket(*s)) --bd;
- else if(chartype_lbracket(*s)) ++bd;
+ if(*s == ']') --bd;
+ else if(*s == '[') ++bd;
if(bd == 0) break;
++s;
}
// Note: 's' now points to end of DTD, i.e.: ']'.
- SCANFOR(chartype_leave(*s));
+ SCANFOR(*s == '>');
continue;
}
// Fall-through
@@ -697,19 +907,19 @@ namespace pugi
cursor->name = s;
SCANWHILE(chartype_symbol(*s)); // Scan for a terminator.
ENDSEG(); // Save char in 'ch', terminate & step over.
- if (*s!=0 && chartype_close(ch)) // '</...'
+ if (*s!=0 && ch == '/') // '</...'
{
- SCANFOR(chartype_leave(*s)); // Scan for '>'
+ SCANFOR(*s == '>'); // Scan for '>'
POPNODE(); // Pop.
goto LOC_LEAVE;
}
- else if(*s!=0 && !chartype_space(ch))
+ else if(*s!=0 && !is_chartype(ch, ct_space))
{
- if (!chartype_leave(ch)) SCANWHILE(!chartype_leave(*s));
+ if (ch != '>') SCANWHILE(*s != '>');
if (!*s) return s;
goto LOC_PCDATA; // No attributes, so scan for PCDATA.
}
- else if(*s!=0 && chartype_space(ch))
+ else if(*s!=0 && is_chartype(ch, ct_space))
{
SKIPWS(); // Eat any whitespace.
LOC_ATTRIBUTE:
@@ -719,34 +929,34 @@ namespace pugi
a->name = s; // Save the offset.
SCANWHILE(chartype_symbol(*s)); // Scan for a terminator.
ENDSEG(); // Save char in 'ch', terminate & step over.
- if(*s!=0 && chartype_space(ch)) SKIPWS(); // Eat any whitespace.
- if(*s!=0 && (chartype_equals(ch) || chartype_equals(*s))) // '<... #=...'
+ if(*s!=0 && is_chartype(ch, ct_space)) SKIPWS(); // Eat any whitespace.
+ if(*s!=0 && (ch == '=' || *s == '=')) // '<... #=...'
{
- if(chartype_equals(*s)) ++s;
+ if(*s == '=') ++s;
SKIPWS(); // Eat any whitespace.
- if(chartype_quote(*s)) // '<... #="...'
+ if(*s == '\'' || *s == '"') // '<... #="...'
{
ch = *s; // Save quote char to avoid breaking on "''" -or- '""'.
++s; // Step over the quote.
a->value = s; // Save the offset.
- SCANFOR(*s == ch); // Scan for the terminating quote
- ENDSEG(); // Save char in 'ch', terminate & step over.
- strconv_attribute(&a->value);
+ s = strconv_attribute(s, ch);
+
+ if (!s) return s;
- if(chartype_leave(*s))
+ if(*s == '>')
{
++s;
goto LOC_PCDATA;
}
- else if(chartype_close(*s))
+ else if(*s == '/')
{
++s;
POPNODE();
SKIPWS(); // Eat any whitespace.
goto LOC_LEAVE;
}
- else if(chartype_space(*s)) // This may indicate a following attribute.
+ else if(is_chartype(*s, ct_space)) // This may indicate a following attribute.
{
SKIPWS(); // Eat any whitespace.
goto LOC_ATTRIBUTE; // Go scan for additional attributes.
@@ -756,19 +966,19 @@ namespace pugi
goto LOC_ATTRIBUTE;
}
- SCANWHILE(!chartype_leave(*s) && !chartype_close(*s));
+ SCANWHILE(*s != '>' && *s != '/');
}
LOC_LEAVE:
- if(chartype_leave(*s)) // '...>'
+ if(*s == '>') // '...>'
{
++s; // Step over the '>'.
LOC_PCDATA: // '>...<'
mark = s; // Save this offset while searching for a terminator.
SKIPWS(); // Eat whitespace if no genuine PCDATA here.
// We hit a '<...', with only whitespace, so don't bother storing anything.
- if((mark == s || !OPTSET(parse_ws_pcdata)) && chartype_enter(*s))
+ if((mark == s || !OPTSET(parse_ws_pcdata)) && *s == '<')
{
- if(chartype_close(*(s+1))) // '</...'
+ if(*(s+1) == '/') // '</...'
{
++s;
goto LOC_CLOSE;
@@ -784,34 +994,29 @@ namespace pugi
{
PUSHNODE(node_pcdata); // Append a new node on the tree.
cursor->value = s; // Save the offset.
- }
-
- while (*s && !chartype_enter(*s)) ++s; // '...<'
-
- if (preserve)
- {
- if (*s) ENDSEG(); // Save char in 'ch', terminate & step over.
- strconv_pcdata(&cursor->value);
+ s = strconv_pcdata(s, OPTSET(parse_escapes), OPTSET(parse_eol));
+
+ if (!s) return s;
POPNODE(); // Pop since this is a standalone.
}
-
- if (!*s) return s;
-
- if(chartype_enter(ch)) // Did we hit a '<...'?
+ else
{
- if(chartype_close(*s)) goto LOC_CLOSE;
- else if(chartype_special(*s)) goto LOC_CLASSIFY; // We hit a '<!...'. We must test this here if we want comments intermixed w/PCDATA.
- else if(*s) goto LOC_CLASSIFY;
- else return s;
+ SCANFOR(*s == '<'); // '...<'
}
+
+ // We're after '<...', otherwise we would not get here
+ if(*s == '/') goto LOC_CLOSE;
+ else if(*s == '!') goto LOC_CLASSIFY; // We hit a '<!...'. We must test this here if we want comments intermixed w/PCDATA.
+ else if(*s) goto LOC_CLASSIFY;
+ else return s;
}
// Fall-through A.
- else if(chartype_close(*s)) // '.../'
+ else if(*s == '/') // '.../'
{
++s;
- if(chartype_leave(*s)) // '.../>'
+ if(*s == '>') // '.../>'
{
POPNODE(); // Pop.
goto LOC_LEAVE;
@@ -819,7 +1024,7 @@ namespace pugi
}
}
// Fall-through B.
- else if(chartype_close(*s)) // '.../'
+ else if(*s == '/') // '.../'
{
LOC_CLOSE:
++s;
@@ -868,7 +1073,7 @@ namespace pugi
}
else
{
- SCANFOR(chartype_leave(*s)); // '...>'
+ SCANFOR(*s == '>'); // '...>'
POPNODE(); // Pop.
}
diff --git a/src/pugixml.hpp b/src/pugixml.hpp
index a802e3d..b7ded37 100644
--- a/src/pugixml.hpp
+++ b/src/pugixml.hpp
@@ -2,7 +2,9 @@
//
// Pug Improved XML Parser - Version 0.2
// --------------------------------------------------------
-// Copyright (C) 2006, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+// Copyright (C) 2006-2007, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+// Thanks to Palvelev Artyom (cppguru@mail.ru) for hints about optimizing
+// conversion functions.
// This work is based on the pugxml parser, which is:
// Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
// Released into the Public Domain. Use at your own risk.