From 624b5702d75d63dde56fbbc89680358241035aa7 Mon Sep 17 00:00:00 2001 From: "arseny.kapoulkine" Date: Fri, 11 Jun 2010 20:39:57 +0000 Subject: Rewritten numeric character reference parsing (fixed &#; and &#x; parsing), added more character reference tests git-svn-id: http://pugixml.googlecode.com/svn/trunk@512 99668b35-9821-0410-8761-19e4c4f06640 --- src/pugixml.cpp | 51 +++++++++++++++++++++++++++++++-------------------- tests/test_parse.cpp | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 20 deletions(-) diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 242202e..68a68d0 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -1374,37 +1374,48 @@ namespace { unsigned int ucsc = 0; - ++stre; - - if (*stre == 'x') // &#x... (hex code) + if (stre[1] == 'x') // &#x... (hex code) { - ++stre; - - while (*stre) + stre += 2; + + char_t ch = *stre; + + if (ch == ';') return stre; + + for (;;) { - if (*stre >= '0' && *stre <= '9') - ucsc = 16 * ucsc + (*stre++ - '0'); - else if (*stre >= 'A' && *stre <= 'F') - ucsc = 16 * ucsc + (*stre++ - 'A' + 10); - else if (*stre >= 'a' && *stre <= 'f') - ucsc = 16 * ucsc + (*stre++ - 'a' + 10); - else if (*stre == ';') + if (static_cast(ch - '0') <= 9) + ucsc = 16 * ucsc + (ch - '0'); + else if (static_cast((ch | ' ') - 'a') <= 5) + ucsc = 16 * ucsc + ((ch | ' ') - 'a' + 10); + else if (ch == ';') break; else // cancel return stre; - } - if (*stre != ';') return stre; - + ch = *++stre; + } + ++stre; } else // &#... (dec code) { - while (*stre >= '0' && *stre <= '9') - ucsc = 10 * ucsc + (*stre++ - '0'); + char_t ch = *++stre; - if (*stre != ';') return stre; - + if (ch == ';') return stre; + + for (;;) + { + if (static_cast(ch - '0') <= 9) + ucsc = 10 * ucsc + (ch - '0'); + else if (ch == ';') + break; + else // cancel + return stre; + + ch = *++stre; + } + ++stre; } diff --git a/tests/test_parse.cpp b/tests/test_parse.cpp index c2f56e5..50f8867 100644 --- a/tests/test_parse.cpp +++ b/tests/test_parse.cpp @@ -281,6 +281,47 @@ TEST(parse_escapes_code) CHECK_STRING(doc.child_value(STR("node")), STR("\01 ")); } +TEST(parse_escapes_code_exhaustive_dec) +{ + xml_document doc; + CHECK(doc.load(STR("&#/; &#:;&#a;&#A; "), parse_minimal | parse_escapes)); + CHECK_STRING(doc.child_value(STR("node")), STR("&#/;\x1\x2\x3\x4\x5\x6\x7\x8\x9&#:;&#a;&#A; ")); +} + +TEST(parse_escapes_code_exhaustive_hex) +{ + xml_document doc; + CHECK(doc.load(STR("&#x/; &#x:;&#x@; &#xG;&#x`; &#xg;"), parse_minimal | parse_escapes)); + CHECK_STRING(doc.child_value(STR("node")), STR("&#x/;\x1\x2\x3\x4\x5\x6\x7\x8\x9&#x:;&#x@;\xa\xb\xc\xd\xe\xf&#xG;&#x`;\xa\xb\xc\xd\xe\xf&#xg;")); +} + +TEST(parse_escapes_code_restore) +{ + xml_document doc; + CHECK(doc.load(STR("  - - "), parse_minimal | parse_escapes)); + CHECK_STRING(doc.child_value(STR("node")), STR("  - - ")); +} + +TEST(parse_escapes_char_restore) +{ + xml_document doc; + + CHECK(doc.load(STR("&q &qu &quo " "), parse_minimal | parse_escapes)); + CHECK_STRING(doc.child_value(STR("node")), STR("&q &qu &quo " ")); + + CHECK(doc.load(STR("&a &ap &apo &apos "), parse_minimal | parse_escapes)); + CHECK_STRING(doc.child_value(STR("node")), STR("&a &ap &apo &apos ")); + + CHECK(doc.load(STR("&a &am & "), parse_minimal | parse_escapes)); + CHECK_STRING(doc.child_value(STR("node")), STR("&a &am & ")); + + CHECK(doc.load(STR("&l < "), parse_minimal | parse_escapes)); + CHECK_STRING(doc.child_value(STR("node")), STR("&l < ")); + + CHECK(doc.load(STR("&g > "), parse_minimal | parse_escapes)); + CHECK_STRING(doc.child_value(STR("node")), STR("&g > ")); +} + TEST(parse_escapes_unicode) { xml_document doc; @@ -314,6 +355,13 @@ TEST(parse_escapes_error) CHECK(!doc.load(STR("