From 0a747e6c1aba8218bda284477701044328fc50bb Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Tue, 25 Feb 2014 03:41:54 +0000 Subject: Add parse_trim_pcdata parse option. git-svn-id: https://pugixml.googlecode.com/svn/trunk@987 99668b35-9821-0410-8761-19e4c4f06640 --- src/pugixml.cpp | 45 ++++++++++++++++++++++++++++++++------------- src/pugixml.hpp | 5 ++++- 2 files changed, 36 insertions(+), 14 deletions(-) diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 634192a..754f92f 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -1875,19 +1875,27 @@ PUGI__NS_BEGIN typedef char_t* (*strconv_pcdata_t)(char_t*); - template struct strconv_pcdata_impl + template struct strconv_pcdata_impl { static char_t* parse(char_t* s) { gap g; - + + char_t* begin = s; + while (true) { while (!PUGI__IS_CHARTYPE(*s, ct_parse_pcdata)) ++s; if (*s == '<') // PCDATA ends here { - *g.flush(s) = 0; + char_t* end = g.flush(s); + + if (opt_trim::value) + while (end > begin && PUGI__IS_CHARTYPE(end[-1], ct_space)) + --end; + + *end = 0; return s + 1; } @@ -1903,8 +1911,14 @@ PUGI__NS_BEGIN } else if (*s == 0) { - *g.flush(s) = 0; - + char_t* end = g.flush(s); + + if (opt_trim::value) + while (end > begin && PUGI__IS_CHARTYPE(end[-1], ct_space)) + --end; + + *end = 0; + return s; } else ++s; @@ -1914,14 +1928,18 @@ PUGI__NS_BEGIN PUGI__FN strconv_pcdata_t get_strconv_pcdata(unsigned int optmask) { - PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20); + PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_trim_pcdata == 0x0800); - switch ((optmask >> 4) & 3) // get bitmask for flags (eol escapes) + switch (((optmask >> 4) & 3) | ((optmask >> 9) & 4)) // get bitmask for flags (eol escapes trim) { - case 0: return strconv_pcdata_impl::parse; - case 1: return strconv_pcdata_impl::parse; - case 2: return strconv_pcdata_impl::parse; - case 3: return strconv_pcdata_impl::parse; + case 0: return strconv_pcdata_impl::parse; + case 1: return strconv_pcdata_impl::parse; + case 2: return strconv_pcdata_impl::parse; + case 3: return strconv_pcdata_impl::parse; + case 4: return strconv_pcdata_impl::parse; + case 5: return strconv_pcdata_impl::parse; + case 6: return strconv_pcdata_impl::parse; + case 7: return strconv_pcdata_impl::parse; default: assert(false); return 0; // should not get here } } @@ -2636,7 +2654,7 @@ PUGI__NS_BEGIN // We skipped some whitespace characters because otherwise we would take the tag branch instead of PCDATA one assert(mark != s); - if (!PUGI__OPTSET(parse_ws_pcdata | parse_ws_pcdata_single)) + if (!PUGI__OPTSET(parse_ws_pcdata | parse_ws_pcdata_single) || PUGI__OPTSET(parse_trim_pcdata)) { continue; } @@ -2646,7 +2664,8 @@ PUGI__NS_BEGIN } } - s = mark; + if (!PUGI__OPTSET(parse_trim_pcdata)) + s = mark; if (cursor->parent || PUGI__OPTSET(parse_fragment)) { diff --git a/src/pugixml.hpp b/src/pugixml.hpp index e5009fe..b912127 100644 --- a/src/pugixml.hpp +++ b/src/pugixml.hpp @@ -151,9 +151,12 @@ namespace pugi // This flag is off by default; turning it on may result in slower parsing and more memory consumption. const unsigned int parse_ws_pcdata_single = 0x0400; + // This flag determines if leading and trailing whitespace is to be removed from plain character data. This flag is off by default. + const unsigned int parse_trim_pcdata = 0x0800; + // This flag determines if plain character data that does not have a parent node is added to the DOM tree, and if an empty document // is a valid document. This flag is off by default. - const unsigned int parse_fragment = 0x0800; + const unsigned int parse_fragment = 0x1000; // The default parsing mode. // Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded, -- cgit v1.2.3