From 2777da9faad5a5afe927e7afa0b6cf04e0f34671 Mon Sep 17 00:00:00 2001 From: "arseny.kapoulkine" Date: Fri, 5 Jan 2007 20:05:10 +0000 Subject: Parsing flags refactoring (removed trim flags, eol flags merged together, escapes flags merged together, removed wnorm_pcdata flag, changed wnorm_attribute flag (it's space normalization + trimming now), fixed default flags, changed documentation accordingly git-svn-id: http://pugixml.googlecode.com/svn/trunk@26 99668b35-9821-0410-8761-19e4c4f06640 --- docs/index.html | 39 ++++-------------- src/pugixml.cpp | 124 ++++++++++++++------------------------------------------ src/pugixml.hpp | 29 +++++-------- 3 files changed, 50 insertions(+), 142 deletions(-) diff --git a/docs/index.html b/docs/index.html index 02c8ae9..4066dc4 100644 --- a/docs/index.html +++ b/docs/index.html @@ -275,30 +275,13 @@ So, these are the processing flags:

@@ -329,7 +306,7 @@ correctly). This is controlled by parse_match_end_tags, which is on by de
  • just treat the tag as a closing tag for the node (so that <foo> ... </bar> will be parsed as <foo> ... </foo>). This is the fastest way, and this is what pugxml is doing, but it can corrupt your DOM tree. This way is chosen if both parse_check_end_tags and -parsse_match_end_tags are off. +parse_match_end_tags are off. Note, that these 2 flags are mutually exclusive.

    diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 63bd36a..de3a548 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -252,14 +252,14 @@ namespace pugi static bool chartype_lbracket(char c) { return c == '['; } static bool chartype_rbracket(char c) { return c == ']'; } - template static void strconv_t(char** s) + template static void strconv_t(char** s) { if (!s || !*s) return; - if (!opt_trim && !opt_escape && !opt_wnorm && !opt_wconv && !opt_eol) return; + if (!opt_escape && !opt_wnorm && !opt_wconv && !opt_eol) return; // Trim whitespaces - if (opt_trim) while (chartype_space(**s)) ++(*s); + if (opt_wnorm) while (chartype_space(**s)) ++(*s); char* str = *s; @@ -270,6 +270,7 @@ namespace pugi { if (opt_escape && *str == '&') break; if ((opt_wnorm || opt_wconv || opt_eol) && chartype_space(*str)) break; + ++str; } } @@ -406,7 +407,7 @@ namespace pugi *lastpos++ = *str++; } - if (opt_trim) + if (opt_wnorm) { do *lastpos-- = 0; while (chartype_space(*lastpos)); @@ -414,66 +415,34 @@ namespace pugi else *lastpos = 0; } - static void strconv_setup(void (*&func)(char**), unsigned int opt_trim, unsigned int opt_escape, unsigned int opt_wnorm, unsigned int opt_wconv, unsigned int opt_eol) + static void strconv_setup(void (*&func)(char**), unsigned int opt_escape, unsigned int opt_wnorm, unsigned int opt_wconv, unsigned int opt_eol) { if (opt_eol) { if (opt_wconv) { - if (opt_trim) + if (opt_escape) { - if (opt_escape) - { - if (opt_wnorm) func = &strconv_t; - else func = &strconv_t; - } - else - { - if (opt_wnorm) func = &strconv_t; - else func = &strconv_t; - } + if (opt_wnorm) func = &strconv_t; + else func = &strconv_t; } else { - if (opt_escape) - { - if (opt_wnorm) func = &strconv_t; - else func = &strconv_t; - } - else - { - if (opt_wnorm) func = &strconv_t; - else func = &strconv_t; - } + if (opt_wnorm) func = &strconv_t; + else func = &strconv_t; } } else { - if (opt_trim) + if (opt_escape) { - if (opt_escape) - { - if (opt_wnorm) func = &strconv_t; - else func = &strconv_t; - } - else - { - if (opt_wnorm) func = &strconv_t; - else func = &strconv_t; - } + if (opt_wnorm) func = &strconv_t; + else func = &strconv_t; } else { - if (opt_escape) - { - if (opt_wnorm) func = &strconv_t; - else func = &strconv_t; - } - else - { - if (opt_wnorm) func = &strconv_t; - else func = &strconv_t; - } + if (opt_wnorm) func = &strconv_t; + else func = &strconv_t; } } } @@ -481,64 +450,33 @@ namespace pugi { if (opt_wconv) { - if (opt_trim) + if (opt_escape) { - if (opt_escape) - { - if (opt_wnorm) func = &strconv_t; - else func = &strconv_t; - } - else - { - if (opt_wnorm) func = &strconv_t; - else func = &strconv_t; - } + if (opt_wnorm) func = &strconv_t; + else func = &strconv_t; } else { - if (opt_escape) - { - if (opt_wnorm) func = &strconv_t; - else func = &strconv_t; - } - else - { - if (opt_wnorm) func = &strconv_t; - else func = &strconv_t; - } + if (opt_wnorm) func = &strconv_t; + else func = &strconv_t; } } else { - if (opt_trim) + if (opt_escape) { - if (opt_escape) - { - if (opt_wnorm) func = &strconv_t; - else func = &strconv_t; - } - else - { - if (opt_wnorm) func = &strconv_t; - else func = &strconv_t; - } + if (opt_wnorm) func = &strconv_t; + else func = &strconv_t; } else { - if (opt_escape) - { - if (opt_wnorm) func = &strconv_t; - else func = &strconv_t; - } - else - { - if (opt_wnorm) func = &strconv_t; - else func = &strconv_t; - } + if (opt_wnorm) func = &strconv_t; + else func = &strconv_t; } } } } + // Allocate & append a new xml_node_struct onto the given parent. // \param parent - pointer to parent node. // \param type - desired node type. @@ -608,8 +546,8 @@ namespace pugi void (*strconv_pcdata)(char**); void (*strconv_attribute)(char**); - strconv_setup(strconv_attribute, OPTSET(parse_trim_attribute), OPTSET(parse_escapes_attribute), OPTSET(parse_wnorm_attribute), OPTSET(parse_wconv_attribute), OPTSET(parse_eol_attribute)); - strconv_setup(strconv_pcdata, OPTSET(parse_trim_pcdata), OPTSET(parse_escapes_pcdata), OPTSET(parse_wnorm_pcdata), false, OPTSET(parse_eol_pcdata)); + strconv_setup(strconv_attribute, OPTSET(parse_escapes), OPTSET(parse_wnorm_attribute), OPTSET(parse_wconv_attribute), OPTSET(parse_eol)); + strconv_setup(strconv_pcdata, OPTSET(parse_escapes), false, false, OPTSET(parse_eol)); char ch = 0; // Current char, in cases where we must null-terminate before we test. xml_node_struct* cursor = xmldoc; // Tree node cursor. @@ -702,9 +640,9 @@ namespace pugi SCANFOR(chartype_rbracket(*s) && chartype_rbracket(*(s+1)) && chartype_leave(*(s+2))); ENDSEG(); // Zero-terminate this segment. - if (OPTSET(parse_eol_cdata)) + if (OPTSET(parse_eol)) { - strconv_t(&cursor->value); + strconv_t(&cursor->value); } POPNODE(); // Pop since this is a standalone. diff --git a/src/pugixml.hpp b/src/pugixml.hpp index 22e5c52..a802e3d 100644 --- a/src/pugixml.hpp +++ b/src/pugixml.hpp @@ -48,28 +48,21 @@ namespace pugi const unsigned int parse_pi = 0x00000001; ///< Parse '' const unsigned int parse_comments = 0x00000002; ///< Parse '' const unsigned int parse_cdata = 0x00000004; ///< Parse '' - const unsigned int parse_ws_pcdata = 0x00000008; ///< Skip PCDATA that consists only of whitespaces - const unsigned int parse_ext_pcdata = 0x00000010; ///< Skip PCDATA that is outside all tags (i.e. root) - const unsigned int parse_trim_pcdata = 0x00000020; ///< Trim '>...<' - const unsigned int parse_trim_attribute = 0x00000040; ///< Trim 'foo="..."'. - const unsigned int parse_escapes_pcdata = 0x00000080; ///< Parse <, >, &, ", ', &#.. sequences - const unsigned int parse_escapes_attribute = 0x00000100; ///< Parse <, >, &, ", ', &#.. sequences - const unsigned int parse_wnorm_pcdata = 0x00000200; ///< Normalize spaces in pcdata - const unsigned int parse_wnorm_attribute = 0x00000400; ///< Normalize spaces in attributes - const unsigned int parse_wconv_attribute = 0x00000800; ///< Convert space-like characters to spaces in attributes (only if wnorm is not set) - const unsigned int parse_eol_pcdata = 0x00001000; ///< Perform EOL handling in pcdata - const unsigned int parse_eol_attribute = 0x00002000; ///< Perform EOL handling in attrobites - const unsigned int parse_eol_cdata = 0x00004000; ///< Perform EOL handling in CDATA sections - const unsigned int parse_check_end_tags = 0x00010000; ///< Check start and end tag names and return error if names mismatch - const unsigned int parse_match_end_tags = 0x00020000; ///< Try to find corresponding start tag for an end tag + const unsigned int parse_ws_pcdata = 0x00000008; ///< Do not skip PCDATA that consists only of whitespaces + const unsigned int parse_ext_pcdata = 0x00000010; ///< Do not skip PCDATA that is outside all tags (i.e. root) + const unsigned int parse_escapes = 0x00000020; ///< Parse <, >, &, ", ', &#.. sequences + const unsigned int parse_wnorm_attribute = 0x00000080; ///< Normalize spaces in attributes (convert space-like characters to spaces + merge adjacent spaces + trim leading/trailing spaces) + const unsigned int parse_wconv_attribute = 0x00000100; ///< Convert space-like characters to spaces in attributes (only if wnorm is not set) + const unsigned int parse_eol = 0x00000200; ///< Perform EOL handling + const unsigned int parse_check_end_tags = 0x00000400; ///< Check start and end tag names and return error if names mismatch + const unsigned int parse_match_end_tags = 0x00000800; ///< Try to find corresponding start tag for an end tag ///< Set all flags, except parse_ws_pcdata, parse_trim_attribute, parse_pi and parse_comments - const unsigned int parse_default = 0x00FFFFFF & ~parse_ws_pcdata & ~parse_trim_attribute & ~parse_pi & ~parse_comments; + const unsigned int parse_default = parse_cdata | parse_ext_pcdata | parse_escapes | parse_wconv_attribute | parse_eol | parse_check_end_tags; const unsigned int parse_noset = 0x80000000; ///< Parse with flags in xml_parser const unsigned int parse_w3c = parse_pi | parse_comments | parse_cdata | - parse_escapes_pcdata | parse_escapes_attribute | - parse_wconv_attribute | parse_check_end_tags | - parse_ws_pcdata | parse_eol_cdata; + parse_escapes | parse_wconv_attribute | + parse_check_end_tags | parse_ws_pcdata | parse_eol; /// Forward declarations struct xml_attribute_struct; -- cgit v1.2.3