From 2777da9faad5a5afe927e7afa0b6cf04e0f34671 Mon Sep 17 00:00:00 2001
From: "arseny.kapoulkine"
Date: Fri, 5 Jan 2007 20:05:10 +0000
Subject: Parsing flags refactoring (removed trim flags, eol flags merged
together, escapes flags merged together, removed wnorm_pcdata flag, changed
wnorm_attribute flag (it's space normalization + trimming now), fixed default
flags, changed documentation accordingly
git-svn-id: http://pugixml.googlecode.com/svn/trunk@26 99668b35-9821-0410-8761-19e4c4f06640
---
docs/index.html | 39 ++++--------------
src/pugixml.cpp | 124 ++++++++++++++------------------------------------------
src/pugixml.hpp | 29 +++++--------
3 files changed, 50 insertions(+), 142 deletions(-)
diff --git a/docs/index.html b/docs/index.html
index 02c8ae9..4066dc4 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -275,30 +275,13 @@ So, these are the processing flags:
-- If parse_trim_pcdata is on, then the trimming of leading/trailing space-like characters
-is performed for PCDATA content
-
Default value: on
-
In W3C mode: off
-- If parse_trim_attribute is on, then the trimming of leading/trailing space-like characters
-is performed for attribute values (this is non-standard behavior and is here only for compatibility
-reasons (PugXML had this flag).
-
Default value: off
-
In W3C mode: off
-- If parse_escapes_pcdata is on, then the character reference expansion is done for PCDATA
-content (replacing <lt; with <, c; with L, etc.).
-
Default value: on
-
In W3C mode: on
-- If parse_escapes_attribute is on, then the character reference expansion is done for
-attribute values (replacing <lt; with <, c; with L, etc.).
+
- If parse_escapes is on, then the character reference expansion is done for PCDATA content
+and for attribute values (replacing <lt; with <, c; with L, etc.).
Default value: on
In W3C mode: on
-- If parse_wnorm_pcdata is on, then the whitespace normalisation is done for PCDATA content
-(this includes replacing any space-like character by a space character and converting sequences of
-spaces into a single space)
-
Default value: on
-
In W3C mode: off
- If parse_wnorm_attribute is on, then the whitespace normalisation is done for attribute
-values
+values (this includes replacing any space-like character by a space character, converting sequences of
+spaces into a single space and trimming of leading/trailing spaces)
Default value: on
In W3C mode: off
- If parse_wconv_attribute is on, then the whitespace conversion is done for attribute
@@ -306,15 +289,9 @@ values (this is a subset of whitespace normalization, and includes only replacin
with spaces). If parse_wnorm_attribute is on, this flag has no effect.
Default value: on
In W3C mode: on
-- If parse_eol_pcdata is on, then the end-of-line handling is done for PCDATA content (this
-includes converting any pair of 0x0d 0x0a characters to a single 0x0a and converting any standalone
-0x0d to 0x0a).
-
Default value: on
-
In W3C mode: on
-- If parse_eol_attribute is on, then the end-of-line handling is done for attribute values.
-
Default value: on
-
In W3C mode: on
-- If parse_eol_cdata is on, then the end-of-line handling is done for CDATA content.
+
- If parse_eol is on, then the end-of-line handling is done for PCDATA/CDATA content and for
+attribute values (this includes converting any pair of 0x0d 0x0a characters to a single 0x0a and
+converting any standalone 0x0d to 0x0a).
Default value: on
In W3C mode: on
@@ -329,7 +306,7 @@ correctly). This is controlled by parse_match_end_tags, which is on by de
just treat the tag as a closing tag for the node (so that <foo> ... </bar> will
be parsed as <foo> ... </foo>). This is the fastest way, and this is what pugxml
is doing, but it can corrupt your DOM tree. This way is chosen if both parse_check_end_tags and
-parsse_match_end_tags are off.
+parse_match_end_tags are off.
Note, that these 2 flags are mutually exclusive.
diff --git a/src/pugixml.cpp b/src/pugixml.cpp
index 63bd36a..de3a548 100644
--- a/src/pugixml.cpp
+++ b/src/pugixml.cpp
@@ -252,14 +252,14 @@ namespace pugi
static bool chartype_lbracket(char c) { return c == '['; }
static bool chartype_rbracket(char c) { return c == ']'; }
- template static void strconv_t(char** s)
+ template static void strconv_t(char** s)
{
if (!s || !*s) return;
- if (!opt_trim && !opt_escape && !opt_wnorm && !opt_wconv && !opt_eol) return;
+ if (!opt_escape && !opt_wnorm && !opt_wconv && !opt_eol) return;
// Trim whitespaces
- if (opt_trim) while (chartype_space(**s)) ++(*s);
+ if (opt_wnorm) while (chartype_space(**s)) ++(*s);
char* str = *s;
@@ -270,6 +270,7 @@ namespace pugi
{
if (opt_escape && *str == '&') break;
if ((opt_wnorm || opt_wconv || opt_eol) && chartype_space(*str)) break;
+
++str;
}
}
@@ -406,7 +407,7 @@ namespace pugi
*lastpos++ = *str++;
}
- if (opt_trim)
+ if (opt_wnorm)
{
do *lastpos-- = 0;
while (chartype_space(*lastpos));
@@ -414,66 +415,34 @@ namespace pugi
else *lastpos = 0;
}
- static void strconv_setup(void (*&func)(char**), unsigned int opt_trim, unsigned int opt_escape, unsigned int opt_wnorm, unsigned int opt_wconv, unsigned int opt_eol)
+ static void strconv_setup(void (*&func)(char**), unsigned int opt_escape, unsigned int opt_wnorm, unsigned int opt_wconv, unsigned int opt_eol)
{
if (opt_eol)
{
if (opt_wconv)
{
- if (opt_trim)
+ if (opt_escape)
{
- if (opt_escape)
- {
- if (opt_wnorm) func = &strconv_t;
- else func = &strconv_t;
- }
- else
- {
- if (opt_wnorm) func = &strconv_t;
- else func = &strconv_t;
- }
+ if (opt_wnorm) func = &strconv_t;
+ else func = &strconv_t;
}
else
{
- if (opt_escape)
- {
- if (opt_wnorm) func = &strconv_t;
- else func = &strconv_t;
- }
- else
- {
- if (opt_wnorm) func = &strconv_t;
- else func = &strconv_t;
- }
+ if (opt_wnorm) func = &strconv_t;
+ else func = &strconv_t;
}
}
else
{
- if (opt_trim)
+ if (opt_escape)
{
- if (opt_escape)
- {
- if (opt_wnorm) func = &strconv_t;
- else func = &strconv_t;
- }
- else
- {
- if (opt_wnorm) func = &strconv_t;
- else func = &strconv_t;
- }
+ if (opt_wnorm) func = &strconv_t;
+ else func = &strconv_t;
}
else
{
- if (opt_escape)
- {
- if (opt_wnorm) func = &strconv_t;
- else func = &strconv_t;
- }
- else
- {
- if (opt_wnorm) func = &strconv_t;
- else func = &strconv_t;
- }
+ if (opt_wnorm) func = &strconv_t;
+ else func = &strconv_t;
}
}
}
@@ -481,64 +450,33 @@ namespace pugi
{
if (opt_wconv)
{
- if (opt_trim)
+ if (opt_escape)
{
- if (opt_escape)
- {
- if (opt_wnorm) func = &strconv_t;
- else func = &strconv_t;
- }
- else
- {
- if (opt_wnorm) func = &strconv_t;
- else func = &strconv_t;
- }
+ if (opt_wnorm) func = &strconv_t;
+ else func = &strconv_t;
}
else
{
- if (opt_escape)
- {
- if (opt_wnorm) func = &strconv_t;
- else func = &strconv_t;
- }
- else
- {
- if (opt_wnorm) func = &strconv_t;
- else func = &strconv_t;
- }
+ if (opt_wnorm) func = &strconv_t;
+ else func = &strconv_t;
}
}
else
{
- if (opt_trim)
+ if (opt_escape)
{
- if (opt_escape)
- {
- if (opt_wnorm) func = &strconv_t;
- else func = &strconv_t;
- }
- else
- {
- if (opt_wnorm) func = &strconv_t;
- else func = &strconv_t;
- }
+ if (opt_wnorm) func = &strconv_t;
+ else func = &strconv_t;
}
else
{
- if (opt_escape)
- {
- if (opt_wnorm) func = &strconv_t;
- else func = &strconv_t;
- }
- else
- {
- if (opt_wnorm) func = &strconv_t;
- else func = &strconv_t;
- }
+ if (opt_wnorm) func = &strconv_t;
+ else func = &strconv_t;
}
}
}
}
+
// Allocate & append a new xml_node_struct onto the given parent.
// \param parent - pointer to parent node.
// \param type - desired node type.
@@ -608,8 +546,8 @@ namespace pugi
void (*strconv_pcdata)(char**);
void (*strconv_attribute)(char**);
- strconv_setup(strconv_attribute, OPTSET(parse_trim_attribute), OPTSET(parse_escapes_attribute), OPTSET(parse_wnorm_attribute), OPTSET(parse_wconv_attribute), OPTSET(parse_eol_attribute));
- strconv_setup(strconv_pcdata, OPTSET(parse_trim_pcdata), OPTSET(parse_escapes_pcdata), OPTSET(parse_wnorm_pcdata), false, OPTSET(parse_eol_pcdata));
+ strconv_setup(strconv_attribute, OPTSET(parse_escapes), OPTSET(parse_wnorm_attribute), OPTSET(parse_wconv_attribute), OPTSET(parse_eol));
+ strconv_setup(strconv_pcdata, OPTSET(parse_escapes), false, false, OPTSET(parse_eol));
char ch = 0; // Current char, in cases where we must null-terminate before we test.
xml_node_struct* cursor = xmldoc; // Tree node cursor.
@@ -702,9 +640,9 @@ namespace pugi
SCANFOR(chartype_rbracket(*s) && chartype_rbracket(*(s+1)) && chartype_leave(*(s+2)));
ENDSEG(); // Zero-terminate this segment.
- if (OPTSET(parse_eol_cdata))
+ if (OPTSET(parse_eol))
{
- strconv_t(&cursor->value);
+ strconv_t(&cursor->value);
}
POPNODE(); // Pop since this is a standalone.
diff --git a/src/pugixml.hpp b/src/pugixml.hpp
index 22e5c52..a802e3d 100644
--- a/src/pugixml.hpp
+++ b/src/pugixml.hpp
@@ -48,28 +48,21 @@ namespace pugi
const unsigned int parse_pi = 0x00000001; ///< Parse '...?>'
const unsigned int parse_comments = 0x00000002; ///< Parse ''
const unsigned int parse_cdata = 0x00000004; ///< Parse ''
- const unsigned int parse_ws_pcdata = 0x00000008; ///< Skip PCDATA that consists only of whitespaces
- const unsigned int parse_ext_pcdata = 0x00000010; ///< Skip PCDATA that is outside all tags (i.e. root)
- const unsigned int parse_trim_pcdata = 0x00000020; ///< Trim '>...<'
- const unsigned int parse_trim_attribute = 0x00000040; ///< Trim 'foo="..."'.
- const unsigned int parse_escapes_pcdata = 0x00000080; ///< Parse <, >, &, ", ', .. sequences
- const unsigned int parse_escapes_attribute = 0x00000100; ///< Parse <, >, &, ", ', .. sequences
- const unsigned int parse_wnorm_pcdata = 0x00000200; ///< Normalize spaces in pcdata
- const unsigned int parse_wnorm_attribute = 0x00000400; ///< Normalize spaces in attributes
- const unsigned int parse_wconv_attribute = 0x00000800; ///< Convert space-like characters to spaces in attributes (only if wnorm is not set)
- const unsigned int parse_eol_pcdata = 0x00001000; ///< Perform EOL handling in pcdata
- const unsigned int parse_eol_attribute = 0x00002000; ///< Perform EOL handling in attrobites
- const unsigned int parse_eol_cdata = 0x00004000; ///< Perform EOL handling in CDATA sections
- const unsigned int parse_check_end_tags = 0x00010000; ///< Check start and end tag names and return error if names mismatch
- const unsigned int parse_match_end_tags = 0x00020000; ///< Try to find corresponding start tag for an end tag
+ const unsigned int parse_ws_pcdata = 0x00000008; ///< Do not skip PCDATA that consists only of whitespaces
+ const unsigned int parse_ext_pcdata = 0x00000010; ///< Do not skip PCDATA that is outside all tags (i.e. root)
+ const unsigned int parse_escapes = 0x00000020; ///< Parse <, >, &, ", ', .. sequences
+ const unsigned int parse_wnorm_attribute = 0x00000080; ///< Normalize spaces in attributes (convert space-like characters to spaces + merge adjacent spaces + trim leading/trailing spaces)
+ const unsigned int parse_wconv_attribute = 0x00000100; ///< Convert space-like characters to spaces in attributes (only if wnorm is not set)
+ const unsigned int parse_eol = 0x00000200; ///< Perform EOL handling
+ const unsigned int parse_check_end_tags = 0x00000400; ///< Check start and end tag names and return error if names mismatch
+ const unsigned int parse_match_end_tags = 0x00000800; ///< Try to find corresponding start tag for an end tag
///< Set all flags, except parse_ws_pcdata, parse_trim_attribute, parse_pi and parse_comments
- const unsigned int parse_default = 0x00FFFFFF & ~parse_ws_pcdata & ~parse_trim_attribute & ~parse_pi & ~parse_comments;
+ const unsigned int parse_default = parse_cdata | parse_ext_pcdata | parse_escapes | parse_wconv_attribute | parse_eol | parse_check_end_tags;
const unsigned int parse_noset = 0x80000000; ///< Parse with flags in xml_parser
const unsigned int parse_w3c = parse_pi | parse_comments | parse_cdata |
- parse_escapes_pcdata | parse_escapes_attribute |
- parse_wconv_attribute | parse_check_end_tags |
- parse_ws_pcdata | parse_eol_cdata;
+ parse_escapes | parse_wconv_attribute |
+ parse_check_end_tags | parse_ws_pcdata | parse_eol;
/// Forward declarations
struct xml_attribute_struct;
--
cgit v1.2.3