Optimized attribute parsing; behavior of parse_wconv changed, it now assumes that parse_eol is set

git-svn-id: http://pugixml.googlecode.com/svn/trunk@503 99668b35-9821-0410-8761-19e4c4f06640
author: arseny.kapoulkine <arseny.kapoulkine@99668b35-9821-0410-8761-19e4c4f06640> 2010-06-04 18:50:26 +0000
committer: arseny.kapoulkine <arseny.kapoulkine@99668b35-9821-0410-8761-19e4c4f06640> 2010-06-04 18:50:26 +0000
commit: 9fa82b15f53f0f20363f50b5b1adf1a762ed96d6 (patch)
tree: e01329629402f8d57eb41ebd55f06a58663a7121
parent: f9c78551437bace4404cafdde632af947309161c (diff)
3 files changed, 121 insertions, 60 deletions
diff --git a/src/pugixml.cpp b/src/pugixml.cpp
index 5b668b0..7911689 100644
--- a/src/pugixml.cpp
+++ b/src/pugixml.cpp
@@ -935,7 +935,7 @@ namespace
 	{
 		ct_parse_pcdata = 1,	// \0, &, \r, <
 		ct_parse_attr = 2,		// \0, &, \r, ', "
-		ct_parse_attr_ws = 4,	// \0, &, \r, ', ", \n, space, tab
+		ct_parse_attr_ws = 4,	// \0, &, \r, ', ", \n, tab
 		ct_space = 8,			// \r, \n, space, tab
 		ct_parse_cdata = 16,	// \0, ], >, \r
 		ct_parse_comment = 32,	// \0, -, >, \r
@@ -947,7 +947,7 @@ namespace
 	{
 		55,  0,   0,   0,   0,   0,   0,   0,      0,   12,  12,  0,   0,   63,  0,   0,   // 0-15
 		0,   0,   0,   0,   0,   0,   0,   0,      0,   0,   0,   0,   0,   0,   0,   0,   // 16-31
-		12,  0,   6,   0,   0,   0,   7,   6,      0,   0,   0,   0,   0,   96,  64,  0,   // 32-47
+		8,   0,   6,   0,   0,   0,   7,   6,      0,   0,   0,   0,   0,   96,  64,  0,   // 32-47
 		64,  64,  64,  64,  64,  64,  64,  64,     64,  64,  192, 0,   1,   0,   48,  0,   // 48-63
 		0,   192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192, // 64-79
 		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 0,   0,   16,  0,   192, // 80-95
@@ -1020,19 +1020,6 @@ namespace
 	template <bool _1, bool _2> const bool opt2_to_type<_1, _2>::o1 = _1;
 	template <bool _1, bool _2> const bool opt2_to_type<_1, _2>::o2 = _2;
 
-	template <bool _1, bool _2, bool _3, bool _4> struct opt4_to_type
-	{
-		static const bool o1;
-		static const bool o2;
-		static const bool o3;
-		static const bool o4;
-	};
-
-	template <bool _1, bool _2, bool _3, bool _4> const bool opt4_to_type<_1, _2, _3, _4>::o1 = _1;
-	template <bool _1, bool _2, bool _3, bool _4> const bool opt4_to_type<_1, _2, _3, _4>::o2 = _2;
-	template <bool _1, bool _2, bool _3, bool _4> const bool opt4_to_type<_1, _2, _3, _4>::o3 = _3;
-	template <bool _1, bool _2, bool _3, bool _4> const bool opt4_to_type<_1, _2, _3, _4>::o4 = _4;
-
 	bool is_little_endian()
 	{
 		unsigned int ui = 1;
@@ -1628,19 +1615,16 @@ namespace
 
 	typedef char_t* (*strconv_attribute_t)(char_t*, char_t);
 	
-	template <typename opt4> struct strconv_attribute_impl
+	template <typename opt1> struct strconv_attribute_impl
 	{
-		static char_t* parse(char_t* s, char_t end_quote)
+		static char_t* parse_wnorm(char_t* s, char_t end_quote)
 		{
-			const bool opt_wconv = opt4::o1;
-			const bool opt_wnorm = opt4::o2;
-			const bool opt_eol = opt4::o3;
-			const bool opt_escape = opt4::o4;
+			const bool opt_escape = opt1::o1;
 
 			gap g;
 
 			// trim leading whitespaces
-			if (opt_wnorm && IS_CHARTYPE(*s, ct_space))
+			if (IS_CHARTYPE(*s, ct_space))
 			{
 				char_t* str = s;
 				
@@ -1652,22 +1636,18 @@ namespace
 
 			while (true)
 			{
-				while (!IS_CHARTYPE(*s, (opt_wnorm || opt_wconv) ? ct_parse_attr_ws : ct_parse_attr)) ++s;
+				while (!IS_CHARTYPE(*s, ct_parse_attr_ws | ct_space)) ++s;
 				
 				if (*s == end_quote)
 				{
 					char_t* str = g.flush(s);
 					
-					if (opt_wnorm)
-					{
-						do *str-- = 0;
-						while (IS_CHARTYPE(*str, ct_space));
-					}
-					else *str = 0;
+					do *str-- = 0;
+					while (IS_CHARTYPE(*str, ct_space));
 				
 					return s + 1;
 				}
-				else if (opt_wnorm && IS_CHARTYPE(*s, ct_space))
+				else if (IS_CHARTYPE(*s, ct_space))
 				{
 					*s++ = ' ';
 		
@@ -1679,21 +1659,73 @@ namespace
 						g.push(s, str - s);
 					}
 				}
-				else if (opt_wconv && IS_CHARTYPE(*s, ct_space))
+				else if (opt_escape && *s == '&')
+				{
+					s = strconv_escape(s, g);
+				}
+				else if (!*s)
+				{
+					return 0;
+				}
+				else ++s;
+			}
+		}
+
+		static char_t* parse_wconv(char_t* s, char_t end_quote)
+		{
+			const bool opt_escape = opt1::o1;
+
+			gap g;
+
+			while (true)
+			{
+				while (!IS_CHARTYPE(*s, ct_parse_attr_ws)) ++s;
+				
+				if (*s == end_quote)
+				{
+					*g.flush(s) = 0;
+				
+					return s + 1;
+				}
+				else if (IS_CHARTYPE(*s, ct_space))
 				{
-					if (opt_eol)
+					if (*s == '\r')
 					{
-						if (*s == '\r')
-						{
-							*s++ = ' ';
-					
-							if (*s == '\n') g.push(s, 1);
-						}
-						else *s++ = ' ';
+						*s++ = ' ';
+				
+						if (*s == '\n') g.push(s, 1);
 					}
 					else *s++ = ' ';
 				}
-				else if (opt_eol && *s == '\r')
+				else if (opt_escape && *s == '&')
+				{
+					s = strconv_escape(s, g);
+				}
+				else if (!*s)
+				{
+					return 0;
+				}
+				else ++s;
+			}
+		}
+
+		static char_t* parse_eol(char_t* s, char_t end_quote)
+		{
+			const bool opt_escape = opt1::o1;
+
+			gap g;
+
+			while (true)
+			{
+				while (!IS_CHARTYPE(*s, ct_parse_attr)) ++s;
+				
+				if (*s == end_quote)
+				{
+					*g.flush(s) = 0;
+				
+					return s + 1;
+				}
+				else if (*s == '\r')
 				{
 					*s++ = '\n';
 					
@@ -1710,30 +1742,58 @@ namespace
 				else ++s;
 			}
 		}
+
+		static char_t* parse_simple(char_t* s, char_t end_quote)
+		{
+			const bool opt_escape = opt1::o1;
+
+			gap g;
+
+			while (true)
+			{
+				while (!IS_CHARTYPE(*s, ct_parse_attr)) ++s;
+				
+				if (*s == end_quote)
+				{
+					*g.flush(s) = 0;
+				
+					return s + 1;
+				}
+				else if (opt_escape && *s == '&')
+				{
+					s = strconv_escape(s, g);
+				}
+				else if (!*s)
+				{
+					return 0;
+				}
+				else ++s;
+			}
+		}
 	};
 
 	strconv_attribute_t get_strconv_attribute(unsigned int optmask)
 	{
-		STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_wconv_attribute == 0x80);
+		STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_wconv_attribute == 0x40);
 		
 		switch ((optmask >> 4) & 15) // get bitmask for flags (wconv wnorm eol escapes)
 		{
-		case 0:  return strconv_attribute_impl<opt4_to_type<0, 0, 0, 0> >::parse;
-		case 1:  return strconv_attribute_impl<opt4_to_type<0, 0, 0, 1> >::parse;
-		case 2:  return strconv_attribute_impl<opt4_to_type<0, 0, 1, 0> >::parse;
-		case 3:  return strconv_attribute_impl<opt4_to_type<0, 0, 1, 1> >::parse;
-		case 4:  return strconv_attribute_impl<opt4_to_type<0, 1, 0, 0> >::parse;
-		case 5:  return strconv_attribute_impl<opt4_to_type<0, 1, 0, 1> >::parse;
-		case 6:  return strconv_attribute_impl<opt4_to_type<0, 1, 1, 0> >::parse;
-		case 7:  return strconv_attribute_impl<opt4_to_type<0, 1, 1, 1> >::parse;
-		case 8:  return strconv_attribute_impl<opt4_to_type<1, 0, 0, 0> >::parse;
-		case 9:  return strconv_attribute_impl<opt4_to_type<1, 0, 0, 1> >::parse;
-		case 10: return strconv_attribute_impl<opt4_to_type<1, 0, 1, 0> >::parse;
-		case 11: return strconv_attribute_impl<opt4_to_type<1, 0, 1, 1> >::parse;
-		case 12: return strconv_attribute_impl<opt4_to_type<1, 1, 0, 0> >::parse;
-		case 13: return strconv_attribute_impl<opt4_to_type<1, 1, 0, 1> >::parse;
-		case 14: return strconv_attribute_impl<opt4_to_type<1, 1, 1, 0> >::parse;
-		case 15: return strconv_attribute_impl<opt4_to_type<1, 1, 1, 1> >::parse;
+		case 0:  return strconv_attribute_impl<opt1_to_type<0> >::parse_simple;
+		case 1:  return strconv_attribute_impl<opt1_to_type<1> >::parse_simple;
+		case 2:  return strconv_attribute_impl<opt1_to_type<0> >::parse_eol;
+		case 3:  return strconv_attribute_impl<opt1_to_type<1> >::parse_eol;
+		case 4:  return strconv_attribute_impl<opt1_to_type<0> >::parse_wconv;
+		case 5:  return strconv_attribute_impl<opt1_to_type<1> >::parse_wconv;
+		case 6:  return strconv_attribute_impl<opt1_to_type<0> >::parse_wconv;
+		case 7:  return strconv_attribute_impl<opt1_to_type<1> >::parse_wconv;
+		case 8:  return strconv_attribute_impl<opt1_to_type<0> >::parse_wnorm;
+		case 9:  return strconv_attribute_impl<opt1_to_type<1> >::parse_wnorm;
+		case 10: return strconv_attribute_impl<opt1_to_type<0> >::parse_wnorm;
+		case 11: return strconv_attribute_impl<opt1_to_type<1> >::parse_wnorm;
+		case 12: return strconv_attribute_impl<opt1_to_type<0> >::parse_wnorm;
+		case 13: return strconv_attribute_impl<opt1_to_type<1> >::parse_wnorm;
+		case 14: return strconv_attribute_impl<opt1_to_type<0> >::parse_wnorm;
+		case 15: return strconv_attribute_impl<opt1_to_type<1> >::parse_wnorm;
 		default: return 0; // should not get here
 		}
 	}
diff --git a/src/pugixml.hpp b/src/pugixml.hpp
index 6f4cece..398dd77 100644
--- a/src/pugixml.hpp
+++ b/src/pugixml.hpp
@@ -235,17 +235,18 @@ namespace pugi
 #if !defined(__INTEL_COMPILER) || __INTEL_COMPILER > 800
 	PUGIXML_DEPRECATED
 #endif
- 	const unsigned int parse_wnorm_attribute	= 0x0040;
+ 	const unsigned int parse_wnorm_attribute	= 0x0080;
 
  	/**
  	 * This flag determines if attribute value normalization should be performed for all attributes.
  	 * This means, that whitespace characters (new line, tab and space) are replaced with space (' ').
  	 * Note, that the actions performed while this flag is on are also performed if parse_wnorm_attribute
  	 * is on, so this flag has no effect if parse_wnorm_attribute flag is set.
+	 * New line characters are always treated as if parse_eol is set, i.e. \r\n is converted to single space.
  	 * 
  	 * This flag is on by default.
  	 */
- 	const unsigned int parse_wconv_attribute	= 0x0080;
+ 	const unsigned int parse_wconv_attribute	= 0x0040;
 	
 	/**
 	 * This flag determines if XML document declaration (this node has the form of <?xml ... ?> in XML)
diff --git a/tests/test_parse.cpp b/tests/test_parse.cpp
index 400942f..c2f56e5 100644
--- a/tests/test_parse.cpp
+++ b/tests/test_parse.cpp
@@ -351,7 +351,7 @@ TEST(parse_attribute_no_eol_wconv)
 {
 	xml_document doc;
 	CHECK(doc.load(STR("<node id=' \t\r\rval1  \rval2\r\nval3\nval4\r\r'/>"), parse_minimal | parse_wconv_attribute));
-	CHECK_STRING(doc.child(STR("node")).attribute(STR("id")).value(), STR("    val1   val2  val3 val4  "));
+	CHECK_STRING(doc.child(STR("node")).attribute(STR("id")).value(), STR("    val1   val2 val3 val4  "));
 }
 
 TEST(parse_attribute_eol_wconv)
author	arseny.kapoulkine <arseny.kapoulkine@99668b35-9821-0410-8761-19e4c4f06640>	2010-06-04 18:50:26 +0000
committer	arseny.kapoulkine <arseny.kapoulkine@99668b35-9821-0410-8761-19e4c4f06640>	2010-06-04 18:50:26 +0000
commit	9fa82b15f53f0f20363f50b5b1adf1a762ed96d6 (patch)
tree	e01329629402f8d57eb41ebd55f06a58663a7121
parent	f9c78551437bace4404cafdde632af947309161c (diff)