From 7bda2cb529550fae678224102e1c5d0697dacf04 Mon Sep 17 00:00:00 2001
From: "arseny.kapoulkine"
 <arseny.kapoulkine@99668b35-9821-0410-8761-19e4c4f06640>
Date: Mon, 10 May 2010 19:15:44 +0000
Subject: Implemented better DOCTYPE parsing, added more DOCTYPE tests

git-svn-id: http://pugixml.googlecode.com/svn/trunk@409 99668b35-9821-0410-8761-19e4c4f06640
---
 src/pugixml.cpp | 152 ++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 114 insertions(+), 38 deletions(-)

(limited to 'src/pugixml.cpp')

diff --git a/src/pugixml.cpp b/src/pugixml.cpp
index aa10bbc..dcb25f7 100644
--- a/src/pugixml.cpp
+++ b/src/pugixml.cpp
@@ -1739,6 +1739,116 @@ namespace
 		{
 		}
 
+		// DOCTYPE consists of nested sections of the following possible types:
+		// <!-- ... -->, <? ... ?>, "...", '...'
+		// <![...]]>
+		// <!...>
+		// First group can not contain nested groups
+		// Second group can contain nested groups of the same type
+		// Third group can contain all other groups
+		xml_parse_result parse_doctype_primitive(char_t*& s, char_t* buffer_start)
+		{
+			if (*s == '"' || *s == '\'')
+			{
+				// quoted string
+				char_t ch = *s++;
+				SCANFOR(*s == ch);
+				if (!*s) THROW_ERROR(status_bad_doctype, s);
+
+				s++;
+			}
+			else if (s[0] == '<' && s[1] == '?')
+			{
+				// <? ... ?>
+				s += 2;
+				SCANFOR(s[0] == '?' && s[1] == '>'); // no need for ENDSWITH because ?> can't terminate proper doctype
+				if (!*s) THROW_ERROR(status_bad_doctype, s);
+
+				s += 2;
+			}
+			else if (s[0] == '<' && s[1] == '!' && s[2] == '-' && s[3] == '-')
+			{
+				s += 4;
+				SCANFOR(s[0] == '-' && s[1] == '-' && s[2] == '>'); // no need for ENDSWITH because --> can't terminate proper doctype
+				if (!*s) THROW_ERROR(status_bad_doctype, s);
+
+				s += 4;
+			}
+			else THROW_ERROR(status_bad_doctype, s);
+
+			THROW_ERROR(status_ok, s);
+		}
+
+		xml_parse_result parse_doctype_ignore(char_t*& s, char_t* buffer_start)
+		{
+			assert(s[0] == '<' && s[1] == '!' && s[2] == '[');
+			s++;
+
+			while (*s)
+			{
+				if (s[0] == '<' && s[1] == '!' && s[2] == '[')
+				{
+					// nested ignore section
+					xml_parse_result res = parse_doctype_ignore(s, buffer_start);
+
+					if (!res) return res;
+				}
+				else if (s[0] == ']' && s[1] == ']' && s[2] == '>')
+				{
+					// ignore section end
+					s += 3;
+
+					THROW_ERROR(status_ok, s);
+				}
+				else s++;
+			}
+
+			THROW_ERROR(status_bad_doctype, s);
+		}
+
+		xml_parse_result parse_doctype(char_t*& s, char_t* buffer_start, char_t endch, bool toplevel)
+		{
+			assert(s[0] == '<' && s[1] == '!');
+			s++;
+
+			while (*s)
+			{
+				if (s[0] == '<' && s[1] == '!' && s[2] != '-')
+				{
+					if (s[2] == '[')
+					{
+						// ignore
+						xml_parse_result res = parse_doctype_ignore(s, buffer_start);
+
+						if (!res) return res;
+					}
+					else
+					{
+						// some control group
+						xml_parse_result res = parse_doctype(s, buffer_start, endch, false);
+
+						if (!res) return res;
+					}
+				}
+				else if (s[0] == '<' || s[0] == '"' || s[0] == '\'')
+				{
+					// unknown tag (forbidden), or some primitive group
+					xml_parse_result res = parse_doctype_primitive(s, buffer_start);
+
+					if (!res) return res;
+				}
+				else if (*s == '>')
+				{
+					s++;
+
+					THROW_ERROR(status_ok, s);
+				}
+				else s++;
+			}
+
+			THROW_ERROR((toplevel && endch == '>') ? status_ok : status_bad_doctype, s);
+		}
+
 		xml_parse_result parse_exclamation(char_t*& ref_s, xml_node_struct* cursor, unsigned int optmsk, char_t* buffer_start, char_t endch)
 		{
 			// load into registers
@@ -1831,47 +1941,13 @@ namespace
 			}
 			else if (s[0] == 'D' && s[1] == 'O' && s[2] == 'C' && s[3] == 'T' && s[4] == 'Y' && s[5] == 'P' && ENDSWITH(s[6], 'E'))
 			{
-				if (s[6] != 'E') THROW_ERROR(status_bad_doctype, s); 
+				if (s[6] != 'E') THROW_ERROR(status_bad_doctype, s);
 
-			LOC_DOCTYPE:
-				SCANFOR(*s == '\'' || *s == '"' || *s == '[' || *s == '>');
-				if (*s == 0 && endch != '>') THROW_ERROR(status_bad_doctype, s);
+				s -= 2;
 
-				if (*s == '\'' || *s == '"') // '...SYSTEM "..."
-				{
-					ch = *s++;
-					SCANFOR(*s == ch);
-					if (*s == 0 && endch != '>') THROW_ERROR(status_bad_doctype, s);
+				xml_parse_result res = parse_doctype(s, buffer_start, endch, true);
 
-					s += (*s != 0);
-					goto LOC_DOCTYPE;
-				}
-
-				if(*s == '[') // '...[...'
-				{
-					++s;
-					unsigned int bd = 1; // Bracket depth counter.
-					while (*s!=0) // Loop till we're out of all brackets.
-					{
-						if (*s == ']') --bd;
-						else if (*s == '[') ++bd;
-						if (bd == 0) break;
-						++s;
-					}
-
-					if (bd != 0) THROW_ERROR(status_bad_doctype, s);
-				}
-
-				SCANFOR(*s == '>');
-
-				if (*s == 0)
-				{
-					if (endch != '>') THROW_ERROR(status_bad_doctype, s);
-				}
-				else
-				{
-					++s;
-				}
+				if (!res) return res;
 			}
 			else if (*s == 0 && endch == '-') THROW_ERROR(status_bad_comment, s);
 			else if (*s == 0 && endch == '[') THROW_ERROR(status_bad_cdata, s);
-- 
cgit v1.2.3