From 7bda2cb529550fae678224102e1c5d0697dacf04 Mon Sep 17 00:00:00 2001 From: "arseny.kapoulkine" Date: Mon, 10 May 2010 19:15:44 +0000 Subject: Implemented better DOCTYPE parsing, added more DOCTYPE tests git-svn-id: http://pugixml.googlecode.com/svn/trunk@409 99668b35-9821-0410-8761-19e4c4f06640 --- src/pugixml.cpp | 152 ++++++++++++++++++++++++++++++++----------- tests/test_parse.cpp | 29 --------- tests/test_parse_doctype.cpp | 92 ++++++++++++++++++++++++++ 3 files changed, 206 insertions(+), 67 deletions(-) create mode 100644 tests/test_parse_doctype.cpp diff --git a/src/pugixml.cpp b/src/pugixml.cpp index aa10bbc..dcb25f7 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -1739,6 +1739,116 @@ namespace { } + // DOCTYPE consists of nested sections of the following possible types: + // , , "...", '...' + // + // + // First group can not contain nested groups + // Second group can contain nested groups of the same type + // Third group can contain all other groups + xml_parse_result parse_doctype_primitive(char_t*& s, char_t* buffer_start) + { + if (*s == '"' || *s == '\'') + { + // quoted string + char_t ch = *s++; + SCANFOR(*s == ch); + if (!*s) THROW_ERROR(status_bad_doctype, s); + + s++; + } + else if (s[0] == '<' && s[1] == '?') + { + // + s += 2; + SCANFOR(s[0] == '?' && s[1] == '>'); // no need for ENDSWITH because ?> can't terminate proper doctype + if (!*s) THROW_ERROR(status_bad_doctype, s); + + s += 2; + } + else if (s[0] == '<' && s[1] == '!' && s[2] == '-' && s[3] == '-') + { + s += 4; + SCANFOR(s[0] == '-' && s[1] == '-' && s[2] == '>'); // no need for ENDSWITH because --> can't terminate proper doctype + if (!*s) THROW_ERROR(status_bad_doctype, s); + + s += 4; + } + else THROW_ERROR(status_bad_doctype, s); + + THROW_ERROR(status_ok, s); + } + + xml_parse_result parse_doctype_ignore(char_t*& s, char_t* buffer_start) + { + assert(s[0] == '<' && s[1] == '!' && s[2] == '['); + s++; + + while (*s) + { + if (s[0] == '<' && s[1] == '!' && s[2] == '[') + { + // nested ignore section + xml_parse_result res = parse_doctype_ignore(s, buffer_start); + + if (!res) return res; + } + else if (s[0] == ']' && s[1] == ']' && s[2] == '>') + { + // ignore section end + s += 3; + + THROW_ERROR(status_ok, s); + } + else s++; + } + + THROW_ERROR(status_bad_doctype, s); + } + + xml_parse_result parse_doctype(char_t*& s, char_t* buffer_start, char_t endch, bool toplevel) + { + assert(s[0] == '<' && s[1] == '!'); + s++; + + while (*s) + { + if (s[0] == '<' && s[1] == '!' && s[2] != '-') + { + if (s[2] == '[') + { + // ignore + xml_parse_result res = parse_doctype_ignore(s, buffer_start); + + if (!res) return res; + } + else + { + // some control group + xml_parse_result res = parse_doctype(s, buffer_start, endch, false); + + if (!res) return res; + } + } + else if (s[0] == '<' || s[0] == '"' || s[0] == '\'') + { + // unknown tag (forbidden), or some primitive group + xml_parse_result res = parse_doctype_primitive(s, buffer_start); + + if (!res) return res; + } + else if (*s == '>') + { + s++; + + THROW_ERROR(status_ok, s); + } + else s++; + } + + THROW_ERROR((toplevel && endch == '>') ? status_ok : status_bad_doctype, s); + } + xml_parse_result parse_exclamation(char_t*& ref_s, xml_node_struct* cursor, unsigned int optmsk, char_t* buffer_start, char_t endch) { // load into registers @@ -1831,47 +1941,13 @@ namespace } else if (s[0] == 'D' && s[1] == 'O' && s[2] == 'C' && s[3] == 'T' && s[4] == 'Y' && s[5] == 'P' && ENDSWITH(s[6], 'E')) { - if (s[6] != 'E') THROW_ERROR(status_bad_doctype, s); + if (s[6] != 'E') THROW_ERROR(status_bad_doctype, s); - LOC_DOCTYPE: - SCANFOR(*s == '\'' || *s == '"' || *s == '[' || *s == '>'); - if (*s == 0 && endch != '>') THROW_ERROR(status_bad_doctype, s); + s -= 2; - if (*s == '\'' || *s == '"') // '...SYSTEM "..." - { - ch = *s++; - SCANFOR(*s == ch); - if (*s == 0 && endch != '>') THROW_ERROR(status_bad_doctype, s); + xml_parse_result res = parse_doctype(s, buffer_start, endch, true); - s += (*s != 0); - goto LOC_DOCTYPE; - } - - if(*s == '[') // '...[...' - { - ++s; - unsigned int bd = 1; // Bracket depth counter. - while (*s!=0) // Loop till we're out of all brackets. - { - if (*s == ']') --bd; - else if (*s == '[') ++bd; - if (bd == 0) break; - ++s; - } - - if (bd != 0) THROW_ERROR(status_bad_doctype, s); - } - - SCANFOR(*s == '>'); - - if (*s == 0) - { - if (endch != '>') THROW_ERROR(status_bad_doctype, s); - } - else - { - ++s; - } + if (!res) return res; } else if (*s == 0 && endch == '-') THROW_ERROR(status_bad_comment, s); else if (*s == 0 && endch == '[') THROW_ERROR(status_bad_cdata, s); diff --git a/tests/test_parse.cpp b/tests/test_parse.cpp index 0719e5d..fb0dd23 100644 --- a/tests/test_parse.cpp +++ b/tests/test_parse.cpp @@ -497,35 +497,6 @@ TEST(parse_declaration_error) CHECK(doc.load(STR(""), parse_minimal | parse_declaration).status == status_bad_attribute); } -TEST(parse_doctype_skip) -{ - xml_document doc; - CHECK(doc.load(STR("")) && !doc.first_child()); - CHECK(doc.load(STR("")) && !doc.first_child()); - CHECK(doc.load(STR("")) && !doc.first_child()); - CHECK(doc.load(STR("")) && !doc.first_child()); - CHECK(doc.load(STR("")) && !doc.first_child()); - CHECK(doc.load(STR("]>")) && !doc.first_child()); - - CHECK(doc.load(STR("]>"))); - CHECK_NODE(doc, STR("")); -} - -TEST(parse_doctype_error) -{ - xml_document doc; - CHECK(doc.load(STR("")).status == status_bad_doctype); - CHECK(doc.load(STR("]")).status == status_bad_doctype); - CHECK(doc.load(STR("] ")).status == status_bad_doctype); -} - TEST(parse_empty) { xml_document doc; diff --git a/tests/test_parse_doctype.cpp b/tests/test_parse_doctype.cpp new file mode 100644 index 0000000..35015ff --- /dev/null +++ b/tests/test_parse_doctype.cpp @@ -0,0 +1,92 @@ +#include "common.hpp" + +#include + +bool test_doctype_wf(const std::basic_string& decl) +{ + xml_document doc; + + // standalone + if (!doc.load(decl.c_str()) || doc.first_child()) return false; + + // pcdata pre/postfix + if (!doc.load(("a" + decl).c_str()) || doc.first_child()) return false; + if (!doc.load((decl + "b").c_str()) || doc.first_child()) return false; + if (!doc.load(("a" + decl + "b").c_str()) || doc.first_child()) return false; + + // node pre/postfix + if (!doc.load(("" + decl).c_str()) || !test_node(doc, STR(""), STR(""), format_raw)) return false; + if (!doc.load((decl + "").c_str()) || !test_node(doc, STR(""), STR(""), format_raw)) return false; + if (!doc.load(("" + decl + "").c_str()) || !test_node(doc, STR(""), STR(""), format_raw)) return false; + + return true; +} + +bool test_doctype_nwf(const std::basic_string& decl) +{ + xml_document doc; + + // standalone + if (doc.load(decl.c_str()).status != status_bad_doctype) return false; + + // pcdata postfix + if (doc.load((decl + "b").c_str()).status != status_bad_doctype) return false; + + // node postfix + if (doc.load((decl + "").c_str()).status != status_bad_doctype) return false; + + return true; +} + +#define TEST_DOCTYPE_WF(contents) CHECK(test_doctype_wf(STR(contents))) +#define TEST_DOCTYPE_NWF(contents) CHECK(test_doctype_nwf(STR(contents))) + +TEST(parse_doctype_skip) +{ + TEST_DOCTYPE_WF(""); + TEST_DOCTYPE_WF(""); + TEST_DOCTYPE_WF(""); + TEST_DOCTYPE_WF(""); + TEST_DOCTYPE_WF(""); + TEST_DOCTYPE_WF("]>"); +} + +TEST(parse_doctype_error) +{ + TEST_DOCTYPE_NWF(""); + TEST_DOCTYPE_NWF("]"); + TEST_DOCTYPE_NWF("] "); +} + +// Examples from W3C recommendations +TEST(parse_doctype_w3c_wf) +{ + TEST_DOCTYPE_WF(""); + TEST_DOCTYPE_WF(" ]>"); + TEST_DOCTYPE_WF(" ]>"); + TEST_DOCTYPE_WF(" ]]> ]]>]>"); + TEST_DOCTYPE_WF(" ]>"); + TEST_DOCTYPE_WF(" ]>"); +} + +TEST(parse_doctype_w3c_nwf) +{ + TEST_DOCTYPE_NWF(""); + TEST_DOCTYPE_NWF(" ]"); + TEST_DOCTYPE_NWF(""); + TEST_DOCTYPE_NWF(" ]"); + TEST_DOCTYPE_NWF(""); + TEST_DOCTYPE_NWF("