Use a null-terminated buffer for parsing as often as possible.

Parsing used to work on a non null-terminated buffer, inserting a fake null terminator to increase performance. This makes it impossible to implement fragment parsing that preserves PCDATA contents (as witnessed by some tests for boundary conditions that actually depended on this behavior). Since almost all uses result in us allocating an internal buffer anyway, the new policy is to make sure all buffers that are allocated by pugixml are null-terminated - the only exception now is external calls to load_buffer_inplace that don't trigger encoding conversion. git-svn-id: https://pugixml.googlecode.com/svn/trunk@977 99668b35-9821-0410-8761-19e4c4f06640
author: Arseny Kapoulkine <arseny.kapoulkine@gmail.com> 2014-02-10 16:57:04 +0000
committer: Arseny Kapoulkine <arseny.kapoulkine@gmail.com> 2014-02-10 16:57:04 +0000
commit: 79fb68ac4177206e063f8f29113abbe82ac49698 (patch)
tree: c9f07c8d3b6bc82c944c8c16c72f50bf374e7b6d /tests
parent: 9ba26b94c74a03ac937a5d5972f8f12a2916f301 (diff)
2 files changed, 106 insertions, 5 deletions
diff --git a/tests/test_document.cpp b/tests/test_document.cpp
index 7adc2a1..adc4bdb 100644
--- a/tests/test_document.cpp
+++ b/tests/test_document.cpp
@@ -1069,3 +1069,104 @@ TEST_XML(document_reset_copy_self, "<node><child/></node>")
     CHECK(!doc.first_child());
     CHECK_NODE(doc, STR(""));
 }
+
+struct document_data_t
+{
+    xml_encoding encoding;
+
+    const unsigned char* data;
+    size_t size;
+};
+
+#include <stdio.h>
+
+TEST(document_load_buffer_utf_truncated)
+{
+	const unsigned char utf8[] = {'<', 0xe2, 0x82, 0xac, '/', '>'};
+	const unsigned char utf16_be[] = {0, '<', 0x20, 0xac, 0, '/', 0, '>'};
+	const unsigned char utf16_le[] = {'<', 0, 0xac, 0x20, '/', 0, '>', 0};
+	const unsigned char utf32_be[] = {0, 0, 0, '<', 0, 0, 0x20, 0xac, 0, 0, 0, '/', 0, 0, 0, '>'};
+	const unsigned char utf32_le[] = {'<', 0, 0, 0, 0xac, 0x20, 0, 0, '/', 0, 0, 0, '>', 0, 0, 0};
+
+	const document_data_t data[] =
+	{
+		{ encoding_utf8, utf8, sizeof(utf8) },
+		{ encoding_utf16_be, utf16_be, sizeof(utf16_be) },
+		{ encoding_utf16_le, utf16_le, sizeof(utf16_le) },
+		{ encoding_utf32_be, utf32_be, sizeof(utf32_be) },
+		{ encoding_utf32_le, utf32_le, sizeof(utf32_le) },
+	};
+
+	for (size_t i = 0; i < sizeof(data) / sizeof(data[0]); ++i)
+	{
+		const document_data_t& d = data[i];
+
+		for (size_t j = 0; j <= d.size; ++j)
+		{
+			char* buffer = new char[j];
+			memcpy(buffer, d.data, j);
+
+			xml_document doc;
+			xml_parse_result res = doc.load_buffer(buffer, j, parse_default, d.encoding);
+
+			if (j == d.size)
+			{
+				CHECK(res);
+
+				const char_t* name = doc.first_child().name();
+
+			#ifdef PUGIXML_WCHAR_MODE
+				CHECK(name[0] == 0x20ac && name[1] == 0);
+			#else
+				CHECK_STRING(name, "\xe2\x82\xac");
+			#endif
+			}
+			else
+			{
+				CHECK(!res || !doc.first_child());
+			}
+
+			delete[] buffer;
+		}
+	}
+}
+
+#ifndef PUGIXML_NO_STL
+TEST(document_load_stream_truncated)
+{
+	const unsigned char utf32_be[] = {0, 0, 0, '<', 0, 0, 0x20, 0xac, 0, 0, 0, '/', 0, 0, 0, '>'};
+
+	for (size_t i = 0; i <= sizeof(utf32_be); ++i)
+	{
+		std::string prefix(reinterpret_cast<const char*>(utf32_be), i);
+		std::istringstream iss(prefix);
+
+		xml_document doc;
+		xml_parse_result res = doc.load(iss);
+
+		if (i == sizeof(utf32_be))
+		{
+			CHECK(res);
+		}
+		else
+		{
+			CHECK(!res || !doc.first_child());
+
+			if (i < 8)
+			{
+				CHECK(!doc.first_child());
+			}
+			else
+			{
+				const char_t* name = doc.first_child().name();
+
+			#ifdef PUGIXML_WCHAR_MODE
+				CHECK(name[0] == 0x20ac && name[1] == 0);
+			#else
+				CHECK_STRING(name, "\xe2\x82\xac");
+			#endif
+			}
+		}
+	}
+}
+#endif
+\ No newline at end of file
diff --git a/tests/test_parse.cpp b/tests/test_parse.cpp
index 9a8bdf1..c165a65 100644
--- a/tests/test_parse.cpp
+++ b/tests/test_parse.cpp
@@ -313,12 +313,12 @@ TEST(parse_ws_pcdata_permutations)
         // current implementation of parse_ws_pcdata_single has an unfortunate bug; reproduce it here
         {4, STR("<node>\t\t<!---->\n\n</node>"), STR("<node>\n\n</node>"), 3},
         // error case: terminate PCDATA in the middle
-        {7, STR("<node>abcdef"), STR("<node>abcde</node>"), -3},
-        {7, STR("<node>      "), STR("<node>     </node>"), -3},
+        {7, STR("<node>abcdef"), STR("<node>abcdef</node>"), -3},
+        {7, STR("<node>      "), STR("<node>      </node>"), -3},
         // error case: terminate PCDATA as early as possible
         {7, STR("<node>"), STR("<node />"), -2},
-        {7, STR("<node>a"), STR("<node />"), -2},
-        {7, STR("<node> "), STR("<node />"), -2},
+        {7, STR("<node>a"), STR("<node>a</node>"), -3},
+        {7, STR("<node> "), STR("<node> </node>"), -3},
     };
 
     for (size_t i = 0; i < sizeof(test_data) / sizeof(test_data[0]); ++i)
@@ -805,7 +805,7 @@ TEST(parse_error_offset)
 
 	CHECK_OFFSET("<3d/>", parse_default, status_unrecognized_tag, 1);
 	CHECK_OFFSET(" <3d/>", parse_default, status_unrecognized_tag, 2);
-	CHECK_OFFSET(" <", parse_default, status_unrecognized_tag, 2);
+	CHECK_OFFSET(" <", parse_default, status_unrecognized_tag, 1);
 
 	CHECK_OFFSET("<?pi", parse_default, status_bad_pi, 3);
 	CHECK_OFFSET("<?pi", parse_default | parse_pi, status_bad_pi, 3);
author	Arseny Kapoulkine <arseny.kapoulkine@gmail.com>	2014-02-10 16:57:04 +0000
committer	Arseny Kapoulkine <arseny.kapoulkine@gmail.com>	2014-02-10 16:57:04 +0000
commit	79fb68ac4177206e063f8f29113abbe82ac49698 (patch)
tree	c9f07c8d3b6bc82c944c8c16c72f50bf374e7b6d /tests
parent	9ba26b94c74a03ac937a5d5972f8f12a2916f301 (diff)