From 79fb68ac4177206e063f8f29113abbe82ac49698 Mon Sep 17 00:00:00 2001
From: Arseny Kapoulkine <arseny.kapoulkine@gmail.com>
Date: Mon, 10 Feb 2014 16:57:04 +0000
Subject: Use a null-terminated buffer for parsing as often as possible.

Parsing used to work on a non null-terminated buffer, inserting a fake null terminator to increase performance.
This makes it impossible to implement fragment parsing that preserves PCDATA contents (as witnessed by some
tests for boundary conditions that actually depended on this behavior).

Since almost all uses result in us allocating an internal buffer anyway, the new policy is to make sure all buffers
that are allocated by pugixml are null-terminated - the only exception now is external calls to load_buffer_inplace
that don't trigger encoding conversion.

git-svn-id: https://pugixml.googlecode.com/svn/trunk@977 99668b35-9821-0410-8761-19e4c4f06640
---
 tests/test_document.cpp | 101 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)

(limited to 'tests/test_document.cpp')
diff --git a/tests/test_document.cpp b/tests/test_document.cpp
index 7adc2a1..adc4bdb 100644
--- a/tests/test_document.cpp
+++ b/tests/test_document.cpp
@@ -1069,3 +1069,104 @@ TEST_XML(document_reset_copy_self, "<node><child/></node>")
     CHECK(!doc.first_child());
     CHECK_NODE(doc, STR(""));
 }
+
+struct document_data_t
+{
+    xml_encoding encoding;
+
+    const unsigned char* data;
+    size_t size;
+};
+
+#include <stdio.h>
+
+TEST(document_load_buffer_utf_truncated)
+{
+	const unsigned char utf8[] = {'<', 0xe2, 0x82, 0xac, '/', '>'};
+	const unsigned char utf16_be[] = {0, '<', 0x20, 0xac, 0, '/', 0, '>'};
+	const unsigned char utf16_le[] = {'<', 0, 0xac, 0x20, '/', 0, '>', 0};
+	const unsigned char utf32_be[] = {0, 0, 0, '<', 0, 0, 0x20, 0xac, 0, 0, 0, '/', 0, 0, 0, '>'};
+	const unsigned char utf32_le[] = {'<', 0, 0, 0, 0xac, 0x20, 0, 0, '/', 0, 0, 0, '>', 0, 0, 0};
+
+	const document_data_t data[] =
+	{
+		{ encoding_utf8, utf8, sizeof(utf8) },
+		{ encoding_utf16_be, utf16_be, sizeof(utf16_be) },
+		{ encoding_utf16_le, utf16_le, sizeof(utf16_le) },
+		{ encoding_utf32_be, utf32_be, sizeof(utf32_be) },
+		{ encoding_utf32_le, utf32_le, sizeof(utf32_le) },
+	};
+
+	for (size_t i = 0; i < sizeof(data) / sizeof(data[0]); ++i)
+	{
+		const document_data_t& d = data[i];
+
+		for (size_t j = 0; j <= d.size; ++j)
+		{
+			char* buffer = new char[j];
+			memcpy(buffer, d.data, j);
+
+			xml_document doc;
+			xml_parse_result res = doc.load_buffer(buffer, j, parse_default, d.encoding);
+
+			if (j == d.size)
+			{
+				CHECK(res);
+
+				const char_t* name = doc.first_child().name();
+
+			#ifdef PUGIXML_WCHAR_MODE
+				CHECK(name[0] == 0x20ac && name[1] == 0);
+			#else
+				CHECK_STRING(name, "\xe2\x82\xac");
+			#endif
+			}
+			else
+			{
+				CHECK(!res || !doc.first_child());
+			}
+
+			delete[] buffer;
+		}
+	}
+}
+
+#ifndef PUGIXML_NO_STL
+TEST(document_load_stream_truncated)
+{
+	const unsigned char utf32_be[] = {0, 0, 0, '<', 0, 0, 0x20, 0xac, 0, 0, 0, '/', 0, 0, 0, '>'};
+
+	for (size_t i = 0; i <= sizeof(utf32_be); ++i)
+	{
+		std::string prefix(reinterpret_cast<const char*>(utf32_be), i);
+		std::istringstream iss(prefix);
+
+		xml_document doc;
+		xml_parse_result res = doc.load(iss);
+
+		if (i == sizeof(utf32_be))
+		{
+			CHECK(res);
+		}
+		else
+		{
+			CHECK(!res || !doc.first_child());
+
+			if (i < 8)
+			{
+				CHECK(!doc.first_child());
+			}
+			else
+			{
+				const char_t* name = doc.first_child().name();
+
+			#ifdef PUGIXML_WCHAR_MODE
+				CHECK(name[0] == 0x20ac && name[1] == 0);
+			#else
+				CHECK_STRING(name, "\xe2\x82\xac");
+			#endif
+			}
+		}
+	}
+}
+#endif
\ No newline at end of file
-- 
cgit v1.2.3