From f542c5ebb8068ccd4f9176684eb62183afbe7e5c Mon Sep 17 00:00:00 2001 From: "arseny.kapoulkine" Date: Thu, 6 May 2010 20:28:36 +0000 Subject: Integrated changes from unicode branch to trunk git-svn-id: http://pugixml.googlecode.com/svn/trunk@383 99668b35-9821-0410-8761-19e4c4f06640 --- tests/test_unicode.cpp | 133 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 107 insertions(+), 26 deletions(-) (limited to 'tests/test_unicode.cpp') diff --git a/tests/test_unicode.cpp b/tests/test_unicode.cpp index 61e23aa..ea2494b 100644 --- a/tests/test_unicode.cpp +++ b/tests/test_unicode.cpp @@ -1,39 +1,80 @@ +#ifndef PUGIXML_NO_STL + #include "common.hpp" -// letters taken from http://www.utf8-chartable.de/ +#include -#ifdef __DMC__ -#define U_LITERALS // DMC does not understand \x01234 (it parses first three digits), but understands \u01234 -#endif +// letters taken from http://www.utf8-chartable.de/ -inline wchar_t wchar_cast(unsigned int value) +TEST(as_wide_empty) { - return static_cast(value); // to avoid C4310 on MSVC + CHECK(as_wide("") == L""); } -#ifndef PUGIXML_NO_STL -TEST(as_utf16) +TEST(as_wide_valid_basic) { // valid 1-byte, 2-byte and 3-byte inputs #ifdef U_LITERALS - CHECK(as_utf16("?\xd0\x80\xe2\x80\xbd") == L"?\u0400\u203D"); + CHECK(as_wide("?\xd0\x80\xe2\x80\xbd") == L"?\u0400\u203D"); #else - CHECK(as_utf16("?\xd0\x80\xe2\x80\xbd") == L"?\x0400\x203D"); + CHECK(as_wide("?\xd0\x80\xe2\x80\xbd") == L"?\x0400\x203D"); #endif +} - // invalid 1-byte input - CHECK(as_utf16("\xb0") == L" "); - +TEST(as_wide_valid_astral) +{ // valid 4-byte input - std::wstring b4 = as_utf16("\xf2\x97\x98\xa4 \xf4\x80\x8f\xbf"); - CHECK(b4.size() == 3 && b4[0] == wchar_cast(0x97624) && b4[1] == L' ' && b4[2] == wchar_cast(0x1003ff)); + std::wstring b4 = as_wide("\xf2\x97\x98\xa4 \xf4\x80\x8f\xbf"); + + size_t wcharsize = sizeof(wchar_t); + + if (wcharsize == 4) + { + CHECK(b4.size() == 3 && b4[0] == wchar_cast(0x97624) && b4[1] == L' ' && b4[2] == wchar_cast(0x1003ff)); + } + else + { + CHECK(b4.size() == 5 && b4[0] == 0xda1d && b4[1] == 0xde24 && b4[2] == L' ' && b4[3] == 0xdbc0 && b4[4] == 0xdfff); + } +} + +TEST(as_wide_invalid) +{ + // invalid 1-byte input + CHECK(as_wide("a\xb0") == L"a"); + CHECK(as_wide("a\xb0_") == L"a_"); + + // invalid 2-byte input + CHECK(as_wide("a\xc0") == L"a"); + CHECK(as_wide("a\xd0") == L"a"); + CHECK(as_wide("a\xc0_") == L"a_"); + CHECK(as_wide("a\xd0_") == L"a_"); + + // invalid 3-byte input + CHECK(as_wide("a\xe2\x80") == L"a"); + CHECK(as_wide("a\xe2") == L"a"); + CHECK(as_wide("a\xe2\x80_") == L"a_"); + CHECK(as_wide("a\xe2_") == L"a_"); + + // invalid 4-byte input + CHECK(as_wide("a\xf2\x97\x98") == L"a"); + CHECK(as_wide("a\xf2\x97") == L"a"); + CHECK(as_wide("a\xf2") == L"a"); + CHECK(as_wide("a\xf2\x97\x98_") == L"a_"); + CHECK(as_wide("a\xf2\x97_") == L"a_"); + CHECK(as_wide("a\xf2_") == L"a_"); // invalid 5-byte input - std::wstring b5 = as_utf16("\xf8\nbcd"); - CHECK(b5 == L" \nbcd"); + std::wstring b5 = as_wide("\xf8\nbcd"); + CHECK(b5 == L"\nbcd"); +} + +TEST(as_utf8_empty) +{ + CHECK(as_utf8(L"") == ""); } -TEST(as_utf8) +TEST(as_utf8_valid_basic) { // valid 1-byte, 2-byte and 3-byte outputs #ifdef U_LITERALS @@ -41,16 +82,56 @@ TEST(as_utf8) #else CHECK(as_utf8(L"?\x0400\x203D") == "?\xd0\x80\xe2\x80\xbd"); #endif - +} + +TEST(as_utf8_valid_astral) +{ // valid 4-byte output -#if 0 - // requires 4-byte wchar_t :( - CHECK(as_utf8(L"\x97624 \x1003ff") == "\xf2\x97\x98\xa4 \xf4\x80\x8f\xbf"); -#endif + size_t wcharsize = sizeof(wchar_t); + + if (wcharsize == 4) + { + std::wstring s; + s.resize(3); + s[0] = wchar_cast(0x97624); + s[1] = ' '; + s[2] = wchar_cast(0x1003ff); + + CHECK(as_utf8(s.c_str()) == "\xf2\x97\x98\xa4 \xf4\x80\x8f\xbf"); + } + else + { + #ifdef U_LITERALS + CHECK(as_utf8(L"\uda1d\ude24 \udbc0\udfff") == "\xf2\x97\x98\xa4 \xf4\x80\x8f\xbf"); + #else + CHECK(as_utf8(L"\xda1d\xde24 \xdbc0\xdfff") == "\xf2\x97\x98\xa4 \xf4\x80\x8f\xbf"); + #endif + } } -#endif -TEST_XML(parse_bom_utf8, "\xef\xbb\xbf") +TEST(as_utf8_invalid) { - CHECK_NODE(doc, ""); + size_t wcharsize = sizeof(wchar_t); + + if (wcharsize == 2) + { + // check non-terminated degenerate handling + #ifdef U_LITERALS + CHECK(as_utf8(L"a\uda1d") == "a"); + CHECK(as_utf8(L"a\uda1d_") == "a_"); + #else + CHECK(as_utf8(L"a\xda1d") == "a"); + CHECK(as_utf8(L"a\xda1d_") == "a_"); + #endif + + // check incorrect leading code + #ifdef U_LITERALS + CHECK(as_utf8(L"a\ude24") == "a"); + CHECK(as_utf8(L"a\ude24_") == "a_"); + #else + CHECK(as_utf8(L"a\xde24") == "a"); + CHECK(as_utf8(L"a\xde24_") == "a_"); + #endif + } } +#endif -- cgit v1.2.3