From a590a69bda9069b3d4a54e86a2472c613106ff48 Mon Sep 17 00:00:00 2001 From: "arseny.kapoulkine" Date: Wed, 22 Sep 2010 19:05:31 +0000 Subject: Minor UTF8 conversion refactoring, added as_utf8 and as_wide overloads with string arguments git-svn-id: http://pugixml.googlecode.com/svn/trunk@749 99668b35-9821-0410-8761-19e4c4f06640 --- src/pugixml.cpp | 148 +++++++++++++++++++++++++++---------------------- src/pugixml.hpp | 3 +- tests/test_unicode.cpp | 14 +++++ 3 files changed, 98 insertions(+), 67 deletions(-) diff --git a/src/pugixml.cpp b/src/pugixml.cpp index c65fc90..4720f45 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -1272,6 +1272,74 @@ namespace } #endif + size_t as_utf8_begin(const wchar_t* str, size_t length) + { + STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4); + + // get length in utf8 characters + return sizeof(wchar_t) == 2 ? + utf_decoder::decode_utf16_block(reinterpret_cast(str), length, 0) : + utf_decoder::decode_utf32_block(reinterpret_cast(str), length, 0); + } + + void as_utf8_end(char* buffer, size_t size, const wchar_t* str, size_t length) + { + STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4); + + // convert to utf8 + uint8_t* begin = reinterpret_cast(buffer); + uint8_t* end = sizeof(wchar_t) == 2 ? + utf_decoder::decode_utf16_block(reinterpret_cast(str), length, begin) : + utf_decoder::decode_utf32_block(reinterpret_cast(str), length, begin); + + assert(begin + size == end); + (void)!end; + + // zero-terminate + buffer[size] = 0; + } + +#ifndef PUGIXML_NO_STL + std::string as_utf8_impl(const wchar_t* str, size_t length) + { + // first pass: get length in utf8 characters + size_t size = as_utf8_begin(str, length); + + // allocate resulting string + std::string result; + result.resize(size); + + // second pass: convert to utf8 + if (size > 0) as_utf8_end(&result[0], size, str, length); + + return result; + } + + std::wstring as_wide_impl(const char* str, size_t size) + { + const uint8_t* data = reinterpret_cast(str); + + // first pass: get length in wchar_t units + size_t length = utf_decoder::decode_utf8_block(data, size, 0); + + // allocate resulting string + std::wstring result; + result.resize(length); + + // second pass: convert to wchar_t + if (length > 0) + { + wchar_writer::value_type begin = reinterpret_cast(&result[0]); + wchar_writer::value_type end = utf_decoder::decode_utf8_block(data, size, begin); + + assert(begin + length == end); + (void)!end; + } + + return result; + } +#endif + inline bool strcpy_insitu_allow(size_t length, uintptr_t allocated, char_t* target) { assert(target); @@ -3096,33 +3164,16 @@ namespace { assert(str); - STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4); - - size_t length = wcslen(str); - // first pass: get length in utf8 characters - size_t size = sizeof(wchar_t) == 2 ? - utf_decoder::decode_utf16_block(reinterpret_cast(str), length, 0) : - utf_decoder::decode_utf32_block(reinterpret_cast(str), length, 0); + size_t length = wcslen(str); + size_t size = as_utf8_begin(str, length); // allocate resulting string char* result = static_cast(global_allocate(size + 1)); if (!result) return 0; // second pass: convert to utf8 - if (size > 0) - { - uint8_t* begin = reinterpret_cast(result); - uint8_t* end = sizeof(wchar_t) == 2 ? - utf_decoder::decode_utf16_block(reinterpret_cast(str), length, begin) : - utf_decoder::decode_utf32_block(reinterpret_cast(str), length, begin); - - assert(begin + size == end); - (void)!end; - } - - // zero-terminate - result[size] = 0; + as_utf8_end(result, size, str, length); return result; } @@ -4504,59 +4555,24 @@ namespace pugi { assert(str); - STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4); - - size_t length = wcslen(str); - - // first pass: get length in utf8 characters - size_t size = sizeof(wchar_t) == 2 ? - utf_decoder::decode_utf16_block(reinterpret_cast(str), length, 0) : - utf_decoder::decode_utf32_block(reinterpret_cast(str), length, 0); - - // allocate resulting string - std::string result; - result.resize(size); - - // second pass: convert to utf8 - if (size > 0) - { - uint8_t* begin = reinterpret_cast(&result[0]); - uint8_t* end = sizeof(wchar_t) == 2 ? - utf_decoder::decode_utf16_block(reinterpret_cast(str), length, begin) : - utf_decoder::decode_utf32_block(reinterpret_cast(str), length, begin); - - assert(begin + result.size() == end); - (void)!end; - } + return as_utf8_impl(str, wcslen(str)); + } - return result; + std::string PUGIXML_FUNCTION as_utf8(const std::wstring& str) + { + return as_utf8_impl(str.c_str(), str.size()); } std::wstring PUGIXML_FUNCTION as_wide(const char* str) { assert(str); - const uint8_t* data = reinterpret_cast(str); - size_t size = strlen(str); - - // first pass: get length in wchar_t - size_t length = utf_decoder::decode_utf8_block(data, size, 0); - - // allocate resulting string - std::wstring result; - result.resize(length); - - // second pass: convert to wchar_t - if (length > 0) - { - wchar_writer::value_type begin = reinterpret_cast(&result[0]); - wchar_writer::value_type end = utf_decoder::decode_utf8_block(data, size, begin); - - assert(begin + result.size() == end); - (void)!end; - } - - return result; + return as_wide_impl(str, strlen(str)); + } + + std::wstring PUGIXML_FUNCTION as_wide(const std::string& str) + { + return as_wide_impl(str.c_str(), str.size()); } #endif diff --git a/src/pugixml.hpp b/src/pugixml.hpp index 2624362..814b0b7 100644 --- a/src/pugixml.hpp +++ b/src/pugixml.hpp @@ -713,7 +713,6 @@ namespace pugi // Destructor, invalidates all node/attribute handles to this document ~xml_document(); - public: #ifndef PUGIXML_NO_STL // Load document from stream. xml_parse_result load(std::basic_istream >& stream, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); @@ -1036,9 +1035,11 @@ namespace pugi #ifndef PUGIXML_NO_STL // Convert wide string to UTF8 std::basic_string, std::allocator > PUGIXML_FUNCTION as_utf8(const wchar_t* str); + std::basic_string, std::allocator > PUGIXML_FUNCTION as_utf8(const std::basic_string, std::allocator >& str); // Convert UTF8 to wide string std::basic_string, std::allocator > PUGIXML_FUNCTION as_wide(const char* str); + std::basic_string, std::allocator > PUGIXML_FUNCTION as_wide(const std::basic_string, std::allocator >& str); #endif // Memory allocation function interface; returns pointer to allocated memory or NULL on failure diff --git a/tests/test_unicode.cpp b/tests/test_unicode.cpp index 0b656a3..e01e031 100644 --- a/tests/test_unicode.cpp +++ b/tests/test_unicode.cpp @@ -69,6 +69,13 @@ TEST(as_wide_invalid) CHECK(b5 == L"\nbcd"); } +TEST(as_wide_string) +{ + std::string s = "abcd"; + + CHECK(as_wide(s) == L"abcd"); +} + TEST(as_utf8_empty) { CHECK(as_utf8(L"") == ""); @@ -134,4 +141,11 @@ TEST(as_utf8_invalid) #endif } } + +TEST(as_utf8_string) +{ + std::wstring s = L"abcd"; + + CHECK(as_utf8(s) == "abcd"); +} #endif -- cgit v1.2.3