diff options
author | arseny.kapoulkine <arseny.kapoulkine@99668b35-9821-0410-8761-19e4c4f06640> | 2010-09-22 19:05:31 +0000 |
---|---|---|
committer | arseny.kapoulkine <arseny.kapoulkine@99668b35-9821-0410-8761-19e4c4f06640> | 2010-09-22 19:05:31 +0000 |
commit | a590a69bda9069b3d4a54e86a2472c613106ff48 (patch) | |
tree | 76ae86f39e2e269b9728ab735e1a7fcb2bbf7d8f | |
parent | ac31030886749eafe01d5102ea3f5eb303a9af3d (diff) |
Minor UTF8 conversion refactoring, added as_utf8 and as_wide overloads with string arguments
git-svn-id: http://pugixml.googlecode.com/svn/trunk@749 99668b35-9821-0410-8761-19e4c4f06640
-rw-r--r-- | src/pugixml.cpp | 148 | ||||
-rw-r--r-- | src/pugixml.hpp | 3 | ||||
-rw-r--r-- | tests/test_unicode.cpp | 14 |
3 files changed, 98 insertions, 67 deletions
diff --git a/src/pugixml.cpp b/src/pugixml.cpp index c65fc90..4720f45 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -1272,6 +1272,74 @@ namespace } #endif + size_t as_utf8_begin(const wchar_t* str, size_t length) + { + STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4); + + // get length in utf8 characters + return sizeof(wchar_t) == 2 ? + utf_decoder<utf8_counter>::decode_utf16_block(reinterpret_cast<const uint16_t*>(str), length, 0) : + utf_decoder<utf8_counter>::decode_utf32_block(reinterpret_cast<const uint32_t*>(str), length, 0); + } + + void as_utf8_end(char* buffer, size_t size, const wchar_t* str, size_t length) + { + STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4); + + // convert to utf8 + uint8_t* begin = reinterpret_cast<uint8_t*>(buffer); + uint8_t* end = sizeof(wchar_t) == 2 ? + utf_decoder<utf8_writer>::decode_utf16_block(reinterpret_cast<const uint16_t*>(str), length, begin) : + utf_decoder<utf8_writer>::decode_utf32_block(reinterpret_cast<const uint32_t*>(str), length, begin); + + assert(begin + size == end); + (void)!end; + + // zero-terminate + buffer[size] = 0; + } + +#ifndef PUGIXML_NO_STL + std::string as_utf8_impl(const wchar_t* str, size_t length) + { + // first pass: get length in utf8 characters + size_t size = as_utf8_begin(str, length); + + // allocate resulting string + std::string result; + result.resize(size); + + // second pass: convert to utf8 + if (size > 0) as_utf8_end(&result[0], size, str, length); + + return result; + } + + std::wstring as_wide_impl(const char* str, size_t size) + { + const uint8_t* data = reinterpret_cast<const uint8_t*>(str); + + // first pass: get length in wchar_t units + size_t length = utf_decoder<wchar_counter>::decode_utf8_block(data, size, 0); + + // allocate resulting string + std::wstring result; + result.resize(length); + + // second pass: convert to wchar_t + if (length > 0) + { + wchar_writer::value_type begin = reinterpret_cast<wchar_writer::value_type>(&result[0]); + wchar_writer::value_type end = utf_decoder<wchar_writer>::decode_utf8_block(data, size, begin); + + assert(begin + length == end); + (void)!end; + } + + return result; + } +#endif + inline bool strcpy_insitu_allow(size_t length, uintptr_t allocated, char_t* target) { assert(target); @@ -3096,33 +3164,16 @@ namespace { assert(str); - STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4); - - size_t length = wcslen(str); - // first pass: get length in utf8 characters - size_t size = sizeof(wchar_t) == 2 ? - utf_decoder<utf8_counter>::decode_utf16_block(reinterpret_cast<const uint16_t*>(str), length, 0) : - utf_decoder<utf8_counter>::decode_utf32_block(reinterpret_cast<const uint32_t*>(str), length, 0); + size_t length = wcslen(str); + size_t size = as_utf8_begin(str, length); // allocate resulting string char* result = static_cast<char*>(global_allocate(size + 1)); if (!result) return 0; // second pass: convert to utf8 - if (size > 0) - { - uint8_t* begin = reinterpret_cast<uint8_t*>(result); - uint8_t* end = sizeof(wchar_t) == 2 ? - utf_decoder<utf8_writer>::decode_utf16_block(reinterpret_cast<const uint16_t*>(str), length, begin) : - utf_decoder<utf8_writer>::decode_utf32_block(reinterpret_cast<const uint32_t*>(str), length, begin); - - assert(begin + size == end); - (void)!end; - } - - // zero-terminate - result[size] = 0; + as_utf8_end(result, size, str, length); return result; } @@ -4504,59 +4555,24 @@ namespace pugi { assert(str); - STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4); - - size_t length = wcslen(str); - - // first pass: get length in utf8 characters - size_t size = sizeof(wchar_t) == 2 ? - utf_decoder<utf8_counter>::decode_utf16_block(reinterpret_cast<const uint16_t*>(str), length, 0) : - utf_decoder<utf8_counter>::decode_utf32_block(reinterpret_cast<const uint32_t*>(str), length, 0); - - // allocate resulting string - std::string result; - result.resize(size); - - // second pass: convert to utf8 - if (size > 0) - { - uint8_t* begin = reinterpret_cast<uint8_t*>(&result[0]); - uint8_t* end = sizeof(wchar_t) == 2 ? - utf_decoder<utf8_writer>::decode_utf16_block(reinterpret_cast<const uint16_t*>(str), length, begin) : - utf_decoder<utf8_writer>::decode_utf32_block(reinterpret_cast<const uint32_t*>(str), length, begin); - - assert(begin + result.size() == end); - (void)!end; - } + return as_utf8_impl(str, wcslen(str)); + } - return result; + std::string PUGIXML_FUNCTION as_utf8(const std::wstring& str) + { + return as_utf8_impl(str.c_str(), str.size()); } std::wstring PUGIXML_FUNCTION as_wide(const char* str) { assert(str); - const uint8_t* data = reinterpret_cast<const uint8_t*>(str); - size_t size = strlen(str); - - // first pass: get length in wchar_t - size_t length = utf_decoder<wchar_counter>::decode_utf8_block(data, size, 0); - - // allocate resulting string - std::wstring result; - result.resize(length); - - // second pass: convert to wchar_t - if (length > 0) - { - wchar_writer::value_type begin = reinterpret_cast<wchar_writer::value_type>(&result[0]); - wchar_writer::value_type end = utf_decoder<wchar_writer>::decode_utf8_block(data, size, begin); - - assert(begin + result.size() == end); - (void)!end; - } - - return result; + return as_wide_impl(str, strlen(str)); + } + + std::wstring PUGIXML_FUNCTION as_wide(const std::string& str) + { + return as_wide_impl(str.c_str(), str.size()); } #endif diff --git a/src/pugixml.hpp b/src/pugixml.hpp index 2624362..814b0b7 100644 --- a/src/pugixml.hpp +++ b/src/pugixml.hpp @@ -713,7 +713,6 @@ namespace pugi // Destructor, invalidates all node/attribute handles to this document ~xml_document(); - public: #ifndef PUGIXML_NO_STL // Load document from stream. xml_parse_result load(std::basic_istream<char, std::char_traits<char> >& stream, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); @@ -1036,9 +1035,11 @@ namespace pugi #ifndef PUGIXML_NO_STL // Convert wide string to UTF8 std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const wchar_t* str); + std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >& str); // Convert UTF8 to wide string std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const char* str); + std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const std::basic_string<char, std::char_traits<char>, std::allocator<char> >& str); #endif // Memory allocation function interface; returns pointer to allocated memory or NULL on failure diff --git a/tests/test_unicode.cpp b/tests/test_unicode.cpp index 0b656a3..e01e031 100644 --- a/tests/test_unicode.cpp +++ b/tests/test_unicode.cpp @@ -69,6 +69,13 @@ TEST(as_wide_invalid) CHECK(b5 == L"\nbcd"); } +TEST(as_wide_string) +{ + std::string s = "abcd"; + + CHECK(as_wide(s) == L"abcd"); +} + TEST(as_utf8_empty) { CHECK(as_utf8(L"") == ""); @@ -134,4 +141,11 @@ TEST(as_utf8_invalid) #endif } } + +TEST(as_utf8_string) +{ + std::wstring s = L"abcd"; + + CHECK(as_utf8(s) == "abcd"); +} #endif |