summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorarseny.kapoulkine <arseny.kapoulkine@99668b35-9821-0410-8761-19e4c4f06640>2010-09-22 19:05:31 +0000
committerarseny.kapoulkine <arseny.kapoulkine@99668b35-9821-0410-8761-19e4c4f06640>2010-09-22 19:05:31 +0000
commita590a69bda9069b3d4a54e86a2472c613106ff48 (patch)
tree76ae86f39e2e269b9728ab735e1a7fcb2bbf7d8f
parentac31030886749eafe01d5102ea3f5eb303a9af3d (diff)
Minor UTF8 conversion refactoring, added as_utf8 and as_wide overloads with string arguments
git-svn-id: http://pugixml.googlecode.com/svn/trunk@749 99668b35-9821-0410-8761-19e4c4f06640
-rw-r--r--src/pugixml.cpp148
-rw-r--r--src/pugixml.hpp3
-rw-r--r--tests/test_unicode.cpp14
3 files changed, 98 insertions, 67 deletions
diff --git a/src/pugixml.cpp b/src/pugixml.cpp
index c65fc90..4720f45 100644
--- a/src/pugixml.cpp
+++ b/src/pugixml.cpp
@@ -1272,6 +1272,74 @@ namespace
}
#endif
+ size_t as_utf8_begin(const wchar_t* str, size_t length)
+ {
+ STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4);
+
+ // get length in utf8 characters
+ return sizeof(wchar_t) == 2 ?
+ utf_decoder<utf8_counter>::decode_utf16_block(reinterpret_cast<const uint16_t*>(str), length, 0) :
+ utf_decoder<utf8_counter>::decode_utf32_block(reinterpret_cast<const uint32_t*>(str), length, 0);
+ }
+
+ void as_utf8_end(char* buffer, size_t size, const wchar_t* str, size_t length)
+ {
+ STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4);
+
+ // convert to utf8
+ uint8_t* begin = reinterpret_cast<uint8_t*>(buffer);
+ uint8_t* end = sizeof(wchar_t) == 2 ?
+ utf_decoder<utf8_writer>::decode_utf16_block(reinterpret_cast<const uint16_t*>(str), length, begin) :
+ utf_decoder<utf8_writer>::decode_utf32_block(reinterpret_cast<const uint32_t*>(str), length, begin);
+
+ assert(begin + size == end);
+ (void)!end;
+
+ // zero-terminate
+ buffer[size] = 0;
+ }
+
+#ifndef PUGIXML_NO_STL
+ std::string as_utf8_impl(const wchar_t* str, size_t length)
+ {
+ // first pass: get length in utf8 characters
+ size_t size = as_utf8_begin(str, length);
+
+ // allocate resulting string
+ std::string result;
+ result.resize(size);
+
+ // second pass: convert to utf8
+ if (size > 0) as_utf8_end(&result[0], size, str, length);
+
+ return result;
+ }
+
+ std::wstring as_wide_impl(const char* str, size_t size)
+ {
+ const uint8_t* data = reinterpret_cast<const uint8_t*>(str);
+
+ // first pass: get length in wchar_t units
+ size_t length = utf_decoder<wchar_counter>::decode_utf8_block(data, size, 0);
+
+ // allocate resulting string
+ std::wstring result;
+ result.resize(length);
+
+ // second pass: convert to wchar_t
+ if (length > 0)
+ {
+ wchar_writer::value_type begin = reinterpret_cast<wchar_writer::value_type>(&result[0]);
+ wchar_writer::value_type end = utf_decoder<wchar_writer>::decode_utf8_block(data, size, begin);
+
+ assert(begin + length == end);
+ (void)!end;
+ }
+
+ return result;
+ }
+#endif
+
inline bool strcpy_insitu_allow(size_t length, uintptr_t allocated, char_t* target)
{
assert(target);
@@ -3096,33 +3164,16 @@ namespace
{
assert(str);
- STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4);
-
- size_t length = wcslen(str);
-
// first pass: get length in utf8 characters
- size_t size = sizeof(wchar_t) == 2 ?
- utf_decoder<utf8_counter>::decode_utf16_block(reinterpret_cast<const uint16_t*>(str), length, 0) :
- utf_decoder<utf8_counter>::decode_utf32_block(reinterpret_cast<const uint32_t*>(str), length, 0);
+ size_t length = wcslen(str);
+ size_t size = as_utf8_begin(str, length);
// allocate resulting string
char* result = static_cast<char*>(global_allocate(size + 1));
if (!result) return 0;
// second pass: convert to utf8
- if (size > 0)
- {
- uint8_t* begin = reinterpret_cast<uint8_t*>(result);
- uint8_t* end = sizeof(wchar_t) == 2 ?
- utf_decoder<utf8_writer>::decode_utf16_block(reinterpret_cast<const uint16_t*>(str), length, begin) :
- utf_decoder<utf8_writer>::decode_utf32_block(reinterpret_cast<const uint32_t*>(str), length, begin);
-
- assert(begin + size == end);
- (void)!end;
- }
-
- // zero-terminate
- result[size] = 0;
+ as_utf8_end(result, size, str, length);
return result;
}
@@ -4504,59 +4555,24 @@ namespace pugi
{
assert(str);
- STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4);
-
- size_t length = wcslen(str);
-
- // first pass: get length in utf8 characters
- size_t size = sizeof(wchar_t) == 2 ?
- utf_decoder<utf8_counter>::decode_utf16_block(reinterpret_cast<const uint16_t*>(str), length, 0) :
- utf_decoder<utf8_counter>::decode_utf32_block(reinterpret_cast<const uint32_t*>(str), length, 0);
-
- // allocate resulting string
- std::string result;
- result.resize(size);
-
- // second pass: convert to utf8
- if (size > 0)
- {
- uint8_t* begin = reinterpret_cast<uint8_t*>(&result[0]);
- uint8_t* end = sizeof(wchar_t) == 2 ?
- utf_decoder<utf8_writer>::decode_utf16_block(reinterpret_cast<const uint16_t*>(str), length, begin) :
- utf_decoder<utf8_writer>::decode_utf32_block(reinterpret_cast<const uint32_t*>(str), length, begin);
-
- assert(begin + result.size() == end);
- (void)!end;
- }
+ return as_utf8_impl(str, wcslen(str));
+ }
- return result;
+ std::string PUGIXML_FUNCTION as_utf8(const std::wstring& str)
+ {
+ return as_utf8_impl(str.c_str(), str.size());
}
std::wstring PUGIXML_FUNCTION as_wide(const char* str)
{
assert(str);
- const uint8_t* data = reinterpret_cast<const uint8_t*>(str);
- size_t size = strlen(str);
-
- // first pass: get length in wchar_t
- size_t length = utf_decoder<wchar_counter>::decode_utf8_block(data, size, 0);
-
- // allocate resulting string
- std::wstring result;
- result.resize(length);
-
- // second pass: convert to wchar_t
- if (length > 0)
- {
- wchar_writer::value_type begin = reinterpret_cast<wchar_writer::value_type>(&result[0]);
- wchar_writer::value_type end = utf_decoder<wchar_writer>::decode_utf8_block(data, size, begin);
-
- assert(begin + result.size() == end);
- (void)!end;
- }
-
- return result;
+ return as_wide_impl(str, strlen(str));
+ }
+
+ std::wstring PUGIXML_FUNCTION as_wide(const std::string& str)
+ {
+ return as_wide_impl(str.c_str(), str.size());
}
#endif
diff --git a/src/pugixml.hpp b/src/pugixml.hpp
index 2624362..814b0b7 100644
--- a/src/pugixml.hpp
+++ b/src/pugixml.hpp
@@ -713,7 +713,6 @@ namespace pugi
// Destructor, invalidates all node/attribute handles to this document
~xml_document();
- public:
#ifndef PUGIXML_NO_STL
// Load document from stream.
xml_parse_result load(std::basic_istream<char, std::char_traits<char> >& stream, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
@@ -1036,9 +1035,11 @@ namespace pugi
#ifndef PUGIXML_NO_STL
// Convert wide string to UTF8
std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const wchar_t* str);
+ std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >& str);
// Convert UTF8 to wide string
std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const char* str);
+ std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const std::basic_string<char, std::char_traits<char>, std::allocator<char> >& str);
#endif
// Memory allocation function interface; returns pointer to allocated memory or NULL on failure
diff --git a/tests/test_unicode.cpp b/tests/test_unicode.cpp
index 0b656a3..e01e031 100644
--- a/tests/test_unicode.cpp
+++ b/tests/test_unicode.cpp
@@ -69,6 +69,13 @@ TEST(as_wide_invalid)
CHECK(b5 == L"\nbcd");
}
+TEST(as_wide_string)
+{
+ std::string s = "abcd";
+
+ CHECK(as_wide(s) == L"abcd");
+}
+
TEST(as_utf8_empty)
{
CHECK(as_utf8(L"") == "");
@@ -134,4 +141,11 @@ TEST(as_utf8_invalid)
#endif
}
}
+
+TEST(as_utf8_string)
+{
+ std::wstring s = L"abcd";
+
+ CHECK(as_utf8(s) == "abcd");
+}
#endif