From a0769dfe380ad7e4bb3c47dc6b32099e3a4918be Mon Sep 17 00:00:00 2001 From: "arseny.kapoulkine" Date: Tue, 20 Dec 2011 09:45:10 +0000 Subject: Introduced encoding_latin1 support (conversion on loading, conversion on saving, encoding name in declaration in document::save) git-svn-id: http://pugixml.googlecode.com/svn/trunk@829 99668b35-9821-0410-8761-19e4c4f06640 --- src/pugixml.cpp | 127 +++++++++++++++++++++++++++++++++++++++- src/pugixml.hpp | 3 +- tests/data/latintest_latin1.xml | 1 + tests/data/latintest_utf8.xml | 1 + tests/test_document.cpp | 67 +++++++++++++++++---- tests/test_write.cpp | 2 + 6 files changed, 189 insertions(+), 12 deletions(-) create mode 100644 tests/data/latintest_latin1.xml create mode 100644 tests/data/latintest_utf8.xml diff --git a/src/pugixml.cpp b/src/pugixml.cpp index a6196dc..eb924a2 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -754,6 +754,27 @@ namespace } }; + struct latin1_writer + { + typedef uint8_t* value_type; + + static value_type low(value_type result, uint32_t ch) + { + *result = static_cast(ch > 255 ? '?' : ch); + + return result + 1; + } + + static value_type high(value_type result, uint32_t ch) + { + (void)ch; + + *result = '?'; + + return result + 1; + } + }; + template struct wchar_selector; template <> struct wchar_selector<2> @@ -904,6 +925,16 @@ namespace return result; } + + static inline typename Traits::value_type decode_latin1_block(const uint8_t* data, size_t size, typename Traits::value_type result) + { + for (size_t i = 0; i < size; ++i) + { + result = Traits::low(result, data[i]); + } + + return result; + } }; template inline void convert_utf_endian_swap(T* result, const T* data, size_t length) @@ -1172,6 +1203,27 @@ namespace return true; } + bool convert_buffer_latin1(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size) + { + const uint8_t* data = static_cast(contents); + + // get length in wchar_t units + out_length = size; + + // allocate buffer of suitable length + out_buffer = static_cast(global_allocate((out_length > 0 ? out_length : 1) * sizeof(char_t))); + if (!out_buffer) return false; + + // convert latin1 input to wchar_t + wchar_writer::value_type out_begin = reinterpret_cast(out_buffer); + wchar_writer::value_type out_end = utf_decoder::decode_latin1_block(data, size, out_begin); + + assert(out_end == out_begin + out_length); + (void)!out_end; + + return true; + } + bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable) { // get native encoding @@ -1206,6 +1258,9 @@ namespace convert_buffer_utf32(out_buffer, out_length, contents, size, opt_true()); } + // source encoding is latin1 + if (encoding == encoding_latin1) return convert_buffer_latin1(out_buffer, out_length, contents, size); + assert(!"Invalid encoding"); return false; } @@ -1254,6 +1309,48 @@ namespace return true; } + size_t get_latin1_7bit_prefix_length(const uint8_t* data, size_t size) + { + for (size_t i = 0; i < size; ++i) + if (data[i] > 127) + return i; + + return size; + } + + bool convert_buffer_latin1(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable) + { + const uint8_t* data = static_cast(contents); + + // get size of prefix that does not need utf8 conversion + size_t prefix_length = get_latin1_7bit_prefix_length(data, size); + assert(prefix_length <= size); + + const uint8_t* postfix = data + prefix_length; + size_t postfix_length = size - prefix_length; + + // if no conversion is needed, just return the original buffer + if (postfix_length == 0) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable); + + // first pass: get length in utf8 units + out_length = prefix_length + utf_decoder::decode_latin1_block(postfix, postfix_length, 0); + + // allocate buffer of suitable length + out_buffer = static_cast(global_allocate((out_length > 0 ? out_length : 1) * sizeof(char_t))); + if (!out_buffer) return false; + + // second pass: convert latin1 input to utf8 + memcpy(out_buffer, data, prefix_length); + + uint8_t* out_begin = reinterpret_cast(out_buffer); + uint8_t* out_end = utf_decoder::decode_latin1_block(postfix, postfix_length, out_begin + prefix_length); + + assert(out_end == out_begin + out_length); + (void)!out_end; + + return true; + } + bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable) { // fast path: no conversion required @@ -1279,6 +1376,9 @@ namespace convert_buffer_utf32(out_buffer, out_length, contents, size, opt_true()); } + // source encoding is latin1 + if (encoding == encoding_latin1) return convert_buffer_latin1(out_buffer, out_length, contents, size, is_mutable); + assert(!"Invalid encoding"); return false; } @@ -2580,6 +2680,18 @@ namespace return static_cast(end - dest) * sizeof(uint32_t); } + // convert to latin1 + if (encoding == encoding_latin1) + { + uint8_t* dest = reinterpret_cast(result); + + uint8_t* end = sizeof(wchar_t) == 2 ? + utf_decoder::decode_utf16_block(reinterpret_cast(data), length, dest) : + utf_decoder::decode_utf32_block(reinterpret_cast(data), length, dest); + + return static_cast(end - dest); + } + assert(!"Invalid encoding"); return 0; } @@ -2632,6 +2744,14 @@ namespace return static_cast(end - dest) * sizeof(uint32_t); } + if (encoding == encoding_latin1) + { + uint8_t* dest = reinterpret_cast(result); + uint8_t* end = utf_decoder::decode_utf8_block(reinterpret_cast(data), length, dest); + + return static_cast(end - dest); + } + assert(!"Invalid encoding"); return 0; } @@ -2822,6 +2942,9 @@ namespace writer.write("\xff\xfe\x00\x00", 4); break; + case encoding_latin1: + break; + default: assert(!"Invalid encoding"); } @@ -4806,7 +4929,9 @@ namespace pugi if (!(flags & format_no_declaration) && !has_declaration(*this)) { - buffered_writer.write(PUGIXML_TEXT("")); + buffered_writer.write(PUGIXML_TEXT("'); if (!(flags & format_raw)) buffered_writer.write('\n'); } diff --git a/src/pugixml.hpp b/src/pugixml.hpp index 1946bfb..d0a8623 100644 --- a/src/pugixml.hpp +++ b/src/pugixml.hpp @@ -194,7 +194,8 @@ namespace pugi encoding_utf32_le, // Little-endian UTF32 encoding_utf32_be, // Big-endian UTF32 encoding_utf32, // UTF32 with native endianness - encoding_wchar // The same encoding wchar_t has (either UTF16 or UTF32) + encoding_wchar, // The same encoding wchar_t has (either UTF16 or UTF32) + encoding_latin1 }; // Formatting flags diff --git a/tests/data/latintest_latin1.xml b/tests/data/latintest_latin1.xml new file mode 100644 index 0000000..3336f0c --- /dev/null +++ b/tests/data/latintest_latin1.xml @@ -0,0 +1 @@ +
00000535351010MüllerJörg
<Test>
10
<Test 2>
20
This is a text.
\ No newline at end of file diff --git a/tests/data/latintest_utf8.xml b/tests/data/latintest_utf8.xml new file mode 100644 index 0000000..3efad30 --- /dev/null +++ b/tests/data/latintest_utf8.xml @@ -0,0 +1 @@ +
00000535351010MüllerJörg
<Test>
10
<Test 2>
20
This is a text.
\ No newline at end of file diff --git a/tests/test_document.cpp b/tests/test_document.cpp index a799fbf..a49efcd 100644 --- a/tests/test_document.cpp +++ b/tests/test_document.cpp @@ -307,6 +307,7 @@ TEST_XML(document_save_bom, "") CHECK(test_save_narrow(doc, flags, encoding_utf16_le, "\xff\xfe<\x00n\x00 \x00/\x00>\x00", 12)); CHECK(test_save_narrow(doc, flags, encoding_utf32_be, "\x00\x00\xfe\xff\x00\x00\x00<\x00\x00\x00n\x00\x00\x00 \x00\x00\x00/\x00\x00\x00>", 24)); CHECK(test_save_narrow(doc, flags, encoding_utf32_le, "\xff\xfe\x00\x00<\x00\x00\x00n\x00\x00\x00 \x00\x00\x00/\x00\x00\x00>\x00\x00\x00", 24)); + CHECK(test_save_narrow(doc, flags, encoding_latin1, "", 5)); // encodings synonyms CHECK(save_narrow(doc, flags, encoding_utf16) == save_narrow(doc, flags, (is_little_endian() ? encoding_utf16_le : encoding_utf16_be))); @@ -371,6 +372,15 @@ TEST_XML(document_save_declaration_present_last, "") CHECK(writer.as_string() == STR("\n\n\n")); } +TEST_XML(document_save_declaration_latin1, "") +{ + xml_writer_string writer; + + doc.save(writer, STR(""), pugi::format_default, encoding_latin1); + + CHECK(writer.as_narrow() == "\n\n"); +} + struct temp_file { char path[512]; @@ -704,18 +714,19 @@ static bool load_file_in_memory(const char* path, char*& data, size_t& size) return true; } -TEST(document_contents_preserve) +struct file_data_t { - struct file_t - { - const char* path; - xml_encoding encoding; + const char* path; + xml_encoding encoding; + + char* data; + size_t size; +}; - char* data; - size_t size; - }; - file_t files[] = +TEST(document_contents_preserve) +{ + file_data_t files[] = { {"tests/data/utftest_utf16_be_clean.xml", encoding_utf16_be, 0, 0}, {"tests/data/utftest_utf16_le_clean.xml", encoding_utf16_le, 0, 0}, @@ -751,6 +762,41 @@ TEST(document_contents_preserve) } } +TEST(document_contents_preserve_latin1) +{ + file_data_t files[] = + { + {"tests/data/latintest_utf8.xml", encoding_utf8, 0, 0}, + {"tests/data/latintest_latin1.xml", encoding_latin1, 0, 0} + }; + + // load files in memory + for (unsigned int i = 0; i < sizeof(files) / sizeof(files[0]); ++i) + { + CHECK(load_file_in_memory(files[i].path, files[i].data, files[i].size)); + } + + // convert each file to each format and compare bitwise + for (unsigned int src = 0; src < sizeof(files) / sizeof(files[0]); ++src) + { + for (unsigned int dst = 0; dst < sizeof(files) / sizeof(files[0]); ++dst) + { + // parse into document (preserve comments, declaration and whitespace pcdata) + xml_document doc; + CHECK(doc.load_buffer(files[src].data, files[src].size, parse_default | parse_ws_pcdata | parse_declaration | parse_comments, files[src].encoding)); + + // compare saved document with the original (raw formatting, without extra declaration, write bom if it was in original file) + CHECK(test_save_narrow(doc, format_raw | format_no_declaration | format_write_bom, files[dst].encoding, files[dst].data, files[dst].size)); + } + } + + // cleanup + for (unsigned int j = 0; j < sizeof(files) / sizeof(files[0]); ++j) + { + delete[] files[j].data; + } +} + static bool test_parse_fail(const void* buffer, size_t size, xml_encoding encoding = encoding_utf8) { // copy buffer to heap (to enable out-of-bounds checks) @@ -811,7 +857,8 @@ TEST(document_load_buffer_empty) encoding_utf32_le, encoding_utf32_be, encoding_utf32, - encoding_wchar + encoding_wchar, + encoding_latin1 }; char buffer[1]; diff --git a/tests/test_write.cpp b/tests/test_write.cpp index 094bf59..93f5bd9 100644 --- a/tests/test_write.cpp +++ b/tests/test_write.cpp @@ -189,6 +189,8 @@ TEST(write_encodings) { CHECK(v.size() == 10 && v[0] == '<' && v[1] == 0x54 && v[2] == 0xA2 && v[3] == 0x20AC && v[4] == 0xd852 && v[5] == 0xdf62 && v[6] == ' ' && v[7] == '/' && v[8] == '>' && v[9] == '\n'); } + + CHECK(test_write_narrow(doc, format_default, encoding_latin1, "<\x54\xA2?? />\n", 9)); } #ifdef PUGIXML_WCHAR_MODE -- cgit v1.2.3