From 41fb880bf0c3246df50103c6ef3cf91d0fd5eefc Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Tue, 31 Jan 2017 07:50:39 -0800 Subject: tests: Add coverage tests for encoding detection Enumerate successfull cases and also cases where the detection stops half-way and results in a different detected encoding. --- tests/test_parse.cpp | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/tests/test_parse.cpp b/tests/test_parse.cpp index ba45a45..f94a565 100644 --- a/tests/test_parse.cpp +++ b/tests/test_parse.cpp @@ -1206,3 +1206,83 @@ TEST(parse_encoding_detect_latin1) CHECK(doc.load_buffer(test3, sizeof(test3)).encoding == encoding_latin1); CHECK(doc.load_buffer(test4, sizeof(test4)).encoding == encoding_latin1); } + +TEST(parse_encoding_detect_auto) +{ + struct data_t + { + const char* contents; + size_t size; + xml_encoding encoding; + }; + + const data_t data[] = + { + // BOM + { "\x00\x00\xfe\xff", 4, encoding_utf32_be }, + { "\xff\xfe\x00\x00", 4, encoding_utf32_le }, + { "\xfe\xff ", 4, encoding_utf16_be }, + { "\xff\xfe ", 4, encoding_utf16_le }, + { "\xef\xbb\xbf ", 4, encoding_utf8 }, + // automatic tag detection for < or ", 16, encoding_utf32_be }, + { "<\x00\x00\x00n\x00\x00\x00/\x00\x00\x00>\x00\x00\x00", 16, encoding_utf32_le }, + { "\x00<\x00?\x00n\x00?\x00>", 10, encoding_utf16_be }, + { "<\x00?\x00n\x00?\x00>\x00", 10, encoding_utf16_le }, + { "\x00<\x00n\x00/\x00>", 8, encoding_utf16_be }, + { "<\x00n\x00/\x00>\x00", 8, encoding_utf16_le }, + // ", 25, encoding_latin1 }, + }; + + for (size_t i = 0; i < sizeof(data) / sizeof(data[0]); ++i) + { + xml_document doc; + xml_parse_result result = doc.load_buffer(data[i].contents, data[i].size, parse_fragment); + + CHECK(result); + CHECK(result.encoding == data[i].encoding); + } +} + +TEST(parse_encoding_detect_auto_incomplete) +{ + struct data_t + { + const char* contents; + size_t size; + xml_encoding encoding; + }; + + const data_t data[] = + { + // BOM + { "\x00\x00\xfe ", 4, encoding_utf8 }, + { "\x00\x00 ", 4, encoding_utf8 }, + { "\xff\xfe\x00 ", 4, encoding_utf16_le }, + { "\xfe ", 4, encoding_utf8 }, + { "\xff ", 4, encoding_utf8 }, + { "\xef\xbb ", 4, encoding_utf8 }, + { "\xef ", 4, encoding_utf8 }, + // automatic tag detection for < or \x00", 8, encoding_utf16_le }, + { "\x00", 8, encoding_utf16_be }, + { "<\x00?n/\x00>\x00", 8, encoding_utf16_le }, + { "\x00 ", 8, encoding_utf8 }, + // ", 25, encoding_utf8 }, + { "", 25, encoding_utf8 }, + { "", 25, encoding_utf8 }, + { "<_ABC encoding='latin1'/>", 25, encoding_utf8 }, + }; + + for (size_t i = 0; i < sizeof(data) / sizeof(data[0]); ++i) + { + xml_document doc; + xml_parse_result result = doc.load_buffer(data[i].contents, data[i].size, parse_fragment); + + CHECK(result); + CHECK(result.encoding == data[i].encoding); + } +} -- cgit v1.2.3