From a09d5c0b8e2ca13df6f73af061a455069672f17c Mon Sep 17 00:00:00 2001 From: "arseny.kapoulkine" Date: Mon, 9 Aug 2010 13:40:24 +0000 Subject: Added null pointer assertion to load_buffer, refactored get_buffer_encoding git-svn-id: http://pugixml.googlecode.com/svn/trunk@633 99668b35-9821-0410-8761-19e4c4f06640 --- src/pugixml.cpp | 51 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 20 deletions(-) (limited to 'src/pugixml.cpp') diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 4efb259..e42f64c 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -1034,6 +1034,30 @@ namespace return is_little_endian() ? encoding_utf32_le : encoding_utf32_be; } + xml_encoding guess_buffer_encoding(uint8_t d0, uint8_t d1, uint8_t d2, uint8_t d3) + { + // look for BOM in first few bytes + if (d0 == 0 && d1 == 0 && d2 == 0xfe && d3 == 0xff) return encoding_utf32_be; + if (d0 == 0xff && d1 == 0xfe && d2 == 0 && d3 == 0) return encoding_utf32_le; + if (d0 == 0xfe && d1 == 0xff) return encoding_utf16_be; + if (d0 == 0xff && d1 == 0xfe) return encoding_utf16_le; + if (d0 == 0xef && d1 == 0xbb && d2 == 0xbf) return encoding_utf8; + + // look for <, (contents); - // look for BOM in first few bytes - if (size > 4 && data[0] == 0 && data[1] == 0 && data[2] == 0xfe && data[3] == 0xff) return encoding_utf32_be; - if (size > 4 && data[0] == 0xff && data[1] == 0xfe && data[2] == 0 && data[3] == 0) return encoding_utf32_le; - if (size > 2 && data[0] == 0xfe && data[1] == 0xff) return encoding_utf16_be; - if (size > 2 && data[0] == 0xff && data[1] == 0xfe) return encoding_utf16_le; - if (size > 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf) return encoding_utf8; - - // look for <, 4 && data[0] == 0 && data[1] == 0 && data[2] == 0 && data[3] == 0x3c) return encoding_utf32_be; - if (size > 4 && data[0] == 0x3c && data[1] == 0 && data[2] == 0 && data[3] == 0) return encoding_utf32_le; - if (size > 4 && data[0] == 0 && data[1] == 0x3c && data[2] == 0 && data[3] == 0x3f) return encoding_utf16_be; - if (size > 4 && data[0] == 0x3c && data[1] == 0 && data[2] == 0x3f && data[3] == 0) return encoding_utf16_le; - if (size > 4 && data[0] == 0x3c && data[1] == 0x3f && data[2] == 0x78 && data[3] == 0x6d) return encoding_utf8; - - // look for utf16 < followed by node name (this may fail, but is better than utf8 since it's zero terminated so early) - if (size > 2 && data[0] == 0 && data[1] == 0x3c) return encoding_utf16_be; - if (size > 2 && data[0] == 0x3c && data[1] == 0) return encoding_utf16_le; - - // no known BOM detected, assume utf8 - return encoding_utf8; + return guess_buffer_encoding(data[0], data[1], data[2], data[3]); } bool get_mutable_buffer(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable) @@ -4401,6 +4409,9 @@ namespace pugi { reset(); + // check input buffer + assert(contents || size == 0); + // get actual encoding xml_encoding buffer_encoding = get_buffer_encoding(encoding, contents, size); -- cgit v1.2.3