From 6e2521830f255c6b35ac302363a241274a336c1c Mon Sep 17 00:00:00 2001 From: "arseny.kapoulkine" Date: Tue, 25 May 2010 16:50:03 +0000 Subject: Optimized utf8 decoding git-svn-id: http://pugixml.googlecode.com/svn/trunk@447 99668b35-9821-0410-8761-19e4c4f06640 --- src/pugixml.cpp | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/src/pugixml.cpp b/src/pugixml.cpp index d404652..bf5253f 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -796,9 +796,7 @@ namespace pugi { const char8_t utf8_byte_mask = 0x3f; - const char8_t* end = data + size; - - while (data < end) + while (size) { char8_t lead = *data; @@ -807,29 +805,48 @@ namespace pugi { result = Traits::low(result, lead); data += 1; + size -= 1; + + // process aligned single-byte (ascii) blocks + if ((reinterpret_cast(data) & 3) == 0) + { + while (size >= 4 && (*reinterpret_cast(data) & 0x80808080) == 0) + { + result = Traits::low(result, data[0]); + result = Traits::low(result, data[1]); + result = Traits::low(result, data[2]); + result = Traits::low(result, data[3]); + data += 4; + size -= 4; + } + } } // 110xxxxx -> U+0080..U+07FF - else if ((unsigned)(lead - 0xC0) < 0x20 && (end - data) > 1 && (data[1] & 0xc0) == 0x80) + else if ((unsigned)(lead - 0xC0) < 0x20 && size >= 2 && (data[1] & 0xc0) == 0x80) { result = Traits::low(result, ((lead & ~0xC0) << 6) | (data[1] & utf8_byte_mask)); data += 2; + size -= 2; } // 1110xxxx -> U+0800-U+FFFF - else if ((unsigned)(lead - 0xE0) < 0x10 && (end - data) > 2 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80) + else if ((unsigned)(lead - 0xE0) < 0x10 && size >= 3 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80) { result = Traits::low(result, ((lead & ~0xE0) << 12) | ((data[1] & utf8_byte_mask) << 6) | (data[2] & utf8_byte_mask)); data += 3; + size -= 3; } // 11110xxx -> U+10000..U+10FFFF - else if ((unsigned)(lead - 0xF0) < 0x08 && (end - data) > 3 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80 && (data[3] & 0xc0) == 0x80) + else if ((unsigned)(lead - 0xF0) < 0x08 && size >= 4 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80 && (data[3] & 0xc0) == 0x80) { result = Traits::high(result, ((lead & ~0xF0) << 18) | ((data[1] & utf8_byte_mask) << 12) | ((data[2] & utf8_byte_mask) << 6) | (data[3] & utf8_byte_mask)); data += 4; + size -= 4; } // 10xxxxxx or 11111xxx -> invalid else { data += 1; + size -= 1; } } -- cgit v1.2.3