From d2175179753d8333d283747ab1472a0f6bfb1a60 Mon Sep 17 00:00:00 2001 From: "arseny.kapoulkine" Date: Thu, 6 May 2010 20:39:14 +0000 Subject: Changed version number to 0.6, merged Unicode utilities to pugixml.cpp git-svn-id: http://pugixml.googlecode.com/svn/trunk@384 99668b35-9821-0410-8761-19e4c4f06640 --- src/pugiconfig.hpp | 2 +- src/pugiutf.hpp | 358 ----------------------------------------------------- src/pugixml.cpp | 321 ++++++++++++++++++++++++++++++++++++++++++++++- src/pugixml.hpp | 2 +- src/pugixpath.cpp | 2 +- 5 files changed, 321 insertions(+), 364 deletions(-) delete mode 100644 src/pugiutf.hpp (limited to 'src') diff --git a/src/pugiconfig.hpp b/src/pugiconfig.hpp index a62b7f4..b5c09ee 100644 --- a/src/pugiconfig.hpp +++ b/src/pugiconfig.hpp @@ -1,5 +1,5 @@ /** - * pugixml parser - version 0.5 + * pugixml parser - version 0.6 * -------------------------------------------------------- * Copyright (C) 2006-2009, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) * Report bugs and download new versions at http://code.google.com/p/pugixml/ diff --git a/src/pugiutf.hpp b/src/pugiutf.hpp deleted file mode 100644 index dfca940..0000000 --- a/src/pugiutf.hpp +++ /dev/null @@ -1,358 +0,0 @@ -/** - * pugixml parser - version 0.5 - * -------------------------------------------------------- - * Copyright (C) 2006-2009, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) - * Report bugs and download new versions at http://code.google.com/p/pugixml/ - * - * This library is distributed under the MIT License. See notice at the end - * of this file. - * - * This work is based on the pugxml parser, which is: - * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net) - */ - -#ifndef HEADER_PUGIUTF_HPP -#define HEADER_PUGIUTF_HPP - -namespace pugi -{ - namespace impl - { - typedef unsigned char char8_t; - typedef unsigned short char16_t; - typedef unsigned int char32_t; - - inline char16_t endian_swap(char16_t value) - { - return static_cast(((value & 0xff) << 8) | (value >> 8)); - } - - inline char32_t endian_swap(char32_t value) - { - return ((value & 0xff) << 24) | ((value & 0xff00) << 8) | ((value & 0xff0000) >> 8) | (value >> 24); - } - - struct utf8_counter - { - typedef size_t value_type; - - static value_type low(value_type result, char32_t ch) - { - // U+0000..U+007F - if (ch < 0x80) return result + 1; - // U+0080..U+07FF - else if (ch < 0x800) return result + 2; - // U+0800..U+FFFF - else return result + 3; - } - - static value_type high(value_type result, char32_t) - { - // U+10000..U+10FFFF - return result + 4; - } - }; - - struct utf8_writer - { - typedef char8_t* value_type; - - static value_type low(value_type result, char32_t ch) - { - // U+0000..U+007F - if (ch < 0x80) - { - *result = static_cast(ch); - return result + 1; - } - // U+0080..U+07FF - else if (ch < 0x800) - { - result[0] = static_cast(0xC0 | (ch >> 6)); - result[1] = static_cast(0x80 | (ch & 0x3F)); - return result + 2; - } - // U+0800..U+FFFF - else - { - result[0] = static_cast(0xE0 | (ch >> 12)); - result[1] = static_cast(0x80 | ((ch >> 6) & 0x3F)); - result[2] = static_cast(0x80 | (ch & 0x3F)); - return result + 3; - } - } - - static value_type high(value_type result, char32_t ch) - { - // U+10000..U+10FFFF - result[0] = static_cast(0xF0 | (ch >> 18)); - result[1] = static_cast(0x80 | ((ch >> 12) & 0x3F)); - result[2] = static_cast(0x80 | ((ch >> 6) & 0x3F)); - result[3] = static_cast(0x80 | (ch & 0x3F)); - return result + 4; - } - - static value_type any(value_type result, char32_t ch) - { - return (ch < 0x10000) ? low(result, ch) : high(result, ch); - } - }; - - struct utf16_counter - { - typedef size_t value_type; - - static value_type low(value_type result, char32_t) - { - return result + 1; - } - - static value_type high(value_type result, char32_t) - { - return result + 2; - } - }; - - struct utf16_writer - { - typedef char16_t* value_type; - - static value_type low(value_type result, char32_t ch) - { - *result = static_cast(ch); - - return result + 1; - } - - static value_type high(value_type result, char32_t ch) - { - char32_t msh = (char32_t)(ch - 0x10000) >> 10; - char32_t lsh = (char32_t)(ch - 0x10000) & 0x3ff; - - result[0] = static_cast(0xD800 + msh); - result[1] = static_cast(0xDC00 + lsh); - - return result + 2; - } - - static value_type any(value_type result, char32_t ch) - { - return (ch < 0x10000) ? low(result, ch) : high(result, ch); - } - }; - - struct utf32_counter - { - typedef size_t value_type; - - static value_type low(value_type result, char32_t) - { - return result + 1; - } - - static value_type high(value_type result, char32_t) - { - return result + 1; - } - }; - - struct utf32_writer - { - typedef char32_t* value_type; - - static value_type low(value_type result, char32_t ch) - { - *result = ch; - - return result + 1; - } - - static value_type high(value_type result, char32_t ch) - { - *result = ch; - - return result + 1; - } - - static value_type any(value_type result, char32_t ch) - { - *result = ch; - - return result + 1; - } - }; - - template struct wchar_selector; - - template <> struct wchar_selector<2> - { - typedef char16_t type; - typedef utf16_counter counter; - typedef utf16_writer writer; - }; - - template <> struct wchar_selector<4> - { - typedef char32_t type; - typedef utf32_counter counter; - typedef utf32_writer writer; - }; - - typedef wchar_selector::counter wchar_counter; - typedef wchar_selector::writer wchar_writer; - - template static inline typename Traits::value_type decode_utf8_block(const char8_t* data, size_t size, typename Traits::value_type result, Traits = Traits()) - { - const char8_t utf8_byte_mask = 0x3f; - - const char8_t* end = data + size; - - while (data < end) - { - char8_t lead = *data; - - // 0xxxxxxx -> U+0000..U+007F - if (lead < 0x80) - { - result = Traits::low(result, lead); - data += 1; - } - // 110xxxxx -> U+0080..U+07FF - else if ((unsigned)(lead - 0xC0) < 0x20 && data + 1 < end && (data[1] & 0xc0) == 0x80) - { - result = Traits::low(result, ((lead & ~0xC0) << 6) | (data[1] & utf8_byte_mask)); - data += 2; - } - // 1110xxxx -> U+0800-U+FFFF - else if ((unsigned)(lead - 0xE0) < 0x10 && data + 2 < end && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80) - { - result = Traits::low(result, ((lead & ~0xE0) << 12) | ((data[1] & utf8_byte_mask) << 6) | (data[2] & utf8_byte_mask)); - data += 3; - } - // 11110xxx -> U+10000..U+10FFFF - else if ((unsigned)(lead - 0xF0) < 0x08 && data + 3 < end && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80 && (data[3] & 0xc0) == 0x80) - { - result = Traits::high(result, ((lead & ~0xF0) << 18) | ((data[1] & utf8_byte_mask) << 12) | ((data[2] & utf8_byte_mask) << 6) | (data[3] & utf8_byte_mask)); - data += 4; - } - // 10xxxxxx or 11111xxx -> invalid - else - { - data += 1; - } - } - - return result; - } - - template static inline typename Traits::value_type decode_utf16_block(const char16_t* data, size_t size, typename Traits::value_type result, opt1, Traits = Traits()) - { - const bool swap = opt1::o1; - - const char16_t* end = data + size; - - while (data < end) - { - char16_t lead = swap ? endian_swap(*data) : *data; - - // U+0000..U+D7FF - if (lead < 0xD800) - { - result = Traits::low(result, lead); - data += 1; - } - // U+E000..U+FFFF - else if ((unsigned)(lead - 0xE000) < 0x2000) - { - result = Traits::low(result, lead); - data += 1; - } - // surrogate pair lead - else if ((unsigned)(lead - 0xD800) < 0x400 && data + 1 < end) - { - char16_t next = swap ? endian_swap(data[1]) : data[1]; - - if ((unsigned)(next - 0xDC00) < 0x400) - { - result = Traits::high(result, 0x10000 + ((lead & 0x3ff) << 10) + (next & 0x3ff)); - data += 2; - } - else - { - data += 1; - } - } - else - { - data += 1; - } - } - - return result; - } - - template static inline typename Traits::value_type decode_utf32_block(const char32_t* data, size_t size, typename Traits::value_type result, opt1, Traits = Traits()) - { - const bool swap = opt1::o1; - - const char32_t* end = data + size; - - while (data < end) - { - char32_t lead = swap ? endian_swap(*data) : *data; - - // U+0000..U+FFFF - if (lead < 0x10000) - { - result = Traits::low(result, lead); - data += 1; - } - // U+10000..U+10FFFF - else - { - result = Traits::high(result, lead); - data += 1; - } - } - - return result; - } - - template inline void convert_utf_endian_swap(T* result, const T* data, size_t length) - { - for (size_t i = 0; i < length; ++i) result[i] = endian_swap(data[i]); - } - - inline void convert_wchar_endian_swap(wchar_t* result, const wchar_t* data, size_t length) - { - for (size_t i = 0; i < length; ++i) result[i] = static_cast(endian_swap(static_cast::type>(data[i]))); - } - } -} - -#endif - -/** - * Copyright (c) 2006-2009 Arseny Kapoulkine - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ diff --git a/src/pugixml.cpp b/src/pugixml.cpp index d67919b..099f85e 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -1,5 +1,5 @@ /** - * pugixml parser - version 0.5 + * pugixml parser - version 0.6 * -------------------------------------------------------- * Copyright (C) 2006-2009, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) * Report bugs and download new versions at http://code.google.com/p/pugixml/ @@ -17,8 +17,6 @@ #error No exception mode can not be used with XPath support #endif -#include "pugiutf.hpp" - #include #include #include @@ -401,6 +399,323 @@ namespace pugi } } +// Unicode utilities +namespace pugi +{ + namespace impl + { + typedef unsigned char char8_t; + typedef unsigned short char16_t; + typedef unsigned int char32_t; + + inline char16_t endian_swap(char16_t value) + { + return static_cast(((value & 0xff) << 8) | (value >> 8)); + } + + inline char32_t endian_swap(char32_t value) + { + return ((value & 0xff) << 24) | ((value & 0xff00) << 8) | ((value & 0xff0000) >> 8) | (value >> 24); + } + + struct utf8_counter + { + typedef size_t value_type; + + static value_type low(value_type result, char32_t ch) + { + // U+0000..U+007F + if (ch < 0x80) return result + 1; + // U+0080..U+07FF + else if (ch < 0x800) return result + 2; + // U+0800..U+FFFF + else return result + 3; + } + + static value_type high(value_type result, char32_t) + { + // U+10000..U+10FFFF + return result + 4; + } + }; + + struct utf8_writer + { + typedef char8_t* value_type; + + static value_type low(value_type result, char32_t ch) + { + // U+0000..U+007F + if (ch < 0x80) + { + *result = static_cast(ch); + return result + 1; + } + // U+0080..U+07FF + else if (ch < 0x800) + { + result[0] = static_cast(0xC0 | (ch >> 6)); + result[1] = static_cast(0x80 | (ch & 0x3F)); + return result + 2; + } + // U+0800..U+FFFF + else + { + result[0] = static_cast(0xE0 | (ch >> 12)); + result[1] = static_cast(0x80 | ((ch >> 6) & 0x3F)); + result[2] = static_cast(0x80 | (ch & 0x3F)); + return result + 3; + } + } + + static value_type high(value_type result, char32_t ch) + { + // U+10000..U+10FFFF + result[0] = static_cast(0xF0 | (ch >> 18)); + result[1] = static_cast(0x80 | ((ch >> 12) & 0x3F)); + result[2] = static_cast(0x80 | ((ch >> 6) & 0x3F)); + result[3] = static_cast(0x80 | (ch & 0x3F)); + return result + 4; + } + + static value_type any(value_type result, char32_t ch) + { + return (ch < 0x10000) ? low(result, ch) : high(result, ch); + } + }; + + struct utf16_counter + { + typedef size_t value_type; + + static value_type low(value_type result, char32_t) + { + return result + 1; + } + + static value_type high(value_type result, char32_t) + { + return result + 2; + } + }; + + struct utf16_writer + { + typedef char16_t* value_type; + + static value_type low(value_type result, char32_t ch) + { + *result = static_cast(ch); + + return result + 1; + } + + static value_type high(value_type result, char32_t ch) + { + char32_t msh = (char32_t)(ch - 0x10000) >> 10; + char32_t lsh = (char32_t)(ch - 0x10000) & 0x3ff; + + result[0] = static_cast(0xD800 + msh); + result[1] = static_cast(0xDC00 + lsh); + + return result + 2; + } + + static value_type any(value_type result, char32_t ch) + { + return (ch < 0x10000) ? low(result, ch) : high(result, ch); + } + }; + + struct utf32_counter + { + typedef size_t value_type; + + static value_type low(value_type result, char32_t) + { + return result + 1; + } + + static value_type high(value_type result, char32_t) + { + return result + 1; + } + }; + + struct utf32_writer + { + typedef char32_t* value_type; + + static value_type low(value_type result, char32_t ch) + { + *result = ch; + + return result + 1; + } + + static value_type high(value_type result, char32_t ch) + { + *result = ch; + + return result + 1; + } + + static value_type any(value_type result, char32_t ch) + { + *result = ch; + + return result + 1; + } + }; + + template struct wchar_selector; + + template <> struct wchar_selector<2> + { + typedef char16_t type; + typedef utf16_counter counter; + typedef utf16_writer writer; + }; + + template <> struct wchar_selector<4> + { + typedef char32_t type; + typedef utf32_counter counter; + typedef utf32_writer writer; + }; + + typedef wchar_selector::counter wchar_counter; + typedef wchar_selector::writer wchar_writer; + + template static inline typename Traits::value_type decode_utf8_block(const char8_t* data, size_t size, typename Traits::value_type result, Traits = Traits()) + { + const char8_t utf8_byte_mask = 0x3f; + + const char8_t* end = data + size; + + while (data < end) + { + char8_t lead = *data; + + // 0xxxxxxx -> U+0000..U+007F + if (lead < 0x80) + { + result = Traits::low(result, lead); + data += 1; + } + // 110xxxxx -> U+0080..U+07FF + else if ((unsigned)(lead - 0xC0) < 0x20 && data + 1 < end && (data[1] & 0xc0) == 0x80) + { + result = Traits::low(result, ((lead & ~0xC0) << 6) | (data[1] & utf8_byte_mask)); + data += 2; + } + // 1110xxxx -> U+0800-U+FFFF + else if ((unsigned)(lead - 0xE0) < 0x10 && data + 2 < end && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80) + { + result = Traits::low(result, ((lead & ~0xE0) << 12) | ((data[1] & utf8_byte_mask) << 6) | (data[2] & utf8_byte_mask)); + data += 3; + } + // 11110xxx -> U+10000..U+10FFFF + else if ((unsigned)(lead - 0xF0) < 0x08 && data + 3 < end && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80 && (data[3] & 0xc0) == 0x80) + { + result = Traits::high(result, ((lead & ~0xF0) << 18) | ((data[1] & utf8_byte_mask) << 12) | ((data[2] & utf8_byte_mask) << 6) | (data[3] & utf8_byte_mask)); + data += 4; + } + // 10xxxxxx or 11111xxx -> invalid + else + { + data += 1; + } + } + + return result; + } + + template static inline typename Traits::value_type decode_utf16_block(const char16_t* data, size_t size, typename Traits::value_type result, opt1, Traits = Traits()) + { + const bool swap = opt1::o1; + + const char16_t* end = data + size; + + while (data < end) + { + char16_t lead = swap ? endian_swap(*data) : *data; + + // U+0000..U+D7FF + if (lead < 0xD800) + { + result = Traits::low(result, lead); + data += 1; + } + // U+E000..U+FFFF + else if ((unsigned)(lead - 0xE000) < 0x2000) + { + result = Traits::low(result, lead); + data += 1; + } + // surrogate pair lead + else if ((unsigned)(lead - 0xD800) < 0x400 && data + 1 < end) + { + char16_t next = swap ? endian_swap(data[1]) : data[1]; + + if ((unsigned)(next - 0xDC00) < 0x400) + { + result = Traits::high(result, 0x10000 + ((lead & 0x3ff) << 10) + (next & 0x3ff)); + data += 2; + } + else + { + data += 1; + } + } + else + { + data += 1; + } + } + + return result; + } + + template static inline typename Traits::value_type decode_utf32_block(const char32_t* data, size_t size, typename Traits::value_type result, opt1, Traits = Traits()) + { + const bool swap = opt1::o1; + + const char32_t* end = data + size; + + while (data < end) + { + char32_t lead = swap ? endian_swap(*data) : *data; + + // U+0000..U+FFFF + if (lead < 0x10000) + { + result = Traits::low(result, lead); + data += 1; + } + // U+10000..U+10FFFF + else + { + result = Traits::high(result, lead); + data += 1; + } + } + + return result; + } + + template inline void convert_utf_endian_swap(T* result, const T* data, size_t length) + { + for (size_t i = 0; i < length; ++i) result[i] = endian_swap(data[i]); + } + + inline void convert_wchar_endian_swap(wchar_t* result, const wchar_t* data, size_t length) + { + for (size_t i = 0; i < length; ++i) result[i] = static_cast(endian_swap(static_cast::type>(data[i]))); + } + } +} + namespace { using namespace pugi; diff --git a/src/pugixml.hpp b/src/pugixml.hpp index f2cb3d1..b2348e8 100644 --- a/src/pugixml.hpp +++ b/src/pugixml.hpp @@ -1,5 +1,5 @@ /** - * pugixml parser - version 0.5 + * pugixml parser - version 0.6 * -------------------------------------------------------- * Copyright (C) 2006-2009, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) * Report bugs and download new versions at http://code.google.com/p/pugixml/ diff --git a/src/pugixpath.cpp b/src/pugixpath.cpp index 0dc66e6..a867995 100644 --- a/src/pugixpath.cpp +++ b/src/pugixpath.cpp @@ -1,5 +1,5 @@ /** - * pugixml parser - version 0.5 + * pugixml parser - version 0.6 * -------------------------------------------------------- * Copyright (C) 2006-2009, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) * Report bugs and download new versions at http://code.google.com/p/pugixml/ -- cgit v1.2.3