From f542c5ebb8068ccd4f9176684eb62183afbe7e5c Mon Sep 17 00:00:00 2001 From: "arseny.kapoulkine" Date: Thu, 6 May 2010 20:28:36 +0000 Subject: Integrated changes from unicode branch to trunk git-svn-id: http://pugixml.googlecode.com/svn/trunk@383 99668b35-9821-0410-8761-19e4c4f06640 --- src/pugiconfig.hpp | 3 + src/pugiutf.hpp | 358 +++++++++ src/pugixml.cpp | 2060 ++++++++++++++++++++++++++++++++++++---------------- src/pugixml.hpp | 338 ++++++--- src/pugixpath.cpp | 498 +++++++------ 5 files changed, 2298 insertions(+), 959 deletions(-) create mode 100644 src/pugiutf.hpp (limited to 'src') diff --git a/src/pugiconfig.hpp b/src/pugiconfig.hpp index 517c959..a62b7f4 100644 --- a/src/pugiconfig.hpp +++ b/src/pugiconfig.hpp @@ -14,6 +14,9 @@ #ifndef HEADER_PUGICONFIG_HPP #define HEADER_PUGICONFIG_HPP +// Uncomment this to enable wchar_t mode +// #define PUGIXML_WCHAR_MODE + // Uncomment this to disable STL // #define PUGIXML_NO_STL diff --git a/src/pugiutf.hpp b/src/pugiutf.hpp new file mode 100644 index 0000000..dfca940 --- /dev/null +++ b/src/pugiutf.hpp @@ -0,0 +1,358 @@ +/** + * pugixml parser - version 0.5 + * -------------------------------------------------------- + * Copyright (C) 2006-2009, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) + * Report bugs and download new versions at http://code.google.com/p/pugixml/ + * + * This library is distributed under the MIT License. See notice at the end + * of this file. + * + * This work is based on the pugxml parser, which is: + * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net) + */ + +#ifndef HEADER_PUGIUTF_HPP +#define HEADER_PUGIUTF_HPP + +namespace pugi +{ + namespace impl + { + typedef unsigned char char8_t; + typedef unsigned short char16_t; + typedef unsigned int char32_t; + + inline char16_t endian_swap(char16_t value) + { + return static_cast(((value & 0xff) << 8) | (value >> 8)); + } + + inline char32_t endian_swap(char32_t value) + { + return ((value & 0xff) << 24) | ((value & 0xff00) << 8) | ((value & 0xff0000) >> 8) | (value >> 24); + } + + struct utf8_counter + { + typedef size_t value_type; + + static value_type low(value_type result, char32_t ch) + { + // U+0000..U+007F + if (ch < 0x80) return result + 1; + // U+0080..U+07FF + else if (ch < 0x800) return result + 2; + // U+0800..U+FFFF + else return result + 3; + } + + static value_type high(value_type result, char32_t) + { + // U+10000..U+10FFFF + return result + 4; + } + }; + + struct utf8_writer + { + typedef char8_t* value_type; + + static value_type low(value_type result, char32_t ch) + { + // U+0000..U+007F + if (ch < 0x80) + { + *result = static_cast(ch); + return result + 1; + } + // U+0080..U+07FF + else if (ch < 0x800) + { + result[0] = static_cast(0xC0 | (ch >> 6)); + result[1] = static_cast(0x80 | (ch & 0x3F)); + return result + 2; + } + // U+0800..U+FFFF + else + { + result[0] = static_cast(0xE0 | (ch >> 12)); + result[1] = static_cast(0x80 | ((ch >> 6) & 0x3F)); + result[2] = static_cast(0x80 | (ch & 0x3F)); + return result + 3; + } + } + + static value_type high(value_type result, char32_t ch) + { + // U+10000..U+10FFFF + result[0] = static_cast(0xF0 | (ch >> 18)); + result[1] = static_cast(0x80 | ((ch >> 12) & 0x3F)); + result[2] = static_cast(0x80 | ((ch >> 6) & 0x3F)); + result[3] = static_cast(0x80 | (ch & 0x3F)); + return result + 4; + } + + static value_type any(value_type result, char32_t ch) + { + return (ch < 0x10000) ? low(result, ch) : high(result, ch); + } + }; + + struct utf16_counter + { + typedef size_t value_type; + + static value_type low(value_type result, char32_t) + { + return result + 1; + } + + static value_type high(value_type result, char32_t) + { + return result + 2; + } + }; + + struct utf16_writer + { + typedef char16_t* value_type; + + static value_type low(value_type result, char32_t ch) + { + *result = static_cast(ch); + + return result + 1; + } + + static value_type high(value_type result, char32_t ch) + { + char32_t msh = (char32_t)(ch - 0x10000) >> 10; + char32_t lsh = (char32_t)(ch - 0x10000) & 0x3ff; + + result[0] = static_cast(0xD800 + msh); + result[1] = static_cast(0xDC00 + lsh); + + return result + 2; + } + + static value_type any(value_type result, char32_t ch) + { + return (ch < 0x10000) ? low(result, ch) : high(result, ch); + } + }; + + struct utf32_counter + { + typedef size_t value_type; + + static value_type low(value_type result, char32_t) + { + return result + 1; + } + + static value_type high(value_type result, char32_t) + { + return result + 1; + } + }; + + struct utf32_writer + { + typedef char32_t* value_type; + + static value_type low(value_type result, char32_t ch) + { + *result = ch; + + return result + 1; + } + + static value_type high(value_type result, char32_t ch) + { + *result = ch; + + return result + 1; + } + + static value_type any(value_type result, char32_t ch) + { + *result = ch; + + return result + 1; + } + }; + + template struct wchar_selector; + + template <> struct wchar_selector<2> + { + typedef char16_t type; + typedef utf16_counter counter; + typedef utf16_writer writer; + }; + + template <> struct wchar_selector<4> + { + typedef char32_t type; + typedef utf32_counter counter; + typedef utf32_writer writer; + }; + + typedef wchar_selector::counter wchar_counter; + typedef wchar_selector::writer wchar_writer; + + template static inline typename Traits::value_type decode_utf8_block(const char8_t* data, size_t size, typename Traits::value_type result, Traits = Traits()) + { + const char8_t utf8_byte_mask = 0x3f; + + const char8_t* end = data + size; + + while (data < end) + { + char8_t lead = *data; + + // 0xxxxxxx -> U+0000..U+007F + if (lead < 0x80) + { + result = Traits::low(result, lead); + data += 1; + } + // 110xxxxx -> U+0080..U+07FF + else if ((unsigned)(lead - 0xC0) < 0x20 && data + 1 < end && (data[1] & 0xc0) == 0x80) + { + result = Traits::low(result, ((lead & ~0xC0) << 6) | (data[1] & utf8_byte_mask)); + data += 2; + } + // 1110xxxx -> U+0800-U+FFFF + else if ((unsigned)(lead - 0xE0) < 0x10 && data + 2 < end && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80) + { + result = Traits::low(result, ((lead & ~0xE0) << 12) | ((data[1] & utf8_byte_mask) << 6) | (data[2] & utf8_byte_mask)); + data += 3; + } + // 11110xxx -> U+10000..U+10FFFF + else if ((unsigned)(lead - 0xF0) < 0x08 && data + 3 < end && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80 && (data[3] & 0xc0) == 0x80) + { + result = Traits::high(result, ((lead & ~0xF0) << 18) | ((data[1] & utf8_byte_mask) << 12) | ((data[2] & utf8_byte_mask) << 6) | (data[3] & utf8_byte_mask)); + data += 4; + } + // 10xxxxxx or 11111xxx -> invalid + else + { + data += 1; + } + } + + return result; + } + + template static inline typename Traits::value_type decode_utf16_block(const char16_t* data, size_t size, typename Traits::value_type result, opt1, Traits = Traits()) + { + const bool swap = opt1::o1; + + const char16_t* end = data + size; + + while (data < end) + { + char16_t lead = swap ? endian_swap(*data) : *data; + + // U+0000..U+D7FF + if (lead < 0xD800) + { + result = Traits::low(result, lead); + data += 1; + } + // U+E000..U+FFFF + else if ((unsigned)(lead - 0xE000) < 0x2000) + { + result = Traits::low(result, lead); + data += 1; + } + // surrogate pair lead + else if ((unsigned)(lead - 0xD800) < 0x400 && data + 1 < end) + { + char16_t next = swap ? endian_swap(data[1]) : data[1]; + + if ((unsigned)(next - 0xDC00) < 0x400) + { + result = Traits::high(result, 0x10000 + ((lead & 0x3ff) << 10) + (next & 0x3ff)); + data += 2; + } + else + { + data += 1; + } + } + else + { + data += 1; + } + } + + return result; + } + + template static inline typename Traits::value_type decode_utf32_block(const char32_t* data, size_t size, typename Traits::value_type result, opt1, Traits = Traits()) + { + const bool swap = opt1::o1; + + const char32_t* end = data + size; + + while (data < end) + { + char32_t lead = swap ? endian_swap(*data) : *data; + + // U+0000..U+FFFF + if (lead < 0x10000) + { + result = Traits::low(result, lead); + data += 1; + } + // U+10000..U+10FFFF + else + { + result = Traits::high(result, lead); + data += 1; + } + } + + return result; + } + + template inline void convert_utf_endian_swap(T* result, const T* data, size_t length) + { + for (size_t i = 0; i < length; ++i) result[i] = endian_swap(data[i]); + } + + inline void convert_wchar_endian_swap(wchar_t* result, const wchar_t* data, size_t length) + { + for (size_t i = 0; i < length; ++i) result[i] = static_cast(endian_swap(static_cast::type>(data[i]))); + } + } +} + +#endif + +/** + * Copyright (c) 2006-2009 Arseny Kapoulkine + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 1bd6c68..d67919b 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -13,23 +13,27 @@ #include "pugixml.hpp" +#if !defined(PUGIXML_NO_XPATH) && defined(PUGIXML_NO_EXCEPTIONS) +#error No exception mode can not be used with XPath support +#endif + +#include "pugiutf.hpp" + #include #include #include #include - -// For placement new -#include - -#if !defined(PUGIXML_NO_XPATH) && defined(PUGIXML_NO_EXCEPTIONS) -#error No exception mode can not be used with XPath support -#endif +#include #ifndef PUGIXML_NO_STL # include # include +# include #endif +// For placement new +#include + #ifdef _MSC_VER # pragma warning(disable: 4127) // conditional expression is constant # pragma warning(disable: 4996) // this function or variable may be unsafe @@ -48,6 +52,7 @@ using std::memcpy; #define STATIC_ASSERT(cond) { static const char condition_failed[(cond) ? 1 : -1] = {0}; (void)condition_failed[0]; } +// Memory allocation namespace { void* default_allocate(size_t size) @@ -64,6 +69,155 @@ namespace pugi::deallocation_function global_deallocate = default_deallocate; } +// String utilities prototypes +namespace pugi +{ + namespace impl + { + size_t strlen(const char_t* s); + void strcpy(char_t* dst, const char_t* src); + bool strequalrange(const char_t* lhs, const char_t* rhs, size_t count); + void widen_ascii(wchar_t* dest, const char* source); + } +} + +// String utilities +namespace pugi +{ + namespace impl + { + // Get string length + size_t strlen(const char_t* s) + { + #ifdef PUGIXML_WCHAR_MODE + return wcslen(s); + #else + return ::strlen(s); + #endif + } + + // Copy one string into another + void strcpy(char_t* dst, const char_t* src) + { + #ifdef PUGIXML_WCHAR_MODE + wcscpy(dst, src); + #else + ::strcpy(dst, src); + #endif + } + + // Compare two strings + bool PUGIXML_FUNCTION strequal(const char_t* src, const char_t* dst) + { + #ifdef PUGIXML_WCHAR_MODE + return wcscmp(src, dst) == 0; + #else + return strcmp(src, dst) == 0; + #endif + } + + // Compare lhs with [rhs_begin, rhs_end) + bool strequalrange(const char_t* lhs, const char_t* rhs, size_t count) + { + for (size_t i = 0; i < count; ++i) + if (lhs[i] != rhs[i]) + return false; + + return true; + } + + // Character set pattern match. + static bool strequalwild_cset(const char_t** src, const char_t** dst) + { + int find = 0, excl = 0, star = 0; + + if (**src == '!') + { + excl = 1; + ++(*src); + } + + while (**src != ']' || star == 1) + { + if (find == 0) + { + if (**src == '-' && *(*src-1) < *(*src+1) && *(*src+1) != ']' && star == 0) + { + if (**dst >= *(*src-1) && **dst <= *(*src+1)) + { + find = 1; + ++(*src); + } + } + else if (**src == **dst) find = 1; + } + ++(*src); + star = 0; + } + + if (excl == 1) find = (1 - find); + if (find == 1) ++(*dst); + + return find == 0; + } + + // Wildcard pattern match. + static bool strequalwild_astr(const char_t** src, const char_t** dst) + { + int find = 1; + ++(*src); + while ((**dst != 0 && **src == '?') || **src == '*') + { + if(**src == '?') ++(*dst); + ++(*src); + } + while (**src == '*') ++(*src); + if (**dst == 0 && **src != 0) return 0; + if (**dst == 0 && **src == 0) return 1; + else + { + if (!impl::strequalwild(*src,*dst)) + { + do + { + ++(*dst); + while(**src != **dst && **src != '[' && **dst != 0) + ++(*dst); + } + while ((**dst != 0) ? !impl::strequalwild(*src,*dst) : 0 != (find=0)); + } + if (**dst == 0 && **src == 0) find = 1; + return find == 0; + } + } + + // Compare two strings, with globbing, and character sets. + bool PUGIXML_FUNCTION strequalwild(const char_t* src, const char_t* dst) + { + int find = 1; + for(; *src != 0 && find == 1 && *dst != 0; ++src) + { + switch (*src) + { + case '?': ++dst; break; + case '[': ++src; find = !strequalwild_cset(&src,&dst); break; + case '*': find = !strequalwild_astr(&src,&dst); --src; break; + default : find = (int) (*src == *dst); ++dst; + } + } + while (*src == '*' && find == 1) ++src; + return (find == 1 && *dst == 0 && *src == 0); + } + + // Convert string to wide string, assuming all symbols are ASCII + void widen_ascii(wchar_t* dest, const char* source) + { + for (const char* i = source; *i; ++i) *dest++ = *i; + *dest = 0; + } + } +} + namespace pugi { struct xml_document_struct; @@ -131,8 +285,8 @@ namespace pugi unsigned int name_allocated : 1; unsigned int value_allocated : 1; - char* name; ///< Pointer to attribute name. - char* value; ///< Pointer to attribute value. + char_t* name; ///< Pointer to attribute name. + char_t* value; ///< Pointer to attribute value. xml_attribute_struct* prev_attribute; ///< Previous attribute xml_attribute_struct* next_attribute; ///< Next attribute @@ -208,8 +362,8 @@ namespace pugi xml_node_struct* parent; ///< Pointer to parent - char* name; ///< Pointer to element name. - char* value; ///< Pointer to any associated string data. + char_t* name; ///< Pointer to element name. + char_t* value; ///< Pointer to any associated string data. xml_node_struct* first_child; ///< First child xml_node_struct* last_child; ///< Last child @@ -228,7 +382,7 @@ namespace pugi } xml_allocator allocator; - const char* buffer; + const char_t* buffer; }; xml_document_struct* xml_allocator::allocate_document() @@ -251,12 +405,7 @@ namespace { using namespace pugi; - const unsigned char UTF8_BYTE_MASK = 0xBF; - const unsigned char UTF8_BYTE_MARK = 0x80; - const unsigned char UTF8_BYTE_MASK_READ = 0x3F; - const unsigned char UTF8_FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; - - enum chartype + enum chartype_t { ct_parse_pcdata = 1, // \0, &, \r, < ct_parse_attr = 2, // \0, &, \r, ', " @@ -288,182 +437,394 @@ namespace 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192 }; - - inline bool is_chartype(char c, chartype ct) + + inline bool is_chartype(char_t c, chartype_t ct) { + #ifdef PUGIXML_WCHAR_MODE + unsigned int ch = static_cast(c); + + return !!((ch < 128 ? chartype_table[ch] : chartype_table[128]) & ct); + #else return !!(chartype_table[static_cast(c)] & ct); + #endif + } + + enum output_chartype_t + { + oct_special_pcdata = 1, // Any symbol >= 0 and < 32 (except \t, \r, \n), &, <, > + oct_special_attr = 2 // Any symbol >= 0 and < 32 (except \t), &, <, >, " + }; + + const unsigned char output_chartype_table[256] = + { + 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 2, 3, 3, 2, 3, 3, // 0-15 + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 16-31 + 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 32-47 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 0, // 48-63 + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 64-128 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 128+ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }; + + inline bool is_output_chartype(char_t c, output_chartype_t ct) + { + #ifdef PUGIXML_WCHAR_MODE + unsigned int ch = static_cast(c); + + return !!((ch < 128 ? output_chartype_table[ch] : output_chartype_table[128]) & ct); + #else + return !!(output_chartype_table[static_cast(c)] & ct); + #endif + } + + template struct opt1_to_type + { + static const bool o1; + }; + + template const bool opt1_to_type<_1>::o1 = _1; + + template struct opt2_to_type + { + static const bool o1; + static const bool o2; + }; + + template const bool opt2_to_type<_1, _2>::o1 = _1; + template const bool opt2_to_type<_1, _2>::o2 = _2; + + template struct opt4_to_type + { + static const bool o1; + static const bool o2; + static const bool o3; + static const bool o4; + }; + + template const bool opt4_to_type<_1, _2, _3, _4>::o1 = _1; + template const bool opt4_to_type<_1, _2, _3, _4>::o2 = _2; + template const bool opt4_to_type<_1, _2, _3, _4>::o3 = _3; + template const bool opt4_to_type<_1, _2, _3, _4>::o4 = _4; + + bool is_little_endian() + { + unsigned int ui = 1; + + return *reinterpret_cast(&ui) == 1; + } + + encoding_t get_wchar_encoding() + { + STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4); + + if (sizeof(wchar_t) == 2) + return is_little_endian() ? encoding_utf16_le : encoding_utf16_be; + else + return is_little_endian() ? encoding_utf32_le : encoding_utf32_be; } - bool strcpy_insitu(char*& dest, bool& allocated, const char* source) + encoding_t get_buffer_encoding(encoding_t encoding, const void* contents, size_t size) { - size_t source_size = strlen(source); + // replace wchar encoding with utf implementation + if (encoding == encoding_wchar) return get_wchar_encoding(); - if (dest && strlen(dest) >= source_size) + // replace utf16 encoding with utf16 with specific endianness + if (encoding == encoding_utf16) return is_little_endian() ? encoding_utf16_le : encoding_utf16_be; + + // replace utf32 encoding with utf32 with specific endianness + if (encoding == encoding_utf32) return is_little_endian() ? encoding_utf32_le : encoding_utf32_be; + + // only do autodetection if no explicit encoding is requested + if (encoding != encoding_auto) return encoding; + + // try to guess encoding (based on XML specification, Appendix F.1) + const impl::char8_t* data = static_cast(contents); + + // look for BOM in first few bytes + if (size > 4 && data[0] == 0 && data[1] == 0 && data[2] == 0xfe && data[3] == 0xff) return encoding_utf32_be; + if (size > 4 && data[0] == 0xff && data[1] == 0xfe && data[2] == 0 && data[3] == 0) return encoding_utf32_le; + if (size > 2 && data[0] == 0xfe && data[1] == 0xff) return encoding_utf16_be; + if (size > 2 && data[0] == 0xff && data[1] == 0xfe) return encoding_utf16_le; + if (size > 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf) return encoding_utf8; + + // look for <, 4 && data[0] == 0 && data[1] == 0 && data[2] == 0 && data[3] == 0x3c) return encoding_utf32_be; + if (size > 4 && data[0] == 0x3c && data[1] == 0 && data[2] == 0 && data[3] == 0) return encoding_utf32_le; + if (size > 4 && data[0] == 0 && data[1] == 0x3c && data[2] == 0 && data[3] == 0x3f) return encoding_utf16_be; + if (size > 4 && data[0] == 0x3c && data[1] == 0 && data[2] == 0x3f && data[3] == 0) return encoding_utf16_le; + if (size > 4 && data[0] == 0x3c && data[1] == 0x3f && data[2] == 0x78 && data[3] == 0x6d) return encoding_utf8; + + // look for utf16 < followed by node name (this may fail, but is better than utf8 since it's zero terminated so early) + if (size > 2 && data[0] == 0 && data[1] == 0x3c) return encoding_utf16_be; + if (size > 2 && data[0] == 0x3c && data[1] == 0) return encoding_utf16_le; + + // no known BOM detected, assume utf8 + return encoding_utf8; + } + + bool get_mutable_buffer(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable) + { + if (is_mutable) { - strcpy(dest, source); - - return true; + out_buffer = static_cast(const_cast(contents)); } else { - char* buf = static_cast(global_allocate(source_size + 1)); - if (!buf) return false; - - strcpy(buf, source); + void* buffer = global_allocate(size > 0 ? size : 1); + if (!buffer) return false; - if (allocated) global_deallocate(dest); - - dest = buf; - allocated = true; + memcpy(buffer, contents, size); - return true; + out_buffer = static_cast(buffer); } + + out_length = size / sizeof(char_t); + + return true; } - // Get the size that is needed for strutf16_utf8 applied to all s characters - size_t strutf16_utf8_size(const wchar_t* s) +#ifdef PUGIXML_WCHAR_MODE + inline bool need_endian_swap_utf(encoding_t le, encoding_t re) { - size_t length = 0; + return (le == encoding_utf16_be && re == encoding_utf16_le) || (le == encoding_utf16_le && re == encoding_utf16_be) || + (le == encoding_utf32_be && re == encoding_utf32_le) || (le == encoding_utf32_le && re == encoding_utf32_be); + } - for (; *s; ++s) - { - unsigned int ch = *s; + bool convert_buffer_endian_swap(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable) + { + const char_t* data = static_cast(contents); + + out_buffer = is_mutable ? const_cast(data) : static_cast(global_allocate(size > 0 ? size : 1)); + out_length = size / sizeof(char_t); - if (ch < 0x80) length += 1; - else if (ch < 0x800) length += 2; - else if (ch < 0x10000) length += 3; - else if (ch < 0x200000) length += 4; - } + if (!out_buffer) return false; - return length; + impl::convert_wchar_endian_swap(out_buffer, data, out_length); + + return true; } - // Write utf16 char to stream, return position after the last written char - // \return position after last char - char* strutf16_utf8(char* s, unsigned int ch) + bool convert_buffer_utf8(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size) { - unsigned int length; + const impl::char8_t* data = static_cast(contents); - if (ch < 0x80) length = 1; - else if (ch < 0x800) length = 2; - else if (ch < 0x10000) length = 3; - else if (ch < 0x200000) length = 4; - else return s; - - s += length; + // first pass: get length in wchar_t units + out_length = impl::decode_utf8_block(data, size, 0); - // Scary scary fall throughs. - switch (length) - { - case 4: - *--s = (char)((ch | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); - ch >>= 6; - case 3: - *--s = (char)((ch | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); - ch >>= 6; - case 2: - *--s = (char)((ch | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); - ch >>= 6; - case 1: - *--s = (char)(ch | UTF8_FIRST_BYTE_MARK[length]); - } - - return s + length; + // allocate buffer of suitable length + out_buffer = static_cast(global_allocate((out_length > 0 ? out_length : 1) * sizeof(char_t))); + if (!out_buffer) return false; + + // second pass: convert utf8 input to wchar_t + impl::wchar_writer::value_type out_begin = reinterpret_cast(out_buffer); + impl::wchar_writer::value_type out_end = impl::decode_utf8_block(data, size, out_begin); + + assert(out_end == out_begin + out_length); + (void)!out_end; + + return true; } - // Get the size that is needed for strutf8_utf16 applied to all s characters - size_t strutf8_utf16_size(const char* s) + template bool convert_buffer_utf16(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt1) { - size_t length = 0; + const impl::char16_t* data = static_cast(contents); + size_t length = size / sizeof(impl::char16_t); - for (; *s; ++s) - { - unsigned char ch = static_cast(*s); + // first pass: get length in wchar_t units + out_length = impl::decode_utf16_block(data, length, 0, opt1()); - if (ch < 0x80 || (ch >= 0xC0 && ch < 0xFC)) ++length; - } + // allocate buffer of suitable length + out_buffer = static_cast(global_allocate((out_length > 0 ? out_length : 1) * sizeof(char_t))); + if (!out_buffer) return false; - return length; + // second pass: convert utf16 input to wchar_t + impl::wchar_writer::value_type out_begin = reinterpret_cast(out_buffer); + impl::wchar_writer::value_type out_end = impl::decode_utf16_block(data, length, out_begin, opt1()); + + assert(out_end == out_begin + out_length); + (void)!out_end; + + return true; + } + + template bool convert_buffer_utf32(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt1) + { + const impl::char32_t* data = static_cast(contents); + size_t length = size / sizeof(impl::char32_t); + + // first pass: get length in wchar_t units + out_length = impl::decode_utf32_block(data, length, 0, opt1()); + + // allocate buffer of suitable length + out_buffer = static_cast(global_allocate((out_length > 0 ? out_length : 1) * sizeof(char_t))); + if (!out_buffer) return false; + + // second pass: convert utf32 input to wchar_t + impl::wchar_writer::value_type out_begin = reinterpret_cast(out_buffer); + impl::wchar_writer::value_type out_end = impl::decode_utf32_block(data, length, out_begin, opt1()); + + assert(out_end == out_begin + out_length); + (void)!out_end; + + return true; } - // Read utf16 char from utf8 stream, return position after the last read char - // \return position after the last char - const char* strutf8_utf16(const char* s, unsigned int& ch) + bool convert_buffer(char_t*& out_buffer, size_t& out_length, encoding_t encoding, const void* contents, size_t size, bool is_mutable) { - unsigned int length; + // get native encoding + encoding_t wchar_encoding = get_wchar_encoding(); + + // fast path: no conversion required + if (encoding == wchar_encoding) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable); - const unsigned char* str = reinterpret_cast(s); + // only endian-swapping is required + if (need_endian_swap_utf(encoding, wchar_encoding)) return convert_buffer_endian_swap(out_buffer, out_length, contents, size, is_mutable); - if (*str < UTF8_BYTE_MARK) + // source encoding is utf8 + if (encoding == encoding_utf8) return convert_buffer_utf8(out_buffer, out_length, contents, size); + + // source encoding is utf16 + if (encoding == encoding_utf16_be || encoding == encoding_utf16_le) { - ch = *str; - return s + 1; + encoding_t native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be; + + return (native_encoding == encoding) ? + convert_buffer_utf16(out_buffer, out_length, contents, size, opt1_to_type()) : + convert_buffer_utf16(out_buffer, out_length, contents, size, opt1_to_type()); } - else if (*str < 0xC0) + + // source encoding is utf32 + if (encoding == encoding_utf32_be || encoding == encoding_utf32_le) { - ch = ' '; - return s + 1; + encoding_t native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be; + + return (native_encoding == encoding) ? + convert_buffer_utf32(out_buffer, out_length, contents, size, opt1_to_type()) : + convert_buffer_utf32(out_buffer, out_length, contents, size, opt1_to_type()); } - else if (*str < 0xE0) length = 2; - else if (*str < 0xF0) length = 3; - else if (*str < 0xF8) length = 4; - else + + // invalid encoding combination (this can't happen) + assert(false); + + return false; + } +#else + template bool convert_buffer_utf16(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt1) + { + const impl::char16_t* data = static_cast(contents); + size_t length = size / sizeof(impl::char16_t); + + // first pass: get length in utf8 units + out_length = impl::decode_utf16_block(data, length, 0, opt1()); + + // allocate buffer of suitable length + out_buffer = static_cast(global_allocate((out_length > 0 ? out_length : 1) * sizeof(char_t))); + if (!out_buffer) return false; + + // second pass: convert utf16 input to utf8 + impl::char8_t* out_begin = reinterpret_cast(out_buffer); + impl::char8_t* out_end = impl::decode_utf16_block(data, length, out_begin, opt1()); + + assert(out_end == out_begin + out_length); + (void)!out_end; + + return true; + } + + template bool convert_buffer_utf32(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt1) + { + const impl::char32_t* data = static_cast(contents); + size_t length = size / sizeof(impl::char32_t); + + // first pass: get length in utf8 units + out_length = impl::decode_utf32_block(data, length, 0, opt1()); + + // allocate buffer of suitable length + out_buffer = static_cast(global_allocate((out_length > 0 ? out_length : 1) * sizeof(char_t))); + if (!out_buffer) return false; + + // second pass: convert utf32 input to utf8 + impl::char8_t* out_begin = reinterpret_cast(out_buffer); + impl::char8_t* out_end = impl::decode_utf32_block(data, length, out_begin, opt1()); + + assert(out_end == out_begin + out_length); + (void)!out_end; + + return true; + } + + bool convert_buffer(char_t*& out_buffer, size_t& out_length, encoding_t encoding, const void* contents, size_t size, bool is_mutable) + { + // fast path: no conversion required + if (encoding == encoding_utf8) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable); + + // source encoding is utf16 + if (encoding == encoding_utf16_be || encoding == encoding_utf16_le) { - ch = ' '; - return s + 1; + encoding_t native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be; + + return (native_encoding == encoding) ? + convert_buffer_utf16(out_buffer, out_length, contents, size, opt1_to_type()) : + convert_buffer_utf16(out_buffer, out_length, contents, size, opt1_to_type()); } - ch = (*str++ & ~UTF8_FIRST_BYTE_MARK[length]); - - // Scary scary fall throughs. - switch (length) + // source encoding is utf32 + if (encoding == encoding_utf32_be || encoding == encoding_utf32_le) { - case 4: - ch <<= 6; - ch += (*str++ & UTF8_BYTE_MASK_READ); - case 3: - ch <<= 6; - ch += (*str++ & UTF8_BYTE_MASK_READ); - case 2: - ch <<= 6; - ch += (*str++ & UTF8_BYTE_MASK_READ); + encoding_t native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be; + + return (native_encoding == encoding) ? + convert_buffer_utf32(out_buffer, out_length, contents, size, opt1_to_type()) : + convert_buffer_utf32(out_buffer, out_length, contents, size, opt1_to_type()); } - - return reinterpret_cast(str); - } - template struct opt1_to_type - { - static const bool o1; - }; + // invalid encoding combination (this can't happen) + assert(false); - template const bool opt1_to_type<_1>::o1 = _1; + return false; + } +#endif - template struct opt2_to_type + bool strcpy_insitu(char_t*& dest, bool& allocated, const char_t* source) { - static const bool o1; - static const bool o2; - }; + size_t source_length = impl::strlen(source); - template const bool opt2_to_type<_1, _2>::o1 = _1; - template const bool opt2_to_type<_1, _2>::o2 = _2; + if (dest && impl::strlen(dest) >= source_length) + { + impl::strcpy(dest, source); + + return true; + } + else + { + char_t* buf = static_cast(global_allocate((source_length + 1) * sizeof(char_t))); + if (!buf) return false; - template struct opt4_to_type - { - static const bool o1; - static const bool o2; - static const bool o3; - static const bool o4; - }; + impl::strcpy(buf, source); - template const bool opt4_to_type<_1, _2, _3, _4>::o1 = _1; - template const bool opt4_to_type<_1, _2, _3, _4>::o2 = _2; - template const bool opt4_to_type<_1, _2, _3, _4>::o3 = _3; - template const bool opt4_to_type<_1, _2, _3, _4>::o4 = _4; + if (allocated) global_deallocate(dest); + + dest = buf; + allocated = true; + + return true; + } + } struct gap { - char* end; + char_t* end; size_t size; gap(): end(0), size(0) @@ -472,12 +833,12 @@ namespace // Push new gap, move s count bytes further (skipping the gap). // Collapse previous gap. - void push(char*& s, size_t count) + void push(char_t*& s, size_t count) { if (end) // there was a gap already; collapse it { // Move [old_gap_end, new_gap_start) to [old_gap_start, ...) - memmove(end - size, end, s - end); + memmove(end - size, end, reinterpret_cast(s) - reinterpret_cast(end)); } s += count; // end of current gap @@ -488,12 +849,12 @@ namespace } // Collapse all gaps, return past-the-end pointer - char* flush(char* s) + char_t* flush(char_t* s) { if (end) { // Move [old_gap_end, current_pos) to [old_gap_start, ...) - memmove(end - size, end, s - end); + memmove(end - size, end, reinterpret_cast(s) - reinterpret_cast(end)); return s - size; } @@ -501,9 +862,9 @@ namespace } }; - char* strconv_escape(char* s, gap& g) + char_t* strconv_escape(char_t* s, gap& g) { - char* stre = s + 1; + char_t* stre = s + 1; switch (*stre) { @@ -545,7 +906,11 @@ namespace ++stre; } - s = strutf16_utf8(s, ucsc); + #ifdef PUGIXML_WCHAR_MODE + s = reinterpret_cast(impl::wchar_writer::any(reinterpret_cast(s), ucsc)); + #else + s = reinterpret_cast(impl::utf8_writer::any(reinterpret_cast(s), ucsc)); + #endif g.push(s, stre - s); return stre; @@ -619,7 +984,10 @@ namespace return stre; } - char* strconv_comment(char* s) + // Utility macro for last character handling + #define ENDSWITH(c, e) ((c) == (e) || ((c) == 0 && endch == (e))) + + char_t* strconv_comment(char_t* s, char_t endch) { if (!*s) return 0; @@ -635,11 +1003,11 @@ namespace if (*s == '\n') g.push(s, 1); } - else if (*s == '-' && *(s+1) == '-' && *(s+2) == '>') // comment ends here + else if (s[0] == '-' && s[1] == '-' && ENDSWITH(s[2], '>')) // comment ends here { *g.flush(s) = 0; - return s + 3; + return s + (s[2] == '>' ? 3 : 2); } else if (*s == 0) { @@ -649,7 +1017,7 @@ namespace } } - char* strconv_cdata(char* s) + char_t* strconv_cdata(char_t* s, char_t endch) { if (!*s) return 0; @@ -665,7 +1033,7 @@ namespace if (*s == '\n') g.push(s, 1); } - else if (*s == ']' && *(s+1) == ']' && *(s+2) == '>') // CDATA ends here + else if (s[0] == ']' && s[1] == ']' && ENDSWITH(s[2], '>')) // CDATA ends here { *g.flush(s) = 0; @@ -678,170 +1046,176 @@ namespace else ++s; } } + + typedef char_t* (*strconv_pcdata_t)(char_t*); - template char* strconv_pcdata_t(char* s, opt2) + template struct strconv_pcdata_impl { - assert(*s); - - const bool opt_eol = opt2::o1; - const bool opt_escape = opt2::o2; - - gap g; - - while (true) + static char_t* parse(char_t* s) { - while (!is_chartype(*s, ct_parse_pcdata)) ++s; - - if (*s == '<') // PCDATA ends here - { - *g.flush(s) = 0; - - return s + 1; - } - else if (opt_eol && *s == '\r') // Either a single 0x0d or 0x0d 0x0a pair - { - *s++ = '\n'; // replace first one with 0x0a - - if (*s == '\n') g.push(s, 1); - } - else if (opt_escape && *s == '&') - { - s = strconv_escape(s, g); - } - else if (*s == 0) + const bool opt_eol = opt2::o1; + const bool opt_escape = opt2::o2; + + gap g; + + while (true) { - return s; + while (!is_chartype(*s, ct_parse_pcdata)) ++s; + + if (*s == '<') // PCDATA ends here + { + *g.flush(s) = 0; + + return s + 1; + } + else if (opt_eol && *s == '\r') // Either a single 0x0d or 0x0d 0x0a pair + { + *s++ = '\n'; // replace first one with 0x0a + + if (*s == '\n') g.push(s, 1); + } + else if (opt_escape && *s == '&') + { + s = strconv_escape(s, g); + } + else if (*s == 0) + { + return s; + } + else ++s; } - else ++s; } - } - - char* strconv_pcdata(char* s, unsigned int optmask) + }; + + strconv_pcdata_t get_strconv_pcdata(unsigned int optmask) { STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20); switch ((optmask >> 4) & 3) // get bitmask for flags (eol escapes) { - case 0: return strconv_pcdata_t(s, opt2_to_type<0, 0>()); - case 1: return strconv_pcdata_t(s, opt2_to_type<0, 1>()); - case 2: return strconv_pcdata_t(s, opt2_to_type<1, 0>()); - case 3: return strconv_pcdata_t(s, opt2_to_type<1, 1>()); + case 0: return strconv_pcdata_impl >::parse; + case 1: return strconv_pcdata_impl >::parse; + case 2: return strconv_pcdata_impl >::parse; + case 3: return strconv_pcdata_impl >::parse; default: return 0; // should not get here } } - template char* strconv_attribute_t(char* s, char end_quote, opt4) + typedef char_t* (*strconv_attribute_t)(char_t*, char_t); + + template struct strconv_attribute_impl { - const bool opt_wconv = opt4::o1; - const bool opt_wnorm = opt4::o2; - const bool opt_eol = opt4::o3; - const bool opt_escape = opt4::o4; - - if (!*s) return 0; - - gap g; - - // trim leading whitespaces - if (opt_wnorm && is_chartype(*s, ct_space)) + static char_t* parse(char_t* s, char_t end_quote) { - char* str = s; - - do ++str; - while (is_chartype(*str, ct_space)); - - g.push(s, str - s); - } + const bool opt_wconv = opt4::o1; + const bool opt_wnorm = opt4::o2; + const bool opt_eol = opt4::o3; + const bool opt_escape = opt4::o4; - while (true) - { - while (!is_chartype(*s, (opt_wnorm || opt_wconv) ? ct_parse_attr_ws : ct_parse_attr)) ++s; - - if (*s == end_quote) + gap g; + + // trim leading whitespaces + if (opt_wnorm && is_chartype(*s, ct_space)) { - char* str = g.flush(s); + char_t* str = s; - if (opt_wnorm) - { - do *str-- = 0; - while (is_chartype(*str, ct_space)); - } - else *str = 0; - - return s + 1; + do ++str; + while (is_chartype(*str, ct_space)); + + g.push(s, str - s); } - else if (opt_wnorm && is_chartype(*s, ct_space)) + + while (true) { - *s++ = ' '; - - if (is_chartype(*s, ct_space)) + while (!is_chartype(*s, (opt_wnorm || opt_wconv) ? ct_parse_attr_ws : ct_parse_attr)) ++s; + + if (*s == end_quote) { - char* str = s + 1; - while (is_chartype(*str, ct_space)) ++str; + char_t* str = g.flush(s); - g.push(s, str - s); + if (opt_wnorm) + { + do *str-- = 0; + while (is_chartype(*str, ct_space)); + } + else *str = 0; + + return s + 1; } - } - else if (opt_wconv && is_chartype(*s, ct_space)) - { - if (opt_eol) + else if (opt_wnorm && is_chartype(*s, ct_space)) { - if (*s == '\r') + *s++ = ' '; + + if (is_chartype(*s, ct_space)) { - *s++ = ' '; - - if (*s == '\n') g.push(s, 1); + char_t* str = s + 1; + while (is_chartype(*str, ct_space)) ++str; + + g.push(s, str - s); + } + } + else if (opt_wconv && is_chartype(*s, ct_space)) + { + if (opt_eol) + { + if (*s == '\r') + { + *s++ = ' '; + + if (*s == '\n') g.push(s, 1); + } + else *s++ = ' '; } else *s++ = ' '; } - else *s++ = ' '; - } - else if (opt_eol && *s == '\r') - { - *s++ = '\n'; - - if (*s == '\n') g.push(s, 1); - } - else if (opt_escape && *s == '&') - { - s = strconv_escape(s, g); - } - else if (!*s) - { - return 0; + else if (opt_eol && *s == '\r') + { + *s++ = '\n'; + + if (*s == '\n') g.push(s, 1); + } + else if (opt_escape && *s == '&') + { + s = strconv_escape(s, g); + } + else if (!*s) + { + return 0; + } + else ++s; } - else ++s; } - } + }; - char* strconv_attribute(char* s, char end_quote, unsigned int optmask) + strconv_attribute_t get_strconv_attribute(unsigned int optmask) { STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_wnorm_attribute == 0x40 && parse_wconv_attribute == 0x80); switch ((optmask >> 4) & 15) // get bitmask for flags (wconv wnorm eol escapes) { - case 0: return strconv_attribute_t(s, end_quote, opt4_to_type<0, 0, 0, 0>()); - case 1: return strconv_attribute_t(s, end_quote, opt4_to_type<0, 0, 0, 1>()); - case 2: return strconv_attribute_t(s, end_quote, opt4_to_type<0, 0, 1, 0>()); - case 3: return strconv_attribute_t(s, end_quote, opt4_to_type<0, 0, 1, 1>()); - case 4: return strconv_attribute_t(s, end_quote, opt4_to_type<0, 1, 0, 0>()); - case 5: return strconv_attribute_t(s, end_quote, opt4_to_type<0, 1, 0, 1>()); - case 6: return strconv_attribute_t(s, end_quote, opt4_to_type<0, 1, 1, 0>()); - case 7: return strconv_attribute_t(s, end_quote, opt4_to_type<0, 1, 1, 1>()); - case 8: return strconv_attribute_t(s, end_quote, opt4_to_type<1, 0, 0, 0>()); - case 9: return strconv_attribute_t(s, end_quote, opt4_to_type<1, 0, 0, 1>()); - case 10: return strconv_attribute_t(s, end_quote, opt4_to_type<1, 0, 1, 0>()); - case 11: return strconv_attribute_t(s, end_quote, opt4_to_type<1, 0, 1, 1>()); - case 12: return strconv_attribute_t(s, end_quote, opt4_to_type<1, 1, 0, 0>()); - case 13: return strconv_attribute_t(s, end_quote, opt4_to_type<1, 1, 0, 1>()); - case 14: return strconv_attribute_t(s, end_quote, opt4_to_type<1, 1, 1, 0>()); - case 15: return strconv_attribute_t(s, end_quote, opt4_to_type<1, 1, 1, 1>()); + case 0: return strconv_attribute_impl >::parse; + case 1: return strconv_attribute_impl >::parse; + case 2: return strconv_attribute_impl >::parse; + case 3: return strconv_attribute_impl >::parse; + case 4: return strconv_attribute_impl >::parse; + case 5: return strconv_attribute_impl >::parse; + case 6: return strconv_attribute_impl >::parse; + case 7: return strconv_attribute_impl >::parse; + case 8: return strconv_attribute_impl >::parse; + case 9: return strconv_attribute_impl >::parse; + case 10: return strconv_attribute_impl >::parse; + case 11: return strconv_attribute_impl >::parse; + case 12: return strconv_attribute_impl >::parse; + case 13: return strconv_attribute_impl >::parse; + case 14: return strconv_attribute_impl >::parse; + case 15: return strconv_attribute_impl >::parse; default: return 0; // should not get here } } inline xml_parse_result make_parse_result(xml_parse_status status, ptrdiff_t offset, unsigned int line) { - xml_parse_result result = {status, offset, line}; + xml_parse_result result = {status, offset, line, encoding_auto}; return result; } @@ -866,11 +1240,11 @@ namespace { } - xml_parse_result parse_exclamation(char*& ref_s, xml_node_struct* cursor, unsigned int optmsk, char* buffer_start) + xml_parse_result parse_exclamation(char_t*& ref_s, xml_node_struct* cursor, unsigned int optmsk, char_t* buffer_start, char_t endch) { // load into registers - char* s = ref_s; - char ch = 0; + char_t* s = ref_s; + char_t ch = 0; // parse node contents, starting with exclamation mark ++s; @@ -891,20 +1265,20 @@ namespace if (OPTSET(parse_eol) && OPTSET(parse_comments)) { - s = strconv_comment(s); + s = strconv_comment(s, endch); if (!s) THROW_ERROR(status_bad_comment, cursor->value); } else { // Scan for terminating '-->'. - SCANFOR(*s == '-' && *(s+1) == '-' && *(s+2) == '>'); + SCANFOR(s[0] == '-' && s[1] == '-' && ENDSWITH(s[2], '>')); CHECK_ERROR(status_bad_comment, s); if (OPTSET(parse_comments)) *s = 0; // Zero-terminate this segment at the first terminating '-'. - s += 3; // Step over the '\0->'. + s += (s[2] == '>' ? 3 : 2); // Step over the '\0->'. } if (OPTSET(parse_comments)) @@ -928,18 +1302,17 @@ namespace if (OPTSET(parse_eol)) { - s = strconv_cdata(s); + s = strconv_cdata(s, endch); if (!s) THROW_ERROR(status_bad_cdata, cursor->value); } else { // Scan for terminating ']]>'. - SCANFOR(*s == ']' && *(s+1) == ']' && *(s+2) == '>'); + SCANFOR(s[0] == ']' && s[1] == ']' && ENDSWITH(s[2], '>')); CHECK_ERROR(status_bad_cdata, s); ENDSEG(); // Zero-terminate this segment. - CHECK_ERROR(status_bad_cdata, s); } POPNODE(); // Pop since this is a standalone. @@ -947,34 +1320,31 @@ namespace else // Flagged for discard, but we still have to scan for the terminator. { // Scan for terminating ']]>'. - SCANFOR(*s == ']' && *(s+1) == ']' && *(s+2) == '>'); + SCANFOR(s[0] == ']' && s[1] == ']' && ENDSWITH(s[2], '>')); CHECK_ERROR(status_bad_cdata, s); ++s; } - s += 2; // Step over the last ']>'. + s += (s[1] == '>' ? 2 : 1); // Step over the last ']>'. } else THROW_ERROR(status_bad_cdata, s); } - else if (*s=='D' && *++s=='O' && *++s=='C' && *++s=='T' && *++s=='Y' && *++s=='P' && *++s=='E') + else if (s[0] == 'D' && s[1] == 'O' && s[2] == 'C' && s[3] == 'T' && s[4] == 'Y' && s[5] == 'P' && ENDSWITH(s[6], 'E')) { - ++s; - - SKIPWS(); // Eat any whitespace. - CHECK_ERROR(status_bad_doctype, s); + if (s[6] != 'E') THROW_ERROR(status_bad_doctype, s); LOC_DOCTYPE: SCANFOR(*s == '\'' || *s == '"' || *s == '[' || *s == '>'); - CHECK_ERROR(status_bad_doctype, s); + if (*s == 0 && endch != '>') THROW_ERROR(status_bad_doctype, s); if (*s == '\'' || *s == '"') // '...SYSTEM "..." { ch = *s++; SCANFOR(*s == ch); - CHECK_ERROR(status_bad_doctype, s); + if (*s == 0 && endch != '>') THROW_ERROR(status_bad_doctype, s); - ++s; + s += (*s != 0); goto LOC_DOCTYPE; } @@ -989,13 +1359,23 @@ namespace if (bd == 0) break; ++s; } + + if (bd != 0) THROW_ERROR(status_bad_doctype, s); } SCANFOR(*s == '>'); - CHECK_ERROR(status_bad_doctype, s); - ++s; + if (*s == 0) + { + if (endch != '>') THROW_ERROR(status_bad_doctype, s); + } + else + { + ++s; + } } + else if (*s == 0 && endch == '-') THROW_ERROR(status_bad_comment, s); + else if (*s == 0 && endch == '[') THROW_ERROR(status_bad_cdata, s); else THROW_ERROR(status_unrecognized_tag, s); // store from registers @@ -1004,12 +1384,12 @@ namespace THROW_ERROR(status_ok, s); } - xml_parse_result parse_question(char*& ref_s, xml_node_struct*& ref_cursor, unsigned int optmsk, char* buffer_start) + xml_parse_result parse_question(char_t*& ref_s, xml_node_struct*& ref_cursor, unsigned int optmsk, char_t* buffer_start, char_t endch) { // load into registers - char* s = ref_s; + char_t* s = ref_s; xml_node_struct* cursor = ref_cursor; - char ch = 0; + char_t ch = 0; // parse node contents, starting with question mark ++s; @@ -1018,7 +1398,7 @@ namespace THROW_ERROR(status_bad_pi, s); else if (OPTSET(parse_pi) || OPTSET(parse_declaration)) { - char* mark = s; + char_t* mark = s; SCANWHILE(is_chartype(*s, ct_symbol)); // Read PI target CHECK_ERROR(status_bad_pi, s); @@ -1026,12 +1406,12 @@ namespace THROW_ERROR(status_bad_pi, s); ENDSEG(); - CHECK_ERROR(status_bad_pi, s); + if (*s == 0 && endch != '>') THROW_ERROR(status_bad_pi, s); if (ch == '?') // nothing except target present { - if (*s != '>') THROW_ERROR(status_bad_pi, s); - ++s; + if (!ENDSWITH(*s, '>')) THROW_ERROR(status_bad_pi, s); + s += (*s == '>'); // stricmp / strcasecmp is not portable if ((mark[0] == 'x' || mark[0] == 'X') && (mark[1] == 'm' || mark[1] == 'M') @@ -1068,7 +1448,7 @@ namespace // scan for tag end mark = s; - SCANFOR(*s == '?' && *(s+1) == '>'); // Look for '?>'. + SCANFOR(s[0] == '?' && ENDSWITH(s[1], '>')); // Look for '?>'. CHECK_ERROR(status_bad_pi, s); // replace ending ? with / to terminate properly @@ -1095,13 +1475,12 @@ namespace mark = s; - SCANFOR(*s == '?' && *(s+1) == '>'); // Look for '?>'. + SCANFOR(s[0] == '?' && ENDSWITH(s[1], '>')); // Look for '?>'. CHECK_ERROR(status_bad_pi, s); ENDSEG(); - CHECK_ERROR(status_bad_pi, s); - ++s; // Step over > + s += (*s == '>'); // Step over > if (OPTSET(parse_pi)) { @@ -1113,10 +1492,10 @@ namespace } else // not parsing PI { - SCANFOR(*s == '?' && *(s+1) == '>'); // Look for '?>'. + SCANFOR(s[0] == '?' && ENDSWITH(s[1], '>')); // Look for '?>'. CHECK_ERROR(status_bad_pi, s); - s += 2; + s += (s[1] == '>' ? 2 : 1); } // store from registers @@ -1126,19 +1505,16 @@ namespace THROW_ERROR(status_ok, s); } - xml_parse_result parse(char* s, xml_node_struct* xmldoc, unsigned int optmsk = parse_default) + xml_parse_result parse(char_t* s, xml_node_struct* xmldoc, unsigned int optmsk, char_t endch) { - if (!s || !xmldoc) return MAKE_PARSE_RESULT(status_internal_error); - - char* buffer_start = s; + strconv_attribute_t strconv_attribute = get_strconv_attribute(optmsk); + strconv_pcdata_t strconv_pcdata = get_strconv_pcdata(optmsk); + + char_t* buffer_start = s; - // UTF-8 BOM - if ((unsigned char)*s == 0xEF && (unsigned char)*(s+1) == 0xBB && (unsigned char)*(s+2) == 0xBF) - s += 3; - - char ch = 0; + char_t ch = 0; xml_node_struct* cursor = xmldoc; - char* mark = s; + char_t* mark = s; while (*s != 0) { @@ -1154,8 +1530,6 @@ namespace cursor->name = s; SCANWHILE(is_chartype(*s, ct_symbol)); // Scan for a terminator. - CHECK_ERROR(status_bad_start_element, s); - ENDSEG(); // Save char in 'ch', terminate & step over. if (ch == '>') @@ -1199,7 +1573,7 @@ namespace ++s; // Step over the quote. a->value = s; // Save the offset. - s = strconv_attribute(s, ch, optmsk); + s = strconv_attribute(s, ch); if (!s) THROW_ERROR(status_bad_attribute, a->value); @@ -1215,14 +1589,19 @@ namespace else if (*s == '/') { ++s; - - if (*s != '>') THROW_ERROR(status_bad_start_element, s); - - POPNODE(); // Pop. - - ++s; - - break; + + if (*s == '>') + { + POPNODE(); + s++; + break; + } + else if (*s == 0 && endch == '>') + { + POPNODE(); + break; + } + else THROW_ERROR(status_bad_start_element, s); } else if (*s == '>') { @@ -1230,6 +1609,10 @@ namespace break; } + else if (*s == 0 && endch == '>') + { + break; + } else THROW_ERROR(status_bad_start_element, s); } @@ -1237,11 +1620,18 @@ namespace } else if (ch == '/') // '<#.../' { - if (*s != '>') THROW_ERROR(status_bad_start_element, s); + if (!ENDSWITH(*s, '>')) THROW_ERROR(status_bad_start_element, s); POPNODE(); // Pop. - ++s; + s += (*s == '>'); + } + else if (ch == 0) + { + // we stepped over null terminator, backtrack & handle closing tag + --s; + + if (endch != '>') THROW_ERROR(status_bad_start_element, s); } else THROW_ERROR(status_bad_start_element, s); } @@ -1249,9 +1639,7 @@ namespace { ++s; - if (!cursor) THROW_ERROR(status_bad_end_element, s); - - char* name = cursor->name; + char_t* name = cursor->name; if (!name) THROW_ERROR(status_end_element_mismatch, s); while (is_chartype(*s, ct_symbol)) @@ -1259,19 +1647,29 @@ namespace if (*s++ != *name++) THROW_ERROR(status_end_element_mismatch, s); } - if (*name) THROW_ERROR(status_end_element_mismatch, s); + if (*name) + { + if (*s == 0 && name[0] == endch && name[1] == 0) THROW_ERROR(status_bad_end_element, s); + else THROW_ERROR(status_end_element_mismatch, s); + } POPNODE(); // Pop. SKIPWS(); - CHECK_ERROR(status_bad_end_element, s); - if (*s != '>') THROW_ERROR(status_bad_end_element, s); - ++s; + if (*s == 0) + { + if (endch != '>') THROW_ERROR(status_bad_end_element, s); + } + else + { + if (*s != '>') THROW_ERROR(status_bad_end_element, s); + ++s; + } } else if (*s == '?') // 'value = s; // Save the offset. - s = strconv_pcdata(s, optmsk); - - if (!s) THROW_ERROR(status_bad_pcdata, cursor->value); + s = strconv_pcdata(s); POPNODE(); // Pop since this is a standalone. @@ -1324,101 +1721,208 @@ namespace } } + // check that last tag is closed if (cursor != xmldoc) THROW_ERROR(status_end_element_mismatch, s); THROW_ERROR(status_ok, s); } + + static xml_parse_result parse(char_t* buffer, size_t length, xml_node_struct* xmldoc, unsigned int optmsk) + { + // store buffer for offset_debug + static_cast(xmldoc)->buffer = buffer; + + // early-out for empty documents + if (length == 0) return MAKE_PARSE_RESULT(status_ok); + + // create parser on stack + xml_allocator& alloc = static_cast(xmldoc)->allocator; + + xml_parser parser(alloc); + + // save last character and make buffer zero-terminated (speeds up parsing) + char_t endch = buffer[length - 1]; + buffer[length - 1] = 0; + + // perform actual parsing + xml_parse_result result = parser.parse(buffer, xmldoc, optmsk, endch); + + // since we removed last character, we have to handle the only possible false positive + if (result && endch == '<') + { + char_t* buffer_start = buffer; + + // there's no possible well-formed document with < at the end + THROW_ERROR(status_unrecognized_tag, buffer_start + length); + } + + return result; + } private: xml_parser(const xml_parser&); const xml_parser& operator=(const xml_parser&); }; - // Compare lhs with [rhs_begin, rhs_end) - int strcmprange(const char* lhs, const char* rhs_begin, const char* rhs_end) - { - while (*lhs && rhs_begin != rhs_end && *lhs == *rhs_begin) + // Output facilities + encoding_t get_write_native_encoding() + { + #ifdef PUGIXML_WCHAR_MODE + return get_wchar_encoding(); + #else + return encoding_utf8; + #endif + } + + encoding_t get_write_encoding(encoding_t encoding) + { + // replace wchar encoding with utf implementation + if (encoding == encoding_wchar) return get_wchar_encoding(); + + // replace utf16 encoding with utf16 with specific endianness + if (encoding == encoding_utf16) return is_little_endian() ? encoding_utf16_le : encoding_utf16_be; + + // replace utf32 encoding with utf32 with specific endianness + if (encoding == encoding_utf32) return is_little_endian() ? encoding_utf32_le : encoding_utf32_be; + + // only do autodetection if no explicit encoding is requested + if (encoding != encoding_auto) return encoding; + + // assume utf8 encoding + return encoding_utf8; + } + +#ifdef PUGIXML_WCHAR_MODE + size_t get_valid_length(const char_t* data, size_t length) + { + assert(length > 0); + + // discard last character if it's the lead of a surrogate pair + return (sizeof(wchar_t) == 2 && (unsigned)(static_cast(data[length - 1]) - 0xD800) < 0x400) ? length - 1 : length; + } + + size_t convert_buffer(char* result, const char_t* data, size_t length, encoding_t encoding) + { + // only endian-swapping is required + if (need_endian_swap_utf(encoding, get_wchar_encoding())) + { + impl::convert_wchar_endian_swap(reinterpret_cast(result), data, length); + + return length * sizeof(char_t); + } + + // convert to utf8 + if (encoding == encoding_utf8) + { + impl::char8_t* dest = reinterpret_cast(result); + + impl::char8_t* end = sizeof(wchar_t) == 2 ? + impl::decode_utf16_block(reinterpret_cast(data), length, dest, opt1_to_type()) : + impl::decode_utf32_block(reinterpret_cast(data), length, dest, opt1_to_type()); + + return static_cast(end - dest); + } + + // convert to utf16 + if (encoding == encoding_utf16_be || encoding == encoding_utf16_le) + { + impl::char16_t* dest = reinterpret_cast(result); + + // convert to native utf16 + impl::char16_t* end = impl::decode_utf32_block(reinterpret_cast(data), length, dest, opt1_to_type()); + + // swap if necessary + encoding_t native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be; + + if (native_encoding != encoding) impl::convert_utf_endian_swap(dest, dest, static_cast(end - dest)); + + return static_cast(end - dest) * sizeof(impl::char16_t); + } + + // convert to utf32 + if (encoding == encoding_utf32_be || encoding == encoding_utf32_le) { - ++lhs; - ++rhs_begin; + impl::char32_t* dest = reinterpret_cast(result); + + // convert to native utf32 + impl::char32_t* end = impl::decode_utf16_block(reinterpret_cast(data), length, dest, opt1_to_type()); + + // swap if necessary + encoding_t native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be; + + if (native_encoding != encoding) impl::convert_utf_endian_swap(dest, dest, static_cast(end - dest)); + + return static_cast(end - dest) * sizeof(impl::char32_t); } - - if (rhs_begin == rhs_end && *lhs == 0) return 0; - else return 1; + + // invalid encoding combination (this can't happen) + assert(false); + + return 0; } - - // Character set pattern match. - int strcmpwild_cset(const char** src, const char** dst) +#else + size_t get_valid_length(const char_t* data, size_t length) { - int find = 0, excl = 0, star = 0; - - if (**src == '!') - { - excl = 1; - ++(*src); - } - - while (**src != ']' || star == 1) + assert(length > 4); + + for (size_t i = 1; i <= 4; ++i) { - if (find == 0) - { - if (**src == '-' && *(*src-1) < *(*src+1) && *(*src+1) != ']' && star == 0) - { - if (**dst >= *(*src-1) && **dst <= *(*src+1)) - { - find = 1; - ++(*src); - } - } - else if (**src == **dst) find = 1; - } - ++(*src); - star = 0; + impl::char8_t ch = static_cast(data[length - i]); + + // either a standalone character or a leading one + if ((ch & 0xc0) != 0x80) return length - i; } - if (excl == 1) find = (1 - find); - if (find == 1) ++(*dst); - - return find; + // there are four non-leading characters at the end, sequence tail is broken so might as well process the whole chunk + return length; } - // Wildcard pattern match. - int strcmpwild_astr(const char** src, const char** dst) + size_t convert_buffer(char* result, const char_t* data, size_t length, encoding_t encoding) { - int find = 1; - ++(*src); - while ((**dst != 0 && **src == '?') || **src == '*') + if (encoding == encoding_utf16_be || encoding == encoding_utf16_le) { - if(**src == '?') ++(*dst); - ++(*src); + impl::char16_t* dest = reinterpret_cast(result); + + // convert to native utf16 + impl::char16_t* end = impl::decode_utf8_block(reinterpret_cast(data), length, dest); + + // swap if necessary + encoding_t native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be; + + if (native_encoding != encoding) impl::convert_utf_endian_swap(dest, dest, static_cast(end - dest)); + + return static_cast(end - dest) * sizeof(impl::char16_t); } - while (**src == '*') ++(*src); - if (**dst == 0 && **src != 0) return 0; - if (**dst == 0 && **src == 0) return 1; - else + + if (encoding == encoding_utf32_be || encoding == encoding_utf32_le) { - if (impl::strcmpwild(*src,*dst)) - { - do - { - ++(*dst); - while(**src != **dst && **src != '[' && **dst != 0) - ++(*dst); - } - while ((**dst != 0) ? impl::strcmpwild(*src,*dst) : 0 != (find=0)); - } - if (**dst == 0 && **src == 0) find = 1; - return find; + impl::char32_t* dest = reinterpret_cast(result); + + // convert to native utf32 + impl::char32_t* end = impl::decode_utf8_block(reinterpret_cast(data), length, dest); + + // swap if necessary + encoding_t native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be; + + if (native_encoding != encoding) impl::convert_utf_endian_swap(dest, dest, static_cast(end - dest)); + + return static_cast(end - dest) * sizeof(impl::char32_t); } + + // invalid encoding combination (this can't happen) + assert(false); + + return 0; } +#endif - // Output facilities - struct xml_buffered_writer + class xml_buffered_writer { xml_buffered_writer(const xml_buffered_writer&); xml_buffered_writer& operator=(const xml_buffered_writer&); - xml_buffered_writer(xml_writer& writer): writer(writer), bufsize(0) + public: + xml_buffered_writer(xml_writer& writer, encoding_t user_encoding): writer(writer), bufsize(0), encoding(get_write_encoding(user_encoding)) { } @@ -1429,56 +1933,188 @@ namespace void flush() { - if (bufsize > 0) writer.write(buffer, bufsize); + flush(buffer, bufsize); bufsize = 0; } - void write(const void* data, size_t size) + void flush(const char_t* data, size_t size) { - if (bufsize + size > sizeof(buffer)) + if (size == 0) return; + + // fast path, just write data + if (encoding == get_write_native_encoding()) + writer.write(data, size * sizeof(char_t)); + else + { + // convert chunk + size_t result = convert_buffer(scratch, data, size, encoding); + assert(result <= sizeof(scratch)); + + // write data + writer.write(scratch, result); + } + } + + void write(const char_t* data, size_t length) + { + if (bufsize + length > bufcapacity) { + // flush the remaining buffer contents flush(); - if (size > sizeof(buffer)) + // handle large chunks + if (length > bufcapacity) { - writer.write(data, size); - return; + if (encoding == get_write_native_encoding()) + { + // fast path, can just write data chunk + writer.write(data, length * sizeof(char_t)); + return; + } + + // need to convert in suitable chunks + while (length > bufcapacity) + { + // get chunk size by selecting such number of characters that are guaranteed to fit into scratch buffer + // and form a complete codepoint sequence (i.e. discard start of last codepoint if necessary) + size_t chunk_size = get_valid_length(data, bufcapacity); + + // convert chunk and write + flush(data, chunk_size); + + // iterate + data += chunk_size; + length -= chunk_size; + } + + // small tail is copied below + bufsize = 0; } } - memcpy(buffer + bufsize, data, size); - bufsize += size; + memcpy(buffer + bufsize, data, length * sizeof(char_t)); + bufsize += length; } - void write(const char* data) + void write(const char_t* data) { - write(data, strlen(data)); + write(data, impl::strlen(data)); } - void write(char data) + void write(char_t d0) { - if (bufsize + 1 > sizeof(buffer)) flush(); + if (bufsize + 1 > bufcapacity) flush(); - buffer[bufsize++] = data; + buffer[bufsize + 0] = d0; + bufsize += 1; } + void write(char_t d0, char_t d1) + { + if (bufsize + 2 > bufcapacity) flush(); + + buffer[bufsize + 0] = d0; + buffer[bufsize + 1] = d1; + bufsize += 2; + } + + void write(char_t d0, char_t d1, char_t d2) + { + if (bufsize + 3 > bufcapacity) flush(); + + buffer[bufsize + 0] = d0; + buffer[bufsize + 1] = d1; + buffer[bufsize + 2] = d2; + bufsize += 3; + } + + void write(char_t d0, char_t d1, char_t d2, char_t d3) + { + if (bufsize + 4 > bufcapacity) flush(); + + buffer[bufsize + 0] = d0; + buffer[bufsize + 1] = d1; + buffer[bufsize + 2] = d2; + buffer[bufsize + 3] = d3; + bufsize += 4; + } + + void write(char_t d0, char_t d1, char_t d2, char_t d3, char_t d4) + { + if (bufsize + 5 > bufcapacity) flush(); + + buffer[bufsize + 0] = d0; + buffer[bufsize + 1] = d1; + buffer[bufsize + 2] = d2; + buffer[bufsize + 3] = d3; + buffer[bufsize + 4] = d4; + bufsize += 5; + } + + void write(char_t d0, char_t d1, char_t d2, char_t d3, char_t d4, char_t d5) + { + if (bufsize + 6 > bufcapacity) flush(); + + buffer[bufsize + 0] = d0; + buffer[bufsize + 1] = d1; + buffer[bufsize + 2] = d2; + buffer[bufsize + 3] = d3; + buffer[bufsize + 4] = d4; + buffer[bufsize + 5] = d5; + bufsize += 6; + } + + // utf8 maximum expansion: x4 (-> utf32) + // utf16 maximum expansion: x2 (-> utf32) + // utf32 maximum expansion: x1 + enum { bufcapacity = 2048 }; + + char_t buffer[bufcapacity]; + char scratch[4 * bufcapacity]; + xml_writer& writer; - char buffer[8192]; size_t bufsize; + encoding_t encoding; }; - template void text_output_escaped(xml_buffered_writer& writer, const char* s, opt1) + void write_bom(xml_writer& writer, encoding_t encoding) { - const bool attribute = opt1::o1; + switch (encoding) + { + case encoding_utf8: + writer.write("\xef\xbb\xbf", 3); + break; + + case encoding_utf16_be: + writer.write("\xfe\xff", 2); + break; + + case encoding_utf16_le: + writer.write("\xff\xfe", 2); + break; + + case encoding_utf32_be: + writer.write("\x00\x00\xfe\xff", 4); + break; + + case encoding_utf32_le: + writer.write("\xff\xfe\x00\x00", 4); + break; + default: + // invalid encoding (this should not happen) + assert(false); + } + } + + void text_output_escaped(xml_buffered_writer& writer, const char_t* s, output_chartype_t type) + { while (*s) { - const char* prev = s; + const char_t* prev = s; // While *s is a usual symbol - while (*s && *s != '&' && *s != '<' && *s != '>' && (*s != '"' || !attribute) - && ((unsigned char)*s >= 32 || (*s == '\r' && !attribute) || (*s == '\n' && !attribute) || *s == '\t')) - ++s; + while (!is_output_chartype(*s, type)) ++s; writer.write(prev, static_cast(s - prev)); @@ -1486,43 +2122,33 @@ namespace { case 0: break; case '&': - writer.write("&"); + writer.write('&', 'a', 'm', 'p', ';'); ++s; break; case '<': - writer.write("<"); + writer.write('&', 'l', 't', ';'); ++s; break; case '>': - writer.write(">"); + writer.write('&', 'g', 't', ';'); ++s; break; case '"': - writer.write("""); - ++s; - break; - case '\r': - writer.write(" "); - ++s; - break; - case '\n': - writer.write(" "); + writer.write('&', 'q', 'u', 'o', 't', ';'); ++s; break; default: // s is not a usual symbol { - unsigned int ch = (unsigned char)*s++; - - char buf[8]; - sprintf(buf, "&#%u;", ch); + unsigned int ch = static_cast(*s++); + assert(ch < 32); - writer.write(buf); + writer.write('&', '#', static_cast((ch / 10) + '0'), static_cast((ch % 10) + '0'), ';'); } } } } - void node_output(xml_buffered_writer& writer, const xml_node& node, const char* indent, unsigned int flags, unsigned int depth) + void node_output(xml_buffered_writer& writer, const xml_node& node, const char_t* indent, unsigned int flags, unsigned int depth) { if ((flags & format_indent) != 0 && (flags & format_raw) == 0) for (unsigned int i = 0; i < depth; ++i) writer.write(indent); @@ -1545,10 +2171,9 @@ namespace { writer.write(' '); writer.write(a.name()); - writer.write('='); - writer.write('"'); + writer.write('=', '"'); - text_output_escaped(writer, a.value(), opt1_to_type<1>()); + text_output_escaped(writer, a.value(), oct_special_attr); writer.write('"'); } @@ -1556,7 +2181,7 @@ namespace if (flags & format_raw) { if (!node.first_child()) - writer.write(" />"); + writer.write(' ', '/', '>'); else { writer.write('>'); @@ -1564,30 +2189,26 @@ namespace for (xml_node n = node.first_child(); n; n = n.next_sibling()) node_output(writer, n, indent, flags, depth + 1); - writer.write('<'); - writer.write('/'); + writer.write('<', '/'); writer.write(node.name()); writer.write('>'); } } else if (!node.first_child()) - writer.write(" />\n"); + writer.write(' ', '/', '>', '\n'); else if (node.first_child() == node.last_child() && node.first_child().type() == node_pcdata) { writer.write('>'); - text_output_escaped(writer, node.first_child().value(), opt1_to_type<0>()); + text_output_escaped(writer, node.first_child().value(), oct_special_pcdata); - writer.write('<'); - writer.write('/'); + writer.write('<', '/'); writer.write(node.name()); - writer.write('>'); - writer.write('\n'); + writer.write('>', '\n'); } else { - writer.write('>'); - writer.write('\n'); + writer.write('>', '\n'); for (xml_node n = node.first_child(); n; n = n.next_sibling()) node_output(writer, n, indent, flags, depth + 1); @@ -1595,65 +2216,63 @@ namespace if ((flags & format_indent) != 0 && (flags & format_raw) == 0) for (unsigned int i = 0; i < depth; ++i) writer.write(indent); - writer.write('<'); - writer.write('/'); + writer.write('<', '/'); writer.write(node.name()); - writer.write('>'); - writer.write('\n'); + writer.write('>', '\n'); } break; } case node_pcdata: - text_output_escaped(writer, node.value(), opt1_to_type<0>()); + text_output_escaped(writer, node.value(), oct_special_pcdata); if ((flags & format_raw) == 0) writer.write('\n'); break; case node_cdata: - writer.write(""); + writer.write(']', ']', '>'); if ((flags & format_raw) == 0) writer.write('\n'); break; case node_comment: - writer.write(""); + writer.write('-', '-', '>'); if ((flags & format_raw) == 0) writer.write('\n'); break; case node_pi: - writer.write(""); + writer.write('?', '>'); if ((flags & format_raw) == 0) writer.write('\n'); break; case node_declaration: { - writer.write("()); + text_output_escaped(writer, a.value(), oct_special_attr); writer.write('"'); } - writer.write("?>"); + writer.write('?', '>'); if ((flags & format_raw) == 0) writer.write('\n'); break; } @@ -1718,33 +2337,6 @@ namespace namespace pugi { - namespace impl - { - // Compare two strings - int PUGIXML_FUNCTION strcmp(const char* src, const char* dst) - { - return ::strcmp(src, dst); - } - - // Compare two strings, with globbing, and character sets. - int PUGIXML_FUNCTION strcmpwild(const char* src, const char* dst) - { - int find = 1; - for(; *src != 0 && find == 1 && *dst != 0; ++src) - { - switch (*src) - { - case '?': ++dst; break; - case '[': ++src; find = strcmpwild_cset(&src,&dst); break; - case '*': find = strcmpwild_astr(&src,&dst); --src; break; - default : find = (int) (*src == *dst); ++dst; - } - } - while (*src == '*' && find == 1) ++src; - return (find == 1 && *dst == 0 && *src == 0) ? 0 : 1; - } - } - xml_writer_file::xml_writer_file(void* file): file(file) { } @@ -1755,13 +2347,28 @@ namespace pugi } #ifndef PUGIXML_NO_STL - xml_writer_stream::xml_writer_stream(std::ostream& stream): stream(&stream) + xml_writer_stream::xml_writer_stream(std::basic_ostream >& stream): narrow_stream(&stream), wide_stream(0) + { + } + + xml_writer_stream::xml_writer_stream(std::basic_ostream >& stream): narrow_stream(0), wide_stream(&stream) { } void xml_writer_stream::write(const void* data, size_t size) { - stream->write(reinterpret_cast(data), static_cast(size)); + if (narrow_stream) + { + assert(!wide_stream); + narrow_stream->write(reinterpret_cast(data), static_cast(size)); + } + else + { + assert(wide_stream); + assert(size % sizeof(wchar_t) == 0); + + wide_stream->write(reinterpret_cast(data), static_cast(size / sizeof(wchar_t))); + } } #endif @@ -1796,17 +2403,14 @@ namespace pugi { } -#ifdef __MWERKS__ xml_attribute::operator xml_attribute::unspecified_bool_type() const { +#ifdef __MWERKS__ return _attr ? &xml_attribute::empty : 0; - } #else - xml_attribute::operator xml_attribute::unspecified_bool_type() const - { return _attr ? &xml_attribute::_attr : 0; - } #endif + } bool xml_attribute::operator!() const { @@ -1855,29 +2459,56 @@ namespace pugi int xml_attribute::as_int() const { - return (_attr && _attr->value) ? atoi(_attr->value) : 0; + if (!_attr || !_attr->value) return 0; + + #ifdef PUGIXML_WCHAR_MODE + return (int)wcstol(_attr->value, 0, 10); + #else + return atoi(_attr->value); + #endif } unsigned int xml_attribute::as_uint() const { - int result = (_attr && _attr->value) ? atoi(_attr->value) : 0; + if (!_attr || !_attr->value) return 0; + + #ifdef PUGIXML_WCHAR_MODE + int result = (int)wcstol(_attr->value, 0, 10); + #else + int result = atoi(_attr->value); + #endif + return result < 0 ? 0 : static_cast(result); } double xml_attribute::as_double() const { - return (_attr && _attr->value) ? atof(_attr->value) : 0; + if (!_attr || !_attr->value) return 0; + + #ifdef PUGIXML_WCHAR_MODE + return wcstod(_attr->value, 0); + #else + return atof(_attr->value); + #endif } float xml_attribute::as_float() const { - return (_attr && _attr->value) ? (float)atof(_attr->value) : 0; + if (!_attr || !_attr->value) return 0; + + #ifdef PUGIXML_WCHAR_MODE + return (float)wcstod(_attr->value, 0); + #else + return (float)atof(_attr->value); + #endif } bool xml_attribute::as_bool() const { + if (!_attr || !_attr->value) return false; + // only look at first char - char first = (_attr && _attr->value) ? *_attr->value : '\0'; + char_t first = *_attr->value; // 1*, t* (true), T* (True), y* (yes), Y* (YES) return (first == '1' || first == 't' || first == 'T' || first == 'y' || first == 'Y'); @@ -1888,14 +2519,14 @@ namespace pugi return !_attr; } - const char* xml_attribute::name() const + const char_t* xml_attribute::name() const { - return (_attr && _attr->name) ? _attr->name : ""; + return (_attr && _attr->name) ? _attr->name : PUGIXML_TEXT(""); } - const char* xml_attribute::value() const + const char_t* xml_attribute::value() const { - return (_attr && _attr->value) ? _attr->value : ""; + return (_attr && _attr->value) ? _attr->value : PUGIXML_TEXT(""); } unsigned int xml_attribute::document_order() const @@ -1903,7 +2534,7 @@ namespace pugi return _attr ? _attr->document_order : 0; } - xml_attribute& xml_attribute::operator=(const char* rhs) + xml_attribute& xml_attribute::operator=(const char_t* rhs) { set_value(rhs); return *this; @@ -1933,7 +2564,7 @@ namespace pugi return *this; } - bool xml_attribute::set_name(const char* rhs) + bool xml_attribute::set_name(const char_t* rhs) { if (!_attr) return false; @@ -1944,7 +2575,7 @@ namespace pugi return res; } - bool xml_attribute::set_value(const char* rhs) + bool xml_attribute::set_value(const char_t* rhs) { if (!_attr) return false; @@ -1959,26 +2590,50 @@ namespace pugi { char buf[128]; sprintf(buf, "%d", rhs); + + #ifdef PUGIXML_WCHAR_MODE + char_t wbuf[128]; + impl::widen_ascii(wbuf, buf); + + return set_value(wbuf); + #else return set_value(buf); + #endif } bool xml_attribute::set_value(unsigned int rhs) { char buf[128]; sprintf(buf, "%u", rhs); + + #ifdef PUGIXML_WCHAR_MODE + char_t wbuf[128]; + impl::widen_ascii(wbuf, buf); + + return set_value(wbuf); + #else return set_value(buf); + #endif } bool xml_attribute::set_value(double rhs) { char buf[128]; sprintf(buf, "%g", rhs); + + #ifdef PUGIXML_WCHAR_MODE + char_t wbuf[128]; + impl::widen_ascii(wbuf, buf); + + return set_value(wbuf); + #else return set_value(buf); + #endif } bool xml_attribute::set_value(bool rhs) { - return set_value(rhs ? "true" : "false"); + return set_value(rhs ? PUGIXML_TEXT("true") : PUGIXML_TEXT("false")); } #ifdef __BORLANDC__ @@ -2001,17 +2656,14 @@ namespace pugi { } -#ifdef __MWERKS__ xml_node::operator xml_node::unspecified_bool_type() const { +#ifdef __MWERKS__ return _root ? &xml_node::empty : 0; - } #else - xml_node::operator xml_node::unspecified_bool_type() const - { return _root ? &xml_node::_root : 0; - } #endif + } bool xml_node::operator!() const { @@ -2080,9 +2732,9 @@ namespace pugi return static_cast(r)->allocator; } - const char* xml_node::name() const + const char_t* xml_node::name() const { - return (_root && _root->name) ? _root->name : ""; + return (_root && _root->name) ? _root->name : PUGIXML_TEXT(""); } xml_node_type xml_node::type() const @@ -2090,69 +2742,69 @@ namespace pugi return _root ? static_cast(_root->type) : node_null; } - const char* xml_node::value() const + const char_t* xml_node::value() const { - return (_root && _root->value) ? _root->value : ""; + return (_root && _root->value) ? _root->value : PUGIXML_TEXT(""); } - xml_node xml_node::child(const char* name) const + xml_node xml_node::child(const char_t* name) const { if (!_root) return xml_node(); for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling) - if (i->name && !strcmp(name, i->name)) return xml_node(i); + if (i->name && impl::strequal(name, i->name)) return xml_node(i); return xml_node(); } - xml_node xml_node::child_w(const char* name) const + xml_node xml_node::child_w(const char_t* name) const { if (!_root) return xml_node(); for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling) - if (i->name && !impl::strcmpwild(name, i->name)) return xml_node(i); + if (i->name && impl::strequalwild(name, i->name)) return xml_node(i); return xml_node(); } - xml_attribute xml_node::attribute(const char* name) const + xml_attribute xml_node::attribute(const char_t* name) const { if (!_root) return xml_attribute(); for (xml_attribute_struct* i = _root->first_attribute; i; i = i->next_attribute) - if (i->name && !strcmp(name, i->name)) + if (i->name && impl::strequal(name, i->name)) return xml_attribute(i); return xml_attribute(); } - xml_attribute xml_node::attribute_w(const char* name) const + xml_attribute xml_node::attribute_w(const char_t* name) const { if (!_root) return xml_attribute(); for (xml_attribute_struct* i = _root->first_attribute; i; i = i->next_attribute) - if (i->name && !impl::strcmpwild(name, i->name)) + if (i->name && impl::strequalwild(name, i->name)) return xml_attribute(i); return xml_attribute(); } - xml_node xml_node::next_sibling(const char* name) const + xml_node xml_node::next_sibling(const char_t* name) const { if (!_root) return xml_node(); for (xml_node_struct* i = _root->next_sibling; i; i = i->next_sibling) - if (i->name && !strcmp(name, i->name)) return xml_node(i); + if (i->name && impl::strequal(name, i->name)) return xml_node(i); return xml_node(); } - xml_node xml_node::next_sibling_w(const char* name) const + xml_node xml_node::next_sibling_w(const char_t* name) const { if (!_root) return xml_node(); for (xml_node_struct* i = _root->next_sibling; i; i = i->next_sibling) - if (i->name && !impl::strcmpwild(name, i->name)) return xml_node(i); + if (i->name && impl::strequalwild(name, i->name)) return xml_node(i); return xml_node(); } @@ -2165,22 +2817,22 @@ namespace pugi else return xml_node(); } - xml_node xml_node::previous_sibling(const char* name) const + xml_node xml_node::previous_sibling(const char_t* name) const { if (!_root) return xml_node(); for (xml_node_struct* i = _root->prev_sibling; i; i = i->prev_sibling) - if (i->name && !strcmp(name, i->name)) return xml_node(i); + if (i->name && impl::strequal(name, i->name)) return xml_node(i); return xml_node(); } - xml_node xml_node::previous_sibling_w(const char* name) const + xml_node xml_node::previous_sibling_w(const char_t* name) const { if (!_root) return xml_node(); for (xml_node_struct* i = _root->prev_sibling; i; i = i->prev_sibling) - if (i->name && !impl::strcmpwild(name, i->name)) return xml_node(i); + if (i->name && impl::strequalwild(name, i->name)) return xml_node(i); return xml_node(); } @@ -2205,23 +2857,23 @@ namespace pugi return r; } - const char* xml_node::child_value() const + const char_t* xml_node::child_value() const { - if (!_root) return ""; + if (!_root) return PUGIXML_TEXT(""); for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling) if ((static_cast(i->type) == node_pcdata || static_cast(i->type) == node_cdata) && i->value) return i->value; - return ""; + return PUGIXML_TEXT(""); } - const char* xml_node::child_value(const char* name) const + const char_t* xml_node::child_value(const char_t* name) const { return child(name).child_value(); } - const char* xml_node::child_value_w(const char* name) const + const char_t* xml_node::child_value_w(const char_t* name) const { return child_w(name).child_value(); } @@ -2246,7 +2898,7 @@ namespace pugi return _root ? xml_node(_root->last_child) : xml_node(); } - bool xml_node::set_name(const char* rhs) + bool xml_node::set_name(const char_t* rhs) { switch (type()) { @@ -2266,7 +2918,7 @@ namespace pugi } } - bool xml_node::set_value(const char* rhs) + bool xml_node::set_value(const char_t* rhs) { switch (type()) { @@ -2287,7 +2939,7 @@ namespace pugi } } - xml_attribute xml_node::append_attribute(const char* name) + xml_attribute xml_node::append_attribute(const char_t* name) { if (type() != node_element && type() != node_declaration) return xml_attribute(); @@ -2297,7 +2949,7 @@ namespace pugi return a; } - xml_attribute xml_node::insert_attribute_before(const char* name, const xml_attribute& attr) + xml_attribute xml_node::insert_attribute_before(const char_t* name, const xml_attribute& attr) { if ((type() != node_element && type() != node_declaration) || attr.empty()) return xml_attribute(); @@ -2323,7 +2975,7 @@ namespace pugi return a; } - xml_attribute xml_node::insert_attribute_after(const char* name, const xml_attribute& attr) + xml_attribute xml_node::insert_attribute_after(const char_t* name, const xml_attribute& attr) { if ((type() != node_element && type() != node_declaration) || attr.empty()) return xml_attribute(); @@ -2453,7 +3105,7 @@ namespace pugi return result; } - void xml_node::remove_attribute(const char* name) + void xml_node::remove_attribute(const char_t* name) { remove_attribute(attribute(name)); } @@ -2478,7 +3130,7 @@ namespace pugi a._attr->destroy(); } - void xml_node::remove_child(const char* name) + void xml_node::remove_child(const char_t* name) { remove_child(child(name)); } @@ -2496,64 +3148,64 @@ namespace pugi n._root->destroy(); } - xml_node xml_node::find_child_by_attribute(const char* name, const char* attr_name, const char* attr_value) const + xml_node xml_node::find_child_by_attribute(const char_t* name, const char_t* attr_name, const char_t* attr_value) const { if (!_root) return xml_node(); for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling) - if (i->name && !strcmp(name, i->name)) + if (i->name && impl::strequal(name, i->name)) { for (xml_attribute_struct* a = i->first_attribute; a; a = a->next_attribute) - if (!strcmp(attr_name, a->name) && !strcmp(attr_value, a->value)) + if (impl::strequal(attr_name, a->name) && impl::strequal(attr_value, a->value)) return xml_node(i); } return xml_node(); } - xml_node xml_node::find_child_by_attribute_w(const char* name, const char* attr_name, const char* attr_value) const + xml_node xml_node::find_child_by_attribute_w(const char_t* name, const char_t* attr_name, const char_t* attr_value) const { if (!_root) return xml_node(); for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling) - if (i->name && !impl::strcmpwild(name, i->name)) + if (i->name && impl::strequalwild(name, i->name)) { for (xml_attribute_struct* a = i->first_attribute; a; a = a->next_attribute) - if (!impl::strcmpwild(attr_name, a->name) && !impl::strcmpwild(attr_value, a->value)) + if (impl::strequalwild(attr_name, a->name) && impl::strequalwild(attr_value, a->value)) return xml_node(i); } return xml_node(); } - xml_node xml_node::find_child_by_attribute(const char* attr_name, const char* attr_value) const + xml_node xml_node::find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const { if (!_root) return xml_node(); for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling) for (xml_attribute_struct* a = i->first_attribute; a; a = a->next_attribute) - if (!strcmp(attr_name, a->name) && !strcmp(attr_value, a->value)) + if (impl::strequal(attr_name, a->name) && impl::strequal(attr_value, a->value)) return xml_node(i); return xml_node(); } - xml_node xml_node::find_child_by_attribute_w(const char* attr_name, const char* attr_value) const + xml_node xml_node::find_child_by_attribute_w(const char_t* attr_name, const char_t* attr_value) const { if (!_root) return xml_node(); for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling) for (xml_attribute_struct* a = i->first_attribute; a; a = a->next_attribute) - if (!impl::strcmpwild(attr_name, a->name) && !impl::strcmpwild(attr_value, a->value)) + if (impl::strequalwild(attr_name, a->name) && impl::strequalwild(attr_value, a->value)) return xml_node(i); return xml_node(); } #ifndef PUGIXML_NO_STL - std::string xml_node::path(char delimiter) const + string_t xml_node::path(char_t delimiter) const { - std::string path; + string_t path; xml_node cursor = *this; // Make a copy. @@ -2563,7 +3215,7 @@ namespace pugi { cursor = cursor.parent(); - std::string temp = cursor.name(); + string_t temp = cursor.name(); temp += delimiter; temp += path; path.swap(temp); @@ -2573,7 +3225,7 @@ namespace pugi } #endif - xml_node xml_node::first_element_by_path(const char* path, char delimiter) const + xml_node xml_node::first_element_by_path(const char_t* path, char_t delimiter) const { xml_node found = *this; // Current search context. @@ -2586,17 +3238,17 @@ namespace pugi ++path; } - const char* path_segment = path; + const char_t* path_segment = path; while (*path_segment == delimiter) ++path_segment; - const char* path_segment_end = path_segment; + const char_t* path_segment_end = path_segment; while (*path_segment_end && *path_segment_end != delimiter) ++path_segment_end; if (path_segment == path_segment_end) return found; - const char* next_segment = path_segment_end; + const char_t* next_segment = path_segment_end; while (*next_segment == delimiter) ++next_segment; @@ -2608,7 +3260,7 @@ namespace pugi { for (xml_node_struct* j = found._root->first_child; j; j = j->next_sibling) { - if (j->name && !strcmprange(j->name, path_segment, path_segment_end)) + if (j->name && impl::strequalrange(j->name, path_segment, static_cast(path_segment_end - path_segment))) { xml_node subsearch = xml_node(j).first_element_by_path(next_segment, delimiter); @@ -2698,23 +3350,32 @@ namespace pugi } } - void xml_node::print(xml_writer& writer, const char* indent, unsigned int flags, unsigned int depth) const + void xml_node::print(xml_writer& writer, const char_t* indent, unsigned int flags, encoding_t encoding, unsigned int depth) const { if (!_root) return; - xml_buffered_writer buffered_writer(writer); + xml_buffered_writer buffered_writer(writer, encoding); node_output(buffered_writer, *this, indent, flags, depth); } #ifndef PUGIXML_NO_STL - void xml_node::print(std::ostream& stream, const char* indent, unsigned int flags, unsigned int depth) const + void xml_node::print(std::basic_ostream >& stream, const char_t* indent, unsigned int flags, encoding_t encoding, unsigned int depth) const + { + if (!_root) return; + + xml_writer_stream writer(stream); + + print(writer, indent, flags, encoding, depth); + } + + void xml_node::print(std::basic_ostream >& stream, const char_t* indent, unsigned int flags, unsigned int depth) const { if (!_root) return; xml_writer_stream writer(stream); - print(writer, indent, flags, depth); + print(writer, indent, flags, encoding_wchar, depth); } #endif @@ -2724,7 +3385,7 @@ namespace pugi if (!r) return -1; - const char* buffer = static_cast(r)->buffer; + const char_t* buffer = static_cast(r)->buffer; if (!buffer) return -1; @@ -2964,7 +3625,7 @@ namespace pugi } #ifndef PUGIXML_NO_STL - xml_parse_result xml_document::load(std::istream& stream, unsigned int options) + xml_parse_result xml_document::load(std::basic_istream >& stream, unsigned int options, encoding_t encoding) { destroy(); @@ -2977,36 +3638,63 @@ namespace pugi if (!stream.good()) return MAKE_PARSE_RESULT(status_io_error); - char* s = static_cast(global_allocate(length + 1)); + char* s = static_cast(global_allocate(length > 0 ? length : 1)); if (!s) return MAKE_PARSE_RESULT(status_out_of_memory); stream.read(s, length); - if (stream.gcount() > length || stream.gcount() == 0) + if (stream.gcount() > length || (length > 0 && stream.gcount() == 0)) { global_deallocate(s); return MAKE_PARSE_RESULT(status_io_error); } - s[stream.gcount()] = 0; - - return parse(transfer_ownership_tag(), s, options); // Parse the input string. + return load_buffer_inplace_own(s, stream.gcount(), options, encoding); // Parse the input string. } -#endif - xml_parse_result xml_document::load(const char* contents, unsigned int options) + xml_parse_result xml_document::load(std::basic_istream >& stream, unsigned int options) { destroy(); - char* s = static_cast(global_allocate(strlen(contents) + 1)); + if (!stream.good()) return MAKE_PARSE_RESULT(status_io_error); + + std::streamoff length, pos = stream.tellg(); + stream.seekg(0, std::ios::end); + length = stream.tellg(); + stream.seekg(pos, std::ios::beg); + + if (!stream.good()) return MAKE_PARSE_RESULT(status_io_error); + + wchar_t* s = static_cast(global_allocate((length > 0 ? length : 1) * sizeof(wchar_t))); if (!s) return MAKE_PARSE_RESULT(status_out_of_memory); - strcpy(s, contents); + stream.read(s, length); + + if (stream.gcount() > length || (length > 0 && stream.gcount() == 0)) + { + global_deallocate(s); + return MAKE_PARSE_RESULT(status_io_error); + } + + return load_buffer_inplace_own(s, stream.gcount() * sizeof(wchar_t), options, encoding_wchar); // Parse the input string. + } +#endif + + xml_parse_result xml_document::load(const char_t* contents, unsigned int options) + { + destroy(); + + // Force native encoding (skip autodetection) + #ifdef PUGIXML_WCHAR_MODE + encoding_t encoding = encoding_wchar; + #else + encoding_t encoding = encoding_utf8; + #endif - return parse(transfer_ownership_tag(), s, options); // Parse the input string. + return load_buffer(contents, impl::strlen(contents) * sizeof(char_t), options, encoding); } - xml_parse_result xml_document::load_file(const char* name, unsigned int options) + xml_parse_result xml_document::load_file(const char* name, unsigned int options, encoding_t encoding) { destroy(); @@ -3023,7 +3711,7 @@ namespace pugi return MAKE_PARSE_RESULT(status_io_error); } - char* s = static_cast(global_allocate(length + 1)); + char* s = static_cast(global_allocate(length > 0 ? length : 1)); if (!s) { @@ -3031,69 +3719,134 @@ namespace pugi return MAKE_PARSE_RESULT(status_out_of_memory); } - size_t read = fread(s, (size_t)length, 1, file); + size_t read = fread(s, 1, (size_t)length, file); fclose(file); - if (read != 1) + if (read != (size_t)length) { global_deallocate(s); return MAKE_PARSE_RESULT(status_io_error); } - - s[length] = 0; - return parse(transfer_ownership_tag(), s, options); // Parse the input string. + return load_buffer_inplace_own(s, length, options, encoding); } - xml_parse_result xml_document::parse(char* xmlstr, unsigned int options) + xml_parse_result xml_document::load_buffer(const void* contents, size_t size, unsigned int options, encoding_t encoding) { destroy(); - // for offset_debug - static_cast(_root)->buffer = xmlstr; + // get actual encoding + encoding_t buffer_encoding = get_buffer_encoding(encoding, contents, size); + + // get private buffer + char_t* buffer; + size_t length; - xml_allocator& alloc = static_cast(_root)->allocator; + if (!convert_buffer(buffer, length, buffer_encoding, contents, size, false)) return MAKE_PARSE_RESULT(status_out_of_memory); - xml_parser parser(alloc); + // parse + xml_parse_result res = xml_parser::parse(buffer, length, _root, options); + + // remember encoding + res.encoding = buffer_encoding; + + // grab onto buffer + _buffer = buffer; + + return res; + } + + xml_parse_result xml_document::load_buffer_inplace(void* contents, size_t size, unsigned int options, encoding_t encoding) + { + destroy(); + + // get actual encoding + encoding_t buffer_encoding = get_buffer_encoding(encoding, contents, size); + + // get private buffer + char_t* buffer; + size_t length; + + if (!convert_buffer(buffer, length, buffer_encoding, contents, size, true)) return MAKE_PARSE_RESULT(status_out_of_memory); - return parser.parse(xmlstr, _root, options); // Parse the input string. + // parse + xml_parse_result res = xml_parser::parse(buffer, length, _root, options); + + // remember encoding + res.encoding = buffer_encoding; + + // grab onto buffer if it's our buffer, user is responsible for deallocating contens himself + if (buffer != contents) _buffer = buffer; + + return res; } - xml_parse_result xml_document::parse(const transfer_ownership_tag&, char* xmlstr, unsigned int options) + xml_parse_result xml_document::load_buffer_inplace_own(void* contents, size_t size, unsigned int options, encoding_t encoding) { - xml_parse_result res = parse(xmlstr, options); + destroy(); + + // get actual encoding + encoding_t buffer_encoding = get_buffer_encoding(encoding, contents, size); + + // get private buffer + char_t* buffer; + size_t length; + + if (!convert_buffer(buffer, length, buffer_encoding, contents, size, true)) return MAKE_PARSE_RESULT(status_out_of_memory); + + // delete original buffer if we performed a conversion + if (buffer != contents) global_deallocate(contents); + + // parse + xml_parse_result res = xml_parser::parse(buffer, length, _root, options); + + // remember encoding + res.encoding = buffer_encoding; - _buffer = xmlstr; + // grab onto buffer + _buffer = buffer; return res; } - void xml_document::save(xml_writer& writer, const char* indent, unsigned int flags) const + void xml_document::save(xml_writer& writer, const char_t* indent, unsigned int flags, encoding_t encoding) const { - xml_buffered_writer buffered_writer(writer); + if (flags & format_write_bom) write_bom(writer, get_write_encoding(encoding)); - if (flags & format_write_bom_utf8) - { - static const unsigned char utf8_bom[] = {0xEF, 0xBB, 0xBF}; - buffered_writer.write(utf8_bom, 3); - } + xml_buffered_writer buffered_writer(writer, encoding); if (!(flags & format_no_declaration)) { - buffered_writer.write(""); - if (!(flags & format_raw)) buffered_writer.write("\n"); + buffered_writer.write(PUGIXML_TEXT("")); + if (!(flags & format_raw)) buffered_writer.write('\n'); } node_output(buffered_writer, *this, indent, flags, 0); } - bool xml_document::save_file(const char* name, const char* indent, unsigned int flags) const +#ifndef PUGIXML_NO_STL + void xml_document::save(std::basic_ostream >& stream, const char_t* indent, unsigned int flags, encoding_t encoding) const + { + xml_writer_stream writer(stream); + + save(writer, indent, flags, encoding); + } + + void xml_document::save(std::basic_ostream >& stream, const char_t* indent, unsigned int flags) const + { + xml_writer_stream writer(stream); + + save(writer, indent, flags, encoding_wchar); + } +#endif + + bool xml_document::save_file(const char* name, const char_t* indent, unsigned int flags, encoding_t encoding) const { FILE* file = fopen(name, "wb"); if (!file) return false; xml_writer_file writer(file); - save(writer, indent, flags); + save(writer, indent, flags, encoding); fclose(file); @@ -3108,29 +3861,56 @@ namespace pugi #ifndef PUGIXML_NO_STL std::string PUGIXML_FUNCTION as_utf8(const wchar_t* str) { + STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4); + + size_t length = wcslen(str); + + // first pass: get length in utf8 characters + size_t size = sizeof(wchar_t) == 2 ? + impl::decode_utf16_block(reinterpret_cast(str), length, 0, opt1_to_type()) : + impl::decode_utf32_block(reinterpret_cast(str), length, 0, opt1_to_type()); + + // allocate resulting string std::string result; - result.reserve(strutf16_utf8_size(str)); - - for (; *str; ++str) + result.resize(size); + + // second pass: convert to utf8 + if (size > 0) { - char buffer[6]; + impl::char8_t* begin = reinterpret_cast(&result[0]); + impl::char8_t* end = sizeof(wchar_t) == 2 ? + impl::decode_utf16_block(reinterpret_cast(str), length, begin, opt1_to_type()) : + impl::decode_utf32_block(reinterpret_cast(str), length, begin, opt1_to_type()); - result.append(buffer, strutf16_utf8(buffer, *str)); + // truncate invalid output + assert(begin <= end && static_cast(end - begin) <= result.size()); + result.resize(static_cast(end - begin)); } - + return result; } - std::wstring PUGIXML_FUNCTION as_utf16(const char* str) + std::wstring PUGIXML_FUNCTION as_wide(const char* str) { + const impl::char8_t* data = reinterpret_cast(str); + size_t size = strlen(str); + + // first pass: get length in wchar_t + size_t length = impl::decode_utf8_block(data, size, 0); + + // allocate resulting string std::wstring result; - result.reserve(strutf8_utf16_size(str)); + result.resize(length); - for (; *str;) + // second pass: convert to wchar_t + if (length > 0) { - unsigned int ch = 0; - str = strutf8_utf16(str, ch); - result += (wchar_t)ch; + impl::wchar_writer::value_type begin = reinterpret_cast(&result[0]); + impl::wchar_writer::value_type end = impl::decode_utf8_block(data, size, begin); + + // truncate invalid output + assert(begin <= end && static_cast(end - begin) <= result.size()); + result.resize(static_cast(end - begin)); } return result; @@ -3154,6 +3934,22 @@ namespace pugi } } +#if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC)) +namespace std +{ + // Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier) + std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_node_iterator&) + { + return std::bidirectional_iterator_tag(); + } + + std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_attribute_iterator&) + { + return std::bidirectional_iterator_tag(); + } +} +#endif + /** * Copyright (c) 2006-2009 Arseny Kapoulkine * diff --git a/src/pugixml.hpp b/src/pugixml.hpp index 0a4d9fd..f2cb3d1 100644 --- a/src/pugixml.hpp +++ b/src/pugixml.hpp @@ -17,8 +17,25 @@ #include "pugiconfig.hpp" #ifndef PUGIXML_NO_STL -# include -# include +#include + +namespace std +{ + struct bidirectional_iterator_tag; + + // Borland C++ compiler has a bug which forces template argument names in forward declarations to be the same as in actual definitions + template class allocator; + template struct char_traits; + template class basic_istream; + template class basic_ostream; + template class basic_string; + + // Digital Mars compiler has a bug which requires a forward declaration for explicit instantiation (otherwise type selection is messed up later, producing link errors) + // Also note that we have to declare char_traits as a class here, since it's defined that way +#ifdef __DMC__ + template <> class char_traits; +#endif +} #endif // No XPath without STL @@ -45,13 +62,39 @@ #include +// Character interface macros +#ifdef PUGIXML_WCHAR_MODE +# define PUGIXML_TEXT(t) L ## t + +namespace pugi +{ + typedef wchar_t char_t; + +#ifndef PUGIXML_NO_STL + typedef std::basic_string, std::allocator > string_t; +#endif +} +#else +# define PUGIXML_TEXT(t) t + +namespace pugi +{ + typedef char char_t; + +# ifndef PUGIXML_NO_STL + // gcc3.4 has a bug which prevents string_t instantiation using char_t, so we have to use char type explicitly + typedef std::basic_string, std::allocator > string_t; +# endif +} +#endif + // Helpers for inline implementation namespace pugi { namespace impl { - int PUGIXML_FUNCTION strcmp(const char*, const char*); - int PUGIXML_FUNCTION strcmpwild(const char*, const char*); + bool PUGIXML_FUNCTION strequal(const char_t*, const char_t*); + bool PUGIXML_FUNCTION strequalwild(const char_t*, const char_t*); } } @@ -199,6 +242,24 @@ namespace pugi */ const unsigned int parse_default = parse_cdata | parse_escapes | parse_wconv_attribute | parse_eol; + /** + * These flags determine the encoding of input data for XML document. Default mode is encoding_auto, + * which means that document encoding is autodetected from BOM and necessary encoding conversions are + * applied. You can override this mode by using any of the specific encodings. + */ + enum encoding_t + { + encoding_auto, //!< Auto-detect input encoding using BOM or >& stream); + + /** + * Construct writer instance + * + * \param stream - output stream object + */ + xml_writer_stream(std::basic_ostream >& stream); virtual void write(const void* data, size_t size); private: - std::ostream* stream; + std::basic_ostream >* narrow_stream; + std::basic_ostream >* wide_stream; }; #endif @@ -559,7 +628,7 @@ namespace pugi * \param rhs - new attribute value * \return self */ - xml_attribute& operator=(const char* rhs); + xml_attribute& operator=(const char_t* rhs); /** * Set attribute value to \a rhs. @@ -599,7 +668,7 @@ namespace pugi * \param rhs - new attribute name * \return success flag (call fails if attribute is empty or there is not enough memory) */ - bool set_name(const char* rhs); + bool set_name(const char_t* rhs); /** * Set attribute value to \a rhs. @@ -607,7 +676,7 @@ namespace pugi * \param rhs - new attribute value * \return success flag (call fails if attribute is empty or there is not enough memory) */ - bool set_value(const char* rhs); + bool set_value(const char_t* rhs); /** * Set attribute value to \a rhs. @@ -655,14 +724,14 @@ namespace pugi * * \return attribute name, or "" if attribute is empty */ - const char* name() const; + const char_t* name() const; /** * Get attribute value. * * \return attribute value, or "" if attribute is empty */ - const char* value() const; + const char_t* value() const; }; #ifdef __BORLANDC__ @@ -825,14 +894,14 @@ namespace pugi * * \return node name, if any; "" otherwise */ - const char* name() const; + const char_t* name() const; /** * Get node value (comment/PI/PCDATA/CDATA contents, depending on node type) * * \return node value, if any; "" otherwise */ - const char* value() const; + const char_t* value() const; /** * Get child with the specified name @@ -840,7 +909,7 @@ namespace pugi * \param name - child name * \return child with the specified name, if any; empty node otherwise */ - xml_node child(const char* name) const; + xml_node child(const char_t* name) const; /** * Get child with the name that matches specified pattern @@ -848,7 +917,7 @@ namespace pugi * \param name - child name pattern * \return child with the name that matches pattern, if any; empty node otherwise */ - xml_node child_w(const char* name) const; + xml_node child_w(const char_t* name) const; /** * Get attribute with the specified name @@ -856,7 +925,7 @@ namespace pugi * \param name - attribute name * \return attribute with the specified name, if any; empty attribute otherwise */ - xml_attribute attribute(const char* name) const; + xml_attribute attribute(const char_t* name) const; /** * Get attribute with the name that matches specified pattern @@ -864,7 +933,7 @@ namespace pugi * \param name - attribute name pattern * \return attribute with the name that matches pattern, if any; empty attribute otherwise */ - xml_attribute attribute_w(const char* name) const; + xml_attribute attribute_w(const char_t* name) const; /** * Get first of following sibling nodes with the specified name @@ -872,7 +941,7 @@ namespace pugi * \param name - sibling name * \return node with the specified name, if any; empty node otherwise */ - xml_node next_sibling(const char* name) const; + xml_node next_sibling(const char_t* name) const; /** * Get first of the following sibling nodes with the name that matches specified pattern @@ -880,7 +949,7 @@ namespace pugi * \param name - sibling name pattern * \return node with the name that matches pattern, if any; empty node otherwise */ - xml_node next_sibling_w(const char* name) const; + xml_node next_sibling_w(const char_t* name) const; /** * Get following sibling @@ -895,7 +964,7 @@ namespace pugi * \param name - sibling name * \return node with the specified name, if any; empty node otherwise */ - xml_node previous_sibling(const char* name) const; + xml_node previous_sibling(const char_t* name) const; /** * Get first of the preceding sibling nodes with the name that matches specified pattern @@ -903,7 +972,7 @@ namespace pugi * \param name - sibling name pattern * \return node with the name that matches pattern, if any; empty node otherwise */ - xml_node previous_sibling_w(const char* name) const; + xml_node previous_sibling_w(const char_t* name) const; /** * Get preceding sibling @@ -931,7 +1000,7 @@ namespace pugi * * \return child value of current node, if any; "" otherwise */ - const char* child_value() const; + const char_t* child_value() const; /** * Get child value of child with specified name. \see child_value @@ -940,7 +1009,7 @@ namespace pugi * \param name - child name * \return child value of specified child node, if any; "" otherwise */ - const char* child_value(const char* name) const; + const char_t* child_value(const char_t* name) const; /** * Get child value of child with name that matches the specified pattern. \see child_value @@ -949,7 +1018,7 @@ namespace pugi * \param name - child name pattern * \return child value of specified child node, if any; "" otherwise */ - const char* child_value_w(const char* name) const; + const char_t* child_value_w(const char_t* name) const; public: /** @@ -958,7 +1027,7 @@ namespace pugi * \param rhs - new node name * \return success flag (call fails if node is of the wrong type or there is not enough memory) */ - bool set_name(const char* rhs); + bool set_name(const char_t* rhs); /** * Set node value to \a rhs (for PI/PCDATA/CDATA/comment nodes). \see value @@ -966,7 +1035,7 @@ namespace pugi * \param rhs - new node value * \return success flag (call fails if node is of the wrong type or there is not enough memory) */ - bool set_value(const char* rhs); + bool set_value(const char_t* rhs); /** * Add attribute with specified name (for element nodes) @@ -974,7 +1043,7 @@ namespace pugi * \param name - attribute name * \return added attribute, or empty attribute if there was an error (wrong node type) */ - xml_attribute append_attribute(const char* name); + xml_attribute append_attribute(const char_t* name); /** * Insert attribute with specified name after \a attr (for element nodes) @@ -983,7 +1052,7 @@ namespace pugi * \param attr - attribute to insert a new one after * \return inserted attribute, or empty attribute if there was an error (wrong node type, or attr does not belong to node) */ - xml_attribute insert_attribute_after(const char* name, const xml_attribute& attr); + xml_attribute insert_attribute_after(const char_t* name, const xml_attribute& attr); /** * Insert attribute with specified name before \a attr (for element nodes) @@ -992,7 +1061,7 @@ namespace pugi * \param attr - attribute to insert a new one before * \return inserted attribute, or empty attribute if there was an error (wrong node type, or attr does not belong to node) */ - xml_attribute insert_attribute_before(const char* name, const xml_attribute& attr); + xml_attribute insert_attribute_before(const char_t* name, const xml_attribute& attr); /** * Add a copy of the specified attribute (for element nodes) @@ -1084,7 +1153,7 @@ namespace pugi * * \param name - attribute name */ - void remove_attribute(const char* name); + void remove_attribute(const char_t* name); /** * Remove specified child @@ -1098,7 +1167,7 @@ namespace pugi * * \param name - child name */ - void remove_child(const char* name); + void remove_child(const char_t* name); public: /** @@ -1121,7 +1190,7 @@ namespace pugi * \param name - node name * \param it - output iterator (for example, std::back_insert_iterator (result of std::back_inserter)) */ - template void all_elements_by_name(const char* name, OutputIterator it) const + template void all_elements_by_name(const char_t* name, OutputIterator it) const { if (!_root) return; @@ -1129,7 +1198,7 @@ namespace pugi { if (node.type() == node_element) { - if (!impl::strcmp(name, node.name())) + if (impl::strequal(name, node.name())) { *it = node; ++it; @@ -1146,7 +1215,7 @@ namespace pugi * \param name - node name pattern * \param it - output iterator (for example, std::back_insert_iterator (result of std::back_inserter)) */ - template void all_elements_by_name_w(const char* name, OutputIterator it) const + template void all_elements_by_name_w(const char_t* name, OutputIterator it) const { if (!_root) return; @@ -1154,7 +1223,7 @@ namespace pugi { if (node.type() == node_element) { - if (!impl::strcmpwild(name, node.name())) + if (impl::strequalwild(name, node.name())) { *it = node; ++it; @@ -1246,7 +1315,7 @@ namespace pugi * \param attr_value - attribute value of child node * \return first matching child node, or empty node */ - xml_node find_child_by_attribute(const char* name, const char* attr_name, const char* attr_value) const; + xml_node find_child_by_attribute(const char_t* name, const char_t* attr_name, const char_t* attr_value) const; /** * Find child node with the specified name that has specified attribute (use pattern matching for node name and attribute name/value) @@ -1256,7 +1325,7 @@ namespace pugi * \param attr_value - pattern for attribute value of child node * \return first matching child node, or empty node */ - xml_node find_child_by_attribute_w(const char* name, const char* attr_name, const char* attr_value) const; + xml_node find_child_by_attribute_w(const char_t* name, const char_t* attr_name, const char_t* attr_value) const; /** * Find child node that has specified attribute @@ -1265,7 +1334,7 @@ namespace pugi * \param attr_value - attribute value of child node * \return first matching child node, or empty node */ - xml_node find_child_by_attribute(const char* attr_name, const char* attr_value) const; + xml_node find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const; /** * Find child node that has specified attribute (use pattern matching for attribute name/value) @@ -1274,7 +1343,7 @@ namespace pugi * \param attr_value - pattern for attribute value of child node * \return first matching child node, or empty node */ - xml_node find_child_by_attribute_w(const char* attr_name, const char* attr_value) const; + xml_node find_child_by_attribute_w(const char_t* attr_name, const char_t* attr_value) const; #ifndef PUGIXML_NO_STL /** @@ -1283,7 +1352,7 @@ namespace pugi * \param delimiter - delimiter character to insert between element names * \return path string (e.g. '/bookstore/book/author'). */ - std::string path(char delimiter = '/') const; + string_t path(char_t delimiter = '/') const; #endif /** @@ -1293,7 +1362,7 @@ namespace pugi * \param delimiter - delimiter character to use while tokenizing path * \return matching node, if any; empty node otherwise */ - xml_node first_element_by_path(const char* path, char delimiter = '/') const; + xml_node first_element_by_path(const char_t* path, char_t delimiter = '/') const; /** * Recursively traverse subtree with xml_tree_walker @@ -1313,7 +1382,7 @@ namespace pugi * \param query - query string * \return first node from the resulting node set by document order, or empty node if none found */ - xpath_node select_single_node(const char* query) const; + xpath_node select_single_node(const char_t* query) const; /** * Select single node by evaluating XPath query @@ -1329,7 +1398,7 @@ namespace pugi * \param query - query string * \return resulting node set */ - xpath_node_set select_nodes(const char* query) const; + xpath_node_set select_nodes(const char_t* query) const; /** * Select node set by evaluating XPath query @@ -1351,7 +1420,7 @@ namespace pugi * \param flags - formatting flags * \param depth - starting depth (used for indentation) */ - void print(xml_writer& writer, const char* indent = "\t", unsigned int flags = format_default, unsigned int depth = 0) const; + void print(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, encoding_t encoding = encoding_auto, unsigned int depth = 0) const; #ifndef PUGIXML_NO_STL /** @@ -1363,7 +1432,18 @@ namespace pugi * \param depth - starting depth (used for indentation) * \deprecated Use print() with xml_writer_stream instead */ - void print(std::ostream& os, const char* indent = "\t", unsigned int flags = format_default, unsigned int depth = 0) const; + void print(std::basic_ostream >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, encoding_t encoding = encoding_auto, unsigned int depth = 0) const; + + /** + * Print subtree to stream + * + * \param os - output stream + * \param indent - indentation string + * \param flags - formatting flags + * \param depth - starting depth (used for indentation) + * \deprecated Use print() with xml_writer_stream instead + */ + void print(std::basic_ostream >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, unsigned int depth = 0) const; #endif /** @@ -1388,17 +1468,7 @@ namespace pugi * It's a bidirectional iterator with value type 'xml_node'. */ class PUGIXML_CLASS xml_node_iterator -#ifndef PUGIXML_NO_STL - : public std::iterator -#endif -#ifdef _MSC_VER -# pragma warning(push) -# pragma warning(disable: 4251 4275) // C4251 and C4275 can be ignored for _Container_base, as per MSDN -#endif { -#ifdef _MSC_VER -# pragma warning(pop) -#endif friend class xml_node; private: @@ -1409,6 +1479,18 @@ namespace pugi explicit xml_node_iterator(xml_node_struct* ref); public: + /** + * Iterator traits + */ + typedef ptrdiff_t difference_type; + typedef xml_node value_type; + typedef xml_node* pointer; + typedef xml_node& reference; + + #ifndef PUGIXML_NO_STL + typedef std::bidirectional_iterator_tag iterator_category; + #endif + /** * Default ctor */ @@ -1493,17 +1575,7 @@ namespace pugi * It's a bidirectional iterator with value type 'xml_attribute'. */ class PUGIXML_CLASS xml_attribute_iterator -#ifndef PUGIXML_NO_STL - : public std::iterator -#endif -#ifdef _MSC_VER -# pragma warning(push) -# pragma warning(disable: 4251 4275) // C4251 and C4275 can be ignored for _Container_base, as per MSDN -#endif { -#ifdef _MSC_VER -# pragma warning(pop) -#endif friend class xml_node; private: @@ -1514,6 +1586,18 @@ namespace pugi explicit xml_attribute_iterator(xml_attribute_struct* ref); public: + /** + * Iterator traits + */ + typedef ptrdiff_t difference_type; + typedef xml_attribute value_type; + typedef xml_attribute* pointer; + typedef xml_attribute& reference; + + #ifndef PUGIXML_NO_STL + typedef std::bidirectional_iterator_tag iterator_category; + #endif + /** * Default ctor */ @@ -1657,12 +1741,6 @@ namespace pugi char data[memory_block_size]; }; - /** - * Struct used to distinguish parsing with ownership transfer from parsing without it. - * \see xml_document::parse - */ - struct transfer_ownership_tag {}; - /** * Parsing status enumeration, returned as part of xml_parse_result struct */ @@ -1702,6 +1780,9 @@ namespace pugi /// Line in parser source which reported this unsigned int line; + /// Source document encoding + encoding_t encoding; + /// Cast to bool operator operator bool() const { @@ -1719,7 +1800,7 @@ namespace pugi class PUGIXML_CLASS xml_document: public xml_node { private: - char* _buffer; + char_t* _buffer; xml_memory_block _memory; @@ -1749,17 +1830,26 @@ namespace pugi * \param options - parsing options * \return parsing result */ - xml_parse_result load(std::istream& stream, unsigned int options = parse_default); + + xml_parse_result load(std::basic_istream >& stream, unsigned int options = parse_default, encoding_t encoding = encoding_auto); + /** + * Load document from stream. + * + * \param stream - stream with xml data + * \param options - parsing options + * \return parsing result + */ + xml_parse_result load(std::basic_istream >& stream, unsigned int options = parse_default); #endif /** - * Load document from string. + * Load document from string. String has to be zero-terminated. No encoding conversions are applied. * * \param contents - input string * \param options - parsing options * \return parsing result */ - xml_parse_result load(const char* contents, unsigned int options = parse_default); + xml_parse_result load(const char_t* contents, unsigned int options = parse_default); /** * Load document from file @@ -1768,32 +1858,43 @@ namespace pugi * \param options - parsing options * \return parsing result */ - xml_parse_result load_file(const char* name, unsigned int options = parse_default); + xml_parse_result load_file(const char* name, unsigned int options = parse_default, encoding_t encoding = encoding_auto); /** - * Parse the given XML string in-situ. - * The string is modified; you should ensure that string data will persist throughout the - * document's lifetime. Although, document does not gain ownership over the string, so you - * should free the memory occupied by it manually. + * Load document from buffer * - * \param xmlstr - readwrite string with xml data + * \param contents - buffer contents + * \param size - buffer size in bytes * \param options - parsing options * \return parsing result */ - xml_parse_result parse(char* xmlstr, unsigned int options = parse_default); - + xml_parse_result load_buffer(const void* contents, size_t size, unsigned int options = parse_default, encoding_t encoding = encoding_auto); + /** - * Parse the given XML string in-situ (gains ownership). - * The string is modified; document gains ownership over the string, so you don't have to worry - * about it's lifetime. - * Call example: doc.parse(transfer_ownership_tag(), string, options); + * Load document from buffer in-situ. + * The buffer is modified; you should ensure that buffer data will persist throughout the document's + * lifetime. Document does not gain ownership over the buffer, so you should free the buffer memory manually. * - * \param xmlstr - readwrite string with xml data + * \param contents - buffer contents + * \param size - buffer size in bytes * \param options - parsing options * \return parsing result */ - xml_parse_result parse(const transfer_ownership_tag&, char* xmlstr, unsigned int options = parse_default); - + xml_parse_result load_buffer_inplace(void* contents, size_t size, unsigned int options = parse_default, encoding_t encoding = encoding_auto); + + /** + * Load document from buffer in-situ (gains buffer ownership). + * The buffer is modified; you should ensure that buffer data will persist throughout the document's + * lifetime. Document gains ownership over the buffer, so you should allocate the buffer with pugixml + * allocation function. + * + * \param contents - buffer contents + * \param size - buffer size in bytes + * \param options - parsing options + * \return parsing result + */ + xml_parse_result load_buffer_inplace_own(void* contents, size_t size, unsigned int options = parse_default, encoding_t encoding = encoding_auto); + /** * Save XML to writer * @@ -1801,7 +1902,27 @@ namespace pugi * \param indent - indentation string * \param flags - formatting flags */ - void save(xml_writer& writer, const char* indent = "\t", unsigned int flags = format_default) const; + void save(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, encoding_t encoding = encoding_auto) const; + + #ifndef PUGIXML_NO_STL + /** + * Save XML to stream + * + * \param stream - output stream + * \param indent - indentation string + * \param flags - formatting flags + */ + void save(std::basic_ostream >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, encoding_t encoding = encoding_auto) const; + + /** + * Save XML to stream + * + * \param stream - output stream + * \param indent - indentation string + * \param flags - formatting flags + */ + void save(std::basic_ostream >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default) const; + #endif /** * Save XML to file @@ -1811,7 +1932,7 @@ namespace pugi * \param flags - formatting flags * \return success flag */ - bool save_file(const char* name, const char* indent = "\t", unsigned int flags = format_default) const; + bool save_file(const char* name, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, encoding_t encoding = encoding_auto) const; /** * Compute document order for the whole tree @@ -2063,20 +2184,20 @@ namespace pugi #ifndef PUGIXML_NO_STL /** - * Convert utf16 to utf8 + * Convert wide string to utf8 * - * \param str - input UTF16 string + * \param str - input wide string string * \return output UTF8 string */ - std::string PUGIXML_FUNCTION as_utf8(const wchar_t* str); + std::basic_string, std::allocator > PUGIXML_FUNCTION as_utf8(const wchar_t* str); /** - * Convert utf8 to utf16 + * Convert utf8 to wide string * * \param str - input UTF8 string - * \return output UTF16 string + * \return output wide string string */ - std::wstring PUGIXML_FUNCTION as_utf16(const char* str); + std::basic_string, std::allocator > PUGIXML_FUNCTION as_wide(const char* str); #endif /** @@ -2104,7 +2225,7 @@ namespace pugi * \param allocate - allocation function * \param deallocate - deallocation function * - * \note XPath-related allocations, as well as allocations in functions that return std::string (xml_node::path, as_utf8, as_utf16) + * \note XPath-related allocations, as well as allocations in functions that return std::string (xml_node::path, as_utf8, as_wide) * are not performed via these functions. * \note If you're using parse() with ownership transfer, you have to allocate the buffer you pass to parse() with allocation * function you set via this function. @@ -2128,6 +2249,15 @@ namespace pugi deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function(); } +#if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC)) +namespace std +{ + // Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier) + std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_node_iterator&); + std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_attribute_iterator&); +} +#endif + #endif /** diff --git a/src/pugixpath.cpp b/src/pugixpath.cpp index fce7459..0dc66e6 100644 --- a/src/pugixpath.cpp +++ b/src/pugixpath.cpp @@ -16,21 +16,39 @@ #ifndef PUGIXML_NO_XPATH #include +#include #include #include +#include #include #include #include #include +#ifdef PUGIXML_WCHAR_MODE +# include +#endif + #if defined(_MSC_VER) # pragma warning(disable: 4127) // conditional expression is constant # pragma warning(disable: 4702) // unreachable code # pragma warning(disable: 4996) // this function or variable may be unsafe #endif +// String utilities prototypes +namespace pugi +{ + namespace impl + { + size_t strlen(const char_t* s); + void strcpy(char_t* dst, const char_t* src); + bool strequalrange(const char_t* lhs, const char_t* rhs, size_t count); + void widen_ascii(wchar_t* dest, const char* source); + } +} + namespace { using namespace pugi; @@ -64,17 +82,32 @@ namespace 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10 }; - inline bool is_chartypex(char c, chartypex ct) + inline bool is_chartypex(char_t c, chartypex ct) { + #ifdef PUGIXML_WCHAR_MODE + unsigned int ch = static_cast(c); + + return !!((ch < 128 ? chartypex_table[ch] : chartypex_table[128]) & ct); + #else return !!(chartypex_table[static_cast(c)] & ct); + #endif } - bool starts_with(const std::string& s, const char* pattern) + bool starts_with(const string_t& s, const char_t* pattern) { - return s.compare(0, strlen(pattern), pattern) == 0; + return s.compare(0, impl::strlen(pattern), pattern) == 0; } - std::string string_value(const xpath_node& na) + const char_t* find_char(const char_t* s, char_t c) + { + #ifdef PUGIXML_WCHAR_MODE + return wcschr(s, c); + #else + return ::strchr(s, c); + #endif + } + + string_t string_value(const xpath_node& na) { if (na.attribute()) return na.attribute().value(); @@ -93,7 +126,7 @@ namespace case node_document: case node_element: { - std::string result; + string_t result; xml_node cur = n.first_child(); @@ -125,7 +158,7 @@ namespace } default: - return ""; + return string_t(); } } } @@ -334,78 +367,78 @@ namespace return (value != 0 && !is_nan(value)); } - std::string convert_number_to_string(double value) + string_t convert_number_to_string(double value) { - if (is_nan(value)) return "NaN"; - else if (is_inf(value)) return value < 0 ? "-Infinity" : "Infinity"; - - char buf[100]; + if (is_nan(value)) return PUGIXML_TEXT("NaN"); + else if (is_inf(value)) return value < 0 ? PUGIXML_TEXT("-Infinity") : PUGIXML_TEXT("Infinity"); + else if (value == 0) return PUGIXML_TEXT("0"); - if (value == (int)value) sprintf(buf, "%d", (int)value); - else - { - sprintf(buf, "%f", value); + char buf[512]; + sprintf(buf, "%f", value); - // trim trailing zeros after decimal point - if (strchr(buf, '.')) - { - char* ptr = buf + strlen(buf) - 1; - for (; *ptr == '0'; --ptr) ; - *(ptr+1) = 0; - } + // trim trailing zeros after decimal point + if (strchr(buf, '.')) + { + char* ptr = buf + strlen(buf) - 1; + for (; *ptr == '0'; --ptr) ; + + // trim leftover decimal point (for integer numbers) + if (*ptr == '.') --ptr; + + *(ptr+1) = 0; } + + #ifdef PUGIXML_WCHAR_MODE + wchar_t wbuf[512]; + impl::widen_ascii(wbuf, buf); - return std::string(buf); + return string_t(wbuf); + #else + return string_t(buf); + #endif } - double convert_string_to_number(const char* string) + bool check_string_to_number_format(const char_t* string) { + // parse leading whitespace while (is_chartypex(*string, ctx_space)) ++string; - - double sign = 1; - - if (*string == '-') - { - sign = -1; - ++string; - } - - double r = 0; - - if (!*string) return gen_nan(); - - while (is_chartypex(*string, ctx_digit)) - { - r = r * 10 + (*string - '0'); - ++string; - } - - if (*string) + + // parse sign + if (*string == '-') ++string; + + if (!*string) return false; + + // if there is no integer part, there should be a decimal part with at least one digit + if (!is_chartypex(string[0], ctx_digit) && (string[0] != '.' || !is_chartypex(string[1], ctx_digit))) return false; + + // parse integer part + while (is_chartypex(*string, ctx_digit)) ++string; + + // parse decimal part + if (*string == '.') { - if (is_chartypex(*string, ctx_space)) - { - while (is_chartypex(*string, ctx_space)) ++string; - return *string ? gen_nan() : r; - } - - if (*string != '.') return gen_nan(); - ++string; - - double power = 0.1; - - while (is_chartypex(*string, ctx_digit)) - { - r += power * (*string - '0'); - power /= 10; - ++string; - } - - while (is_chartypex(*string, ctx_space)) ++string; - if (*string) return gen_nan(); + + while (is_chartypex(*string, ctx_digit)) ++string; } - - return r * sign; + + // parse trailing whitespace + while (is_chartypex(*string, ctx_space)) ++string; + + return *string == 0; + } + + double convert_string_to_number(const char_t* string) + { + // check string format + if (!check_string_to_number_format(string)) return gen_nan(); + + // parse string + #ifdef PUGIXML_WCHAR_MODE + return wcstod(string, 0); + #else + return atof(string); + #endif } double ieee754_round(double value) @@ -413,18 +446,18 @@ namespace return is_nan(value) ? value : floor(value + 0.5); } - const char* local_name(const char* name) + const char_t* local_name(const char_t* name) { - const char* p = strchr(name, ':'); + const char_t* p = find_char(name, ':'); return p ? p + 1 : name; } - const char* namespace_uri(const xml_node& node) + const char_t* namespace_uri(const xml_node& node) { - const char* pos = strchr(node.name(), ':'); + const char_t* pos = find_char(node.name(), ':'); - std::string ns = "xmlns"; + string_t ns = PUGIXML_TEXT("xmlns"); if (pos) { @@ -443,17 +476,17 @@ namespace p = p.parent(); } - return ""; + return PUGIXML_TEXT(""); } - const char* namespace_uri(const xml_attribute& attr, const xml_node& parent) + const char_t* namespace_uri(const xml_attribute& attr, const xml_node& parent) { - const char* pos = strchr(attr.name(), ':'); + const char_t* pos = find_char(attr.name(), ':'); // Default namespace does not apply to attributes - if (!pos) return ""; + if (!pos) return PUGIXML_TEXT(""); - std::string ns = "xmlns:"; + string_t ns = PUGIXML_TEXT("xmlns:"); ns.append(attr.name(), pos); xml_node p = parent; @@ -467,7 +500,7 @@ namespace p = p.parent(); } - return ""; + return PUGIXML_TEXT(""); } template struct equal_to @@ -850,9 +883,9 @@ namespace pugi xpath_lexer& operator=(const xpath_lexer&); private: - const char* m_cur; + const char_t* m_cur; - char* m_cur_lexeme_contents; + char_t* m_cur_lexeme_contents; size_t m_clc_size; size_t m_clc_capacity; @@ -864,15 +897,15 @@ namespace pugi m_clc_size = 0; } - void contents_push(char c) + void contents_push(char_t c) { if (m_clc_size == m_clc_capacity) { if (!m_clc_capacity) m_clc_capacity = 16; else m_clc_capacity *= 2; - char* s = new char[m_clc_capacity + 1]; - if (m_cur_lexeme_contents) strcpy(s, m_cur_lexeme_contents); + char_t* s = new char_t[m_clc_capacity + 1]; + if (m_cur_lexeme_contents) impl::strcpy(s, m_cur_lexeme_contents); delete[] m_cur_lexeme_contents; m_cur_lexeme_contents = s; @@ -883,7 +916,7 @@ namespace pugi } public: - explicit xpath_lexer(const char* query): m_cur(query) + explicit xpath_lexer(const char_t* query): m_cur(query) { m_clc_capacity = m_clc_size = 0; m_cur_lexeme_contents = 0; @@ -896,12 +929,12 @@ namespace pugi delete[] m_cur_lexeme_contents; } - const char* state() const + const char_t* state() const { return m_cur; } - void reset(const char* state) + void reset(const char_t* state) { m_cur = state; next(); @@ -1070,7 +1103,7 @@ namespace pugi case '"': case '\'': { - char terminator = *m_cur; + char_t terminator = *m_cur; ++m_cur; @@ -1153,9 +1186,9 @@ namespace pugi return m_cur_lexeme; } - const char* contents() const + const char_t* contents() const { - return m_cur_lexeme_contents ? m_cur_lexeme_contents : ""; + return m_cur_lexeme_contents ? m_cur_lexeme_contents : PUGIXML_TEXT(""); } }; @@ -1275,7 +1308,7 @@ namespace pugi // variable name for ast_variable // string value for ast_constant // node test for ast_step (node name/namespace/node type/pi target) - const char* m_contents; + const char_t* m_contents; // for t_step / t_predicate axis_t m_axis; @@ -1330,7 +1363,7 @@ namespace pugi } else if (lhs->rettype() == xpath_type_string) { - std::string l = lhs->eval_string(c); + string_t l = lhs->eval_string(c); xpath_node_set rs = rhs->eval_node_set(c); for (xpath_node_set::const_iterator ri = rs.begin(); ri != rs.end(); ++ri) @@ -1362,7 +1395,7 @@ namespace pugi else if (rhs->rettype() == xpath_type_string) { xpath_node_set ls = lhs->eval_node_set(c); - std::string r = rhs->eval_string(c); + string_t r = rhs->eval_string(c); for (xpath_node_set::const_iterator li = ls.begin(); li != ls.end(); ++li) { @@ -1482,12 +1515,12 @@ namespace pugi // There are no attribute nodes corresponding to attributes that declare namespaces // That is, "xmlns:..." or "xmlns" - if (!strncmp(a.name(), "xmlns", 5) && (a.name()[5] == 0 || a.name()[5] == ':')) return; + if (impl::strequalrange(a.name(), PUGIXML_TEXT("xmlns"), 5) && (a.name()[5] == 0 || a.name()[5] == ':')) return; switch (m_test) { case nodetest_name: - if (!strcmp(a.name(), m_contents)) ns.push_back(xpath_node(a, parent)); + if (impl::strequal(a.name(), m_contents)) ns.push_back(xpath_node(a, parent)); break; case nodetest_type_node: @@ -1496,7 +1529,7 @@ namespace pugi break; case nodetest_all_in_namespace: - if (!strncmp(a.name(), m_contents, strlen(m_contents))) + if (impl::strequalrange(a.name(), m_contents, impl::strlen(m_contents))) ns.push_back(xpath_node(a, parent)); break; @@ -1512,7 +1545,7 @@ namespace pugi switch (m_test) { case nodetest_name: - if (n.type() == node_element && !strcmp(n.name(), m_contents)) ns.push_back(n); + if (n.type() == node_element && impl::strequal(n.name(), m_contents)) ns.push_back(n); break; case nodetest_type_node: @@ -1535,7 +1568,7 @@ namespace pugi break; case nodetest_pi: - if (n.type() == node_pi && !strcmp(n.name(), m_contents)) + if (n.type() == node_pi && impl::strequal(n.name(), m_contents)) ns.push_back(n); break; @@ -1545,7 +1578,7 @@ namespace pugi break; case nodetest_all_in_namespace: - if (n.type() == node_element && !strncmp(n.name(), m_contents, strlen(m_contents))) + if (n.type() == node_element && impl::strequalrange(n.name(), m_contents, impl::strlen(m_contents))) ns.push_back(n); break; @@ -1898,18 +1931,18 @@ namespace pugi } } - void set_contents(const char* value, xpath_allocator& a) + void set_contents(const char_t* value, xpath_allocator& a) { if (value) { - char* c = static_cast(a.alloc(strlen(value) + 1)); - strcpy(c, value); + char_t* c = static_cast(a.alloc((impl::strlen(value) + 1) * sizeof(char_t))); + impl::strcpy(c, value); m_contents = c; } else m_contents = 0; } public: - xpath_ast_node(ast_type_t type, const char* contents, xpath_allocator& a): m_type(type), + xpath_ast_node(ast_type_t type, const char_t* contents, xpath_allocator& a): m_type(type), m_rettype(xpath_type_none), m_left(0), m_right(0), m_third(0), m_next(0), m_contents(0), m_axis(axis_self), m_test(nodetest_none) { @@ -1928,7 +1961,7 @@ namespace pugi { } - xpath_ast_node(ast_type_t type, xpath_ast_node* left, axis_t axis, nodetest_t test, const char* contents, xpath_allocator& a): + xpath_ast_node(ast_type_t type, xpath_ast_node* left, axis_t axis, nodetest_t test, const char_t* contents, xpath_allocator& a): m_type(type), m_rettype(xpath_type_none), m_left(left), m_right(0), m_third(0), m_next(0), m_contents(0), m_axis(axis), m_test(test) { @@ -1958,10 +1991,10 @@ namespace pugi else return m_right->eval_boolean(c); case ast_op_equal: - return compare_eq, equal_to, equal_to >::run(m_left, m_right, c); + return compare_eq, equal_to, equal_to >::run(m_left, m_right, c); case ast_op_not_equal: - return compare_eq, not_equal_to, not_equal_to >::run(m_left, m_right, c); + return compare_eq, not_equal_to, not_equal_to >::run(m_left, m_right, c); case ast_op_less: return compare_rel >::run(m_left, m_right, c); @@ -1980,10 +2013,10 @@ namespace pugi case ast_func_contains: { - std::string lr = m_left->eval_string(c); - std::string rr = m_right->eval_string(c); + string_t lr = m_left->eval_string(c); + string_t rr = m_right->eval_string(c); - return rr.empty() || lr.find(rr) != std::string::npos; + return rr.empty() || lr.find(rr) != string_t::npos; } case ast_func_boolean: @@ -2002,18 +2035,18 @@ namespace pugi { if (c.n.attribute()) return false; - std::string lang = m_left->eval_string(c); + string_t lang = m_left->eval_string(c); for (xml_node n = c.n.node(); n; n = n.parent()) { - xml_attribute a = n.attribute("xml:lang"); + xml_attribute a = n.attribute(PUGIXML_TEXT("xml:lang")); if (a) { - const char* value = a.value(); + const char_t* value = a.value(); // strnicmp / strncasecmp is not portable - for (const char* lit = lang.c_str(); *lit; ++lit) + for (const char_t* lit = lang.c_str(); *lit; ++lit) { if (tolower(*lit) != tolower(*value)) return false; ++value; @@ -2145,7 +2178,7 @@ namespace pugi } } - std::string eval_string(xpath_context& c) + string_t eval_string(xpath_context& c) { switch (m_type) { @@ -2163,7 +2196,7 @@ namespace pugi case ast_func_local_name_1: { xpath_node_set ns = m_left->eval_node_set(c); - if (ns.empty()) return ""; + if (ns.empty()) return string_t(); xpath_node na = ns.first(); @@ -2182,7 +2215,7 @@ namespace pugi case ast_func_name_1: { xpath_node_set ns = m_left->eval_node_set(c); - if (ns.empty()) return ""; + if (ns.empty()) return string_t(); xpath_node na = ns.first(); @@ -2201,7 +2234,7 @@ namespace pugi case ast_func_namespace_uri_1: { xpath_node_set ns = m_left->eval_node_set(c); - if (ns.empty()) return ""; + if (ns.empty()) return string_t(); xpath_node na = ns.first(); @@ -2217,7 +2250,7 @@ namespace pugi case ast_func_concat: { - std::string r = m_left->eval_string(c); + string_t r = m_left->eval_string(c); for (xpath_ast_node* n = m_right; n; n = n->m_next) r += n->eval_string(c); @@ -2227,31 +2260,31 @@ namespace pugi case ast_func_substring_before: { - std::string s = m_left->eval_string(c); - std::string::size_type pos = s.find(m_right->eval_string(c)); + string_t s = m_left->eval_string(c); + string_t::size_type pos = s.find(m_right->eval_string(c)); - if (pos == std::string::npos) return ""; - else return std::string(s.begin(), s.begin() + pos); + if (pos == string_t::npos) return string_t(); + else return string_t(s.begin(), s.begin() + pos); } case ast_func_substring_after: { - std::string s = m_left->eval_string(c); - std::string p = m_right->eval_string(c); + string_t s = m_left->eval_string(c); + string_t p = m_right->eval_string(c); - std::string::size_type pos = s.find(p); + string_t::size_type pos = s.find(p); - if (pos == std::string::npos) return ""; - else return std::string(s.begin() + pos + p.length(), s.end()); + if (pos == string_t::npos) return string_t(); + else return string_t(s.begin() + pos + p.length(), s.end()); } case ast_func_substring_2: { - std::string s = m_left->eval_string(c); + string_t s = m_left->eval_string(c); double first = ieee754_round(m_right->eval_number(c)); - if (is_nan(first)) return ""; // NaN - else if (first >= s.length() + 1) return ""; + if (is_nan(first)) return string_t(); // NaN + else if (first >= s.length() + 1) return string_t(); size_t pos = first < 1 ? 1 : (size_t)first; @@ -2260,13 +2293,13 @@ namespace pugi case ast_func_substring_3: { - std::string s = m_left->eval_string(c); + string_t s = m_left->eval_string(c); double first = ieee754_round(m_right->eval_number(c)); double last = first + ieee754_round(m_third->eval_number(c)); - if (is_nan(first) || is_nan(last)) return ""; - else if (first >= s.length() + 1) return ""; - else if (first >= last) return ""; + if (is_nan(first) || is_nan(last)) return string_t(); + else if (first >= s.length() + 1) return string_t(); + else if (first >= last) return string_t(); size_t pos = first < 1 ? 1 : (size_t)first; size_t end = last >= s.length() + 1 ? s.length() + 1 : (size_t)last; @@ -2280,12 +2313,12 @@ namespace pugi case ast_func_normalize_space_0: case ast_func_normalize_space_1: { - std::string s = m_type == ast_func_normalize_space_0 ? string_value(c.n) : m_left->eval_string(c); + string_t s = m_type == ast_func_normalize_space_0 ? string_value(c.n) : m_left->eval_string(c); - std::string r; + string_t r; r.reserve(s.size()); - for (std::string::const_iterator it = s.begin(); it != s.end(); ++it) + for (string_t::const_iterator it = s.begin(); it != s.end(); ++it) { if (is_chartypex(*it, ctx_space)) { @@ -2295,8 +2328,8 @@ namespace pugi else r += *it; } - std::string::size_type pos = r.find_last_not_of(' '); - if (pos == std::string::npos) r = ""; + string_t::size_type pos = r.find_last_not_of(' '); + if (pos == string_t::npos) r = string_t(); else r.erase(r.begin() + pos + 1, r.end()); return r; @@ -2304,15 +2337,15 @@ namespace pugi case ast_func_translate: { - std::string s = m_left->eval_string(c); - std::string from = m_right->eval_string(c); - std::string to = m_third->eval_string(c); + string_t s = m_left->eval_string(c); + string_t from = m_right->eval_string(c); + string_t to = m_third->eval_string(c); - for (std::string::iterator it = s.begin(); it != s.end(); ) + for (string_t::iterator it = s.begin(); it != s.end(); ) { - std::string::size_type pos = from.find(*it); + string_t::size_type pos = from.find(*it); - if (pos == std::string::npos) + if (pos == string_t::npos) ++it; else if (pos >= to.length()) it = s.erase(it); @@ -2328,7 +2361,7 @@ namespace pugi switch (m_rettype) { case xpath_type_boolean: - return eval_boolean(c) ? "true" : "false"; + return eval_boolean(c) ? PUGIXML_TEXT("true") : PUGIXML_TEXT("false"); case xpath_type_number: return convert_number_to_string(eval_number(c)); @@ -2336,12 +2369,12 @@ namespace pugi case xpath_type_node_set: { xpath_node_set ns = eval_node_set(c); - return ns.empty() ? std::string("") : string_value(ns.first()); + return ns.empty() ? string_t() : string_value(ns.first()); } default: assert(!"Wrong expression for ret type string"); - return ""; + return string_t(); } } } @@ -2835,100 +2868,100 @@ namespace pugi xpath_parser(const xpath_parser&); xpath_parser& operator=(const xpath_parser&); - ast_type_t parse_function_name(const std::string& name, size_t argc) + ast_type_t parse_function_name(const string_t& name, size_t argc) { switch (name[0]) { case 'b': - if (name == "boolean" && argc == 1) + if (name == PUGIXML_TEXT("boolean") && argc == 1) return ast_func_boolean; break; case 'c': - if (name == "count" && argc == 1) + if (name == PUGIXML_TEXT("count") && argc == 1) return ast_func_count; - else if (name == "contains" && argc == 2) + else if (name == PUGIXML_TEXT("contains") && argc == 2) return ast_func_contains; - else if (name == "concat" && argc == 2) + else if (name == PUGIXML_TEXT("concat") && argc == 2) return ast_func_concat; - else if (name == "ceiling" && argc == 1) + else if (name == PUGIXML_TEXT("ceiling") && argc == 1) return ast_func_ceiling; break; case 'f': - if (name == "false" && argc == 0) + if (name == PUGIXML_TEXT("false") && argc == 0) return ast_func_false; - else if (name == "floor" && argc == 1) + else if (name == PUGIXML_TEXT("floor") && argc == 1) return ast_func_floor; break; case 'i': - if (name == "id" && argc == 1) + if (name == PUGIXML_TEXT("id") && argc == 1) return ast_func_id; break; case 'l': - if (name == "last" && argc == 0) + if (name == PUGIXML_TEXT("last") && argc == 0) return ast_func_last; - else if (name == "lang" && argc == 1) + else if (name == PUGIXML_TEXT("lang") && argc == 1) return ast_func_lang; - else if (name == "local-name" && argc <= 1) + else if (name == PUGIXML_TEXT("local-name") && argc <= 1) return argc == 0 ? ast_func_local_name_0 : ast_func_local_name_1; break; case 'n': - if (name == "name" && argc <= 1) + if (name == PUGIXML_TEXT("name") && argc <= 1) return argc == 0 ? ast_func_name_0 : ast_func_name_1; - else if (name == "namespace-uri" && argc <= 1) + else if (name == PUGIXML_TEXT("namespace-uri") && argc <= 1) return argc == 0 ? ast_func_namespace_uri_0 : ast_func_namespace_uri_1; - else if (name == "normalize-space" && argc <= 1) + else if (name == PUGIXML_TEXT("normalize-space") && argc <= 1) return argc == 0 ? ast_func_normalize_space_0 : ast_func_normalize_space_1; - else if (name == "not" && argc == 1) + else if (name == PUGIXML_TEXT("not") && argc == 1) return ast_func_not; - else if (name == "number" && argc <= 1) + else if (name == PUGIXML_TEXT("number") && argc <= 1) return argc == 0 ? ast_func_number_0 : ast_func_number_1; break; case 'p': - if (name == "position" && argc == 0) + if (name == PUGIXML_TEXT("position") && argc == 0) return ast_func_position; break; case 'r': - if (name == "round" && argc == 1) + if (name == PUGIXML_TEXT("round") && argc == 1) return ast_func_round; break; case 's': - if (name == "string" && argc <= 1) + if (name == PUGIXML_TEXT("string") && argc <= 1) return argc == 0 ? ast_func_string_0 : ast_func_string_1; - else if (name == "string-length" && argc <= 1) + else if (name == PUGIXML_TEXT("string-length") && argc <= 1) return argc == 0 ? ast_func_string_length_0 : ast_func_string_length_1; - else if (name == "starts-with" && argc == 2) + else if (name == PUGIXML_TEXT("starts-with") && argc == 2) return ast_func_starts_with; - else if (name == "substring-before" && argc == 2) + else if (name == PUGIXML_TEXT("substring-before") && argc == 2) return ast_func_substring_before; - else if (name == "substring-after" && argc == 2) + else if (name == PUGIXML_TEXT("substring-after") && argc == 2) return ast_func_substring_after; - else if (name == "substring" && (argc == 2 || argc == 3)) + else if (name == PUGIXML_TEXT("substring") && (argc == 2 || argc == 3)) return argc == 2 ? ast_func_substring_2 : ast_func_substring_3; - else if (name == "sum" && argc == 1) + else if (name == PUGIXML_TEXT("sum") && argc == 1) return ast_func_sum; break; case 't': - if (name == "translate" && argc == 3) + if (name == PUGIXML_TEXT("translate") && argc == 3) return ast_func_translate; - else if (name == "true" && argc == 0) + else if (name == PUGIXML_TEXT("true") && argc == 0) return ast_func_true; break; @@ -2937,62 +2970,62 @@ namespace pugi return ast_none; } - axis_t parse_axis_name(const std::string& name, bool& specified) + axis_t parse_axis_name(const string_t& name, bool& specified) { specified = true; switch (name[0]) { case 'a': - if (name == "ancestor") + if (name == PUGIXML_TEXT("ancestor")) return axis_ancestor; - else if (name == "ancestor-or-self") + else if (name == PUGIXML_TEXT("ancestor-or-self")) return axis_ancestor_or_self; - else if (name == "attribute") + else if (name == PUGIXML_TEXT("attribute")) return axis_attribute; break; case 'c': - if (name == "child") + if (name == PUGIXML_TEXT("child")) return axis_child; break; case 'd': - if (name == "descendant") + if (name == PUGIXML_TEXT("descendant")) return axis_descendant; - else if (name == "descendant-or-self") + else if (name == PUGIXML_TEXT("descendant-or-self")) return axis_descendant_or_self; break; case 'f': - if (name == "following") + if (name == PUGIXML_TEXT("following")) return axis_following; - else if (name == "following-sibling") + else if (name == PUGIXML_TEXT("following-sibling")) return axis_following_sibling; break; case 'n': - if (name == "namespace") + if (name == PUGIXML_TEXT("namespace")) return axis_namespace; break; case 'p': - if (name == "parent") + if (name == PUGIXML_TEXT("parent")) return axis_parent; - else if (name == "preceding") + else if (name == PUGIXML_TEXT("preceding")) return axis_preceding; - else if (name == "preceding-sibling") + else if (name == PUGIXML_TEXT("preceding-sibling")) return axis_preceding_sibling; break; case 's': - if (name == "self") + if (name == PUGIXML_TEXT("self")) return axis_self; break; @@ -3002,30 +3035,30 @@ namespace pugi return axis_child; } - nodetest_t parse_node_test_type(const char* name) + nodetest_t parse_node_test_type(const char_t* name) { switch (name[0]) { case 'c': - if (!strcmp(name, "comment")) + if (impl::strequal(name, PUGIXML_TEXT("comment"))) return nodetest_type_comment; break; case 'n': - if (!strcmp(name, "node")) + if (impl::strequal(name, PUGIXML_TEXT("node"))) return nodetest_type_node; break; case 'p': - if (!strcmp(name, "processing-instruction")) + if (impl::strequal(name, PUGIXML_TEXT("processing-instruction"))) return nodetest_type_pi; break; case 't': - if (!strcmp(name, "text")) + if (impl::strequal(name, PUGIXML_TEXT("text"))) return nodetest_type_text; break; @@ -3087,10 +3120,10 @@ namespace pugi xpath_ast_node* args[4]; size_t argc = 0; - std::string function = m_lexer.contents(); + string_t function = m_lexer.contents(); m_lexer.next(); - bool func_concat = (function == "concat"); + bool func_concat = (function == PUGIXML_TEXT("concat")); xpath_ast_node* last_concat = 0; if (m_lexer.current() != lex_open_brace) @@ -3160,7 +3193,9 @@ namespace pugi { m_lexer.next(); - n = new (m_alloc.node()) xpath_ast_node(ast_filter, n, parse_expression(), axis_child); + xpath_ast_node* expr = parse_expression(); + + n = new (m_alloc.node()) xpath_ast_node(ast_filter, n, expr, axis_child); if (m_lexer.current() != lex_close_square_brace) throw xpath_exception("Unmatched square brace"); @@ -3202,7 +3237,7 @@ namespace pugi } nodetest_t nt_type = nodetest_none; - std::string nt_name; + string_t nt_name; if (m_lexer.current() == lex_string) { @@ -3251,9 +3286,9 @@ namespace pugi if (nt_type == nodetest_none) throw xpath_exception("Unrecognized node type"); - nt_name = ""; + nt_name = string_t(); } - else if (nt_name == "processing-instruction") + else if (nt_name == PUGIXML_TEXT("processing-instruction")) { if (m_lexer.current() != lex_quoted_string) throw xpath_exception("Only literals are allowed as arguments to processing-instruction()"); @@ -3273,7 +3308,7 @@ namespace pugi // QName or NCName:* else { - std::string::size_type colon_pos = nt_name.find(':'); + string_t::size_type colon_pos = nt_name.find(':'); if (nt_name.size() > 2 && colon_pos == nt_name.size() - 2 && nt_name[nt_name.size() - 1] == '*') // NCName:* { @@ -3300,7 +3335,9 @@ namespace pugi { m_lexer.next(); - xpath_ast_node* pred = new (m_alloc.node()) xpath_ast_node(ast_predicate, parse_expression(), 0, axis); + xpath_ast_node* expr = parse_expression(); + + xpath_ast_node* pred = new (m_alloc.node()) xpath_ast_node(ast_predicate, expr, 0, axis); if (m_lexer.current() != lex_close_square_brace) throw xpath_exception("unmatched square brace"); @@ -3341,7 +3378,7 @@ namespace pugi if (m_lexer.current() == lex_slash) { // Save state for next lexeme - that is, whatever follows '/' - const char* state = 0; // gcc3 "variable might be used uninitialized in this function" bug workaround + const char_t* state = 0; // gcc3 "variable might be used uninitialized in this function" bug workaround state = m_lexer.state(); m_lexer.next(); @@ -3395,7 +3432,7 @@ namespace pugi if (m_lexer.current() == lex_string) { // This is either a function call, or not - if not, we shall proceed with location path - const char* state = m_lexer.state(); + const char_t* state = m_lexer.state(); while (is_chartypex(*state, ctx_space)) ++state; @@ -3433,7 +3470,9 @@ namespace pugi { m_lexer.next(); - n = new (m_alloc.node()) xpath_ast_node(ast_op_union, n, parse_union_expression()); + xpath_ast_node* expr = parse_union_expression(); + + n = new (m_alloc.node()) xpath_ast_node(ast_op_union, n, expr); } return n; @@ -3446,7 +3485,9 @@ namespace pugi { m_lexer.next(); - return new (m_alloc.node()) xpath_ast_node(ast_op_negate, parse_unary_expression()); + xpath_ast_node* expr = parse_unary_expression(); + + return new (m_alloc.node()) xpath_ast_node(ast_op_negate, expr); } else return parse_union_expression(); } @@ -3460,13 +3501,15 @@ namespace pugi xpath_ast_node* n = parse_unary_expression(); while (m_lexer.current() == lex_multiply || (m_lexer.current() == lex_string && - (!strcmp(m_lexer.contents(), "mod") || !strcmp(m_lexer.contents(), "div")))) + (impl::strequal(m_lexer.contents(), PUGIXML_TEXT("mod")) || impl::strequal(m_lexer.contents(), PUGIXML_TEXT("div"))))) { ast_type_t op = m_lexer.current() == lex_multiply ? ast_op_multiply : - !strcmp(m_lexer.contents(), "div") ? ast_op_divide : ast_op_mod; + impl::strequal(m_lexer.contents(), PUGIXML_TEXT("div")) ? ast_op_divide : ast_op_mod; m_lexer.next(); - n = new (m_alloc.node()) xpath_ast_node(op, n, parse_unary_expression()); + xpath_ast_node* expr = parse_unary_expression(); + + n = new (m_alloc.node()) xpath_ast_node(op, n, expr); } return n; @@ -3485,7 +3528,9 @@ namespace pugi m_lexer.next(); - n = new (m_alloc.node()) xpath_ast_node(l == lex_plus ? ast_op_add : ast_op_subtract, n, parse_multiplicative_expression()); + xpath_ast_node* expr = parse_multiplicative_expression(); + + n = new (m_alloc.node()) xpath_ast_node(l == lex_plus ? ast_op_add : ast_op_subtract, n, expr); } return n; @@ -3506,9 +3551,10 @@ namespace pugi lexeme_t l = m_lexer.current(); m_lexer.next(); + xpath_ast_node* expr = parse_additive_expression(); + n = new (m_alloc.node()) xpath_ast_node(l == lex_less ? ast_op_less : l == lex_greater ? ast_op_greater : - l == lex_less_or_equal ? ast_op_less_or_equal : ast_op_greater_or_equal, - n, parse_additive_expression()); + l == lex_less_or_equal ? ast_op_less_or_equal : ast_op_greater_or_equal, n, expr); } return n; @@ -3527,7 +3573,9 @@ namespace pugi m_lexer.next(); - n = new (m_alloc.node()) xpath_ast_node(l == lex_equal ? ast_op_equal : ast_op_not_equal, n, parse_relational_expression()); + xpath_ast_node* expr = parse_relational_expression(); + + n = new (m_alloc.node()) xpath_ast_node(l == lex_equal ? ast_op_equal : ast_op_not_equal, n, expr); } return n; @@ -3538,11 +3586,13 @@ namespace pugi { xpath_ast_node* n = parse_equality_expression(); - while (m_lexer.current() == lex_string && !strcmp(m_lexer.contents(), "and")) + while (m_lexer.current() == lex_string && impl::strequal(m_lexer.contents(), PUGIXML_TEXT("and"))) { m_lexer.next(); - n = new (m_alloc.node()) xpath_ast_node(ast_op_and, n, parse_equality_expression()); + xpath_ast_node* expr = parse_equality_expression(); + + n = new (m_alloc.node()) xpath_ast_node(ast_op_and, n, expr); } return n; @@ -3553,11 +3603,13 @@ namespace pugi { xpath_ast_node* n = parse_and_expression(); - while (m_lexer.current() == lex_string && !strcmp(m_lexer.contents(), "or")) + while (m_lexer.current() == lex_string && impl::strequal(m_lexer.contents(), PUGIXML_TEXT("or"))) { m_lexer.next(); - n = new (m_alloc.node()) xpath_ast_node(ast_op_or, n, parse_and_expression()); + xpath_ast_node* expr = parse_and_expression(); + + n = new (m_alloc.node()) xpath_ast_node(ast_op_or, n, expr); } return n; @@ -3570,7 +3622,7 @@ namespace pugi } public: - explicit xpath_parser(const char* query, xpath_allocator& alloc): m_alloc(alloc), m_lexer(query) + explicit xpath_parser(const char_t* query, xpath_allocator& alloc): m_alloc(alloc), m_lexer(query) { } @@ -3588,7 +3640,7 @@ namespace pugi } }; - xpath_query::xpath_query(const char* query): m_alloc(0), m_root(0) + xpath_query::xpath_query(const char_t* query): m_alloc(0), m_root(0) { compile(query); } @@ -3598,7 +3650,7 @@ namespace pugi delete m_alloc; } - void xpath_query::compile(const char* query) + void xpath_query::compile(const char_t* query) { delete m_alloc; m_alloc = new xpath_allocator; @@ -3644,9 +3696,9 @@ namespace pugi return m_root->eval_number(c); } - std::string xpath_query::evaluate_string(const xml_node& n) const + string_t xpath_query::evaluate_string(const xml_node& n) const { - if (!m_root) return std::string(); + if (!m_root) return string_t(); xpath_context c; @@ -3673,7 +3725,7 @@ namespace pugi return m_root->eval_node_set(c); } - xpath_node xml_node::select_single_node(const char* query) const + xpath_node xml_node::select_single_node(const char_t* query) const { xpath_query q(query); return select_single_node(q); @@ -3685,7 +3737,7 @@ namespace pugi return s.empty() ? xpath_node() : s.first(); } - xpath_node_set xml_node::select_nodes(const char* query) const + xpath_node_set xml_node::select_nodes(const char_t* query) const { xpath_query q(query); return select_nodes(q); -- cgit v1.2.3