From 837ced350c5123c21c32154f1f2dc483238f7629 Mon Sep 17 00:00:00 2001 From: mloy Date: Thu, 30 Oct 2014 14:30:05 +0100 Subject: load_buffer_impl always checks if buffer is valid pointer and size > 0 added some tests to force invalid buffer and size = 0 --- src/pugixml.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/pugixml.cpp b/src/pugixml.cpp index b39aad0..47aba28 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -4292,7 +4292,12 @@ PUGI__NS_BEGIN PUGI__FN xml_parse_result load_buffer_impl(xml_document_struct* doc, xml_node_struct* root, void* contents, size_t size, unsigned int options, xml_encoding encoding, bool is_mutable, bool own, char_t** out_buffer) { // check input buffer - assert(contents || size == 0); + if ((contents==NULL) && (size!=0)) { + xml_parse_result result; + result.status = status_no_document_element; + return result; + } + // get actual encoding xml_encoding buffer_encoding = impl::get_buffer_encoding(encoding, contents, size); -- cgit v1.2.3 From e94552c9ca883f8c4f2cead24355a60ecba0efb2 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Thu, 12 Feb 2015 08:12:12 -0800 Subject: DOCTYPE parsing is now stackless This prevents malformed input XML with very deeply recursive DOCTYPE sections from crashing the parser. Fixes #29. --- src/pugixml.cpp | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) (limited to 'src') diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 265337a..0f696ab 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -2357,23 +2357,28 @@ PUGI__NS_BEGIN char_t* parse_doctype_ignore(char_t* s) { + size_t depth = 0; + assert(s[0] == '<' && s[1] == '!' && s[2] == '['); - s++; + s += 3; while (*s) { if (s[0] == '<' && s[1] == '!' && s[2] == '[') { // nested ignore section - s = parse_doctype_ignore(s); - if (!s) return s; + s += 3; + depth++; } else if (s[0] == ']' && s[1] == ']' && s[2] == '>') { // ignore section end s += 3; - return s; + if (depth == 0) + return s; + + depth--; } else s++; } @@ -2381,10 +2386,12 @@ PUGI__NS_BEGIN PUGI__THROW_ERROR(status_bad_doctype, s); } - char_t* parse_doctype_group(char_t* s, char_t endch, bool toplevel) + char_t* parse_doctype_group(char_t* s, char_t endch) { + size_t depth = 0; + assert((s[0] == '<' || s[0] == 0) && s[1] == '!'); - s++; + s += 2; while (*s) { @@ -2399,12 +2406,8 @@ PUGI__NS_BEGIN else { // some control group - s = parse_doctype_group(s, endch, false); - if (!s) return s; - - // skip > - assert(*s == '>'); - s++; + s += 2; + depth++; } } else if (s[0] == '<' || s[0] == '"' || s[0] == '\'') @@ -2415,12 +2418,16 @@ PUGI__NS_BEGIN } else if (*s == '>') { - return s; + if (depth == 0) + return s; + + depth--; + s++; } else s++; } - if (!toplevel || endch != '>') PUGI__THROW_ERROR(status_bad_doctype, s); + if (depth != 0 || endch != '>') PUGI__THROW_ERROR(status_bad_doctype, s); return s; } @@ -2512,7 +2519,7 @@ PUGI__NS_BEGIN char_t* mark = s + 9; - s = parse_doctype_group(s, endch, true); + s = parse_doctype_group(s, endch); if (!s) return s; assert((*s == 0 && endch == '>') || *s == '>'); -- cgit v1.2.3 From cb04ab2700611f68f8690e73b21c34024a13acc6 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Wed, 4 Mar 2015 10:38:42 -0800 Subject: Fix string length for translate and normalize-space The implementations generated a string with an internal null terminator; this went unnoticed since unit test string verification did not perform string equality check properly (it compared XPath string result as a C-string, thus stopping at the first null terminator). Fixes #36. --- src/pugixml.cpp | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) (limited to 'src') diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 0f696ab..d8a6888 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -7447,7 +7447,7 @@ PUGI__NS_BEGIN return node.attribute() ? namespace_uri(node.attribute(), node.parent()) : namespace_uri(node.node()); } - PUGI__FN void normalize_space(char_t* buffer) + PUGI__FN char_t* normalize_space(char_t* buffer) { char_t* write = buffer; @@ -7471,9 +7471,11 @@ PUGI__NS_BEGIN // zero-terminate *write = 0; + + return write; } - PUGI__FN void translate(char_t* buffer, const char_t* from, const char_t* to, size_t to_length) + PUGI__FN char_t* translate(char_t* buffer, const char_t* from, const char_t* to, size_t to_length) { char_t* write = buffer; @@ -7491,6 +7493,8 @@ PUGI__NS_BEGIN // zero-terminate *write = 0; + + return write; } PUGI__FN unsigned char* translate_table_generate(xpath_allocator* alloc, const char_t* from, const char_t* to) @@ -7527,7 +7531,7 @@ PUGI__NS_BEGIN return static_cast(result); } - PUGI__FN void translate_table(char_t* buffer, const unsigned char* table) + PUGI__FN char_t* translate_table(char_t* buffer, const unsigned char* table) { char_t* write = buffer; @@ -7553,6 +7557,8 @@ PUGI__NS_BEGIN // zero-terminate *write = 0; + + return write; } inline bool is_xpath_attribute(const char_t* name) @@ -9659,18 +9665,20 @@ PUGI__NS_BEGIN { xpath_string s = string_value(c.n, stack.result); - normalize_space(s.data(stack.result)); + char_t* begin = s.data(stack.result); + char_t* end = normalize_space(begin); - return s; + return xpath_string::from_heap_preallocated(begin, end); } case ast_func_normalize_space_1: { xpath_string s = _left->eval_string(c, stack); - normalize_space(s.data(stack.result)); + char_t* begin = s.data(stack.result); + char_t* end = normalize_space(begin); - return s; + return xpath_string::from_heap_preallocated(begin, end); } case ast_func_translate: @@ -9683,18 +9691,20 @@ PUGI__NS_BEGIN xpath_string from = _right->eval_string(c, swapped_stack); xpath_string to = _right->_next->eval_string(c, swapped_stack); - translate(s.data(stack.result), from.c_str(), to.c_str(), to.length()); + char_t* begin = s.data(stack.result); + char_t* end = translate(begin, from.c_str(), to.c_str(), to.length()); - return s; + return xpath_string::from_heap_preallocated(begin, end); } case ast_opt_translate_table: { xpath_string s = _left->eval_string(c, stack); - translate_table(s.data(stack.result), _data.table); + char_t* begin = s.data(stack.result); + char_t* end = translate_table(begin, _data.table); - return s; + return xpath_string::from_heap_preallocated(begin, end); } case ast_variable: -- cgit v1.2.3 From 9749920c8204930f868fed7fcf38ea2cc2b5a2ec Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Thu, 5 Mar 2015 11:35:39 -0800 Subject: Refactor contents=0 behavior Also change the error code to status_io_error --- src/pugixml.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'src') diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 787f693..fa41058 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -4316,12 +4316,7 @@ PUGI__NS_BEGIN PUGI__FN xml_parse_result load_buffer_impl(xml_document_struct* doc, xml_node_struct* root, void* contents, size_t size, unsigned int options, xml_encoding encoding, bool is_mutable, bool own, char_t** out_buffer) { // check input buffer - if ((contents==NULL) && (size!=0)) { - xml_parse_result result; - result.status = status_no_document_element; - return result; - } - + if (!contents && size) return make_parse_result(status_io_error); // get actual encoding xml_encoding buffer_encoding = impl::get_buffer_encoding(encoding, contents, size); -- cgit v1.2.3 From 23060d095447ca7c47a9c0698ec731197cebc80b Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Thu, 5 Mar 2015 12:50:29 -0800 Subject: Use more efficient encoding for string headers Since all string allocations are pointer-aligned to avoid aligning more frequent node allocations, we can rely on that in string encoding. Encoding page offset and block size in sizeof(void*) units increases the maximum memory page size from 64k to 256k on 32-bit and 512k on 64-bit platforms. Fixes #35. --- src/pugixml.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'src') diff --git a/src/pugixml.cpp b/src/pugixml.cpp index fa41058..6c88d55 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -400,7 +400,9 @@ PUGI__NS_BEGIN char_t* allocate_string(size_t length) { - PUGI__STATIC_ASSERT(xml_memory_page_size <= (1 << 16)); + static const size_t max_encoded_offset = (1 << 16) * sizeof(void*); + + PUGI__STATIC_ASSERT(xml_memory_page_size <= max_encoded_offset); // allocate memory for string and header block size_t size = sizeof(xml_memory_string_header) + length * sizeof(char_t); @@ -416,12 +418,14 @@ PUGI__NS_BEGIN // setup header ptrdiff_t page_offset = reinterpret_cast(header) - reinterpret_cast(page) - sizeof(xml_memory_page); - assert(page_offset >= 0 && page_offset < (1 << 16)); - header->page_offset = static_cast(page_offset); + assert(page_offset % sizeof(void*) == 0); + assert(page_offset >= 0 && static_cast(page_offset) < max_encoded_offset); + header->page_offset = static_cast(static_cast(page_offset) / sizeof(void*)); // full_size == 0 for large strings that occupy the whole page - assert(full_size < (1 << 16) || (page->busy_size == full_size && page_offset == 0)); - header->full_size = static_cast(full_size < (1 << 16) ? full_size : 0); + assert(full_size % sizeof(void*) == 0); + assert(full_size < max_encoded_offset || (page->busy_size == full_size && page_offset == 0)); + header->full_size = static_cast(full_size < max_encoded_offset ? full_size / sizeof(void*) : 0); // round-trip through void* to avoid 'cast increases required alignment of target type' warning // header is guaranteed a pointer-sized alignment, which should be enough for char_t @@ -438,11 +442,11 @@ PUGI__NS_BEGIN assert(header); // deallocate - size_t page_offset = sizeof(xml_memory_page) + header->page_offset; + size_t page_offset = sizeof(xml_memory_page) + header->page_offset * sizeof(void*); xml_memory_page* page = reinterpret_cast(static_cast(reinterpret_cast(header) - page_offset)); // if full_size == 0 then this string occupies the whole page - size_t full_size = header->full_size == 0 ? page->busy_size : header->full_size; + size_t full_size = header->full_size == 0 ? page->busy_size : header->full_size * sizeof(void*); deallocate_memory(header, full_size, page); } -- cgit v1.2.3 From 604861e520d2d6579674a1c2bd5e59cb10f7ecd2 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Tue, 10 Mar 2015 09:03:22 -0700 Subject: Escape ?> sequence in PI value during printing This prevents malformed PI value from breaking the document structure. --- src/pugixml.cpp | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 6c88d55..ce8a79f 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -3462,6 +3462,27 @@ PUGI__NS_BEGIN writer.write('-', '-', '>'); } + PUGI__FN void node_output_pi_value(xml_buffered_writer& writer, const char_t* s) + { + while (*s) + { + const char_t* prev = s; + + // look for ?> sequence - we can't output it since ?> terminates PI + while (*s && !(s[0] == '?' && s[1] == '>')) ++s; + + writer.write_buffer(prev, static_cast(s - prev)); + + if (*s) + { + assert(s[0] == '?' && s[1] == '>'); + + writer.write('?', ' ', '>'); + s += 2; + } + } + } + PUGI__FN void node_output_attributes(xml_buffered_writer& writer, xml_node_struct* node, unsigned int flags) { const char_t* default_name = PUGIXML_TEXT(":anonymous"); @@ -3575,7 +3596,7 @@ PUGI__NS_BEGIN if (node->value) { writer.write(' '); - writer.write_string(node->value); + node_output_pi_value(writer, node->value); } writer.write('?', '>'); -- cgit v1.2.3