From 678d2f2369a58d6db9a7bb9e732ce2589086961a Mon Sep 17 00:00:00 2001 From: "arseny.kapoulkine" Date: Wed, 30 Jun 2010 14:29:14 +0000 Subject: docs: Added error handling, parsing options and encoding sections, minor spelling fix git-svn-id: http://pugixml.googlecode.com/svn/trunk@553 99668b35-9821-0410-8761-19e4c4f06640 --- docs/manual.qbk | 274 ++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 216 insertions(+), 58 deletions(-) diff --git a/docs/manual.qbk b/docs/manual.qbk index 416fc0d..97ccde0 100644 --- a/docs/manual.qbk +++ b/docs/manual.qbk @@ -540,7 +540,7 @@ Sometimes XML data should be loaded from some other source than file, i.e. HTTP All functions accept the buffer which is represented by a pointer to XML data, `contents`, and data size in bytes. Also there are two optional arguments, which specify parsing options (see [sref manual.loading.options]) and input data encoding (see [sref manual.loading.encoding]). The buffer does not have to be zero-terminated. -`load_buffer` function works with immutable buffer - it does not ever modify the buffer. Because of this restriction it has to create a private buffer and copy XML data to it before parsing (applying encoding conversions if necessary). This copy operation carries a performance penalty, so inplace functions are provided - `load_buffer_inplace` and `load_buffer_inplace_own` store the document data in the buffer, modifying it in the process. In order for the document to stay valid, you have to make sure that the buffers lifetime exceeds that of the tree if you're using inplace functions. In addition to that, `load_buffer_inplace` does not assume ownership of the buffer, so you'll have to destroy it yourself; `load_buffer_inplace_own` assumes ownership of the buffer and destroys it once it is not needed. This means that if you're using `load_buffer_inplace_own`, you have to allocate memory with pugixml allocation function (you can get it via [link get_memory_allocation_function]). +`load_buffer` function works with immutable buffer - it does not ever modify the buffer. Because of this restriction it has to create a private buffer and copy XML data to it before parsing (applying encoding conversions if necessary). This copy operation carries a performance penalty, so inplace functions are provided - `load_buffer_inplace` and `load_buffer_inplace_own` store the document data in the buffer, modifying it in the process. In order for the document to stay valid, you have to make sure that the buffer's lifetime exceeds that of the tree if you're using inplace functions. In addition to that, `load_buffer_inplace` does not assume ownership of the buffer, so you'll have to destroy it yourself; `load_buffer_inplace_own` assumes ownership of the buffer and destroys it once it is not needed. This means that if you're using `load_buffer_inplace_own`, you have to allocate memory with pugixml allocation function (you can get it via [link get_memory_allocation_function]). The best way from the performance/memory point of view is to load document using `load_buffer_inplace_own`; this function has maximum control of the buffer with XML data so it is able to avoid redundant copies and reduce peak memory usage while parsing. This is the recommended function if you have to load the document from memory and performance is critical. @@ -584,19 +584,173 @@ Stream loading requires working seek/tell functions and therefore may fail when [endsect] [/stream] [section:errors Handling parsing errors] -foo -concise syntax (if (!doc.load(...)) ...) + +[#xml_parse_result] +All document loading functions return the parsing result via `xml_parse_result` object. It contains parsing status, the offset of last successfully parsed character from the beginning of the source stream, and the encoding of the source stream: + + struct xml_parse_result + { + xml_parse_status status; + ptrdiff_t offset; + xml_encoding encoding; + + operator bool() const; + const char* description() const; + }; + +[#xml_parse_status] +[#xml_parse_result::status] +Parsing status is represented as the `xml_parse_status` enumeration and can be one of the following: + +* [#status_ok] +`status_ok` means that no error was encountered during parsing; the source stream represents the valid XML document which was fully parsed and converted to a tree. +[lbr] + +* [#status_file_not_found] +`status_file_not_found` is only returned by `load_file` function and means that file could not be opened. +* [#status_io_error] +`status_io_error` is returned by `load_file` function and by `load` functions with `std::istream`/`std::wstream` arguments; it means that some I/O error has occured during reading the file/stream. +* [#status_out_of_memory] +`status_out_of_memory` means that there was not enough memory during some allocation; any allocation failure during parsing results in this error. +* [#status_internal_error] +`status_internal_error` means that something went horribly wrong; currently this error does not occur +[lbr] + +* [#status_unrecognized_tag] +`status_unrecognized_tag` means that parsing stopped due to a tag with either an empty name or a name which starts with incorrect character, such as [^#]. +* [#status_bad_pi] +`status_bad_pi` means that parsing stopped due to incorrect document declaration/processing instruction +* [#status_bad_comment][#status_bad_cdata][#status_bad_doctype][#status_bad_pcdata] +`status_bad_comment`, `status_bad_cdata`, `status_bad_doctype` and `status_bad_pcdata` mean that parsing stopped due to the invalid construct of the respective type +* [#status_bad_start_element] +`status_bad_start_element` means that parsing stopped because starting tag either had no closing `>` symbol or contained some incorrect symbol +* [#status_bad_attribute] +`status_bad_attribute` means that parsing stopped because there was an incorrect attribute, such as an attribute without value or with value that is not quoted (note that `` is incorrect in XML) +* [#status_bad_end_element] +`status_bad_end_element` means that parsing stopped because ending tag had incorrect syntax (i.e. extra non-whitespace symbols between tag name and `>`) +* [#status_end_element_mismatch] +`status_end_element_mismatch` means that parsing stopped because the closing tag did not match the opening one (i.e. ``) or because some tag was not closed at all + +[#xml_parse_result::description] +`description()` member function can be used to convert parsing status to a string; the returned message is always in English, so you'll have to write your own function if you need a localized string. However please note that the exact messages returned by `description()` function may change from version to version, so any complex status handling should be based on `status` value. + +If parsing failed because the source data was not a valid XML, the resulting tree is not destroyed - despite the fact that load function returns error, you can use the part of the tree that was successfully parsed. Obviously, the last element may have an unexpected name/value; for example, if the attribute value does not end with the necessary quotation mark, like in [^` (document declaration) is not considered to be a PI. This flag is *off* by default. +[lbr] + +* [#parse_comments] +`parse_comments` determines if comments (nodes with type [link node_comment]) are to be put in DOM tree. If this flag is off, they are not put in the tree, but are still parsed and checked for correctness. This flag is *off* by default. +[lbr] + +* [#parse_cdata] +`parse_cdata` determines if CDATA sections (nodes with type [link node_cdata]) are to be put in DOM tree. If this flag is off, they are not put in the tree, but are still parsed and checked for correctness. This flag is *on* by default. +[lbr] + +* [#parse_ws_pcdata] +`parse_ws_pcdata` determines if PCDATA nodes (nodes with type [link node_pcdata]) that consist only of whitespace characters are to be put in DOM tree. Often whitespace-only data is not significant for the application, and the cost of allocating and storing such nodes (both memory and speed-wise) can be significant. For example, after parsing XML string ` `, `` element will have three children when `parse_ws_pcdata` is set (child with type `node_pcdata` and value `" "`, child with type `node_element` and name `"a"`, and another child with type `node_pcdata` and value `" "`), and only one child when `parse_ws_pcdata` is not set. This flag is *off* by default. + +These flags control the transformation of tree element contents: + +* [#parse_escapes] +`parse_escapes` determines if character and entity references are to be expanded during the parsing process. Character references have the form [^&#...;] or [^&#x...;] ([^...] is Unicode numeric representation of character in either decimal ([^&#...;]) or hexadecimal ([^&#x...;]) form), entity references are [^<], [^>], [^&], [^'] and [^"] (note that as pugixml does not handle DTD, the only allowed entities are predefined ones). If character/entity reference can not be expanded, it is left as is, so you can do additional processing later. Reference expansion is performed in attribute values and PCDATA content. This flag is *on* by default. +[lbr] + +* [#parse_eol] +`parse_eol` determines if EOL handling (that is, replacing sequences `0x0d 0x0a` by a single `0x0a` character, and replacing all standalone `0x0d` characters by `0x0a`) is to be performed on input data (that is, comments contents, PCDATA/CDATA contents and attribute values). This flag is *on* by default. +[lbr] + +* [#parse_wconv_attribute] +`parse_wconv_attribute` determines if attribute value normalization should be performed for all attributes. This means, that whitespace characters (new line, tab and space) are replaced with space (`' '`). New line characters are always treated as if `parse_eol` is set, i.e. `\r\n` is converted to single space. This flag is *on* by default. + +Additionally there are two predefined option masks: + +* [#parse_minimal] +`parse_minimal` has all options turned off. This option mask means that pugixml does not add declaration nodes, PI nodes, CDATA sections and comments to the resulting tree and does not perform any conversion for input data, so theoretically it is the fastest mode. However, as discussed above, in practice `parse_default` is usually equally fast. +[lbr] + +* [#parse_default] +`parse_default` is the default set of flags, i.e. it has all options set to their default values. It includes parsing CDATA sections (comments/PIs are not parsed), performing character and entity reference expansion, replacing whitespace characters with spaces in attribute values and performing EOL handling. Note, that PCDATA sections consisting only of whitespace characters are not parsed (by default) for performance reasons. + +This is a simple example of using different parsing options ([@samples/load_options.cpp]): + +[import samples/load_options.cpp] +[code_load_options] + [endsect] [/options] [section:encoding Encodings] -foo + +[#xml_encoding] +pugixml supports all popular Unicode encodings (UTF-8, UTF-16 (big and little endian), UTF-32 (big and little endian); UCS-2 is naturally supported since its a strict subset of UTF-16) and handles all encoding conversions. Most loading functions accept the optional parameter `encoding`. This is a value of enumeration type `xml_encoding`, that can have the following values: + +* [#encoding_auto] +`encoding_auto` means that pugixml will try to guess the encoding based on source XML data. The algorithm is a modified version of the one presented in Appendix F.1 of XML recommendation; it tries to match the first few bytes of input data with the following patterns in strict order: +[lbr] + * If first four bytes match UTF-32 BOM (Byte Order Mark), encoding is assumed to be UTF-32 with the endianness equal to that of BOM; + * If first two bytes match UTF-16 BOM, encoding is assumed to be UTF-16 with the endianness equal to that of BOM; + * If first three bytes match UTF-8 BOM, encoding is assumed to be UTF-8; + * If first four bytes match UTF-32 representation of [^<], encoding is assumed to be UTF-32 with the corresponding endianness; + * If first four bytes match UTF-16 representation of [^ >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const; - * void print(std::basic_ostream >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, unsigned int depth = 0) const; + * void print(std::ostream& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const; + * void print(std::wostream& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, unsigned int depth = 0) const; * typedef xml_node_iterator iterator; * typedef xml_attribute_iterator attribute_iterator; @@ -1018,13 +1174,22 @@ Classes: * void save(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; [lbr] - * void save(std::basic_ostream >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; - * void save(std::basic_ostream >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default) const; + * void save(std::ostream& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; + * void save(std::wostream& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default) const; [lbr] * bool save_file(const char* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; [lbr] +* `struct `[link xml_parse_result] + * `xml_parse_status `[link xml_parse_result::status status]`;` + * `ptrdiff_t `[link xml_parse_result::offset offset]`;` + * `xml_encoding `[link xml_parse_result::encoding encoding]`;` + [lbr] + + * `operator `[link xml_parse_result::bool bool]`() const;` + * `const char* `[link xml_parse_result::description description]`() const;` + * xpath_query * explicit xpath_query(const char_t* query); * ~xpath_query(); @@ -1043,8 +1208,8 @@ Classes: * virtual void write(const void* data, size_t size); * xml_writer_stream - * xml_writer_stream(std::basic_ostream >& stream); - * xml_writer_stream(std::basic_ostream >& stream); + * xml_writer_stream(std::ostream& stream); + * xml_writer_stream(std::wostream& stream); * virtual void write(const void* data, size_t size); * xml_node_iterator @@ -1059,13 +1224,6 @@ Classes: * virtual bool for_each(xml_node&) = 0; * virtual bool end(xml_node&); -* xml_parse_result - * xml_parse_status status; - * ptrdiff_t offset; - * xml_encoding encoding; - * operator bool() const - * const char* description() const; - * xpath_exception * explicit xpath_exception(const char* message); * virtual const char* what() const throw(); -- cgit v1.2.3