1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
|
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<title>pugixml documentation</title>
</head>
<body link="#0000ff" vlink="#800080">
<table border="0" cellpadding="4" cellspacing="0" width="100%" summary="header">
<tr>
<td valign="top" bgcolor="#eeeeeee">
<h2 align="left">pugixml documentation</h2>
</td>
</tr>
</table>
<hr>
<h2>Contents</h2>
<dl class="index">
<dt><a href="#Introduction">Introduction</a></dt>
<dt><a href="#DOM">Document Object Model</a></dt>
<dt><a href="#Documentation">Documentation</a>
<dd><a href="#Doc_Introduction">Introduction</a></dd>
<dd><a href="#Doc_Parser">xml_parser class</a></dd>
<dd><a href="#Doc_Node">xml_node class</a></dd>
<dd><a href="#Doc_Attribute">xml_attribute class</a></dd>
<dd><a href="#Doc_Iterators">Iterators</a></dd>
<dd><a href="#Doc_Misc">Miscellaneous</a></dd>
<dd><a href="#Doc_Lifetime">Lifetime issues and memory management</a></dd>
</dt>
<dt><a href="#Parsing">Parsing process</a></dt>
<dt><a href="#Compliance">W3C compliance</a></dt>
<dt><a href="#ComparisonTable">Comparison with existing parsers</a></dt>
<dt><a href="#FAQ">FAQ</a></dt>
<dt><a href="#Bugs">Bugs</a></dt>
<dt><a href="#Future_work">Future work</a></dt>
<dt><a href="#Changelog">Changelog</a></dt>
<dt><a href="#Acknowledgements">Acknowledgements</a></dt>
<dt><a href="#License">License</a></dt>
</dl>
<hr>
<a name="Introduction">
<h2>Introduction</h2>
<p><i>pugixml</i> is just another XML parser. This is a successor to
<a href="http://www.codeproject.com/soap/pugxml.asp">pugxml</a> (well, to be honest, the only part
that is left as is is wildcard matching code, the rest was either heavily refactored or rewritten
from scratch). The main features (call it USP) are:</p>
<ul>
<li>low memory consumption and fragmentation (the win over <i>pugxml</i> is ~1.3 times, <i>TinyXML</i>
- ~2.5 times, <i>Xerces (DOM)</i> - ~4.3 times <a href="#annot-1"><sup>1</sup></a>). Exact numbers can
be seen in <a href="#ComparisonTable">Comparison with existing parsers</a> section.</li>
<li>extremely high parsing speed (the win over <i>pugxml</i> is ~6 times, <i>TinyXML</i> - ~10
times, <i>Xerces-DOM</i> - ~17.6 times <a href="#annot-1"><sup>1</sup></a></li>
<li>extremely high parsing speed (well, I'm repeating myself, but it's so fast, that it outperforms
<i>Expat</i> by <b>2.8 times</b> on test XML) <a href="#annot-2"><sup>2</sup></a></li>
<li>more or less standard-conformant (it will parse any standard-compliant file correctly in w3c-compliance
mode, with the exception of DTD related issues and XML namespaces)</li>
<li>pretty much error-ignorant (it will not choke on something like <text>You & Me</text>,
like <i>expat</i> will; it will try to recover the state even if meeting an error (like finding matching
tags for closing ones); it will parse files with data in wrong encoding; and so on)</li>
<li>clean interface (a heavily refactored pugxml's one)</li>
<li>more or less unicode-aware (actually, it assumes UTF-8 encoding of the input data, though
it will readily work with ANSI - no UTF-16 for now (see <a href="#Future_work">Future work</a>), with
helper conversion functions (UTF-8 <-> UTF-16/32 (whatever is the default for std::wstring & wchar_t))</li>
<li>fully standard compliant code (approved by <a href="http://www.comeaucomputing.com/tryitout/">Comeau</a>
strict mode), multiplatform (tested on win32 only ^_^)</li>
<li>high flexibility. You can control many aspects of file parsing and DOM tree building via parsing
options.
</ul>
<p>Okay, you might ask - what's the catch? Everything is so cute - it's small, fast, robust, clean solution
for parsing XML. What is missing? Ok, we are fair developers - so here is a misfeature list:</p>
<ul>
<li>memory consumption. It beats every DOM-based parser that I know of - but when SAX parser comes,
there is no chance. You can't process a 2 Gb XML file with less than 4 Gb of memory - and do it fast.
Though <i>pugixml</i> behaves better, than all other DOM-based parser, so if you're stuck with DOM,
it's not a problem.</li>
<li>memory consumption. Ok, I'm repeating myself. Again. When other parsers will allow you to provide
XML file in a constant storage (or even as a memory mapped area), <i>pugixml</i> will not. So you'll
have to copy the entire data into a non-constant storage. Moreover, it should persist during the
parser's lifetime (the reasons for that and more about lifetimes is written below). Again, if you're
ok with DOM - it should not be a problem, because the overall memory consumption is less (well, though
you'll need a contiguous chunk of memory, which can be a problem).</li>
<li>lack of validation, DTD processing, XML namespaces, proper handling of encoding. If you need those -
go take MSXML or XercesC or anything like that.</li>
<li>lack of XPath & UTF-16/32 parsing. These are not implemented for now, but they are the features
for the next release.</li>
<li>immutability of DOM tree. It's constant. You can't change it. There are good reasons for prohibiting
that, though it is a thing that will likely be in the next release.</li>
</ul>
<hr>
<a name="annot-1"><sup>1</sup><small> The tests were done on a 1 mb XML file with a 4 levels deep tree
with a small amount of text. The times are that of building DOM tree. <i>pugixml</i> was run in default
parsing mode, so differences in speed are even bigger with minimal settings.</small> <br>
<a name="annot-2"><sup>2</sup><small> Obviously, you can't estimate time of building DOM tree for a
SAX parser, so the times of reading the data into storage that closely represented the structure of
an XML file were measured.</small>
<hr>
<a name="DOM">
<h2>Document Object Model</h2>
<p><i>pugixml</i> is a DOM-based parser. This means, that the XML document is converted to a tree.
Each XML tag is converted to a node in DOM tree. If a tag is contained in some other tag, its node
is a child to the outer tag's one. Comments, CDATA sections and PIs (Processing Instructions) also are
transformed into tree nodes, as is the standalone text. Each node has its type.</p>
<p>Here is an example of an XML document:
<pre>
<span style='color:#004a43; '><?</span><span style='color:#004a43; '>xml</span> <span style='color:#004a43; '>version</span><span style='color:#808030; '>=</span><span style='color:#008c00; '>"1.0"</span><span style='color:#004a43; '>?></span>
<span style='color:#a65700; '><</span><span style='color:#5f5035; '>mesh</span> <span style='color:#274796; '>name</span><span style='color:#808030; '>=</span><span style='color:#0000e6; '>"</span><span style='color:#0000e6; '>mesh_root</span><span style='color:#0000e6; '>"</span><span style='color:#a65700; '>></span>
<span style='color:#696969; '><!--</span><span style='color:#696969; '> here is a mesh node </span><span style='color:#696969; '>--></span>
some text
<span style='color:#606060; '><![CDATA[</span>[someothertext<span style='color:#606060; '>]]></span>
some more text
<span style='color:#a65700; '><</span><span style='color:#5f5035; '>node</span> <span style='color:#274796; '>attr1</span><span style='color:#808030; '>=</span><span style='color:#0000e6; '>"</span><span style='color:#0000e6; '>value1</span><span style='color:#0000e6; '>"</span> <span style='color:#a65700; '>/></span>
<span style='color:#a65700; '><</span><span style='color:#5f5035; '>node</span> <span style='color:#274796; '>attr1</span><span style='color:#808030; '>=</span><span style='color:#0000e6; '>"</span><span style='color:#0000e6; '>value2</span><span style='color:#0000e6; '>"</span><span style='color:#a65700; '>></span>
<span style='color:#004a43; '><?</span><span style='color:#004a43; '>TARGET</span><span style='color:#004a43; '> somedata</span><span style='color:#004a43; '>?></span>
<span style='color:#a65700; '><</span><span style='color:#5f5035; '>innernode</span><span style='color:#a65700; '>/></span>
<span style='color:#a65700; '></</span><span style='color:#5f5035; '>node</span><span style='color:#a65700; '>></span>
<span style='color:#a65700; '></</span><span style='color:#5f5035; '>mesh</span><span style='color:#a65700; '>></span>
</pre>
It gets converted to the following tree (note, that with some parsing options comments, PIs and CDATA
sections are not stored in the tree, and with some options there are also nodes with whitespaces
and the contents of PCDATA sections is a bit different (with trailing/leading whitespaces). So generally
the resulting DOM tree depends on the parsing options):</p>
<p><img src="tree.png"></p>
<p>The parent-children relations are shown with lines. Some nodes have previous and next siblings
(for example, the next sibling for node_comment node is node_pcdata with value "some text", and the
previous sibling for node_element with name "mesh" is node_pi with target "xml" (target for PI nodes
is stored in the node name)).</p>
<hr>
<a name="Documentation">
<h2>Documentation</h2>
<a name="Doc_Introduction">
<h3>Introduction</h3>
<p><i>pugixml</i> is a library for parsing XML files, which means that you give it XML data some way,
and it gives you the DOM tree and the ways to traverse it and to get some useful information from it.
The library source consist of two files, the header <b>pugixml.hpp</b>, and the source code <b>pugixml.cpp</b>.
You can either compile cpp file in your project, or build a static library (or perhaps even a DLL),
or make the whole code use inline linkage and make one big file (as it was done in <i>pugxml</i>).
All library classes reside in namespace <b>pugi</b>, so you can either use fully qualified
names (<b>pugi::xml_node</b>) or write a using declaration (<b>using namespace pugi;</b>, <b>using
pugi::xml_node</b>) and use plain names. All classes have the <b>xml_</b> prefix.</p>
<p>By default it's supposed that you compile the source file with your project (add it into your
project, or add relevant entry in your Makefile, or do whatever you need to do with your compilation
environment). The library is written in standard-conformant C++ and was tested on win32 platform
(MSVC 7.1 (2003), MSVC 8.0 (2005)).</p>
<a name="Doc_Parser">
<h3>xml_parser class</h3>
<p><b>xml_parser</b> class is the core of parsing process; you initiate parsing with it, you get DOM
tree from it, the nodes and attributes are stored in it. You have two ways to load a file: either
provide a string with XML-data (it has to be null-terminated, and it will be modified during parsing
process, so it can not be a piece of read-only memory), or with an <b>std::istream</b> object (any input
stream, like <b>std::ifstream</b>, <b>std::istringstream</b>, etc.) - in this case the parser will allocate
the necessary amount of memory (equivalent to stream's size) and read everything from the stream.</p>
<p>The functions for parsing are:
<dl>
<dt>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<b><font color="#0000ff" >void</font></b> <font color="#000000" >parse(std::istream&</font> <font color="#000000" >stream,</font> <b><font color="#0000ff" >unsigned</font></b> <b><font color="#0000ff" >int</font></b> <font color="#000000" >optmsk</font> <font color="#000000" >=</font> <font color="#000000" >parse_noset);</font></font></pre></td></tr></table>
<dd>This function will create a buffer with the size equal to that of provided <code>stream</code>,
read the chunk of data from the stream and parse it with provided options (<code>optmsk</code>).
The stream does not have to persist after the call to the function, the lifetime of internal buffer
with stream's data is managed by <i>pugixml</i>.
</dd>
</dt>
<dt> </dt>
<dt>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >parse(</font><b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >xmlstr,</font> <b><font color="#0000ff" >unsigned</font></b> <b><font color="#0000ff" >int</font></b> <font color="#000000" >optmsk</font> <font color="#000000" >=</font> <font color="#000000" >parse_noset);</font>
</font></pre></td></tr></table>
<dd>This function parses the provided string with provided options, and returns the position where the
parsing stopped (do not expect, that parsing will stop on every error, or on most of them - as I've
said, <i>pugixml</i> is error ignorant). The input string is modified. The string must persist for the
lifetime of the parser.
</dt>
<dt> </dt>
<dt>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >parse(</font><b><font color="#0000ff" >const</font></b> <font color="#000000">ownership_transfer_tag&,</font> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >xmlstr,</font> <b><font color="#0000ff" >unsigned</font></b> <b><font color="#0000ff" >int</font></b> <font color="#000000" >optmsk</font> <font color="#000000" >=</font> <font color="#000000" >parse_noset);</font>
</font></pre></td></tr></table>
<dd>This function parses the provided string with provided options, and returns the position where the
parsing stopped (do not expect, that parsing will stop on every error, or on most of them - as I've
said, <i>pugixml</i> is error ignorant). The input string is modified. The string's ownership is
managed by parser (string's memory is freed automatically when parser's destructor is called).</dt>
<dt> </dt>
<dt>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<font color="#000000" >xml_parser(std::istream&</font> <font color="#000000" >stream,</font> <b><font color="#0000ff" >unsigned</font></b> <b><font color="#0000ff" >int</font></b> <font color="#000000" >optmsk</font> <font color="#000000" >=</font> <font color="#000000" >parse_default);</font></font></pre></td></tr></table>
<dd>Just a convenience ctor, that calls the corresponding parse() function.</dd>
</dt>
<dt> </dt>
<dt>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<font color="#000000" >xml_parser(</font><b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >xmlstr,</font> <b><font color="#0000ff" >unsigned</font></b> <b><font color="#0000ff" >int</font></b> <font color="#000000" >optmsk</font> <font color="#000000" >=</font> <font color="#000000" >parse_default);</font></font></pre></td></tr></table>
<dd>Just a convenience ctor, that calls the corresponding parse() function.</dd>
</dt>
<dt> </dt>
<dt>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<font color="#000000" >xml_parser(</font><b><font color="#0000ff" >const</font></b> <font color="#000000">ownership_transfer_tag&,</font> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >xmlstr,</font> <b><font color="#0000ff" >unsigned</font></b> <b><font color="#0000ff" >int</font></b> <font color="#000000" >optmsk</font> <font color="#000000" >=</font> <font color="#000000" >parse_default);</font></font></pre></td></tr></table>
<dd>Just a convenience ctor, that calls the corresponding parse() function.</dd>
</dt>
</dl>
<p>If you want to provide XML data after the creation of the parser, use the default ctor. Otherwise
you are free to use either parsing ctors or default ctor and later - parsing function.</p>
<p>After parsing an XML file, you'll get a DOM tree. To get access to it (or, more precisely, to its
root), call either <b>document()</b> function or cast <b>xml_parser</b> object to <b>xml_node</b> by
using the following functions:</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<b><font color="#0000ff" >operator</font></b> <font color="#000000" >xml_node()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<font color="#000000" >xml_node</font> <font color="#000000" >document()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>Ok, easy part is behind - now let's dive into parsing options. There is a variety of them, and you
must choose them wisely to get the needed results and the best speed/least memory overhead. At first,
there are flags that determine which parts of the document will be put into DOM tree, and which will
be just skipped:</p>
<ul>
<li>If <b>parse_pi</b> is on, then processing instructions (<b><? ... ?></b>) are put into DOM
tree (with node type <b>node_pi</b>, otherwise they are discarded. Note that for now the prolog
(<?xml ... ?>) is parsed as a processing instruction.
<br>Default value: on
<br>In W3C mode: on</li>
<li>If <b>parse_comments</b> is on, then comments (<b><!-- ... --></b>) are put into DOM
tree (with node type <b>node_comment</b>, otherwise they are discarded.
<br>Default value: on
<br>In W3C mode: on</li>
<li>If <b>parse_cdata</b> is on, then the content of CDATA section (<b><![CDATA[[ ... ]]></b>)
is put into DOM tree (with node type <b>node_cdata</b>, otherwise it is discarded.
<br>Default value: on
<br>In W3C mode: on</li>
<li>If <b>parse_ws_pcdata</b> is off, then the content of PCDATA section (it's the plain text
in the node, like in <b><some_tag>Hello!</some_tag></b>) is discarded if it consists only
of space-like characters (spaces, tabs and newlines).
<br>Default value: off
<br>In W3C mode: on</li>
<li>If <b>parse_ext_pcdata</b> is off, then the content of PCDATA section is discarded if it belongs
to root (document) node, that is it does not have a parent tag.
<br>Default value: on
<br>In W3C mode: off</li>
</ul>
<p>Then there are flags that determine how the processing of the retrieved data is done. There are
several reasons for these flags, mainly:
<ul>
<li>parsing speed. The less processing - the more speed.</li>
<li>data fetching comfort. Sometimes you're ok with messed linefeeds, sometimes you're not. Sometimes
you want your PCDATA trimmed, sometimes you do not. Sometimes you want your attribute values normalized,
sometimes you do not. Some of these are normally specified in DOCTYPE, though...
<li>...parser is not DOCTYPE aware (and will never be), so you need a way to set those properties -
if not on per-node basis, then on per-document</li>
</ul>
So, these are the processing flags:
</p>
<ul>
<li>If <b>parse_trim_pcdata</b> is on, then the trimming of leading/trailing space-like characters
is performed for PCDATA content
<br>Default value: on
<br>In W3C mode: off</li>
<li>If <b>parse_trim_attribute</b> is on, then the trimming of leading/trailing space-like characters
is performed for attribute values (this is non-standard behavior and is here only for compatibility
reasons (PugXML had this flag).
<br>Default value: off
<br>In W3C mode: off</li>
<li>If <b>parse_escapes_pcdata</b> is on, then the character reference expansion is done for PCDATA
content (replacing <lt; with <, &#4c; with L, etc.).
<br>Default value: on
<br>In W3C mode: on</li>
<li>If <b>parse_escapes_attribute</b> is on, then the character reference expansion is done for
attribute values (replacing <lt; with <, &#4c; with L, etc.).
<br>Default value: on
<br>In W3C mode: on</li>
<li>If <b>parse_wnorm_pcdata</b> is on, then the whitespace normalisation is done for PCDATA content
(this includes replacing any space-like character by a space character and converting sequences of
spaces into a single space)
<br>Default value: on
<br>In W3C mode: off</li>
<li>If <b>parse_wnorm_attribute</b> is on, then the whitespace normalisation is done for attribute
values
<br>Default value: on
<br>In W3C mode: off</li>
<li>If <b>parse_wconv_attribute</b> is on, then the whitespace conversion is done for attribute
values (this is a subset of whitespace normalization, and includes only replacing space-like characters
with spaces). If <b>parse_wnorm_attribute</b> is on, this flag has no effect.
<br>Default value: on
<br>In W3C mode: on</li>
<li>If <b>parse_eol_pcdata</b> is on, then the end-of-line handling is done for PCDATA content (this
includes converting any pair of 0x0d 0x0a characters to a single 0x0a and converting any standalone
0x0d to 0x0a).
<br>Default value: on
<br>In W3C mode: on</li>
<li>If <b>parse_eol_attribute</b> is on, then the end-of-line handling is done for attribute values.
<br>Default value: on
<br>In W3C mode: on</li>
<li>If <b>parse_eol_cdata</b> is on, then the end-of-line handling is done for CDATA content.
<br>Default value: on
<br>In W3C mode: on</li>
</ul>
<p>Finally, there are two more flags, that indicate closing tag parsing. When <i>pugixml</i> meets a
close tags, there are three ways:
<ul>
<li>check that the tag name matches the opening tag, return an error if it does not. This is a
standard-compliant way, is controlled by <b>parse_check_end_tags</b> flag, which is on in W3C mode</li>
<li>try to find the corresponding tag name (so that <b><foo> <bar> </foo></b> will be parsed
correctly). This is controlled by <b>parse_match_end_tags</b>, which is on by default</li>
<li>just treat the tag as a closing tag for the node (so that <b><foo> ... </bar></b> will
be parsed as <b><foo> ... </foo></b>). This is the fastest way, and this is what <i>pugxml</i>
is doing, but it can corrupt your DOM tree. This way is chosen if both <b>parse_check_end_tags</b> and
<b>parsse_match_end_tags</b> are off.
</ul>
Note, that these 2 flags are mutually exclusive.
</p>
<p>Did I say finally? Ok, so <b>finally</b> there are some helper flags, or better groups of flags.
These are:
<ul>
<li><b>parse_minimal</b> - no flag is set (this also means the fastest parsing)</li>
<li><b>parse_default</b> - default set of flags</li>
<li><b>parse_noset</b> - use the current parser options (see below)</li>
<li><b>parse_w3c</b> - use the W3C compliance mode</li>
</ul>
</p>
<p>A couple of words on flag usage. The parsing options are just a set of bits, with each bit corresponding
to one flag. You can turn the flag on by OR-ing the options value with this flag's constant:
<pre>
parse_w3c | parse_wnorm_pcdata
</pre>
or turn the flag off by AND-ing the options value with the NEGation of this flag's constant:
<pre>
parse_w3c & ~parse_comments
</pre>
You can access the current options of parser by <b>options()</b> method:
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<b><font color="#0000ff" >unsigned</font></b> <b><font color="#0000ff" >int</font></b> <font color="#000000" >options()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<b><font color="#0000ff" >unsigned</font></b> <b><font color="#0000ff" >int</font></b> <font color="#000000" >options(</font><b><font color="#0000ff" >unsigned</font></b> <b><font color="#0000ff" >int</font></b> <font color="#000000" >optmsk);</font>
</font></pre></td></tr></table>
(the latter one returns previous options). These options are used when <b>parse_noset</b> flag set is
passed to <b>parse()</b> functions (which is the default value of corresponding parameter).
</p>
<a name="Doc_Node">
<h3>xml_node class</h3>
<p>If <b>xml_parser</b> is a heart of constructing a DOM tree from file, <b>xml_node</b> is a heart
of processing the tree. This is a simple wrapper, so it's small (4/8 bytes, depending on the size of
pointer), you're free to copy it and it does not own anything. I'll continue with a list of methods
with their description, with one note in advance. Some functions, that do something according to a
string-like parameter, have a pair with a suffix <b>_w</b>. The <b>_w</b> suffix tells, that this
function is doing a wildcard matching, instead of simple string comparison. You're free to use wildcards
<b>*</b> (that is equal to any sequence of characters (possibly empty)), <b>?</b> (that is equal to
any character) and character sets (<b>[Abc]</b> means 'any symbol of A, b and c', <b>[A-Z4]</b> means
'any symbol from A to Z, or 4', <b>[!0-9]</b> means 'any symbol, that is not a digit'). So the wildcard
<b>?ell_[0-9][0-9]_*</b> will match strings like 'cell_23_xref', 'hell_00_', but will not match the
strings like 'ell_23_xref', 'cell_0_x' or 'cell_0a_x'.</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<i><font color="#808080" >/// Access iterators for this node's collection of child nodes.</font></i>
<font color="#000000" >iterator</font> <font color="#000000" >begin()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<font color="#000000" >iterator</font> <font color="#000000" >end()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<i><font color="#808080" >/// Access iterators for this node's collection of child nodes (same as begin/end).</font></i>
<font color="#000000" >iterator</font> <font color="#000000" >children_begin()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<font color="#000000" >iterator</font> <font color="#000000" >children_end()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<i><font color="#808080" >/// Access iterators for this node's collection of attributes.</font></i>
<font color="#000000" >attribute_iterator</font> <font color="#000000" >attributes_begin()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<font color="#000000" >attribute_iterator</font> <font color="#000000" >attributes_end()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<i><font color="#808080" >/// Access iterators for this node's collection of siblings.</font></i>
<font color="#000000" >iterator</font> <font color="#000000" >siblings_begin()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<font color="#000000" >iterator</font> <font color="#000000" >siblings_end()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>Functions, returning the iterators to walk through children/siblings/attributes. More on that in
<a href="#Doc_Iterators">Iterators</a> section.</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<b><font color="#0000ff" >operator</font></b> <font color="#000000" >unspecified_bool_type()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>This is a safe bool-like conversion operator. You can check node's validity (<b>if (xml_node)</b>,
<b>if (!xml_node)</b>, <b>if (node1 && node2 && !node3 && cond1 && ...)</b> - you get the idea) with
it.
</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<b><font color="#0000ff" >bool</font></b> <b><font color="#0000ff" >operator</font></b><font color="#000000" >==(</font><b><font color="#0000ff" >const</font></b> <font color="#000000" >xml_node&</font> <font color="#000000" >r)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<b><font color="#0000ff" >bool</font></b> <b><font color="#0000ff" >operator</font></b><font color="#000000" >!=(</font><b><font color="#0000ff" >const</font></b> <font color="#000000" >xml_node&</font> <font color="#000000" >r)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<b><font color="#0000ff" >bool</font></b> <b><font color="#0000ff" >operator</font></b><font color="#000000" ><(</font><b><font color="#0000ff" >const</font></b> <font color="#000000" >xml_node&</font> <font color="#000000" >r)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<b><font color="#0000ff" >bool</font></b> <b><font color="#0000ff" >operator</font></b><font color="#000000" >>(</font><b><font color="#0000ff" >const</font></b> <font color="#000000" >xml_node&</font> <font color="#000000" >r)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<b><font color="#0000ff" >bool</font></b> <b><font color="#0000ff" >operator</font></b><font color="#000000" ><=(</font><b><font color="#0000ff" >const</font></b> <font color="#000000" >xml_node&</font> <font color="#000000" >r)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<b><font color="#0000ff" >bool</font></b> <b><font color="#0000ff" >operator</font></b><font color="#000000" >>=(</font><b><font color="#0000ff" >const</font></b> <font color="#000000" >xml_node&</font> <font color="#000000" >r)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>Comparison operators</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<b><font color="#0000ff" >bool</font></b> <font color="#000000" >empty()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p><code>if (node.empty())</code> is equivalent to <code>if (!node)</code></p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<font color="#000000" >xml_node_type</font> <font color="#000000" >type()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >name()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >value()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>Access node's properties (type, name and value). If there is no name/value, the corresponding functions
return <b>""</b> - they <u>never</u> return NULL.</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<font color="#000000" >xml_node</font> <font color="#000000" >child(</font><b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >name)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<font color="#000000" >xml_node</font> <font color="#000000" >child_w(</font><b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >name)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>Get a child node with specified name, or <b>xml_node()</b> (this is an invalid node) if nothing is
found</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<font color="#000000" >xml_attribute</font> <font color="#000000" >attribute(</font><b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >name)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<font color="#000000" >xml_attribute</font> <font color="#000000" >attribute_w(</font><b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >name)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>Get an attribute with specified name, or <b>xml_attribute()</b> (this is an invalid attribute) if
nothing is found</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<font color="#000000" >xml_node</font> <font color="#000000" >sibling(</font><b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >name)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<font color="#000000" >xml_node</font> <font color="#000000" >sibling_w(</font><b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >name)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>Get a node's sibling with specified name, or <b>xml_node()</b> if nothing is found.<br>
<code>node.sibling(name)</code> is equivalent to <code>node.parent().child(name)</code>.</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<font color="#000000" >xml_node</font> <font color="#000000" >next_sibling(</font><b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >name)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<font color="#000000" >xml_node</font> <font color="#000000" >next_sibling_w(</font><b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >name)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<font color="#000000" >xml_node</font> <font color="#000000" >next_sibling()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>These functions get the next sibling, that is, one of the siblings of that node, that is to the
right. <code>next_sibling()</code> just returns the right brother of the node (or <b>xml_node()</b>),
the two other functions are searching for the sibling with the given name</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<font color="#000000" >xml_node</font> <font color="#000000" >previous_sibling(</font><b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >name)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<font color="#000000" >xml_node</font> <font color="#000000" >previous_sibling_w(</font><b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >name)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<font color="#000000" >xml_node</font> <font color="#000000" >previous_sibling()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>These functions do exactly the same as <code>next_sibling</code> ones, with the exception that they
search for the left siblings.</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<font color="#000000" >xml_node</font> <font color="#000000" >parent()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>Get a parent node. The parent node for the root one (the document) is considered to be the document
itself.</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >child_value()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>Look for the first node of type <b>node_pcdata</b> or <b>node_cdata</b> among the
children of the current node and return its contents (or <b>""</b> if nothing is found)</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >child_value(</font><b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >name)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>This is the convenient way of looking into child's child value - that is, node.child_value(name) is equivalent to node.child(name).child_value().</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >child_value_w(</font><b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >name)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>This is the convenient way of looking into child's child value - that is, node.child_value_w(name) is equivalent to node.child_w(name).child_value().</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<font color="#000000" >xml_attribute</font> <font color="#000000" >first_attribute()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<font color="#000000" >xml_attribute</font> <font color="#000000" >last_attribute()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>These functions get the first and last attributes of the node (or <b>xml_attribute()</b> if the node
has no attributes).</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<font color="#000000" >xml_node</font> <font color="#000000" >first_child()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<font color="#000000" >xml_node</font> <font color="#000000" >last_child()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>These functions get the first and last children of the node (or <b>xml_node()</b> if the node has
no children).</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<b><font color="#0000ff" >template</font></b> <font color="#000000" ><</font><b><font color="#0000ff" >typename</font></b> <font color="#000000" >OutputIterator></font> <b><font color="#0000ff" >void</font></b> <font color="#000000" >all_elements_by_name(</font><b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >name,</font> <font color="#000000" >OutputIterator</font> <font color="#000000" >it)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<b><font color="#0000ff" >template</font></b> <font color="#000000" ><</font><b><font color="#0000ff" >typename</font></b> <font color="#000000" >OutputIterator></font> <b><font color="#0000ff" >void</font></b> <font color="#000000" >all_elements_by_name_w(</font><b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >name,</font> <font color="#000000" >OutputIterator</font> <font color="#000000" >it)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>Get all elements with the specified name in the subtree (depth-first search) and return them with
the help of output iterator (i.e. std::back_inserter)</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<b><font color="#0000ff" >template</font></b> <font color="#000000" ><</font><b><font color="#0000ff" >typename</font></b> <font color="#000000" >Predicate></font> <font color="#000000" >xml_attribute</font> <font color="#000000" >find_attribute(Predicate</font> <font color="#000000" >pred)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<b><font color="#0000ff" >template</font></b> <font color="#000000" ><</font><b><font color="#0000ff" >typename</font></b> <font color="#000000" >Predicate></font> <font color="#000000" >xml_node</font> <font color="#000000" >find_child(Predicate</font> <font color="#000000" >pred)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<b><font color="#0000ff" >template</font></b> <font color="#000000" ><</font><b><font color="#0000ff" >typename</font></b> <font color="#000000" >Predicate></font> <font color="#000000" >xml_node</font> <font color="#000000" >find_element(Predicate</font> <font color="#000000" >pred)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>Find attribute, child or a node in the subtree (find_element - depth-first search) with the help
of the given predicate. Predicate should behave like a function which accepts a <b>xml_node</b> or
<b>xml_attribute</b> (for find_attribute) parameter and returns <b>bool</b>. The first entity for which
the predicate returned true is returned. If predicate returned false for all entities, <b>xml_node()</b>
or <b>xml_attribute()</b> is returned.</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<font color="#000000" >xml_node</font> <font color="#000000" >first_element(</font><b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >name)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<font color="#000000" >xml_node</font> <font color="#000000" >first_element_w(</font><b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >name)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<font color="#000000" >xml_node</font> <font color="#000000" >first_element_by_value(</font><b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >name,</font> <b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >value)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<font color="#000000" >xml_node</font> <font color="#000000" >first_element_by_value_w(</font><b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >name,</font> <b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >value)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<font color="#000000" >xml_node</font> <font color="#000000" >first_element_by_attribute(</font><b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >name,</font> <b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >attr_name,</font> <b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >attr_value)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<font color="#000000" >xml_node</font> <font color="#000000" >first_element_by_attribute_w(</font><b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >name,</font> <b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >attr_name,</font> <b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >attr_value)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<font color="#000000" >xml_node</font> <font color="#000000" >first_element_by_attribute(</font><b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >attr_name,</font> <b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >attr_value)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<font color="#000000" >xml_node</font> <font color="#000000" >first_element_by_attribute_w(</font><b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >attr_name,</font> <b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >attr_value)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>Find the first node (depth-first search), which corresponds to the given criteria (i.e. either has
a matching name, or a matching value, or has an attribute with given name/value, or has an attribute
and has a matching name). Note that <b>_w</b> versions treat all parameters as wildcards.</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<font color="#000000" >xml_node</font> <font color="#000000" >first_node(xml_node_type</font> <font color="#000000" >type)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>Return a first node (depth-first search) with a given type, or <b>xml_node()</b>.</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<font color="#000000" >std::string</font> <font color="#000000" >path(</font><b><font color="#0000ff" >char</font></b> <font color="#000000" >delimiter</font> <font color="#000000" >=</font> <font color="#ff0000" >'/'</font><font color="#000000" >)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>Get a path of the node (i.e. the string of names of the nodes on the path from the DOM tree root
to the node, separated with delimiter (/ by default).</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<font color="#000000" >xml_node</font> <font color="#000000" >first_element_by_path(</font><b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >path,</font> <b><font color="#0000ff" >char</font></b> <font color="#000000" >delimiter</font> <font color="#000000" >=</font> <font color="#ff0000" >'/'</font><font color="#000000" >)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>Get the first element that has the following path. The path can be absolute (beginning with delimiter) or
relative, '..' means 'up-level' (so if we are at the path <b>mesh/fragment/geometry/stream</b>, <b>../..</b>
will lead us to <b>mesh/fragment</b>, and <b>/mesh</b> will lead us to <b>mesh</b>).</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<b><font color="#0000ff" >bool</font></b> <font color="#000000" >traverse(xml_tree_walker&</font> <font color="#000000" >walker)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>Traverse the subtree (beginning with current node) with the walker, return the result. See
<a href="#Doc_Misc">Miscellaneous</a> section for details.</p>
<a name="Doc_Attribute">
<h3>xml_attribute class</h3>
<p>Like <b>xml_node</b>, <b>xml_attribute</b> is a simple wrapper of the node's attribute.</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<b><font color="#0000ff" >bool</font></b> <b><font color="#0000ff" >operator</font></b><font color="#000000" >==(</font><b><font color="#0000ff" >const</font></b> <font color="#000000" >xml_attribute&</font> <font color="#000000" >r)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<b><font color="#0000ff" >bool</font></b> <b><font color="#0000ff" >operator</font></b><font color="#000000" >!=(</font><b><font color="#0000ff" >const</font></b> <font color="#000000" >xml_attribute&</font> <font color="#000000" >r)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<b><font color="#0000ff" >bool</font></b> <b><font color="#0000ff" >operator</font></b><font color="#000000" ><(</font><b><font color="#0000ff" >const</font></b> <font color="#000000" >xml_attribute&</font> <font color="#000000" >r)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<b><font color="#0000ff" >bool</font></b> <b><font color="#0000ff" >operator</font></b><font color="#000000" >>(</font><b><font color="#0000ff" >const</font></b> <font color="#000000" >xml_attribute&</font> <font color="#000000" >r)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<b><font color="#0000ff" >bool</font></b> <b><font color="#0000ff" >operator</font></b><font color="#000000" ><=(</font><b><font color="#0000ff" >const</font></b> <font color="#000000" >xml_attribute&</font> <font color="#000000" >r)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<b><font color="#0000ff" >bool</font></b> <b><font color="#0000ff" >operator</font></b><font color="#000000" >>=(</font><b><font color="#0000ff" >const</font></b> <font color="#000000" >xml_attribute&</font> <font color="#000000" >r)</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>Comparison operators.</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<b><font color="#0000ff" >operator</font></b> <font color="#000000" >unspecified_bool_type()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>Safe bool conversion - like in <b>xml_node</b>, use this to check for validity.</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<b><font color="#0000ff" >bool</font></b> <font color="#000000" >empty()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>Like with <b>xml_node</b>, <code>if (attr.empty())</code> is equivalent to <code>if (!attr)</code>.
</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<font color="#000000" >xml_attribute</font> <font color="#000000" >next_attribute()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<font color="#000000" >xml_attribute</font> <font color="#000000" >previous_attribute()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>Get the next/previous attribute of the node, that owns the current attribute. Return <b>xml_attribute()</b>
if no such attribute is found.</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >name()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<b><font color="#0000ff" >const</font></b> <b><font color="#0000ff" >char</font></b><font color="#000000" >*</font> <font color="#000000" >value()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>Get the name and value of the attribute. These methods never return NULL - they return <b>""</b> instead.</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<b><font color="#0000ff" >int</font></b> <font color="#000000" >as_int()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<b><font color="#0000ff" >double</font></b> <font color="#000000" >as_double()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
<b><font color="#0000ff" >float</font></b> <font color="#000000" >as_float()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>Convert the value of an attribute to the desired type. If the conversion is not successfull, return
default value (0 for int, 0.0 for double, 0.0f for float). These functions rely on CRT functions ato*.</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<b><font color="#0000ff" >bool</font></b> <font color="#000000" >as_bool()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>Convert the value of an attribute to bool. This method returns true if the first character of the
value is '1', 't', 'T', 'y' or 'Y'. Otherwise it returns false.</p>
<a name="Doc_Iterators">
<h3>Iterators</h3>
<p>Sometimes you have to cycle through the children or the attributes of the node. You can do it either
by using <b>next_sibling</b>, <b>previous_sibling</b>, <b>next_attribute</b> and <b>previous_attribute</b>
(along with <b>first_child</b>, <b>last_child</b>, <b>first_attribute</b> and <b>last_attribute</b>),
or you can use an iterator-like interface. There are two iterator types, <b>xml_node_iterator</b> and
<b>xml_attribute_iterator</b>. They are bidirectional constant iterators, which means that you can
either increment or decrement them, and use dereferencing and member access operators to get constant
access to node/attribute (the constness of iterators may change with the introducing of mutable trees).</p>
<p>In order to get the iterators, use corresponding functions of <b>xml_node</b>. Note that <b>_end()</b>
functions return past-the-end iterator, that is, in order to get the last attribute, you'll have to
do something like:
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<b><font color="#0000ff" >if</font></b> <font color="#000000" >(node.attributes_begin()</font> <font color="#000000" >!=</font> <font color="#000000" >node.attributes_end())</font> <i><font color="#808080" >// we have at least one attribute</font></i>
<font color="#000000" >{</font>
<font color="#000000" >xml_attribute</font> <font color="#000000" >last_attrib</font> <font color="#000000" >=</font> <font color="#000000" >*(--node.attributes_end());</font>
<font color="#000000" >...</font>
<font color="#000000" >}</font>
</font></pre></td></tr></table>
</p>
<a name="Doc_Misc">
<h3>Miscellaneous</h3>
<p>If you want to traverse a subtree, you can use <b>traverse</b> function. There is a class
<b>xml_tree_walker</b>, which has some functions that you can override in order to get custom traversing
(the default one just does nothing).
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<b><font color="#0000ff" >virtual</font></b> <b><font color="#0000ff" >bool</font></b> <font color="#000000" >begin(</font><b><font color="#0000ff" >const</font></b> <font color="#000000" >xml_node&);</font>
<b><font color="#0000ff" >virtual</font></b> <b><font color="#0000ff" >bool</font></b> <font color="#000000" >end(</font><b><font color="#0000ff" >const</font></b> <font color="#000000" >xml_node&);</font>
</font></pre></td></tr></table>
<p>These functions are called when the processing of the node starts/ends. First <b>begin()</b>
is called, then all children of the node are processed recursively, then <b>end()</b> is called. If
any of these functions returns false, the traversing is stopped and the <b>traverse()</b> function
returns false.</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<b><font color="#0000ff" >virtual</font></b> <b><font color="#0000ff" >void</font></b> <font color="#000000" >push();</font>
<b><font color="#0000ff" >virtual</font></b> <b><font color="#0000ff" >void</font></b> <font color="#000000" >pop();</font>
</font></pre></td></tr></table>
<p>These functions are called before and after the processing of node's children. If node has no children,
none of these is called. The default behavior is to increment/decrement current node depth.</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<b><font color="#0000ff" >virtual</font></b> <b><font color="#0000ff" >int</font></b> <font color="#000000" >depth()</font> <b><font color="#0000ff" >const</font></b><font color="#000000" >;</font>
</font></pre></td></tr></table>
<p>Get the current depth. You can use this function to do your own indentation, for example.</p>
<p>Lets get to some minor notes. You can safely write something like:
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<b><font color="#0000ff" >bool</font></b> <font color="#000000" >value</font> <font color="#000000" >=</font> <font color="#000000" >node.child(</font><font color="#ff0000" >"stream"</font><font color="#000000" >).attribute(</font><font color="#ff0000" >"compress"</font><font color="#000000" >).as_bool();</font>
</font></pre></td></tr></table>
If node has a child with the name 'geometry', and this child has an attribute 'compress', than everything
is ok. If node has a child with the name 'geometry' with no attribute 'compress', then attribute("compress")
will return xml_attribute(), and the corresponding call to as_bool() will return default value (false).
If there is no child node 'geometry', the child(...) call will return xml_node(), the subsequent call
to attribute(...) will return xml_attribute() (because there are no attributes belonging to invalid
node), and as_bool() will again return false, so this call sequence is perfectly safe.</p>
<a name="Doc_Lifetime">
<h3>Lifetime issues and memory management</h3>
<p>As parsing is done in-situ, the XML data is to persist during the lifetime of <b>xml_parser</b>. If
the parsing is called via a function of <b>xml_parser</b>, that accepts <b>char*</b>, you have to ensure
yourself, that the string will outlive the <b>xml_parser</b> object.</p>
<p>The memory for nodes and attributes is allocated in blocks of data (the blocks form a linked list;
the default size of the block is 32 kb, though you can change it via changing a <b>memory_block_size</b>
constant in <b>pugixml.hpp</b> file. Remember that the first block is allocated on stack (it resides
inside <b>xml_parser</b> object), and all subsequent blocks are allocated on heap, so expect a stack overflow
when setting too large memory block size), so the <b>xml_parser</b> object (which contains the blocks)
should outlive all <b>xml_node</b> and <b>xml_attribute</b> objects (as well as iterators), which belong
to the parser's tree. Again, you should ensure it yourself.</p>
<hr>
<a name="Example">
<h2>Example</h2>
<p>Ok, so you are not much of documentation reader, are you? So am I. Let's assume that you're going
to parse an xml file... something like this:
<pre>
<span style='color:#004a43; '><?</span><span style='color:#004a43; '>xml</span> <span style='color:#004a43; '>version</span><span style='color:#808030; '>=</span><span style='color:#008c00; '>"1.0"</span> <span style='color:#004a43; '>encoding</span><span style='color:#808030; '>=</span><span style='color:#0000e6; '>"UTF-8"</span><span style='color:#004a43; '>?></span>
<span style='color:#a65700; '><</span><span style='color:#5f5035; '>mesh</span> <span style='color:#274796; '>name</span><span style='color:#808030; '>=</span><span style='color:#0000e6; '>"</span><span style='color:#0000e6; '>Cathedral</span><span style='color:#0000e6; '>"</span><span style='color:#a65700; '>></span>
<span style='color:#a65700; '><</span><span style='color:#5f5035; '>fragment</span> <span style='color:#274796; '>name</span><span style='color:#808030; '>=</span><span style='color:#0000e6; '>"</span><span style='color:#0000e6; '>Cathedral</span><span style='color:#0000e6; '>"</span><span style='color:#a65700; '>></span>
<span style='color:#a65700; '><</span><span style='color:#5f5035; '>geometry</span><span style='color:#a65700; '>></span>
<span style='color:#a65700; '><</span><span style='color:#5f5035; '>stream</span> <span style='color:#274796; '>usage</span><span style='color:#808030; '>=</span><span style='color:#0000e6; '>"</span><span style='color:#0000e6; '>main</span><span style='color:#0000e6; '>"</span> <span style='color:#274796; '>source</span><span style='color:#808030; '>=</span><span style='color:#0000e6; '>"</span><span style='color:#0000e6; '>StAnna.dmesh</span><span style='color:#0000e6; '>"</span> <span style='color:#274796; '>compress</span><span style='color:#808030; '>=</span><span style='color:#0000e6; '>"</span><span style='color:#0000e6; '>true</span><span style='color:#0000e6; '>"</span> <span style='color:#a65700; '>/></span>
<span style='color:#a65700; '><</span><span style='color:#5f5035; '>stream</span> <span style='color:#274796; '>usage</span><span style='color:#808030; '>=</span><span style='color:#0000e6; '>"</span><span style='color:#0000e6; '>ao</span><span style='color:#0000e6; '>"</span> <span style='color:#274796; '>source</span><span style='color:#808030; '>=</span><span style='color:#0000e6; '>"</span><span style='color:#0000e6; '>StAnna.ao</span><span style='color:#0000e6; '>"</span> <span style='color:#a65700; '>/></span>
<span style='color:#a65700; '></</span><span style='color:#5f5035; '>geometry</span><span style='color:#a65700; '>></span>
<span style='color:#a65700; '></</span><span style='color:#5f5035; '>fragment</span><span style='color:#a65700; '>></span>
<span style='color:#a65700; '><</span><span style='color:#5f5035; '>fragment</span> <span style='color:#274796; '>name</span><span style='color:#808030; '>=</span><span style='color:#0000e6; '>"</span><span style='color:#0000e6; '>Cathedral</span><span style='color:#0000e6; '>"</span><span style='color:#a65700; '>></span>
...
<span style='color:#a65700; '></</span><span style='color:#5f5035; '>fragment</span><span style='color:#a65700; '>></span>
...
<span style='color:#a65700; '></</span><span style='color:#5f5035; '>mesh</span><span style='color:#a65700; '>></span>
</pre>
<p><mesh> is a root node, it has 0 or more <fragment>s, each of them has a <geometry>
node, and there are <stream> nodes with the shown attributes. We'd like to parse the file and...
well, and do something with it's contents. There are several methods of doing that; I'll show 2 of them
(the remaining one is using iterators).</p>
<p>Here we exploit the knowledge of the strict hierarchy of our XML document and read the nodes from
DOM tree accordingly. When we have an <b>xml_node</b> object, we can get the desired information from
it (name, value, attributes list, nearby nodes in a tree - siblings, parent and children).</p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<font color="#008000" >#include</font> <font color="#ff0000" ><fstream></font>
<font color="#008000" >#include</font> <font color="#ff0000" ><vector></font>
<font color="#008000" >#include</font> <font color="#ff0000" ><algorithm></font>
<font color="#008000" >#include</font> <font color="#ff0000" ><iterator></font>
<font color="#008000" >#include</font> <font color="#ff0000" >"pugixml.hpp"</font>
<b><font color="#0000ff" >using</font></b> <b><font color="#0000ff" >namespace</font></b> <font color="#000000" >pugi;</font>
<b><font color="#0000ff" >int</font></b> <font color="#000000" >main()</font>
<font color="#000000" >{</font>
<font color="#000000" >std::ifstream</font> <font color="#000000" >in(</font><font color="#ff0000" >"mesh.xml"</font><font color="#000000" >);</font>
<font color="#000000" >in.unsetf(std::ios::skipws);</font>
<font color="#000000" >std::vector<</font><b><font color="#0000ff" >char</font></b><font color="#000000" >></font> <font color="#000000" >buf;</font>
<font color="#000000" >std::copy(std::istream_iterator<</font><b><font color="#0000ff" >char</font></b><font color="#000000" >>(in),</font> <font color="#000000" >std::istream_iterator<</font><b><font color="#0000ff" >char</font></b><font color="#000000" >>(),</font> <font color="#000000" >std::back_inserter(buf));</font>
<font color="#000000" >buf.push_back(</font><b><font color="#40b440" >0</font></b><font color="#000000" >);</font> <i><font color="#808080" >// zero-terminate</font></i>
<font color="#000000" >xml_parser</font> <font color="#000000" >parser(&buf[</font><b><font color="#40b440" >0</font></b><font color="#000000" >],</font> <font color="#000000" >pugi::parse_w3c);</font>
<font color="#000000" >xml_node</font> <font color="#000000" >doc</font> <font color="#000000" >=</font> <font color="#000000" >parser.document();</font>
<b><font color="#0000ff" >if</font></b> <font color="#000000" >(xml_node</font> <font color="#000000" >mesh</font> <font color="#000000" >=</font> <font color="#000000" >doc.first_element(</font><font color="#ff0000" >"mesh"</font><font color="#000000" >))</font>
<font color="#000000" >{</font>
<i><font color="#808080" >// store mesh.attribute("name").value()</font></i>
<b><font color="#0000ff" >for</font></b> <font color="#000000" >(xml_node</font> <font color="#000000" >fragment</font> <font color="#000000" >=</font> <font color="#000000" >mesh.first_element(</font><font color="#ff0000" >"fragment"</font><font color="#000000" >);</font> <font color="#000000" >fragment;</font> <font color="#000000" >fragment</font> <font color="#000000" >=</font> <font color="#000000" >fragment.next_sibling())</font>
<font color="#000000" >{</font>
<i><font color="#808080" >// store fragment.attribute("name").value()</font></i>
<b><font color="#0000ff" >if</font></b> <font color="#000000" >(xml_node</font> <font color="#000000" >geometry</font> <font color="#000000" >=</font> <font color="#000000" >fragment.first_element(</font><font color="#ff0000" >"geometry"</font><font color="#000000" >))</font>
<b><font color="#0000ff" >for</font></b> <font color="#000000" >(xml_node</font> <font color="#000000" >stream</font> <font color="#000000" >=</font> <font color="#000000" >geometry.first_element(</font><font color="#ff0000" >"stream"</font><font color="#000000" >);</font> <font color="#000000" >stream;</font> <font color="#000000" >stream</font> <font color="#000000" >=</font> <font color="#000000" >stream.next_sibling())</font>
<font color="#000000" >{</font>
<i><font color="#808080" >// store stream.attribute("usage").value()</font></i>
<i><font color="#808080" >// store stream.attribute("source").value()</font></i>
<b><font color="#0000ff" >if</font></b> <font color="#000000" >(stream.attribute(</font><font color="#ff0000" >"compress"</font><font color="#000000" >))</font>
<i><font color="#808080" >// store stream.attribute("compress").as_bool()</font></i>
<font color="#000000" >}</font>
<font color="#000000" >}</font>
<font color="#000000" >}</font>
<font color="#000000" >}</font>
</font></pre></td></tr></table>
<p>We can also write a class that will traverse the DOM tree and store the information from nodes based
on their names, depths, attributes, etc. This way is well known by the users of SAX parsers. To do that,
we have to write an implementation of <b>xml_tree_walker</b> interface </p>
<table width = "100%" bgcolor="#e6e6e6"><tr><td><pre><font color="white">
<font color="#008000" >#include</font> <font color="#ff0000" ><fstream></font>
<font color="#008000" >#include</font> <font color="#ff0000" ><vector></font>
<font color="#008000" >#include</font> <font color="#ff0000" ><algorithm></font>
<font color="#008000" >#include</font> <font color="#ff0000" ><iterator></font>
<font color="#008000" >#include</font> <font color="#ff0000" >"pugixml.hpp"</font>
<b><font color="#0000ff" >using</font></b> <b><font color="#0000ff" >namespace</font></b> <font color="#000000" >pugi;</font>
<b><font color="#0000ff" >struct</font></b> <font color="#000000" >mesh_parser:</font> <b><font color="#0000ff" >public</font></b> <font color="#000000" >xml_tree_walker</font>
<font color="#000000" >{</font>
<b><font color="#0000ff" >virtual</font></b> <b><font color="#0000ff" >bool</font></b> <font color="#000000" >begin(</font><b><font color="#0000ff" >const</font></b> <font color="#000000" >xml_node&</font> <font color="#000000" >node)</font>
<font color="#000000" >{</font>
<b><font color="#0000ff" >if</font></b> <font color="#000000" >(strcmp(node.name(),</font> <font color="#ff0000" >"mesh"</font><font color="#000000" >)</font> <font color="#000000" >==</font> <b><font color="#40b440" >0</font></b><font color="#000000" >)</font>
<font color="#000000" >{</font>
<i><font color="#808080" >// store node.attribute("name").value()</font></i>
<font color="#000000" >}</font>
<b><font color="#0000ff" >else</font></b> <b><font color="#0000ff" >if</font></b> <font color="#000000" >(strcmp(node.name(),</font> <font color="#ff0000" >"fragment"</font><font color="#000000" >)</font> <font color="#000000" >==</font> <b><font color="#40b440" >0</font></b><font color="#000000" >)</font>
<font color="#000000" >{</font>
<i><font color="#808080" >// store node.attribute("name").value()</font></i>
<font color="#000000" >}</font>
<b><font color="#0000ff" >else</font></b> <b><font color="#0000ff" >if</font></b> <font color="#000000" >(strcmp(node.name(),</font> <font color="#ff0000" >"geometry"</font><font color="#000000" >)</font> <font color="#000000" >==</font> <b><font color="#40b440" >0</font></b><font color="#000000" >)</font>
<font color="#000000" >{</font>
<i><font color="#808080" >// ...</font></i>
<font color="#000000" >}</font>
<b><font color="#0000ff" >else</font></b> <b><font color="#0000ff" >if</font></b> <font color="#000000" >(strcmp(node.name(),</font> <font color="#ff0000" >"stream"</font><font color="#000000" >)</font> <font color="#000000" >==</font> <b><font color="#40b440" >0</font></b><font color="#000000" >)</font>
<font color="#000000" >{</font>
<i><font color="#808080" >// store node.attribute("usage").value()</font></i>
<i><font color="#808080" >// store node.attribute("source").value()</font></i>
<b><font color="#0000ff" >if</font></b> <font color="#000000" >(node.attribute(</font><font color="#ff0000" >"compress"</font><font color="#000000" >))</font>
<i><font color="#808080" >// store stream.attribute("compress").as_bool()</font></i>
<font color="#000000" >}</font>
<b><font color="#0000ff" >else</font></b> <b><font color="#0000ff" >return</font></b> <b><font color="#0000ff" >false</font></b><font color="#000000" >;</font>
<b><font color="#0000ff" >return</font></b> <b><font color="#0000ff" >true</font></b><font color="#000000" >;</font>
<font color="#000000" >}</font>
<font color="#000000" >};</font>
<b><font color="#0000ff" >int</font></b> <font color="#000000" >main()</font>
<font color="#000000" >{</font>
<font color="#000000" >std::ifstream</font> <font color="#000000" >in(</font><font color="#ff0000" >"mesh.xml"</font><font color="#000000" >);</font>
<font color="#000000" >in.unsetf(std::ios::skipws);</font>
<font color="#000000" >std::vector<</font><b><font color="#0000ff" >char</font></b><font color="#000000" >></font> <font color="#000000" >buf;</font>
<font color="#000000" >std::copy(std::istream_iterator<</font><b><font color="#0000ff" >char</font></b><font color="#000000" >>(in),</font> <font color="#000000" >std::istream_iterator<</font><b><font color="#0000ff" >char</font></b><font color="#000000" >>(),</font> <font color="#000000" >std::back_inserter(buf));</font>
<font color="#000000" >buf.push_back(</font><b><font color="#40b440" >0</font></b><font color="#000000" >);</font> <i><font color="#808080" >// zero-terminate</font></i>
<font color="#000000" >xml_parser</font> <font color="#000000" >parser(&buf[</font><b><font color="#40b440" >0</font></b><font color="#000000" >],</font> <font color="#000000" >pugi::parse_w3c);</font>
<font color="#000000" >mesh_parser</font> <font color="#000000" >mp;</font>
<b><font color="#0000ff" >if</font></b> <font color="#000000" >(!parser.document().traverse(mp))</font>
<i><font color="#808080" >// generate an error</font></i>
<font color="#000000" >}</font>
</font></pre></td></tr></table>
<hr>
<a name="Parsing">
<h2>Parsing process</h2>
<p>So, let's talk a bit about parsing process, and about the reason for providing XML data as a contiguous
writeable block of memory. Parsing is done in-situ. This means, that the strings, representing the
parts of DOM tree (node names, attribute names and values, CDATA content, etc.) are not separately
allocated on heap, but instead are parts of the original data. This is the keypoint to parsing speed,
because it helps achieve the minimal amount of memory allocations (more on that below) and minimal
amount of copying data.</p>
<p>In-situ parsing can be done in two ways, with zero-segmenting the string (that is, set the past-the-end
character for the part of XML string to 0, see <a href="http://www.codeproject.com/soap/pugxml/pugxml_steps.gif">
this image</a> for further details), and storing pointer + size of the string instead of pointer to
the beginning of ASCIIZ string.</p>
<p>Originally, <i>pugxml</i> had only the first way, but then authors added the second method, 'non-segmenting'
or non-destructive parsing. The advantages of this method are: you no longer need non-constant storage;
you can even read data from memory-mapped files directly. Well, there are disadvantages.
For one thing, you can not do any of the transformations in-situ. The transformations that are required
by XML standard are:
<ul>
<li>End of line handling (replacing 0x0d 0x0a with 0x0a and any standalone 0x0d with 0x0a) (for the whole
document)</li>
<li>White space normalization for attribute values (converting space-like characters to spaces (0x20),
sometimes trimming leading/trailing spaces and converting sequences of spaces to a single space</li>
<li>Character reference expansion (&lt; and alike, &#x0a; and alike, &#40; and alike)</li>
<li>Entity reference expansion (&entityname;)</li>
</ul>
None of these can be done in-situ. <i>pugxml</i> did neither character nor entity reference expansion,
and allocated new memory when normalizing white spaces when in non-destructive mode. I chose complete
in-situ parsing (the good thing about it is that any transformation, except entity reference, can be
done in-situ because it does not increase the amount of characters - even converting a character
reference to UTF-8). There is no entity reference expansion because of this and because I do not want
to parse DOCTYPE and, moreover, use DOCTYPE in following parsing (performing selective whitespace
normalization in attributes and CDATA sections and so on).</p>
<p>In order to be able to modify the tree (change attribute/node names & values) with in-situ parsing,
one needs to implement two ways of storing data (both in-situ and not). The DOM tree is now mutable,
but it will change in the future releases (without introducing speed/memory overhead, except on clean-
up stage).</p>
<p>The parsing process itself is more or less straightforward, when you see it - but the impression
is fake, because the explicit jumps are made (i.e. we know, that if we come to a closing brace (>),
we should expect CDATA after it (or a new tag), so let's just jump to the corresponding code), and,
well, there can be bugs (see <a href="#Bugs">Bugs</a> section).</p>
<p>And, to make things worse, memory allocation (which is done only for node and attribute structures)
is done in pools. The pools are single-linked lists with predefined block size (32 kb by default), and
well, it increases speed a lot (allocations are slow, and the memory gets fragmented when allocating
a bunch of 16-byte (attribute) or 40-byte (node) structures)</p>
<hr>
<a name="Compliance">
<h2>W3C compliance</h2>
<p><i>pugixml</i> is not a compliant XML parser. The main reason for that is that it does not reject
most malformed XML files. The more or less complete list of incompatibilities follows (I will be talking
of ones when using <b>parse_w3c</b> mode):
<ul>
<li>The parser is completely DOCTYPE-ignorant, that is, it does not even skip all possible DOCTYPEs
correctly, let alone use them for parsing
<li>It accepts multiple attributes with the same name in one node
<li>It is charset-ignorant
<li>It accepts invalid names of tags
<li>It accepts invalid attribute values (those with < in them) and does not reject invalid entity
references or character references (in fact, it does not do DOCTYPE parsing, so it does not perform
entity reference expansion)
<li>It does not reject comments with -- inside
<li>It does not reject PI with the names of 'xml' and alike; in fact, it parses prolog as a PI, which
is not conformant
<li>All characters from #x1 to #x20 are considered to be whitespaces
<li>And some other things that I forgot to mention
</ul>
In short, it accepts most malformed XML files and does not do anything that is related to DOCTYPE.
This is because the main goal was developing fast, easy-to-use and error ignorant (so you can always
get something even from a malformed document) parser, there are some good validating and conformant
parsers already.</p>
<hr>
<a name="ComparisonTable">
<h2>Comparison with existing parsers</h2>
<p>This table summarizes the comparison in terms of time and memory consumption between pugixml and
other parsers. For DOM parsers (all, except Expat, irrXML and SAX parser of XercesC), the process is
as follows:</p>
<ul>
<li>construct DOM tree from file, which is preloaded in memory (all parsers take const char* and size
as an input). 'parse time' means number of CPU clocks which is spent, 'parse allocs' - number of allocations,
'parse memory' - peak memory consumption
<li>traverse DOM tree to fill information from it into some structure (which is the same for all parsers,
of course). 'walk time' means number of CPU clocks which is spent, 'walk allocs' - number of allocations
</ul>
<p>For SAX parsers, the parse step is skipped (hence the N/A in relevant table cells), structure is
filled during 'walk' step.</p>
<p>For all parsers, 'total time' column means total time spent on the whole process, 'total allocs' -
total allocation count, 'total memory' - peak memory consumption for the whole process.</p>
<p>The tests were performed on a 1 Mb XML file with a small amount of text. They were compiled with
Microsoft Visual C++ 8.0 (2005) compiler in Release mode, with checked iterators/secure STL turned
off. The test system is AMD Sempron 2500+, 512 Mb RAM.</p>
<table cellspacing=0 cellpadding=2 border=1>
<tr><th>parser</th>
<th>parse time</th><th>parse allocs</th><th>parse memory</th>
<th>walk time</th><th>walk allocs</th>
<th>total time</th><th>total allocs</th><th>total memory</th></tr>
<tr><td><a href="http://xml.irrlicht3d.org/">irrXML</a></td>
<td>N/A</td><td>N/A</td><td>N/A</td>
<td>352 Mclocks</td><td>697 245</td>
<td>356 Mclocks</td><td>697 284</td><td>906 kb</td></tr>
<tr><td><a href="http://expat.sourceforge.net/">Expat</a></td>
<td>N/A</td><td>N/A</td><td>N/A</td>
<td>97 Mclocks</td><td>19</td>
<td>97 Mclocks</td><td>23</td><td>1028 kb</td></tr>
<tr><td><a href="http://tinyxml.sourceforge.net/">TinyXML</a></td>
<td>168 Mclocks</td><td>50 163</td><td>5447 kb</td>
<td>37 Mclocks</td><td>0</td>
<td>242 Mclocks</td><td>50 163</td><td>5447 kb</td></tr>
<tr><td><a href="http://www.codeproject.com/soap/pugxml.asp">PugXML</a></td>
<td>100 Mclocks</td><td>106 597</td><td>2747 kb</td>
<td>38 Mclocks</td><td>0</td>
<td>206 Mclocks</td><td>131 677</td><td>2855 kb</td></tr>
<tr><td><a href="http://xml.apache.org/xerces-c/">XercesC</a> SAX</td>
<td>N/A</td><td>N/A</td><td>N/A</td>
<td>411 Mclocks</td><td>70 380</td>
<td>411 Mclocks</td><td>70 495</td><td>243 kb</td></tr>
<tr><td><a href="http://xml.apache.org/xerces-c/">XercesC</a> DOM</td>
<td>300 Mclocks</td><td>30 491</td><td>9251 kb</td>
<td>65 Mclocks</td><td>1</td>
<td>367 Mclocks</td><td>30 492</td><td>9251 kb</td></tr>
<tr><td>pugixml</td>
<td>17 Mclocks</td><td>40</td><td>2154 kb</td>
<td>14 Mclocks</td><td>0</td>
<td>32 Mclocks</td><td>40</td><td>2154 kb</td></tr>
<tr><td>pugixml (test of non-destructive parsing)</td>
<td>12 Mclocks</td><td>51</td><td>1632 kb</td>
<td>21 Mclocks</td><td>0</td>
<td>34 Mclocks</td><td>51</td><td>1632 kb</td></tr>
</table>
<p>Note, that non-destructive parsing mode was just a test and is not yet in <i>pugixml</i>.</p>
<hr>
<a name="FAQ">
<h2>FAQ</h2>
<p><b>Q:</b> I do not have/want STL support. How can I compile <i>pugixml</i> without STL?</p>
<p><b>A:</b> There is an undocumented define PUGIXML_NO_STL. If you uncomment the relevant line
in <i>pugixml</i> header file, it will compile without any STL classes. The reason it is undocumented
are that it will make some documented functions not available (specifically, xml_parser() ctor and
parse() function that operate on std::istream, xml_node::path function, utf16 and utf8 conversion
functions). Otherwise, it will work fine.</p>
<p><b>Q:</b> Do paths that are accepted by <b>first_element_by_path</b> have to end with delimiter?</p>
<p><b>A:</b> Either way will work, both /path/to/node/ and /path/to/node is fine.</p>
<p>I'm always open for questions; feel free to write them to <a href="mailto:arseny.kapoulkine@gmail.com">arseny.kapoulkine@gmail.com</a>.
</p>
<hr>
<a name="Bugs">
<h2>Bugs</h2>
<p>I'm always open for bug reports; feel free to write them to <a href="mailto:arseny.kapoulkine@gmail.com">arseny.kapoulkine@gmail.com</a>.
Please provide as much information as possible - version of <i>pugixml</i>, compiling and OS environment
(compiler and it's version, STL version, OS version, etc.), the description of the situation in which
the bug arises, the code and data files that show the bug, etc. - the more, the better. Though, please,
do not send executable files.</p>
<p>Note, that you can also submit bug reports/suggestions at
<a href="http://code.google.com/p/pugixml/issues/list">project page</a>.
<hr>
<a name="Future_work">
<h2>Future work</h2>
<p>Here are some improvements that will be done in future versions (they are sorted by priority, the
upper ones will get there sooner).</p>
<ul>
<li>Support for altering the tree (both changing nodes'/attributes' names and values and adding/deleting
attributes/nodes) and writing the tree to stream
<li>Support for UTF-16 files (parsing BOM to get file's type and converting UTF-16 file to UTF-8 buffer
if necessary)
<li>Improved API (I'm going to look at SelectNode from MS XML and perhaps there will be some other
changes)
<li>Externally provided entity reference table (or perhaps even taken from DOCTYPE?)
<li>More intelligent parsing of DOCTYPE (it does not always skip DOCTYPE for now)
<li>XML 1.1 changes (changed EOL handling, normalization issues, etc.)
<li>XPath support
<li>Name your own?
</ul>
<hr>
<a name="Changelog">
<h2>Changelog</h2>
<dl>
<dt>15.07.2006 - v0.1
<dd>First private release for testing purposes
</dt>
<dt>6.11.2006 - v0.2
<dd>First public release. Changes: <ul>
<li>Introduced child_value(name) and child_value_w(name)
<li>Fixed child_value() (for empty nodes)
<li>Fixed xml_parser_impl warning at W4
<li>parse_eol_pcdata and parse_eol_attribute flags + parse_minimal optimizations
<li>Optimizations of strconv_t
</ul>
</dt>
</dl>
<hr>
<a name="Acknowledgements">
<h2>Acknowledgements</h2>
<ul>
<li><a href="mailto:kristen@tima.net">Kristen Wegner</a> for <i>pugxml</i> parser
<li><a href="mailto:readonly@getsoft.com">Neville Franks</a> for contributions to <i>pugxml</i> parser
</ul>
<hr>
<a name="License">
<h2>License</h2>
<p>The <i>pugixml</i> parser is distributed under the MIT license:</p>
<pre>
Copyright (c) 2006 Arseny Kapoulkine
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
</pre>
<hr>
<p>Revised 8 December, 2006</p>
<p><i>© Copyright <a href="mailto:arseny.kapoulkine@gmail.com">Arseny Kapoulkine</a> 2006. All Rights Reserved.</i></p>
</body>
</html>
|