Teuchos - Trilinos Tools Package  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Teuchos_XMLParser.cpp
1 // @HEADER
2 // *****************************************************************************
3 // Teuchos: Common Tools Package
4 //
5 // Copyright 2004 NTESS and the Teuchos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 // BUGS: There is a bug in Teuchos_XMLObjectImplem.cpp, line 82
11 // when printing attribute values, one must check if the value contains quote
12 // or apost;
13 // a quot'd attval cannot contain literal quot
14 // a apos'd attval cannot contain literal apos
15 // either they have to be matched appropriately or (easier) all quot and apos must
16 // be replaced by " and '
17 
18 #include "Teuchos_XMLParser.hpp"
20 #include "Teuchos_Assert.hpp"
21 #include <stack>
22 
23 using namespace Teuchos;
24 
25 // this parser currently does not support:
26 // * processing instructions
27 // * XML schemas
28 // * CDATA sections...see http://www.w3.org/TR/2004/REC-xml-20040204/#dt-cdsection
29 // * full Unicode support (we read unsigned bytes, so we get only 0x00 through 0xFF)
30 //
31 // it tolerates (read: ignores) xml declarations, at any point in the file where a tag would be valid
32 //
33 // it currently does support:
34 // * comments
35 // * empty element tags, e.g. <hello />
36 // * entity references: &amp; &lt; &gt; &apos; &quot;
37 // * numeric character references: &#32;
38 // * std::exception/error handling on parse errors
39 
40 
41 /* From the W3C XML 1.0 Third Edition
42  http://www.w3.org/TR/2004/REC-xml-20040204/
43 
44  The following productions specify well-formed XML documents.
45  These have been reduced to the support anticipated for support by this parser.
46 
47  element ::= EmptyElemTag
48  | STag content ETag
49  STag ::= '<' Name (S Attribute)* S? '>'
50  Attribute ::= Name Eq AttValue
51  ETag ::= '</' Name S? '>'
52  content ::= CharData? ((element | Reference | CDSect | Comment) CharData?)*
53  EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
54 
55  AttValue ::= '"' ([^<&"] | Reference)* '"'
56  | "'" ([^<&'] | Reference)* "'"
57 
58  CharRef ::= '&#' [0-9]+ ';'
59  EntityRef ::= '&' Name ';'
60  Reference ::= EntityRef | CharRef
61 
62  #x20 (space)
63  #x9 (horizontal tab)
64  #xD (carriage return)
65  #xA (new line, new line line feed)
66 
67  S ::= (#x20 | #x9 | #xD | #xA)+
68  Eq ::= S? '=' S?
69  NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | #x00B7
70  Name ::= (Letter | '_' | ':') (NameChar)*
71 
72  Letter ::= [#x0041-#x005A] | [#x0061-#x007A]
73  | [#x00C0-#x00D6] | [#x00D8-#x00F6]
74  | [#x00F8-#x00FF]
75  Digit ::= [#x0030-#x0039]
76 
77  Char ::= #x9 | #xA | #xD | [#x20-#xFF]
78  CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
79  that is, some std::string of characters not containing '<' or '&' or ']]>'
80  Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
81  that is, '<!--' txt '-->', where txt does not contain '--'
82 
83  CDSect ::= CDStart CData CDEnd
84  CDStart ::= '<![CDATA['
85  CData ::= (Char* - (Char* ']]>' Char*))
86  CDEnd ::= ']]>'
87 
88  document ::= prolog element Misc*
89  prolog ::= XMLDecl? Misc*
90  XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
91  Misc ::= Comment | S
92 
93  VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
94  Eq ::= S? '=' S?
95  VersionNum ::= '1.' [0-9]+
96  Misc ::= Comment | S
97 
98 
99 
100 */
101 
102 #define XMLPARSER_TFE( T , S ) \
103  TEUCHOS_TEST_FOR_EXCEPTION( T, std::runtime_error, "XML parse error at line " << _lineNo << ": " << S )
104 
106 {
107 
109 
110  _entities.clear();
111  _entities["apos"] = "'";
112  _entities["quot"] = "\"";
113  _entities["lt"] = "<";
114  _entities["gt"] = ">";
115  _entities["amp"] = "&";
116 
117  bool done = false;
118  int curopen = 0; // number of currently open tags, or "do we process character data?"
119  bool gotRoot = false;
120  std::stack<long> tagLineStarts;
121  std::stack<string> tags;
122 
123  while (!done) {
124 
125  std::string tag, cdata;
126  unsigned char c1, c2;
128 
129  // Consume any whitespace
130  if (curopen == 0) {
131  // this will leave a lookahead in c1
132  c1 = '\0';
133  if ( getSpace(c1) ) {
134  done = true;
135  break;
136  }
137  }
138  else {
139  // need to manually lookahead
140  if (_is->readBytes(&c1,1) < 1) {
141  done = true;
142  break;
143  }
144  if (c1 == '\n') ++_lineNo; // a newline while processing character data; not an error
145  }
146 
147  if (c1 == '<') {
148  // determine if it is a STag/EmptyElemTag or ETag or Comment
149  // get lookahead
150  XMLPARSER_TFE( _is->readBytes(&c2,1) < 1 , "stream ended in tag begin/end");
151 
152  if (c2 == '/') {
153  // we have: </
154  // try to get an ETag
155  getETag(tag);
156  // have to check whether we have an enclosing, otherwise tags and tagLineStarts have no top()
157  XMLPARSER_TFE( curopen == 0, "document not well-formed: encountered end element '" << tag << "' while not enclosed." );
158  XMLPARSER_TFE( handler->endElement(tag)!=0, "document not well-formed: end element tag = '" << tag << "'"
159  << " did not match start element '" << tags.top()
160  << "' from line " << tagLineStarts.top() );
161  curopen--;
162  tagLineStarts.pop();
163  tags.pop();
164  }
165  else if (isLetter(c2) || c2==':' || c2=='_') {
166  // it looks like a STag or an EmptyElemTag
167  bool emptytag;
168  tagLineStarts.push(_lineNo);
169  getSTag(c2, tag, attrs, emptytag);
170  tags.push(tag);
171  handler->startElement(tag,attrs);
172  if (curopen == 0) {
173  XMLPARSER_TFE(gotRoot == true, "document not well-formed: more than one root element specified" );
174  gotRoot = true;
175  }
176  curopen++;
177  if (emptytag) {
178  // we just open this tag, so we should have any trouble closing it
179  XMLPARSER_TFE( handler->endElement(tag)!=0, "unknown failure from handler while processing tag '" << tag << "'" );
180  curopen--;
181  tagLineStarts.pop();
182  tags.pop();
183  }
184  }
185  else if (c2 == '?') {
186  // it is starting to look like an xml declaration
187  XMLPARSER_TFE( assertChar('x') != 0 , "was expecting an XML declaration; element not well-formed or exploits unsupported feature" );
188  XMLPARSER_TFE( assertChar('m') != 0 , "was expecting an XML declaration; element not well-formed or exploits unsupported feature" );
189  XMLPARSER_TFE( assertChar('l') != 0 , "was expecting an XML declaration; element not well-formed or exploits unsupported feature" );
190  ignoreXMLDeclaration();
191  }
192  else if (c2 == '!') {
193  // it is starting to look like a comment; we need '--'
194  // if we don't get this, it means
195  // * the document is not well-formed
196  // * the document employs a feature not supported by this parser,
197  // e.g. <!ELEMENT... <!ATTLIST... <!DOCTYPE... <![CDATA[...
198  XMLPARSER_TFE( assertChar('-') != 0 , "element not well-formed or exploits unsupported feature" );
199  XMLPARSER_TFE( assertChar('-') != 0 , "element not well-formed or exploits unsupported feature" );
200  getComment(_lineNo);
201  }
202  else {
203  XMLPARSER_TFE(true, "element not well-formed or exploits unsupported feature" );
204  }
205  }
206  else if ( (curopen > 0) && (c1 == '&') ) {
207  std::string chars = "";
208  getReference(chars);
209  handler->characters(chars);
210  }
211  else if ( (curopen > 0) ) {
212  std::string chars = "";
213  chars.push_back(c1);
214  handler->characters(chars);
215  }
216  else {
217  XMLPARSER_TFE(1 , "document not well-formed: character data outside of an enclosing tag");
218  }
219  }
220 
221  XMLPARSER_TFE( curopen != 0 , "file ended before closing element '" << tags.top() << "' from line " << tagLineStarts.top() );
222 
223  return handler->getObject();
224 
225 }
226 
227 
228 void XMLParser::getETag(std::string &tag)
229 {
230  /* Recall from the specification:
231  ETag ::= '</' Name S? '>'
232  Name ::= (Letter | '_' | ':') (NameChar)*
233 
234  We have already consumed: </
235  */
236 
237  bool tagover = false;
238  unsigned char c;
239  // clear tag
240  tag = "";
241  XMLPARSER_TFE( _is->readBytes(&c,1) < 1 , "EOF before end element was terminated");
242  XMLPARSER_TFE( !isLetter(c) && c!='_' && c!=':' , "tag not well-formed");
243  tag.push_back(c);
244  while (1) {
245  XMLPARSER_TFE( _is->readBytes(&c,1) < 1 , "EOF before end element was terminated");
246  if ( isNameChar(c) ) {
247  if (tagover) {
248  XMLPARSER_TFE(1, "end element not well-formed: expected '>'");
249  }
250  tag.push_back(c);
251  }
252  else if (isSpace(c)) {
253  // mark the end of the tag and consume the whitespace
254  // if it is ia newline, it isn't an error
255  if (c == '\n') ++_lineNo;
256  tagover = true;
257  }
258  else if (c == '>') {
259  break;
260  }
261  else {
262  XMLPARSER_TFE(1, "end element not well-formed");
263  }
264  }
265 }
266 
267 
268 void XMLParser::getSTag(unsigned char lookahead, std::string &tag, Teuchos::map<std::string,string> &attrs, bool &emptytag)
269 {
270 
271  /* Recall from the specification:
272 
273  STag ::= '<' Name (S Attribute)* S? '>'
274  EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
275  Name ::= (Letter | '_' | ':') (NameChar)*
276  NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | #x00B7
277 
278  S ::= (#x20 | #x9 | #xD | #xA)+
279  Attribute ::= Name Eq AttValue
280  Eq ::= S? '=' S?
281  AttValue ::= '"' ([^<&"] | Reference)* '"'
282  | "'" ([^<&'] | Reference)* "'"
283  Reference ::= EntityRef | CharRef
284  CharRef ::= '&#' [0-9]+ ';'
285  EntityRef ::= '&' Name ';'
286 
287  We have already consumed: <lookahead
288  */
289 
290  unsigned char c;
291  attrs.clear();
292 
293  tag = lookahead;
294  // get the rest of the tag: (NameChar)*
295  while (1) {
296  XMLPARSER_TFE( _is->readBytes(&c,1) < 1 , "EOF before start element was terminated");
297  if (isNameChar(c)) {
298  tag.push_back(c);
299  }
300  else {
301  break;
302  }
303  }
304 
305  // after the name: should be one of the following
306  // (S Attribute) | S? '>' | S? '/>'
307  do {
308 
309  bool hadspace = false;
310 
311  // if space, consume the whitespace
312  if ( isSpace(c) ) {
313  hadspace = true;
314  XMLPARSER_TFE( getSpace(c)!=0, "EOF before start element was terminated");
315  }
316 
317  // now, either Attribute | '>' | '/>'
318  if ( (isLetter(c) || c=='_' || c==':') && hadspace ) {
319 
320  // Attribute
321  // get attribute name, starting with contents of c
322  std::string attname, attval;
323  attname = c;
324  do {
325  XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before start element was terminated");
326  if ( isNameChar(c) ) {
327  attname.push_back(c);
328  }
329  else if ( isSpace(c) || c=='=' ) {
330  break;
331  }
332  else {
333  XMLPARSER_TFE(1, "attribute not well-formed: expected whitespace or '='");
334  }
335  } while (1);
336 
337  // if whitespace, consume it
338  if (isSpace(c)) {
339  getSpace(c);
340  }
341  // should be on '='
342  if (c != '=') {
343  XMLPARSER_TFE(1, "attribute not well-formed: expected '='");
344  }
345 
346  // get any whitespace following the '='
347  XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before start element was terminated");
348  if (isSpace(c)) {
349  getSpace(c);
350  }
351 
352  // now get the quoted attribute value
353  bool apost;
354  attval = "";
355  if (c == '\'') {
356  apost = true;
357  }
358  else if (c == '\"') {
359  apost = false;
360  }
361  else {
362  XMLPARSER_TFE(1, "attribute value must be quoted with either ''' or '\"'");
363  }
364  do {
365  XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before start element was terminated");
366  if (apost && c=='\'') {
367  // end of attval
368  break;
369  }
370  else if (!apost && c=='\"') {
371  // end of attval
372  break;
373  }
374  else if ( c == '&' ) {
375  // finish: need to add support for Reference
376  std::string refstr;
377  getReference(refstr);
378  attval += refstr;
379  }
380  else if ( c!='<' ) {
381  // valid character for attval
382  attval.push_back(c);
383  }
384  else {
385  XMLPARSER_TFE(1, "invalid character in attribute value");
386  }
387  } while(1);
388 
389  // add attribute to list
390  XMLPARSER_TFE( attrs.find(attname) != attrs.end() , "cannot have two attributes with the same name");
391  attrs[attname] = attval;
392  }
393  else if (c == '>') {
394  emptytag = false;
395  break;
396  }
397  else if (c == '/') {
398  XMLPARSER_TFE(assertChar('>')!=0, "empty element tag not well-formed: expected '>'");
399  emptytag = true;
400  break;
401  }
402  else {
403  XMLPARSER_TFE(1, "start element not well-formed: invalid character");
404  }
405 
406  // get next char
407  XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before start element was terminated");
408 
409  } while(1);
410 }
411 
412 
413 void XMLParser::getComment(long /* startLine */)
414 {
415  /* Recall from the specification:
416  Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
417  that is, '<!--' txt '-->', where txt does not contain '--'
418  We have already consumed: <!--
419 
420  Be wary here of the fact that c=='-' implies isChar(c)
421  */
422  unsigned char c;
423  while (1) {
424  XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before terminating comment begun at line " << _lineNo );
425  if (c == '\n') ++_lineNo;
426  // if we have a -
427  if (c=='-') {
428  // then it must be the end of the comment or be a Char
429  XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before terminating comment begun at line " << _lineNo );
430  if (c == '\n') ++_lineNo;
431  if (c=='-') {
432  // this had better be leading to the end of the comment
433  XMLPARSER_TFE( assertChar('>')!=0, "comment not well-formed: missing expected '>' at line " << _lineNo );
434  break;
435  }
436  else if (!isChar(c)) {
437  XMLPARSER_TFE(1, "comment not well-formed: invalid character at line " << _lineNo );
438  }
439  }
440  else if (!isChar(c)) {
441  XMLPARSER_TFE(1, "comment not well-formed: invalid character at line " << _lineNo );
442  }
443  }
444 }
445 
446 
447 void XMLParser::getReference(std::string &refstr) {
448  // finish: does CharRef support only dec, or hex as well?
449  unsigned char c;
450  unsigned int num, base;
451  refstr = "";
452  // none of these bytes read are allowed to be a newline, so don't do any incrementing of _lineNo
453  XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before reference was terminated");
454  if (c == '#') {
455  // get a CharRef
456  // CharRef ::= '&#' [0-9]+ ';'
457  // | '&#x' [0-9]+ ';'
458  // get first number
459  XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before reference was terminated");
460  if (c == 'x') {
461  base = 16;
462  num = 0;
463  }
464  else if ('0' <= c && c <= '9') {
465  base = 10;
466  num = c - '0';
467  }
468  else {
469  XMLPARSER_TFE(1, "invalid character in character reference: expected 'x' or [0-9]");
470  }
471 
472  do {
473  XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before reference was terminated");
474  XMLPARSER_TFE( c != ';' && !('0' <= c && c <= '9') , "invalid character in character reference: expected [0-9] or ';'");
475  if (c == ';') {
476  break;
477  }
478  num = num*base + (c-'0');
479  } while (1);
480  XMLPARSER_TFE(num > 0xFF, "character reference value out of range");
481  refstr.push_back( (unsigned char)num );
482  }
483  else if (isLetter(c) || c=='_' || c==':') {
484  // get an EntityRef
485  // EntityRef ::= '&' Name ';'
486  std::string entname = "";
487  entname.push_back(c);
488  do {
489  XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before reference was terminated");
490  if (c==';') {
491  break;
492  }
493  else if ( isLetter(c) || ('0' <= c && c <= '9')
494  || c=='.' || c=='-' || c=='_' || c==':'
495  || c==0xB7 ) {
496  entname.push_back(c);
497  }
498  else {
499  XMLPARSER_TFE(1, "entity reference not well-formed: invalid character");
500  }
501  } while (1);
502  XMLPARSER_TFE( _entities.find(entname) == _entities.end(), "entity reference not well-formed: undefined entity");
503  refstr = _entities[entname];
504  }
505  else {
506  XMLPARSER_TFE(1, "reference not well-formed: expected name or '#'");
507  }
508 }
509 
510 
511 int XMLParser::getSpace(unsigned char &lookahead) {
512  // if space, consume the whitespace
513  do {
514  if (lookahead == '\n') ++_lineNo;
515  if (_is->readBytes(&lookahead,1) < 1) {
516  return 1; // inform caller that we reached the end
517  }
518  }
519  while (isSpace(lookahead));
520  return 0;
521 }
522 
523 
524 bool XMLParser::isLetter(unsigned char c) {
525  if ( (0x41 <= c && c <= 0x5A) || (0x61 <= c && c <= 0x7A) ||
526  (0xC0 <= c && c <= 0xD6) || (0xD8 <= c && c <= 0xF6) ||
527  (0xF8 <= c) /* unsigned char must be <= 0xFF */ )
528  {
529  return true;
530  }
531  return false;
532 }
533 
534 
535 bool XMLParser::isNameChar(unsigned char c) {
536  if ( isLetter(c) || ('0' <= c && c <= '9') ||
537  c=='.' || c=='-' || c=='_' || c==':' || c==0xB7 )
538  {
539  return true;
540  }
541  return false;
542 }
543 
544 
545 bool XMLParser::isSpace(unsigned char c) {
546  if ( c==0x20 || c==0x9 || c==0xD || c==0xA )
547  {
548  return true;
549  }
550  return false;
551 }
552 
553 
554 bool XMLParser::isChar(unsigned char c) {
555  if ( c==0x9 || c==0xA || c==0xD || 0x20 <= c) { // unsigned char must be <= 0xFF
556  return true;
557  }
558  return false;
559 }
560 
561 
562 int XMLParser::assertChar(unsigned char cexp)
563 {
564  // pull the next character off the stream and verify that it is what is expected
565  // if not, return an error to the caller
566  unsigned char c;
567  // don't worry about newlines; assertChar is always wrapped in TEST_FOR_EXCEPTION, so we don't want to advance the line counter
568  if (_is->readBytes(&c,1) < 1) {
569  return 1;
570  }
571  if (c != cexp) {
572  return 2;
573  }
574  return 0;
575 }
576 
577 void XMLParser::ignoreXMLDeclaration()
578 {
579  /* Be a little lax on the spec here; read until we get to '?', then assert '>'
580  We have already consumed: <xml
581  */
582  unsigned char c;
583  while (1) {
584  XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before terminating XML declaration begun at line " << _lineNo );
585  if (c == '\n') ++_lineNo;
586  // if we have a -
587  if (c=='?') {
588  // this had better be leading to the end of the declaration
589  XMLPARSER_TFE( assertChar('>')!=0, "XML declaration not well-formed: missing expected '>' at line " << _lineNo );
590  break;
591  }
592  }
593 }
void characters(const std::string &chars)
Process character data.
Defines a class for assembling an XMLObject from XML input.
int endElement(const std::string &tag)
Receive notification of the end of an element.
XMLObject parse()
Consume the XMLInputStream to build an XMLObject.
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
Deprecated.
Representation of an XML data tree. XMLObject is a ref-counted handle to a XMLObjectImplem object...
void startElement(const std::string &tag, const Map &attributes)
Receive notification of the start of an element.
const XMLObject & getObject() const
Retrieve the entire XML tree.
TreeBuildingXMLHandler assembles a XMLObject from your XML input.
Smart reference counting pointer class for automatic garbage collection.
A class providing a simple XML parser. Methods can be overloaded to exploit external XML parsing libr...