Teuchos - Trilinos Tools Package  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Teuchos_XMLParser.cpp
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Teuchos: Common Tools Package
5 // Copyright (2004) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ***********************************************************************
40 // @HEADER
41 
42 // BUGS: There is a bug in Teuchos_XMLObjectImplem.cpp, line 82
43 // when printing attribute values, one must check if the value contains quote
44 // or apost;
45 // a quot'd attval cannot contain literal quot
46 // a apos'd attval cannot contain literal apos
47 // either they have to be matched appropriately or (easier) all quot and apos must
48 // be replaced by " and '
49 
50 #include "Teuchos_XMLParser.hpp"
52 #include "Teuchos_Assert.hpp"
53 #include <stack>
54 
55 using namespace Teuchos;
56 
57 // this parser currently does not support:
58 // * processing instructions
59 // * XML schemas
60 // * CDATA sections...see http://www.w3.org/TR/2004/REC-xml-20040204/#dt-cdsection
61 // * full Unicode support (we read unsigned bytes, so we get only 0x00 through 0xFF)
62 //
63 // it tolerates (read: ignores) xml declarations, at any point in the file where a tag would be valid
64 //
65 // it currently does support:
66 // * comments
67 // * empty element tags, e.g. <hello />
68 // * entity references: &amp; &lt; &gt; &apos; &quot;
69 // * numeric character references: &#32;
70 // * std::exception/error handling on parse errors
71 
72 
73 /* From the W3C XML 1.0 Third Edition
74  http://www.w3.org/TR/2004/REC-xml-20040204/
75 
76  The following productions specify well-formed XML documents.
77  These have been reduced to the support anticipated for support by this parser.
78 
79  element ::= EmptyElemTag
80  | STag content ETag
81  STag ::= '<' Name (S Attribute)* S? '>'
82  Attribute ::= Name Eq AttValue
83  ETag ::= '</' Name S? '>'
84  content ::= CharData? ((element | Reference | CDSect | Comment) CharData?)*
85  EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
86 
87  AttValue ::= '"' ([^<&"] | Reference)* '"'
88  | "'" ([^<&'] | Reference)* "'"
89 
90  CharRef ::= '&#' [0-9]+ ';'
91  EntityRef ::= '&' Name ';'
92  Reference ::= EntityRef | CharRef
93 
94  #x20 (space)
95  #x9 (horizontal tab)
96  #xD (carriage return)
97  #xA (new line, new line line feed)
98 
99  S ::= (#x20 | #x9 | #xD | #xA)+
100  Eq ::= S? '=' S?
101  NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | #x00B7
102  Name ::= (Letter | '_' | ':') (NameChar)*
103 
104  Letter ::= [#x0041-#x005A] | [#x0061-#x007A]
105  | [#x00C0-#x00D6] | [#x00D8-#x00F6]
106  | [#x00F8-#x00FF]
107  Digit ::= [#x0030-#x0039]
108 
109  Char ::= #x9 | #xA | #xD | [#x20-#xFF]
110  CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
111  that is, some std::string of characters not containing '<' or '&' or ']]>'
112  Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
113  that is, '<!--' txt '-->', where txt does not contain '--'
114 
115  CDSect ::= CDStart CData CDEnd
116  CDStart ::= '<![CDATA['
117  CData ::= (Char* - (Char* ']]>' Char*))
118  CDEnd ::= ']]>'
119 
120  document ::= prolog element Misc*
121  prolog ::= XMLDecl? Misc*
122  XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
123  Misc ::= Comment | S
124 
125  VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
126  Eq ::= S? '=' S?
127  VersionNum ::= '1.' [0-9]+
128  Misc ::= Comment | S
129 
130 
131 
132 */
133 
134 #define XMLPARSER_TFE( T , S ) \
135  TEUCHOS_TEST_FOR_EXCEPTION( T, std::runtime_error, "XML parse error at line " << _lineNo << ": " << S )
136 
138 {
139 
141 
142  _entities.clear();
143  _entities["apos"] = "'";
144  _entities["quot"] = "\"";
145  _entities["lt"] = "<";
146  _entities["gt"] = ">";
147  _entities["amp"] = "&";
148 
149  bool done = false;
150  int curopen = 0; // number of currently open tags, or "do we process character data?"
151  bool gotRoot = false;
152  std::stack<long> tagLineStarts;
153  std::stack<string> tags;
154 
155  while (!done) {
156 
157  std::string tag, cdata;
158  unsigned char c1, c2;
160 
161  // Consume any whitespace
162  if (curopen == 0) {
163  // this will leave a lookahead in c1
164  c1 = '\0';
165  if ( getSpace(c1) ) {
166  done = true;
167  break;
168  }
169  }
170  else {
171  // need to manually lookahead
172  if (_is->readBytes(&c1,1) < 1) {
173  done = true;
174  break;
175  }
176  if (c1 == '\n') ++_lineNo; // a newline while processing character data; not an error
177  }
178 
179  if (c1 == '<') {
180  // determine if it is a STag/EmptyElemTag or ETag or Comment
181  // get lookahead
182  XMLPARSER_TFE( _is->readBytes(&c2,1) < 1 , "stream ended in tag begin/end");
183 
184  if (c2 == '/') {
185  // we have: </
186  // try to get an ETag
187  getETag(tag);
188  // have to check whether we have an enclosing, otherwise tags and tagLineStarts have no top()
189  XMLPARSER_TFE( curopen == 0, "document not well-formed: encountered end element '" << tag << "' while not enclosed." );
190  XMLPARSER_TFE( handler->endElement(tag)!=0, "document not well-formed: end element tag = '" << tag << "'"
191  << " did not match start element '" << tags.top()
192  << "' from line " << tagLineStarts.top() );
193  curopen--;
194  tagLineStarts.pop();
195  tags.pop();
196  }
197  else if (isLetter(c2) || c2==':' || c2=='_') {
198  // it looks like a STag or an EmptyElemTag
199  bool emptytag;
200  tagLineStarts.push(_lineNo);
201  getSTag(c2, tag, attrs, emptytag);
202  tags.push(tag);
203  handler->startElement(tag,attrs);
204  if (curopen == 0) {
205  XMLPARSER_TFE(gotRoot == true, "document not well-formed: more than one root element specified" );
206  gotRoot = true;
207  }
208  curopen++;
209  if (emptytag) {
210  // we just open this tag, so we should have any trouble closing it
211  XMLPARSER_TFE( handler->endElement(tag)!=0, "unknown failure from handler while processing tag '" << tag << "'" );
212  curopen--;
213  tagLineStarts.pop();
214  tags.pop();
215  }
216  }
217  else if (c2 == '?') {
218  // it is starting to look like an xml declaration
219  XMLPARSER_TFE( assertChar('x') != 0 , "was expecting an XML declaration; element not well-formed or exploits unsupported feature" );
220  XMLPARSER_TFE( assertChar('m') != 0 , "was expecting an XML declaration; element not well-formed or exploits unsupported feature" );
221  XMLPARSER_TFE( assertChar('l') != 0 , "was expecting an XML declaration; element not well-formed or exploits unsupported feature" );
222  ignoreXMLDeclaration();
223  }
224  else if (c2 == '!') {
225  // it is starting to look like a comment; we need '--'
226  // if we don't get this, it means
227  // * the document is not well-formed
228  // * the document employs a feature not supported by this parser,
229  // e.g. <!ELEMENT... <!ATTLIST... <!DOCTYPE... <![CDATA[...
230  XMLPARSER_TFE( assertChar('-') != 0 , "element not well-formed or exploits unsupported feature" );
231  XMLPARSER_TFE( assertChar('-') != 0 , "element not well-formed or exploits unsupported feature" );
232  getComment(_lineNo);
233  }
234  else {
235  XMLPARSER_TFE(true, "element not well-formed or exploits unsupported feature" );
236  }
237  }
238  else if ( (curopen > 0) && (c1 == '&') ) {
239  std::string chars = "";
240  getReference(chars);
241  handler->characters(chars);
242  }
243  else if ( (curopen > 0) ) {
244  std::string chars = "";
245  chars.push_back(c1);
246  handler->characters(chars);
247  }
248  else {
249  XMLPARSER_TFE(1 , "document not well-formed: character data outside of an enclosing tag");
250  }
251  }
252 
253  XMLPARSER_TFE( curopen != 0 , "file ended before closing element '" << tags.top() << "' from line " << tagLineStarts.top() );
254 
255  return handler->getObject();
256 
257 }
258 
259 
260 void XMLParser::getETag(std::string &tag)
261 {
262  /* Recall from the specification:
263  ETag ::= '</' Name S? '>'
264  Name ::= (Letter | '_' | ':') (NameChar)*
265 
266  We have already consumed: </
267  */
268 
269  bool tagover = false;
270  unsigned char c;
271  // clear tag
272  tag = "";
273  XMLPARSER_TFE( _is->readBytes(&c,1) < 1 , "EOF before end element was terminated");
274  XMLPARSER_TFE( !isLetter(c) && c!='_' && c!=':' , "tag not well-formed");
275  tag.push_back(c);
276  while (1) {
277  XMLPARSER_TFE( _is->readBytes(&c,1) < 1 , "EOF before end element was terminated");
278  if ( isNameChar(c) ) {
279  if (tagover) {
280  XMLPARSER_TFE(1, "end element not well-formed: expected '>'");
281  }
282  tag.push_back(c);
283  }
284  else if (isSpace(c)) {
285  // mark the end of the tag and consume the whitespace
286  // if it is ia newline, it isn't an error
287  if (c == '\n') ++_lineNo;
288  tagover = true;
289  }
290  else if (c == '>') {
291  break;
292  }
293  else {
294  XMLPARSER_TFE(1, "end element not well-formed");
295  }
296  }
297 }
298 
299 
300 void XMLParser::getSTag(unsigned char lookahead, std::string &tag, Teuchos::map<std::string,string> &attrs, bool &emptytag)
301 {
302 
303  /* Recall from the specification:
304 
305  STag ::= '<' Name (S Attribute)* S? '>'
306  EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
307  Name ::= (Letter | '_' | ':') (NameChar)*
308  NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | #x00B7
309 
310  S ::= (#x20 | #x9 | #xD | #xA)+
311  Attribute ::= Name Eq AttValue
312  Eq ::= S? '=' S?
313  AttValue ::= '"' ([^<&"] | Reference)* '"'
314  | "'" ([^<&'] | Reference)* "'"
315  Reference ::= EntityRef | CharRef
316  CharRef ::= '&#' [0-9]+ ';'
317  EntityRef ::= '&' Name ';'
318 
319  We have already consumed: <lookahead
320  */
321 
322  unsigned char c;
323  attrs.clear();
324 
325  tag = lookahead;
326  // get the rest of the tag: (NameChar)*
327  while (1) {
328  XMLPARSER_TFE( _is->readBytes(&c,1) < 1 , "EOF before start element was terminated");
329  if (isNameChar(c)) {
330  tag.push_back(c);
331  }
332  else {
333  break;
334  }
335  }
336 
337  // after the name: should be one of the following
338  // (S Attribute) | S? '>' | S? '/>'
339  do {
340 
341  bool hadspace = false;
342 
343  // if space, consume the whitespace
344  if ( isSpace(c) ) {
345  hadspace = true;
346  XMLPARSER_TFE( getSpace(c)!=0, "EOF before start element was terminated");
347  }
348 
349  // now, either Attribute | '>' | '/>'
350  if ( (isLetter(c) || c=='_' || c==':') && hadspace ) {
351 
352  // Attribute
353  // get attribute name, starting with contents of c
354  std::string attname, attval;
355  attname = c;
356  do {
357  XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before start element was terminated");
358  if ( isNameChar(c) ) {
359  attname.push_back(c);
360  }
361  else if ( isSpace(c) || c=='=' ) {
362  break;
363  }
364  else {
365  XMLPARSER_TFE(1, "attribute not well-formed: expected whitespace or '='");
366  }
367  } while (1);
368 
369  // if whitespace, consume it
370  if (isSpace(c)) {
371  getSpace(c);
372  }
373  // should be on '='
374  if (c != '=') {
375  XMLPARSER_TFE(1, "attribute not well-formed: expected '='");
376  }
377 
378  // get any whitespace following the '='
379  XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before start element was terminated");
380  if (isSpace(c)) {
381  getSpace(c);
382  }
383 
384  // now get the quoted attribute value
385  bool apost;
386  attval = "";
387  if (c == '\'') {
388  apost = true;
389  }
390  else if (c == '\"') {
391  apost = false;
392  }
393  else {
394  XMLPARSER_TFE(1, "attribute value must be quoted with either ''' or '\"'");
395  }
396  do {
397  XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before start element was terminated");
398  if (apost && c=='\'') {
399  // end of attval
400  break;
401  }
402  else if (!apost && c=='\"') {
403  // end of attval
404  break;
405  }
406  else if ( c == '&' ) {
407  // finish: need to add support for Reference
408  std::string refstr;
409  getReference(refstr);
410  attval += refstr;
411  }
412  else if ( c!='<' ) {
413  // valid character for attval
414  attval.push_back(c);
415  }
416  else {
417  XMLPARSER_TFE(1, "invalid character in attribute value");
418  }
419  } while(1);
420 
421  // add attribute to list
422  XMLPARSER_TFE( attrs.find(attname) != attrs.end() , "cannot have two attributes with the same name");
423  attrs[attname] = attval;
424  }
425  else if (c == '>') {
426  emptytag = false;
427  break;
428  }
429  else if (c == '/') {
430  XMLPARSER_TFE(assertChar('>')!=0, "empty element tag not well-formed: expected '>'");
431  emptytag = true;
432  break;
433  }
434  else {
435  XMLPARSER_TFE(1, "start element not well-formed: invalid character");
436  }
437 
438  // get next char
439  XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before start element was terminated");
440 
441  } while(1);
442 }
443 
444 
445 void XMLParser::getComment(long /* startLine */)
446 {
447  /* Recall from the specification:
448  Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
449  that is, '<!--' txt '-->', where txt does not contain '--'
450  We have already consumed: <!--
451 
452  Be wary here of the fact that c=='-' implies isChar(c)
453  */
454  unsigned char c;
455  while (1) {
456  XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before terminating comment begun at line " << _lineNo );
457  if (c == '\n') ++_lineNo;
458  // if we have a -
459  if (c=='-') {
460  // then it must be the end of the comment or be a Char
461  XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before terminating comment begun at line " << _lineNo );
462  if (c == '\n') ++_lineNo;
463  if (c=='-') {
464  // this had better be leading to the end of the comment
465  XMLPARSER_TFE( assertChar('>')!=0, "comment not well-formed: missing expected '>' at line " << _lineNo );
466  break;
467  }
468  else if (!isChar(c)) {
469  XMLPARSER_TFE(1, "comment not well-formed: invalid character at line " << _lineNo );
470  }
471  }
472  else if (!isChar(c)) {
473  XMLPARSER_TFE(1, "comment not well-formed: invalid character at line " << _lineNo );
474  }
475  }
476 }
477 
478 
479 void XMLParser::getReference(std::string &refstr) {
480  // finish: does CharRef support only dec, or hex as well?
481  unsigned char c;
482  unsigned int num, base;
483  refstr = "";
484  // none of these bytes read are allowed to be a newline, so don't do any incrementing of _lineNo
485  XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before reference was terminated");
486  if (c == '#') {
487  // get a CharRef
488  // CharRef ::= '&#' [0-9]+ ';'
489  // | '&#x' [0-9]+ ';'
490  // get first number
491  XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before reference was terminated");
492  if (c == 'x') {
493  base = 16;
494  num = 0;
495  }
496  else if ('0' <= c && c <= '9') {
497  base = 10;
498  num = c - '0';
499  }
500  else {
501  XMLPARSER_TFE(1, "invalid character in character reference: expected 'x' or [0-9]");
502  }
503 
504  do {
505  XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before reference was terminated");
506  XMLPARSER_TFE( c != ';' && !('0' <= c && c <= '9') , "invalid character in character reference: expected [0-9] or ';'");
507  if (c == ';') {
508  break;
509  }
510  num = num*base + (c-'0');
511  } while (1);
512  XMLPARSER_TFE(num > 0xFF, "character reference value out of range");
513  refstr.push_back( (unsigned char)num );
514  }
515  else if (isLetter(c) || c=='_' || c==':') {
516  // get an EntityRef
517  // EntityRef ::= '&' Name ';'
518  std::string entname = "";
519  entname.push_back(c);
520  do {
521  XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before reference was terminated");
522  if (c==';') {
523  break;
524  }
525  else if ( isLetter(c) || ('0' <= c && c <= '9')
526  || c=='.' || c=='-' || c=='_' || c==':'
527  || c==0xB7 ) {
528  entname.push_back(c);
529  }
530  else {
531  XMLPARSER_TFE(1, "entity reference not well-formed: invalid character");
532  }
533  } while (1);
534  XMLPARSER_TFE( _entities.find(entname) == _entities.end(), "entity reference not well-formed: undefined entity");
535  refstr = _entities[entname];
536  }
537  else {
538  XMLPARSER_TFE(1, "reference not well-formed: expected name or '#'");
539  }
540 }
541 
542 
543 int XMLParser::getSpace(unsigned char &lookahead) {
544  // if space, consume the whitespace
545  do {
546  if (lookahead == '\n') ++_lineNo;
547  if (_is->readBytes(&lookahead,1) < 1) {
548  return 1; // inform caller that we reached the end
549  }
550  }
551  while (isSpace(lookahead));
552  return 0;
553 }
554 
555 
556 bool XMLParser::isLetter(unsigned char c) {
557  if ( (0x41 <= c && c <= 0x5A) || (0x61 <= c && c <= 0x7A) ||
558  (0xC0 <= c && c <= 0xD6) || (0xD8 <= c && c <= 0xF6) ||
559  (0xF8 <= c) /* unsigned char must be <= 0xFF */ )
560  {
561  return true;
562  }
563  return false;
564 }
565 
566 
567 bool XMLParser::isNameChar(unsigned char c) {
568  if ( isLetter(c) || ('0' <= c && c <= '9') ||
569  c=='.' || c=='-' || c=='_' || c==':' || c==0xB7 )
570  {
571  return true;
572  }
573  return false;
574 }
575 
576 
577 bool XMLParser::isSpace(unsigned char c) {
578  if ( c==0x20 || c==0x9 || c==0xD || c==0xA )
579  {
580  return true;
581  }
582  return false;
583 }
584 
585 
586 bool XMLParser::isChar(unsigned char c) {
587  if ( c==0x9 || c==0xA || c==0xD || 0x20 <= c) { // unsigned char must be <= 0xFF
588  return true;
589  }
590  return false;
591 }
592 
593 
594 int XMLParser::assertChar(unsigned char cexp)
595 {
596  // pull the next character off the stream and verify that it is what is expected
597  // if not, return an error to the caller
598  unsigned char c;
599  // don't worry about newlines; assertChar is always wrapped in TEST_FOR_EXCEPTION, so we don't want to advance the line counter
600  if (_is->readBytes(&c,1) < 1) {
601  return 1;
602  }
603  if (c != cexp) {
604  return 2;
605  }
606  return 0;
607 }
608 
609 void XMLParser::ignoreXMLDeclaration()
610 {
611  /* Be a little lax on the spec here; read until we get to '?', then assert '>'
612  We have already consumed: <xml
613  */
614  unsigned char c;
615  while (1) {
616  XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before terminating XML declaration begun at line " << _lineNo );
617  if (c == '\n') ++_lineNo;
618  // if we have a -
619  if (c=='?') {
620  // this had better be leading to the end of the declaration
621  XMLPARSER_TFE( assertChar('>')!=0, "XML declaration not well-formed: missing expected '>' at line " << _lineNo );
622  break;
623  }
624  }
625 }
void characters(const std::string &chars)
Process character data.
Defines a class for assembling an XMLObject from XML input.
int endElement(const std::string &tag)
Receive notification of the end of an element.
XMLObject parse()
Consume the XMLInputStream to build an XMLObject.
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
Deprecated.
Representation of an XML data tree. XMLObject is a ref-counted handle to a XMLObjectImplem object...
void startElement(const std::string &tag, const Map &attributes)
Receive notification of the start of an element.
const XMLObject & getObject() const
Retrieve the entire XML tree.
TreeBuildingXMLHandler assembles a XMLObject from your XML input.
Smart reference counting pointer class for automatic garbage collection.
A class providing a simple XML parser. Methods can be overloaded to exploit external XML parsing libr...