Teuchos - Trilinos Tools Package  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Teuchos_regex.cpp
1 #include "Teuchos_set.hpp"
2 
3 #include "Teuchos_regex.hpp"
4 
5 #include <iostream>
6 #include <sstream>
7 
8 #include "Teuchos_Assert.hpp"
9 #include "Teuchos_Parser.hpp"
10 #include "Teuchos_vector.hpp"
11 #include "Teuchos_string.hpp"
12 #include "Teuchos_chartab.hpp"
13 #include "Teuchos_Reader.hpp"
14 #include "Teuchos_chartab.hpp"
15 
16 namespace Teuchos {
17 namespace regex {
18 
19 Language make_language() {
20  /* The top produtions were from the "grep.y" YACC grammar in the source
21  code for Plan 9's grep utility, see here:
22 https://github.com/wangeguo/plan9/blob/master/sys/src/cmd/grep/grep.y
23  The "set" related productions
24  are from a grammar intended to be used by ProLog to parse Perl's regular
25  expressions, see here:
26 http://www.cs.sfu.ca/~cameron/Teaching/384/99-3/regexp-plg.html */
27  Language out;
28  Language::Productions& prods = out.productions;
29  prods.resize(NPRODS);
30  prods[PROD_REGEX]("regex") >> "union";
31  prods[PROD_UNION_DECAY]("union") >> "concat";
32  prods[PROD_UNION]("union") >> "union", "|", "concat"; // union
33  prods[PROD_CONCAT_DECAY]("concat") >> "qualified";
34  prods[PROD_CONCAT]("concat") >> "concat", "qualified"; // concatenation
35  prods[PROD_QUAL_DECAY]("qualified") >> "single";
36  prods[PROD_STAR]("qualified") >> "qualified", "*";
37  prods[PROD_PLUS]("qualified") >> "qualified", "+";
38  prods[PROD_MAYBE]("qualified") >> "qualified", "?";
39  prods[PROD_SINGLE_CHAR]("single") >> "char";
40  prods[PROD_ANY]("single") >> "."; // any
41  prods[PROD_SINGLE_SET]("single") >> "set";
42  prods[PROD_PARENS_UNION]("single") >> "(", "union", ")";
43  prods[PROD_SET_POSITIVE]("set") >> "positive-set";
44  prods[PROD_SET_NEGATIVE]("set") >> "negative-set";
45  prods[PROD_POSITIVE_SET]("positive-set") >> "[", "set-items", "]";
46  prods[PROD_NEGATIVE_SET]("negative-set") >> "[", "^", "set-items", "]";
47  prods[PROD_SET_ITEMS_DECAY]("set-items") >> "set-item";
48  prods[PROD_SET_ITEMS_ADD]("set-items") >> "set-items", "set-item";
49  prods[PROD_SET_ITEM_CHAR]("set-item") >> "char";
50  prods[PROD_SET_ITEM_RANGE]("set-item") >> "range";
51  prods[PROD_RANGE]("range") >> "char", "-", "char";
52  out.tokens.resize(NTOKS);
53  /* either one of the non-meta characters, or anything preceded by the escape slash */
54  out.tokens[TOK_CHAR]("char", "[^\\\\\\.\\[\\]\\(\\)\\|\\-\\^\\*\\+\\?]|\\\\.");
55  out.tokens[TOK_DOT](".", "\\.");
56  out.tokens[TOK_LRANGE]("[", "\\]");
57  out.tokens[TOK_RRANGE]("]", "\\]");
58  out.tokens[TOK_LPAREN]("(", "\\(");
59  out.tokens[TOK_RPAREN](")", "\\)");
60  out.tokens[TOK_UNION]("|", "\\|");
61  out.tokens[TOK_RANGE]("-", "\\-");
62  out.tokens[TOK_NEGATE]("^", "\\^");
63  out.tokens[TOK_STAR]("*", "\\*");
64  out.tokens[TOK_PLUS]("+", "\\+");
65  out.tokens[TOK_MAYBE]("?", "\\?");
66  return out;
67 }
68 
69 /* bootstrap ! This lexer is used to build the ReaderTables that read
70  regular expressions themselves, so it can't depend on that reader ! */
71 void make_lexer(FiniteAutomaton& result) {
72  std::string meta_chars_str = ".[]()|-^*+?";
73  std::set<int> all_chars;
74  for (int i = 0; i < NCHARS; ++i) all_chars.insert(i);
75  std::set<int> nonmeta_chars = all_chars;
76  for (int i = 0; i < size(meta_chars_str); ++i) {
77  int meta_char = at(meta_chars_str, i);
78  std::set<int>::iterator it = nonmeta_chars.find(get_symbol(meta_char));
79  nonmeta_chars.erase(it);
80  }
81  FiniteAutomaton lex_nonmeta;
82  make_set_nfa(lex_nonmeta, NCHARS, nonmeta_chars, TOK_CHAR);
83  FiniteAutomaton lex_slash;
84  make_char_single_nfa(lex_slash, '\\');
85  FiniteAutomaton lex_any;
86  make_set_nfa(lex_any, NCHARS, all_chars);
87  FiniteAutomaton lex_escaped;
88  concat(lex_escaped, lex_slash, lex_any, TOK_CHAR);
89  FiniteAutomaton lex_char;
90  unite(lex_char, lex_nonmeta, lex_escaped);
91  FiniteAutomaton lex_metachars;
92  for (int i = 0; i < size(meta_chars_str); ++i) {
93  int token = TOK_CHAR + i + 1;
94  if (i) {
95  FiniteAutomaton lex_metachar;
96  make_char_single_nfa(lex_metachar, at(meta_chars_str, i), token);
97  unite(lex_metachars, lex_metachars, lex_metachar);
98  } else {
99  make_char_single_nfa(lex_metachars, at(meta_chars_str, i), token);
100  }
101  }
102  unite(result, lex_metachars, lex_char);
103  make_deterministic(result, result);
104  simplify(result, result);
105 }
106 
107 ReaderTablesPtr ask_reader_tables() {
108  static ReaderTablesPtr ptr;
109  if (ptr.strong_count() == 0) {
110  RCP<ReaderTables> newptr(new ReaderTables());
111  LanguagePtr lang = regex::ask_language();
112  GrammarPtr grammar = make_grammar(*lang);
113  newptr->parser = make_lalr1_parser(grammar);
114  regex::make_lexer(newptr->lexer);
115  newptr->indent_info.is_sensitive = false;
116  newptr->indent_info.indent_token = -1;
117  newptr->indent_info.dedent_token = -1;
118  ptr = newptr;
119  }
120  return ptr;
121 }
122 
123 LanguagePtr ask_language() {
124  static LanguagePtr ptr;
125  if (ptr.strong_count() == 0) {
126  ptr.reset(new Language(make_language()));
127  }
128  return ptr;
129 }
130 
131 void make_dfa(FiniteAutomaton& result, std::string const& name, std::string const& regex, int token) {
132  using std::swap;
133  regex::Reader reader(token);
134  any result_any;
135  try {
136  reader.read_string(result_any, regex, name);
137  } catch (const Teuchos::ParserFail& e) {
138  std::stringstream ss;
139  ss << e.what() << '\n';
140  ss << "error: couldn't build DFA for token \"" << name << "\" regex \"" << regex << "\"\n";
141  ss << "repeating with DebugReader:\n";
142  DebugReader debug_reader(regex::ask_reader_tables(), ss);
143  debug_reader.read_string(result_any, regex, name);
144  throw ParserFail(ss.str());
145  }
146  swap(any_ref_cast<FiniteAutomaton>(result_any), result);
147 }
148 
149 regex::Reader::Reader(int result_token_in):
150  Teuchos::Reader(regex::ask_reader_tables()),
151  result_token(result_token_in) {
152 }
153 
154 void regex::Reader::at_shift(any& result, int token, std::string& text) {
155  if (token != TOK_CHAR) return;
156  if (size(text) == 1) {
157  result = text[0];
158  } else if (size(text) == 2) {
159  TEUCHOS_ASSERT(text[0] == '\\');
160  result = text[1];
161  } else {
162  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
163  "BUG: regex char text is \"" << text << "\"\n");
164  }
165 }
166 
167 void regex::Reader::at_reduce(any& result_any, int production, std::vector<any>& rhs) {
168  using std::swap;
169  switch (production) {
170  case PROD_REGEX: {
171  swap(result_any, at(rhs, 0));
172  FiniteAutomaton& result = any_ref_cast<FiniteAutomaton>(result_any);
173  make_deterministic(result, result);
174  simplify(result, result);
175  return;
176  }
177  case PROD_UNION_DECAY:
178  case PROD_CONCAT_DECAY:
179  case PROD_QUAL_DECAY:
180  case PROD_SET_ITEMS_DECAY:
181  case PROD_SET_ITEM_RANGE: {
182  swap(result_any, at(rhs, 0));
183  return;
184  }
185  case PROD_UNION: {
186  FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
187  FiniteAutomaton& a = any_ref_cast<FiniteAutomaton>(at(rhs, 0));
188  FiniteAutomaton& b = any_ref_cast<FiniteAutomaton>(at(rhs, 2));
189  unite(result, a, b);
190  return;
191  }
192  case PROD_CONCAT: {
193  FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
194  FiniteAutomaton& a = any_ref_cast<FiniteAutomaton>(at(rhs, 0));
195  FiniteAutomaton& b = any_ref_cast<FiniteAutomaton>(at(rhs, 1));
196  concat(result, a, b, result_token);
197  return;
198  }
199  case PROD_STAR: {
200  FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
201  FiniteAutomaton& a = any_ref_cast<FiniteAutomaton>(at(rhs, 0));
202  star(result, a, result_token);
203  return;
204  }
205  case PROD_PLUS: {
206  FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
207  FiniteAutomaton& a = any_ref_cast<FiniteAutomaton>(at(rhs, 0));
208  plus(result, a, result_token);
209  return;
210  }
211  case PROD_MAYBE: {
212  FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
213  FiniteAutomaton& a = any_ref_cast<FiniteAutomaton>(at(rhs, 0));
214  maybe(result, a, result_token);
215  return;
216  }
217  case PROD_SINGLE_CHAR: {
218  FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
219  char c = any_cast<char>(at(rhs, 0));
220  make_char_single_nfa(result, c, result_token);
221  return;
222  }
223  case PROD_ANY: {
224  FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
225  make_range_nfa(result, NCHARS, 0, NCHARS - 1, result_token);
226  return;
227  }
228  case PROD_SINGLE_SET: {
229  FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
230  std::set<char>& charset = any_ref_cast<std::set<char> >(at(rhs, 0));
231  make_char_set_nfa(result, charset, result_token);
232  return;
233  }
234  case PROD_PARENS_UNION: {
235  swap(result_any, at(rhs, 1));
236  return;
237  }
238  case PROD_SET_POSITIVE: {
239  swap(result_any, at(rhs, 0));
240  return;
241  }
242  case PROD_SET_NEGATIVE: {
243  std::set<char>& result = make_any_ref<std::set<char> >(result_any);
244  std::set<char> const& charset = any_ref_cast<std::set<char> >(at(rhs, 0));
245  negate_set(result, charset);
246  return;
247  }
248  case PROD_POSITIVE_SET: {
249  swap(result_any, at(rhs, 1));
250  return;
251  }
252  case PROD_NEGATIVE_SET: {
253  swap(result_any, at(rhs, 2));
254  return;
255  }
256  case PROD_SET_ITEMS_ADD: {
257  std::set<char>& result = make_any_ref<std::set<char> >(result_any);
258  std::set<char>& a = any_ref_cast<std::set<char> >(at(rhs, 0));
259  std::set<char> const& b = any_ref_cast<std::set<char> >(at(rhs, 1));
260  swap(result, a);
261  unite_with(result, b);
262  return;
263  }
264  case PROD_SET_ITEM_CHAR: {
265  std::set<char>& result = make_any_ref<std::set<char> >(result_any);
266  char c = any_cast<char>(at(rhs, 0));
267  result.insert(c);
268  return;
269  }
270  case PROD_RANGE: {
271  std::set<char>& result = make_any_ref<std::set<char> >(result_any);
272  char a = any_cast<char>(at(rhs, 0));
273  char b = any_cast<char>(at(rhs, 2));
274  for (char c = a; c <= b; ++c) {
275  result.insert(c);
276  }
277  return;
278  }
279  }
280  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
281  "BUG: unexpected production " << production << '\n');
282 }
283 
284 } // namespace regex
285 } // namespace Teuchos
Parser make_lalr1_parser(GrammarPtr grammar, bool verbose)
Tries to create LALR(1) parser tables for a given grammar.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
Macro for throwing an exception with breakpointing to ease debugging.
Tries to create LALR(1) parser tables for a given grammar.
Productions productions
vector of productions
Declares Teuchos::Parser, ParserFail and make_lalr1_parser.
void make_lexer(FiniteAutomaton &result, Language const &language)
construct a lexer for the Language tokens.
RCP< const ReaderTables > ReaderTablesPtr
an RCP to a const ReaderTables
RCP< const Language > LanguagePtr
an RCP to a const Language
#define TEUCHOS_ASSERT(assertion_test)
This macro is throws when an assert fails.
Declares Teuchos::Reader.