Teuchos - Trilinos Tools Package  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Teuchos_regex.cpp
1 // @HEADER
2 // *****************************************************************************
3 // Teuchos: Common Tools Package
4 //
5 // Copyright 2004 NTESS and the Teuchos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #include "Teuchos_set.hpp"
11 
12 #include "Teuchos_regex.hpp"
13 
14 #include <iostream>
15 #include <sstream>
16 
17 #include "Teuchos_Assert.hpp"
18 #include "Teuchos_Parser.hpp"
19 #include "Teuchos_vector.hpp"
20 #include "Teuchos_string.hpp"
21 #include "Teuchos_chartab.hpp"
22 #include "Teuchos_Reader.hpp"
23 #include "Teuchos_chartab.hpp"
24 
25 namespace Teuchos {
26 namespace regex {
27 
28 Language make_language() {
29  /* The top produtions were from the "grep.y" YACC grammar in the source
30  code for Plan 9's grep utility, see here:
31 https://github.com/wangeguo/plan9/blob/master/sys/src/cmd/grep/grep.y
32  The "set" related productions
33  are from a grammar intended to be used by ProLog to parse Perl's regular
34  expressions, see here:
35 http://www.cs.sfu.ca/~cameron/Teaching/384/99-3/regexp-plg.html */
36  Language out;
37  Language::Productions& prods = out.productions;
38  prods.resize(NPRODS);
39  prods[PROD_REGEX]("regex") >> "union";
40  prods[PROD_UNION_DECAY]("union") >> "concat";
41  prods[PROD_UNION]("union") >> "union", "|", "concat"; // union
42  prods[PROD_CONCAT_DECAY]("concat") >> "qualified";
43  prods[PROD_CONCAT]("concat") >> "concat", "qualified"; // concatenation
44  prods[PROD_QUAL_DECAY]("qualified") >> "single";
45  prods[PROD_STAR]("qualified") >> "qualified", "*";
46  prods[PROD_PLUS]("qualified") >> "qualified", "+";
47  prods[PROD_MAYBE]("qualified") >> "qualified", "?";
48  prods[PROD_SINGLE_CHAR]("single") >> "char";
49  prods[PROD_ANY]("single") >> "."; // any
50  prods[PROD_SINGLE_SET]("single") >> "set";
51  prods[PROD_PARENS_UNION]("single") >> "(", "union", ")";
52  prods[PROD_SET_POSITIVE]("set") >> "positive-set";
53  prods[PROD_SET_NEGATIVE]("set") >> "negative-set";
54  prods[PROD_POSITIVE_SET]("positive-set") >> "[", "set-items", "]";
55  prods[PROD_NEGATIVE_SET]("negative-set") >> "[", "^", "set-items", "]";
56  prods[PROD_SET_ITEMS_DECAY]("set-items") >> "set-item";
57  prods[PROD_SET_ITEMS_ADD]("set-items") >> "set-items", "set-item";
58  prods[PROD_SET_ITEM_CHAR]("set-item") >> "char";
59  prods[PROD_SET_ITEM_RANGE]("set-item") >> "range";
60  prods[PROD_RANGE]("range") >> "char", "-", "char";
61  out.tokens.resize(NTOKS);
62  /* either one of the non-meta characters, or anything preceded by the escape slash */
63  out.tokens[TOK_CHAR]("char", "[^\\\\\\.\\[\\]\\(\\)\\|\\-\\^\\*\\+\\?]|\\\\.");
64  out.tokens[TOK_DOT](".", "\\.");
65  out.tokens[TOK_LRANGE]("[", "\\]");
66  out.tokens[TOK_RRANGE]("]", "\\]");
67  out.tokens[TOK_LPAREN]("(", "\\(");
68  out.tokens[TOK_RPAREN](")", "\\)");
69  out.tokens[TOK_UNION]("|", "\\|");
70  out.tokens[TOK_RANGE]("-", "\\-");
71  out.tokens[TOK_NEGATE]("^", "\\^");
72  out.tokens[TOK_STAR]("*", "\\*");
73  out.tokens[TOK_PLUS]("+", "\\+");
74  out.tokens[TOK_MAYBE]("?", "\\?");
75  return out;
76 }
77 
78 /* bootstrap ! This lexer is used to build the ReaderTables that read
79  regular expressions themselves, so it can't depend on that reader ! */
80 void make_lexer(FiniteAutomaton& result) {
81  std::string meta_chars_str = ".[]()|-^*+?";
82  std::set<int> all_chars;
83  for (int i = 0; i < NCHARS; ++i) all_chars.insert(i);
84  std::set<int> nonmeta_chars = all_chars;
85  for (int i = 0; i < Teuchos::size(meta_chars_str); ++i) {
86  int meta_char = at(meta_chars_str, i);
87  std::set<int>::iterator it = nonmeta_chars.find(get_symbol(meta_char));
88  nonmeta_chars.erase(it);
89  }
90  FiniteAutomaton lex_nonmeta;
91  make_set_nfa(lex_nonmeta, NCHARS, nonmeta_chars, TOK_CHAR);
92  FiniteAutomaton lex_slash;
93  make_char_single_nfa(lex_slash, '\\');
94  FiniteAutomaton lex_any;
95  make_set_nfa(lex_any, NCHARS, all_chars);
96  FiniteAutomaton lex_escaped;
97  concat(lex_escaped, lex_slash, lex_any, TOK_CHAR);
98  FiniteAutomaton lex_char;
99  unite(lex_char, lex_nonmeta, lex_escaped);
100  FiniteAutomaton lex_metachars;
101  for (int i = 0; i < Teuchos::size(meta_chars_str); ++i) {
102  int token = TOK_CHAR + i + 1;
103  if (i) {
104  FiniteAutomaton lex_metachar;
105  make_char_single_nfa(lex_metachar, at(meta_chars_str, i), token);
106  unite(lex_metachars, lex_metachars, lex_metachar);
107  } else {
108  make_char_single_nfa(lex_metachars, at(meta_chars_str, i), token);
109  }
110  }
111  unite(result, lex_metachars, lex_char);
112  make_deterministic(result, result);
113  simplify(result, result);
114 }
115 
116 ReaderTablesPtr ask_reader_tables() {
117  static ReaderTablesPtr ptr;
118  if (ptr.strong_count() == 0) {
119  RCP<ReaderTables> newptr(new ReaderTables());
120  LanguagePtr lang = regex::ask_language();
121  GrammarPtr grammar = make_grammar(*lang);
122  newptr->parser = make_lalr1_parser(grammar);
123  regex::make_lexer(newptr->lexer);
124  newptr->indent_info.is_sensitive = false;
125  newptr->indent_info.indent_token = -1;
126  newptr->indent_info.dedent_token = -1;
127  ptr = newptr;
128  }
129  return ptr;
130 }
131 
132 LanguagePtr ask_language() {
133  static LanguagePtr ptr;
134  if (ptr.strong_count() == 0) {
135  ptr.reset(new Language(make_language()));
136  }
137  return ptr;
138 }
139 
140 void make_dfa(FiniteAutomaton& result, std::string const& name, std::string const& regex, int token) {
141  using std::swap;
142  regex::Reader reader(token);
143  any result_any;
144  try {
145  reader.read_string(result_any, regex, name);
146  } catch (const Teuchos::ParserFail& e) {
147  std::stringstream ss;
148  ss << e.what() << '\n';
149  ss << "error: couldn't build DFA for token \"" << name << "\" regex \"" << regex << "\"\n";
150  ss << "repeating with DebugReader:\n";
151  DebugReader debug_reader(regex::ask_reader_tables(), ss);
152  debug_reader.read_string(result_any, regex, name);
153  throw ParserFail(ss.str());
154  }
155  swap(any_ref_cast<FiniteAutomaton>(result_any), result);
156 }
157 
158 regex::Reader::Reader(int result_token_in):
159  Teuchos::Reader(regex::ask_reader_tables()),
160  result_token(result_token_in) {
161 }
162 
163 void regex::Reader::at_shift(any& result, int token, std::string& text) {
164  if (token != TOK_CHAR) return;
165  if (Teuchos::size(text) == 1) {
166  result = text[0];
167  } else if (Teuchos::size(text) == 2) {
168  TEUCHOS_ASSERT(text[0] == '\\');
169  result = text[1];
170  } else {
171  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
172  "BUG: regex char text is \"" << text << "\"\n");
173  }
174 }
175 
176 void regex::Reader::at_reduce(any& result_any, int production, std::vector<any>& rhs) {
177  using std::swap;
178  switch (production) {
179  case PROD_REGEX: {
180  swap(result_any, at(rhs, 0));
181  FiniteAutomaton& result = any_ref_cast<FiniteAutomaton>(result_any);
182  make_deterministic(result, result);
183  simplify(result, result);
184  return;
185  }
186  case PROD_UNION_DECAY:
187  case PROD_CONCAT_DECAY:
188  case PROD_QUAL_DECAY:
189  case PROD_SET_ITEMS_DECAY:
190  case PROD_SET_ITEM_RANGE: {
191  swap(result_any, at(rhs, 0));
192  return;
193  }
194  case PROD_UNION: {
195  FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
196  FiniteAutomaton& a = any_ref_cast<FiniteAutomaton>(at(rhs, 0));
197  FiniteAutomaton& b = any_ref_cast<FiniteAutomaton>(at(rhs, 2));
198  unite(result, a, b);
199  return;
200  }
201  case PROD_CONCAT: {
202  FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
203  FiniteAutomaton& a = any_ref_cast<FiniteAutomaton>(at(rhs, 0));
204  FiniteAutomaton& b = any_ref_cast<FiniteAutomaton>(at(rhs, 1));
205  concat(result, a, b, result_token);
206  return;
207  }
208  case PROD_STAR: {
209  FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
210  FiniteAutomaton& a = any_ref_cast<FiniteAutomaton>(at(rhs, 0));
211  star(result, a, result_token);
212  return;
213  }
214  case PROD_PLUS: {
215  FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
216  FiniteAutomaton& a = any_ref_cast<FiniteAutomaton>(at(rhs, 0));
217  plus(result, a, result_token);
218  return;
219  }
220  case PROD_MAYBE: {
221  FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
222  FiniteAutomaton& a = any_ref_cast<FiniteAutomaton>(at(rhs, 0));
223  maybe(result, a, result_token);
224  return;
225  }
226  case PROD_SINGLE_CHAR: {
227  FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
228  char c = any_cast<char>(at(rhs, 0));
229  make_char_single_nfa(result, c, result_token);
230  return;
231  }
232  case PROD_ANY: {
233  FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
234  make_range_nfa(result, NCHARS, 0, NCHARS - 1, result_token);
235  return;
236  }
237  case PROD_SINGLE_SET: {
238  FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
239  std::set<char>& charset = any_ref_cast<std::set<char> >(at(rhs, 0));
240  make_char_set_nfa(result, charset, result_token);
241  return;
242  }
243  case PROD_PARENS_UNION: {
244  swap(result_any, at(rhs, 1));
245  return;
246  }
247  case PROD_SET_POSITIVE: {
248  swap(result_any, at(rhs, 0));
249  return;
250  }
251  case PROD_SET_NEGATIVE: {
252  std::set<char>& result = make_any_ref<std::set<char> >(result_any);
253  std::set<char> const& charset = any_ref_cast<std::set<char> >(at(rhs, 0));
254  negate_set(result, charset);
255  return;
256  }
257  case PROD_POSITIVE_SET: {
258  swap(result_any, at(rhs, 1));
259  return;
260  }
261  case PROD_NEGATIVE_SET: {
262  swap(result_any, at(rhs, 2));
263  return;
264  }
265  case PROD_SET_ITEMS_ADD: {
266  std::set<char>& result = make_any_ref<std::set<char> >(result_any);
267  std::set<char>& a = any_ref_cast<std::set<char> >(at(rhs, 0));
268  std::set<char> const& b = any_ref_cast<std::set<char> >(at(rhs, 1));
269  swap(result, a);
270  unite_with(result, b);
271  return;
272  }
273  case PROD_SET_ITEM_CHAR: {
274  std::set<char>& result = make_any_ref<std::set<char> >(result_any);
275  char c = any_cast<char>(at(rhs, 0));
276  result.insert(c);
277  return;
278  }
279  case PROD_RANGE: {
280  std::set<char>& result = make_any_ref<std::set<char> >(result_any);
281  char a = any_cast<char>(at(rhs, 0));
282  char b = any_cast<char>(at(rhs, 2));
283  for (char c = a; c <= b; ++c) {
284  result.insert(c);
285  }
286  return;
287  }
288  }
289  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
290  "BUG: unexpected production " << production << '\n');
291 }
292 
293 } // namespace regex
294 } // namespace Teuchos
Parser make_lalr1_parser(GrammarPtr grammar, bool verbose)
Tries to create LALR(1) parser tables for a given grammar.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
Macro for throwing an exception with breakpointing to ease debugging.
Tries to create LALR(1) parser tables for a given grammar.
Productions productions
vector of productions
Declares Teuchos::Parser, ParserFail and make_lalr1_parser.
void make_lexer(FiniteAutomaton &result, Language const &language)
construct a lexer for the Language tokens.
RCP< const ReaderTables > ReaderTablesPtr
an RCP to a const ReaderTables
RCP< const Language > LanguagePtr
an RCP to a const Language
#define TEUCHOS_ASSERT(assertion_test)
This macro is throws when an assert fails.
Declares Teuchos::Reader.