Teuchos - Trilinos Tools Package  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Teuchos_FiniteAutomaton.cpp
1 #include "Teuchos_FiniteAutomaton.hpp"
2 
3 #include <set>
4 #include <map>
5 #include <queue>
6 #include <utility>
7 #include <memory>
8 #include <limits>
9 
10 #include "Teuchos_chartab.hpp"
11 #include "Teuchos_RCP.hpp"
12 #include "Teuchos_Table.hpp"
13 
14 namespace Teuchos {
15 
16 template struct Table<int>;
17 
18 FiniteAutomaton::FiniteAutomaton(int nsymbols_init, bool is_deterministic_init,
19  int nstates_reserve):
20  table(nsymbols_init + (is_deterministic_init ? 0 : 2), nstates_reserve),
21  is_deterministic(is_deterministic_init)
22 {
23  reserve(accepted_tokens, nstates_reserve);
24 }
25 
26 void FiniteAutomaton::swap(FiniteAutomaton& other) {
27  using std::swap;
28  swap(table, other.table);
29  swap(accepted_tokens, other.accepted_tokens);
30  swap(is_deterministic, other.is_deterministic);
31 }
32 
33 int get_nstates(FiniteAutomaton const& fa) {
34  return get_nrows(fa.table);
35 }
36 
37 int get_nsymbols(FiniteAutomaton const& fa) {
38  return get_ncols(fa.table) - (fa.is_deterministic ? 0 : 2);
39 }
40 
41 bool get_determinism(FiniteAutomaton const& fa) {
42  return fa.is_deterministic;
43 }
44 
45 int get_epsilon0(FiniteAutomaton const& fa) {
46  TEUCHOS_ASSERT(!fa.is_deterministic);
47  return get_ncols(fa.table) - 2;
48 }
49 
50 int get_epsilon1(FiniteAutomaton const& fa) {
51  TEUCHOS_ASSERT(!fa.is_deterministic);
52  return get_ncols(fa.table) - 1;
53 }
54 
55 int add_state(FiniteAutomaton& fa) {
56  int state = get_nstates(fa);
57  resize(fa.table, state + 1, get_ncols(fa.table));
58  for (int j = 0; j < get_ncols(fa.table); ++j) {
59  at(fa.table, state, j) = -1;
60  }
61  fa.accepted_tokens.push_back(-1);
62  return state;
63 }
64 
65 void add_transition(FiniteAutomaton& fa, int from_state, int at_symbol, int to_state) {
66  TEUCHOS_ASSERT(0 <= to_state);
67  TEUCHOS_ASSERT(to_state < get_nstates(fa));
68  TEUCHOS_ASSERT(0 <= at_symbol);
69  TEUCHOS_ASSERT(at_symbol < get_ncols(fa.table)); // allow setting epsilon transitions
70  TEUCHOS_ASSERT(at(fa.table, from_state, at_symbol) == -1);
71  at(fa.table, from_state, at_symbol) = to_state;
72 }
73 
74 void add_accept(FiniteAutomaton& fa, int state, int token) {
75  TEUCHOS_ASSERT(0 <= token);
76  at(fa.accepted_tokens, state) = token;
77 }
78 
79 void remove_accept(FiniteAutomaton& fa, int state) {
80  at(fa.accepted_tokens, state) = -1;
81 }
82 
83 int step(FiniteAutomaton const& fa, int state, int symbol) {
84  TEUCHOS_ASSERT(0 <= state);
85  TEUCHOS_ASSERT(state < get_nstates(fa));
86  TEUCHOS_ASSERT(0 <= symbol);
87  TEUCHOS_ASSERT(symbol < get_ncols(fa.table)); // allow getting epsilon transitions
88  return at(fa.table, state, symbol);
89 }
90 
91 int accepts(FiniteAutomaton const& fa, int state) {
92  return at(fa.accepted_tokens, state);
93 }
94 
95 int get_nsymbols_eps(FiniteAutomaton const& fa) {
96  return get_ncols(fa.table);
97 }
98 
99 void append_states(FiniteAutomaton& fa, FiniteAutomaton const& other) {
100  TEUCHOS_ASSERT(get_nsymbols(other) == get_nsymbols(fa));
101  bool other_determ = get_determinism(other);
102  if (!other_determ) TEUCHOS_ASSERT(!fa.is_deterministic);
103  int offset = get_nstates(fa);
104  for (int other_state = 0; other_state < get_nstates(other); ++other_state) {
105  int my_state = add_state(fa);
106  int token = accepts(other, other_state);
107  if (0 <= token) add_accept(fa, my_state, token);
108  }
109  for (int other_state = 0; other_state < get_nstates(other); ++other_state) {
110  int my_state = other_state + offset;
111  for (int symbol = 0; symbol < get_nsymbols_eps(other); ++symbol) {
112  int other_next = step(other, other_state, symbol);
113  if (other_next < 0) continue;
114  int my_next = other_next + offset;
115  add_transition(fa, my_state, symbol, my_next);
116  }
117  }
118 }
119 
120 void make_single_nfa(FiniteAutomaton& result, int nsymbols, int symbol, int token) {
121  make_range_nfa(result, nsymbols, symbol, symbol, token);
122 }
123 
124 void make_set_nfa(FiniteAutomaton& result, int nsymbols, std::set<int> const& accepted, int token) {
125  using std::swap;
126  FiniteAutomaton out(nsymbols, true, 2);
127  int start_state = add_state(out);
128  int accept_state = add_state(out);
129  for (std::set<int>::const_iterator it = accepted.begin(); it != accepted.end(); ++it) {
130  int i = *it;
131  add_transition(out, start_state, i, accept_state);
132  }
133  add_accept(out, accept_state, token);
134  swap(result, out);
135 }
136 
137 void make_range_nfa(FiniteAutomaton& result, int nsymbols, int range_start, int range_end, int token) {
138  using std::swap;
139  TEUCHOS_ASSERT(0 <= range_start);
140  TEUCHOS_ASSERT(range_start <= range_end);
141  TEUCHOS_ASSERT(range_end <= nsymbols);
142  FiniteAutomaton out(nsymbols, true, 2);
143  int start_state = add_state(out);
144  int accept_state = add_state(out);
145  for (int i = range_start; i <= range_end; ++i) {
146  add_transition(out, start_state, i, accept_state);
147  }
148  add_accept(out, accept_state, token);
149  swap(result, out);
150 }
151 
152 void unite(FiniteAutomaton& result, FiniteAutomaton const& a, FiniteAutomaton const& b) {
153  using std::swap;
154  int nsymbols = get_nsymbols(a);
155  FiniteAutomaton out(nsymbols, false, 1 + get_nstates(a) + get_nstates(b));
156  int start_state = add_state(out);
157  int a_offset = get_nstates(out);
158  append_states(out, a);
159  int b_offset = get_nstates(out);
160  append_states(out, b);
161  int epsilon0 = get_epsilon0(out);
162  int epsilon1 = get_epsilon1(out);
163  add_transition(out, start_state, epsilon0, a_offset);
164  add_transition(out, start_state, epsilon1, b_offset);
165  using std::swap;
166  swap(out, result);
167 }
168 
169 void concat(FiniteAutomaton& result, FiniteAutomaton const& a, FiniteAutomaton const& b, int token) {
170  int nsymbols = get_nsymbols(a);
171  FiniteAutomaton out(nsymbols, false, get_nstates(a) + get_nstates(b));
172  append_states(out, a);
173  int b_offset = get_nstates(out);
174  append_states(out, b);
175  int epsilon0 = get_epsilon0(out);
176  for (int i = 0; i < get_nstates(a); ++i) {
177  if (accepts(a, i) != -1) {
178  add_transition(out, i, epsilon0, b_offset);
179  remove_accept(out, i);
180  }
181  }
182  for (int i = 0; i < get_nstates(b); ++i) {
183  if (accepts(b, i) != -1) {
184  add_accept(out, i + b_offset, token);
185  }
186  }
187  swap(result, out);
188 }
189 
190 void plus(FiniteAutomaton& result, FiniteAutomaton const& a, int token) {
191  using std::swap;
192  FiniteAutomaton out(get_nsymbols(a), false, get_nstates(a) + 1);
193  append_states(out, a);
194  int new_accept_state = add_state(out);
195  add_accept(out, new_accept_state, token);
196  int epsilon0 = get_epsilon0(out);
197  int epsilon1 = get_epsilon1(out);
198  for (int i = 0; i < get_nstates(a); ++i) {
199  if (accepts(a, i) != -1) {
200  add_transition(out, i, epsilon0, new_accept_state);
201  /* we follow a convention that accepting
202  states should not have epsilon transitions */
203  add_transition(out, i, epsilon1, 0);
204  remove_accept(out, i);
205  }
206  }
207  swap(result, out);
208 }
209 
210 void maybe(FiniteAutomaton& result, FiniteAutomaton const& a, int token) {
211  using std::swap;
212  FiniteAutomaton out(get_nsymbols(a), false, get_nstates(a) + 2);
213  int new_start_state = add_state(out);
214  int offset = get_nstates(out);
215  append_states(out, a);
216  int new_accept_state = add_state(out);
217  int epsilon0 = get_epsilon0(out);
218  int epsilon1 = get_epsilon1(out);
219  add_transition(out, new_start_state, epsilon1, offset);
220  /* form an epsilon0 linked list of new start state,
221  all old accepting states, and new accepting state */
222  int last = new_start_state;
223  for (int i = 0; i < get_nstates(a); ++i) {
224  if (accepts(a, i) != -1) {
225  add_transition(out, last, epsilon0, i + offset);
226  remove_accept(out, i + offset);
227  last = i + offset;
228  }
229  }
230  add_transition(out, last, epsilon0, new_accept_state);
231  add_accept(out, new_accept_state, token);
232  swap(result, out);
233 }
234 
235 void star(FiniteAutomaton& result, FiniteAutomaton const& a, int token) {
236  using std::swap;
237  plus(result, a, token);
238  maybe(result, result, token);
239 }
240 
241 typedef std::set<int> StateSet;
242 
243 static StateSet step(StateSet const& ss, int symbol, FiniteAutomaton const& fa) {
244  StateSet next_ss;
245  for (StateSet::const_iterator it = ss.begin(); it != ss.end(); ++it) {
246  int state = *it;
247  int next_state = step(fa, state, symbol);
248  if (next_state != -1) next_ss.insert(next_state);
249  }
250  return next_ss;
251 }
252 
253 typedef std::queue<int> StateQueue;
254 
255 static StateSet get_epsilon_closure(StateSet ss, FiniteAutomaton const& fa) {
256  StateQueue q;
257  for (StateSet::const_iterator it = ss.begin(); it != ss.end(); ++it) {
258  int state = *it;
259  q.push(state);
260  }
261  int epsilon0 = get_epsilon0(fa);
262  int epsilon1 = get_epsilon1(fa);
263  while (!q.empty()) {
264  int state = q.front(); q.pop();
265  for (int epsilon = epsilon0; epsilon <= epsilon1; ++epsilon) {
266  int next_state = step(fa, state, epsilon);
267  if (next_state == -1) continue;
268  if (!ss.count(next_state)) {
269  ss.insert(next_state);
270  q.push(next_state);
271  }
272  }
273  }
274  return ss;
275 }
276 
277 struct StateSetPtrLess {
278  bool operator()(StateSet* a, StateSet* b) const {
279  return *a < *b;
280  }
281 };
282 
283 typedef std::map<StateSet*,int,StateSetPtrLess> StateSetPtr2State;
284 typedef RCP<StateSet> StateSetPtr;
285 typedef std::vector<StateSetPtr> StateSetUniqPtrVector;
286 
287 static void add_back(StateSetUniqPtrVector& ssupv, StateSet& ss) {
288  using std::swap;
289  StateSetPtr ptr(new StateSet());
290  swap(*ptr, ss);
291  ssupv.push_back(ptr);
292 }
293 
294 /* powerset construction, NFA -> DFA */
295 void make_deterministic(FiniteAutomaton& result, FiniteAutomaton& nfa) {
296  using std::swap;
297  if (get_determinism(nfa)) {
298  swap(result, nfa);
299  return;
300  }
301  StateSetPtr2State ssp2s;
302  StateSetUniqPtrVector ssupv;
303  FiniteAutomaton out(get_nsymbols(nfa), true, 0);
304  StateSet start_ss;
305  start_ss.insert(0);
306  start_ss = get_epsilon_closure(start_ss, nfa);
307  add_back(ssupv, start_ss);
308  ssp2s[ssupv.back().get()] = add_state(out);
309  int front = 0;
310  while (front < int(ssupv.size())) {
311  int state = front;
312  StateSet& ss = *at(ssupv, front);
313  ++front;
314  for (int symbol = 0; symbol < get_nsymbols(nfa); ++symbol) {
315  StateSet next_ss = Teuchos::step(ss, symbol, nfa);
316  if (next_ss.empty()) continue;
317  next_ss = get_epsilon_closure(next_ss, nfa);
318  int next_state;
319  StateSetPtr2State::iterator it = ssp2s.find(&next_ss);
320  if (it == ssp2s.end()) {
321  next_state = add_state(out);
322  add_back(ssupv, next_ss);
323  ssp2s[ssupv.back().get()] = next_state;
324  } else {
325  next_state = it->second;
326  }
327  add_transition(out, state, symbol, next_state);
328  }
329  int min_accepted = -1;
330  for (StateSet::const_iterator it = ss.begin(); it != ss.end(); ++it) {
331  int nfa_state = *it;
332  int nfa_token = accepts(nfa, nfa_state);
333  if (nfa_token == -1) continue;
334  if (min_accepted == -1 || nfa_token < min_accepted) {
335  min_accepted = nfa_token;
336  }
337  }
338  if (min_accepted != -1) add_accept(out, state, min_accepted);
339  }
340  swap(result, out);
341 }
342 
343 struct StateRowLess {
344  Table<int> const& table;
345  std::vector<int> const& accepted;
346  bool operator()(int const& a, int const& b) const {
347  int aa = at(accepted, a);
348  int ab = at(accepted, b);
349  if (aa != ab) return aa < ab;
350  for (int symbol = 0, ncols = get_ncols(table); symbol < ncols; ++symbol) {
351  int na = at(table, a, symbol);
352  int nb = at(table, b, symbol);
353  if (na != nb) return na < nb;
354  }
355  return false;
356  }
357  StateRowLess(Table<int> const& t, std::vector<int> const& a):
358  table(t),accepted(a) {
359  }
360 };
361 
362 typedef std::map<int, int, StateRowLess> StateRow2SimpleState;
363 
364 void simplify_once(FiniteAutomaton& result, FiniteAutomaton const& fa) {
365  using std::swap;
366  StateRow2SimpleState sr2ss(StateRowLess(fa.table, fa.accepted_tokens));
367  int nsimple = 0;
368  for (int state = 0; state < get_nstates(fa); ++state) {
369  std::pair<StateRow2SimpleState::iterator, bool> res =
370  sr2ss.insert(std::make_pair(state, nsimple));
371  if (res.second) {
372  ++nsimple;
373  }
374  }
375  FiniteAutomaton out(get_nsymbols(fa), get_determinism(fa), nsimple);
376  for (int simple = 0; simple < nsimple; ++simple) {
377  add_state(out);
378  }
379  std::vector<bool> did_simple(size_t(nsimple), false);
380  for (int state = 0; state < get_nstates(fa); ++state) {
381  TEUCHOS_ASSERT(sr2ss.count(state));
382  int simple = sr2ss[state];
383  if (at(did_simple, simple)) continue;
384  for (int symbol = 0; symbol < get_nsymbols_eps(fa); ++symbol) {
385  int next_state = step(fa, state, symbol);
386  if (next_state == -1) continue;
387  TEUCHOS_ASSERT(sr2ss.count(next_state));
388  int next_simple = sr2ss[next_state];
389  add_transition(out, simple, symbol, next_simple);
390  }
391  int token = accepts(fa, state);
392  if (token != -1) {
393  add_accept(out, simple, token);
394  }
395  at(did_simple, simple) = true;
396  }
397  swap(result, out);
398 }
399 
400 void simplify(FiniteAutomaton& result, FiniteAutomaton const& fa) {
401  using std::swap;
402  FiniteAutomaton prev;
403  FiniteAutomaton next = fa;
404  int nstates_next = get_nstates(next);
405  int nstates;
406  int i = 0;
407  do {
408  swap(prev, next);
409  nstates = nstates_next;
410  simplify_once(next, prev);
411  ++i;
412  nstates_next = get_nstates(next);
413  } while (nstates_next < nstates);
414  swap(result, next);
415 }
416 
417 void make_char_nfa(FiniteAutomaton& result, bool is_deterministic_init, int nstates_reserve) {
418  using std::swap;
419  FiniteAutomaton tmp(Teuchos::NCHARS, is_deterministic_init, nstates_reserve);
420  swap(result, tmp);
421 }
422 
423 void add_char_transition(FiniteAutomaton& fa, int from_state, char at_char, int to_state) {
424  add_transition(fa, from_state, get_symbol(at_char), to_state);
425 }
426 
427 template <typename T, bool is_signed = std::numeric_limits<T>::is_signed>
428 struct IsSymbol;
429 
430 template <typename T>
431 struct IsSymbol<T, true> {
432  static bool eval(T c) {
433  if (c < 0) return false;
434  return 0 <= Teuchos::chartab[int(c)];
435  }
436 };
437 
438 template <typename T>
439 struct IsSymbol<T, false> {
440  static bool eval(T c) {
441  if (c >= TEUCHOS_CHARTAB_SIZE) return false;
442  return 0 <= Teuchos::chartab[int(c)];
443  }
444 };
445 
446 bool is_symbol(char c) {
447  return IsSymbol<char>::eval(c);
448 }
449 
450 template <typename T, bool is_signed = std::numeric_limits<T>::is_signed>
451 struct GetSymbol;
452 
453 template <typename T>
454 struct GetSymbol<T, true> {
455  static int eval(T c) {
456  TEUCHOS_ASSERT(0 <= c);
457  int symbol = Teuchos::chartab[int(c)];
458  TEUCHOS_ASSERT(0 <= symbol);
459  return symbol;
460  }
461 };
462 
463 template <typename T>
464 struct GetSymbol<T, false> {
465  static int eval(T c) {
466  int symbol = Teuchos::chartab[int(c)];
467  TEUCHOS_ASSERT(0 <= symbol);
468  return symbol;
469  }
470 };
471 
472 int get_symbol(char c) {
473  return GetSymbol<char>::eval(c);
474 }
475 
476 char get_char(int symbol) {
477  TEUCHOS_ASSERT(0 <= symbol);
478  TEUCHOS_ASSERT(symbol < Teuchos::NCHARS);
479  return inv_chartab[symbol];
480 }
481 
482 void make_char_set_nfa(FiniteAutomaton& result, std::set<char> const& accepted, int token) {
483  std::set<int> symbol_set;
484  for (std::set<char>::const_iterator it = accepted.begin(); it != accepted.end(); ++it) {
485  char c = *it;
486  symbol_set.insert(get_symbol(c));
487  }
488  return make_set_nfa(result, Teuchos::NCHARS, symbol_set, token);
489 }
490 
491 void make_char_range_nfa(FiniteAutomaton& result, char range_start, char range_end, int token) {
492  return make_range_nfa(result, Teuchos::NCHARS, get_symbol(range_start), get_symbol(range_end), token);
493 }
494 
495 void make_char_single_nfa(FiniteAutomaton& result, char symbol_char, int token) {
496  return make_range_nfa(result, Teuchos::NCHARS, get_symbol(symbol_char), get_symbol(symbol_char), token);
497 }
498 
499 void negate_set(std::set<char>& result, std::set<char> const& s) {
500  using std::swap;
501  std::set<char> out;
502  for (int symbol = 0; symbol < NCHARS; ++symbol) {
503  char c = inv_chartab[symbol];
504  if (!s.count(c)) out.insert(c);
505  }
506  swap(result, out);
507 }
508 
509 std::ostream& operator<<(std::ostream& os, FiniteAutomaton const& fa) {
510  if (get_determinism(fa)) os << "dfa ";
511  else os << "nfa ";
512  os << get_nstates(fa) << " states " << get_nsymbols(fa) << " symbols\n";
513  for (int state = 0; state < get_nstates(fa); ++state) {
514  for (int symbol = 0; symbol < get_nsymbols(fa); ++symbol) {
515  int next_state = step(fa, state, symbol);
516  if (next_state != -1) os << "(" << state << ", " << symbol << ") -> " << next_state << '\n';
517  }
518  if (!get_determinism(fa)) {
519  for (int symbol = get_epsilon0(fa); symbol <= get_epsilon1(fa); ++symbol) {
520  int next_state = step(fa, state, symbol);
521  if (next_state != -1) os << "(" << state << ", eps" << (symbol - get_epsilon0(fa)) << ") -> " << next_state << '\n';
522  }
523  }
524  int token = accepts(fa, state);
525  if (token != -1) os << state << " accepts " << token << '\n';
526  }
527  return os;
528 }
529 
530 } // end namespace Teuchos
#define TEUCHOS_ASSERT(assertion_test)
This macro is throws when an assert fails.
Reference-counted pointer class and non-member templated function implementations.