Teuchos - Trilinos Tools Package  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Teuchos_FiniteAutomaton.cpp
1 // @HEADER
2 // *****************************************************************************
3 // Teuchos: Common Tools Package
4 //
5 // Copyright 2004 NTESS and the Teuchos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #include "Teuchos_FiniteAutomaton.hpp"
11 
12 #include <set>
13 #include <map>
14 #include <queue>
15 #include <utility>
16 #include <memory>
17 #include <limits>
18 
19 #include "Teuchos_chartab.hpp"
20 #include "Teuchos_RCP.hpp"
21 #include "Teuchos_Table.hpp"
22 
23 namespace Teuchos {
24 
25 template struct Table<int>;
26 
27 FiniteAutomaton::FiniteAutomaton(int nsymbols_init, bool is_deterministic_init,
28  int nstates_reserve):
29  table(nsymbols_init + (is_deterministic_init ? 0 : 2), nstates_reserve),
30  is_deterministic(is_deterministic_init)
31 {
32  reserve(accepted_tokens, nstates_reserve);
33 }
34 
35 void FiniteAutomaton::swap(FiniteAutomaton& other) {
36  using std::swap;
37  swap(table, other.table);
38  swap(accepted_tokens, other.accepted_tokens);
39  swap(is_deterministic, other.is_deterministic);
40 }
41 
42 int get_nstates(FiniteAutomaton const& fa) {
43  return get_nrows(fa.table);
44 }
45 
46 int get_nsymbols(FiniteAutomaton const& fa) {
47  return get_ncols(fa.table) - (fa.is_deterministic ? 0 : 2);
48 }
49 
50 bool get_determinism(FiniteAutomaton const& fa) {
51  return fa.is_deterministic;
52 }
53 
54 int get_epsilon0(FiniteAutomaton const& fa) {
55  TEUCHOS_ASSERT(!fa.is_deterministic);
56  return get_ncols(fa.table) - 2;
57 }
58 
59 int get_epsilon1(FiniteAutomaton const& fa) {
60  TEUCHOS_ASSERT(!fa.is_deterministic);
61  return get_ncols(fa.table) - 1;
62 }
63 
64 int add_state(FiniteAutomaton& fa) {
65  int state = get_nstates(fa);
66  resize(fa.table, state + 1, get_ncols(fa.table));
67  for (int j = 0; j < get_ncols(fa.table); ++j) {
68  at(fa.table, state, j) = -1;
69  }
70  fa.accepted_tokens.push_back(-1);
71  return state;
72 }
73 
74 void add_transition(FiniteAutomaton& fa, int from_state, int at_symbol, int to_state) {
75  TEUCHOS_ASSERT(0 <= to_state);
76  TEUCHOS_ASSERT(to_state < get_nstates(fa));
77  TEUCHOS_ASSERT(0 <= at_symbol);
78  TEUCHOS_ASSERT(at_symbol < get_ncols(fa.table)); // allow setting epsilon transitions
79  TEUCHOS_ASSERT(at(fa.table, from_state, at_symbol) == -1);
80  at(fa.table, from_state, at_symbol) = to_state;
81 }
82 
83 void add_accept(FiniteAutomaton& fa, int state, int token) {
84  TEUCHOS_ASSERT(0 <= token);
85  at(fa.accepted_tokens, state) = token;
86 }
87 
88 void remove_accept(FiniteAutomaton& fa, int state) {
89  at(fa.accepted_tokens, state) = -1;
90 }
91 
92 int step(FiniteAutomaton const& fa, int state, int symbol) {
93  TEUCHOS_ASSERT(0 <= state);
94  TEUCHOS_ASSERT(state < get_nstates(fa));
95  TEUCHOS_ASSERT(0 <= symbol);
96  TEUCHOS_ASSERT(symbol < get_ncols(fa.table)); // allow getting epsilon transitions
97  return at(fa.table, state, symbol);
98 }
99 
100 int accepts(FiniteAutomaton const& fa, int state) {
101  return at(fa.accepted_tokens, state);
102 }
103 
104 int get_nsymbols_eps(FiniteAutomaton const& fa) {
105  return get_ncols(fa.table);
106 }
107 
108 void append_states(FiniteAutomaton& fa, FiniteAutomaton const& other) {
109  TEUCHOS_ASSERT(get_nsymbols(other) == get_nsymbols(fa));
110  bool other_determ = get_determinism(other);
111  if (!other_determ) TEUCHOS_ASSERT(!fa.is_deterministic);
112  int offset = get_nstates(fa);
113  for (int other_state = 0; other_state < get_nstates(other); ++other_state) {
114  int my_state = add_state(fa);
115  int token = accepts(other, other_state);
116  if (0 <= token) add_accept(fa, my_state, token);
117  }
118  for (int other_state = 0; other_state < get_nstates(other); ++other_state) {
119  int my_state = other_state + offset;
120  for (int symbol = 0; symbol < get_nsymbols_eps(other); ++symbol) {
121  int other_next = step(other, other_state, symbol);
122  if (other_next < 0) continue;
123  int my_next = other_next + offset;
124  add_transition(fa, my_state, symbol, my_next);
125  }
126  }
127 }
128 
129 void make_single_nfa(FiniteAutomaton& result, int nsymbols, int symbol, int token) {
130  make_range_nfa(result, nsymbols, symbol, symbol, token);
131 }
132 
133 void make_set_nfa(FiniteAutomaton& result, int nsymbols, std::set<int> const& accepted, int token) {
134  using std::swap;
135  FiniteAutomaton out(nsymbols, true, 2);
136  int start_state = add_state(out);
137  int accept_state = add_state(out);
138  for (std::set<int>::const_iterator it = accepted.begin(); it != accepted.end(); ++it) {
139  int i = *it;
140  add_transition(out, start_state, i, accept_state);
141  }
142  add_accept(out, accept_state, token);
143  swap(result, out);
144 }
145 
146 void make_range_nfa(FiniteAutomaton& result, int nsymbols, int range_start, int range_end, int token) {
147  using std::swap;
148  TEUCHOS_ASSERT(0 <= range_start);
149  TEUCHOS_ASSERT(range_start <= range_end);
150  TEUCHOS_ASSERT(range_end <= nsymbols);
151  FiniteAutomaton out(nsymbols, true, 2);
152  int start_state = add_state(out);
153  int accept_state = add_state(out);
154  for (int i = range_start; i <= range_end; ++i) {
155  add_transition(out, start_state, i, accept_state);
156  }
157  add_accept(out, accept_state, token);
158  swap(result, out);
159 }
160 
161 void unite(FiniteAutomaton& result, FiniteAutomaton const& a, FiniteAutomaton const& b) {
162  using std::swap;
163  int nsymbols = get_nsymbols(a);
164  FiniteAutomaton out(nsymbols, false, 1 + get_nstates(a) + get_nstates(b));
165  int start_state = add_state(out);
166  int a_offset = get_nstates(out);
167  append_states(out, a);
168  int b_offset = get_nstates(out);
169  append_states(out, b);
170  int epsilon0 = get_epsilon0(out);
171  int epsilon1 = get_epsilon1(out);
172  add_transition(out, start_state, epsilon0, a_offset);
173  add_transition(out, start_state, epsilon1, b_offset);
174  using std::swap;
175  swap(out, result);
176 }
177 
178 void concat(FiniteAutomaton& result, FiniteAutomaton const& a, FiniteAutomaton const& b, int token) {
179  int nsymbols = get_nsymbols(a);
180  FiniteAutomaton out(nsymbols, false, get_nstates(a) + get_nstates(b));
181  append_states(out, a);
182  int b_offset = get_nstates(out);
183  append_states(out, b);
184  int epsilon0 = get_epsilon0(out);
185  for (int i = 0; i < get_nstates(a); ++i) {
186  if (accepts(a, i) != -1) {
187  add_transition(out, i, epsilon0, b_offset);
188  remove_accept(out, i);
189  }
190  }
191  for (int i = 0; i < get_nstates(b); ++i) {
192  if (accepts(b, i) != -1) {
193  add_accept(out, i + b_offset, token);
194  }
195  }
196  swap(result, out);
197 }
198 
199 void plus(FiniteAutomaton& result, FiniteAutomaton const& a, int token) {
200  using std::swap;
201  FiniteAutomaton out(get_nsymbols(a), false, get_nstates(a) + 1);
202  append_states(out, a);
203  int new_accept_state = add_state(out);
204  add_accept(out, new_accept_state, token);
205  int epsilon0 = get_epsilon0(out);
206  int epsilon1 = get_epsilon1(out);
207  for (int i = 0; i < get_nstates(a); ++i) {
208  if (accepts(a, i) != -1) {
209  add_transition(out, i, epsilon0, new_accept_state);
210  /* we follow a convention that accepting
211  states should not have epsilon transitions */
212  add_transition(out, i, epsilon1, 0);
213  remove_accept(out, i);
214  }
215  }
216  swap(result, out);
217 }
218 
219 void maybe(FiniteAutomaton& result, FiniteAutomaton const& a, int token) {
220  using std::swap;
221  FiniteAutomaton out(get_nsymbols(a), false, get_nstates(a) + 2);
222  int new_start_state = add_state(out);
223  int offset = get_nstates(out);
224  append_states(out, a);
225  int new_accept_state = add_state(out);
226  int epsilon0 = get_epsilon0(out);
227  int epsilon1 = get_epsilon1(out);
228  add_transition(out, new_start_state, epsilon1, offset);
229  /* form an epsilon0 linked list of new start state,
230  all old accepting states, and new accepting state */
231  int last = new_start_state;
232  for (int i = 0; i < get_nstates(a); ++i) {
233  if (accepts(a, i) != -1) {
234  add_transition(out, last, epsilon0, i + offset);
235  remove_accept(out, i + offset);
236  last = i + offset;
237  }
238  }
239  add_transition(out, last, epsilon0, new_accept_state);
240  add_accept(out, new_accept_state, token);
241  swap(result, out);
242 }
243 
244 void star(FiniteAutomaton& result, FiniteAutomaton const& a, int token) {
245  using std::swap;
246  plus(result, a, token);
247  maybe(result, result, token);
248 }
249 
250 typedef std::set<int> StateSet;
251 
252 static StateSet step(StateSet const& ss, int symbol, FiniteAutomaton const& fa) {
253  StateSet next_ss;
254  for (StateSet::const_iterator it = ss.begin(); it != ss.end(); ++it) {
255  int state = *it;
256  int next_state = step(fa, state, symbol);
257  if (next_state != -1) next_ss.insert(next_state);
258  }
259  return next_ss;
260 }
261 
262 typedef std::queue<int> StateQueue;
263 
264 static StateSet get_epsilon_closure(StateSet ss, FiniteAutomaton const& fa) {
265  StateQueue q;
266  for (StateSet::const_iterator it = ss.begin(); it != ss.end(); ++it) {
267  int state = *it;
268  q.push(state);
269  }
270  int epsilon0 = get_epsilon0(fa);
271  int epsilon1 = get_epsilon1(fa);
272  while (!q.empty()) {
273  int state = q.front(); q.pop();
274  for (int epsilon = epsilon0; epsilon <= epsilon1; ++epsilon) {
275  int next_state = step(fa, state, epsilon);
276  if (next_state == -1) continue;
277  if (!ss.count(next_state)) {
278  ss.insert(next_state);
279  q.push(next_state);
280  }
281  }
282  }
283  return ss;
284 }
285 
286 struct StateSetPtrLess {
287  bool operator()(StateSet* a, StateSet* b) const {
288  return *a < *b;
289  }
290 };
291 
292 typedef std::map<StateSet*,int,StateSetPtrLess> StateSetPtr2State;
293 typedef RCP<StateSet> StateSetPtr;
294 typedef std::vector<StateSetPtr> StateSetUniqPtrVector;
295 
296 static void add_back(StateSetUniqPtrVector& ssupv, StateSet& ss) {
297  using std::swap;
298  StateSetPtr ptr(new StateSet());
299  swap(*ptr, ss);
300  ssupv.push_back(ptr);
301 }
302 
303 /* powerset construction, NFA -> DFA */
304 void make_deterministic(FiniteAutomaton& result, FiniteAutomaton& nfa) {
305  using std::swap;
306  if (get_determinism(nfa)) {
307  swap(result, nfa);
308  return;
309  }
310  StateSetPtr2State ssp2s;
311  StateSetUniqPtrVector ssupv;
312  FiniteAutomaton out(get_nsymbols(nfa), true, 0);
313  StateSet start_ss;
314  start_ss.insert(0);
315  start_ss = get_epsilon_closure(start_ss, nfa);
316  add_back(ssupv, start_ss);
317  ssp2s[ssupv.back().get()] = add_state(out);
318  int front = 0;
319  while (front < int(ssupv.size())) {
320  int state = front;
321  StateSet& ss = *at(ssupv, front);
322  ++front;
323  for (int symbol = 0; symbol < get_nsymbols(nfa); ++symbol) {
324  StateSet next_ss = Teuchos::step(ss, symbol, nfa);
325  if (next_ss.empty()) continue;
326  next_ss = get_epsilon_closure(next_ss, nfa);
327  int next_state;
328  StateSetPtr2State::iterator it = ssp2s.find(&next_ss);
329  if (it == ssp2s.end()) {
330  next_state = add_state(out);
331  add_back(ssupv, next_ss);
332  ssp2s[ssupv.back().get()] = next_state;
333  } else {
334  next_state = it->second;
335  }
336  add_transition(out, state, symbol, next_state);
337  }
338  int min_accepted = -1;
339  for (StateSet::const_iterator it = ss.begin(); it != ss.end(); ++it) {
340  int nfa_state = *it;
341  int nfa_token = accepts(nfa, nfa_state);
342  if (nfa_token == -1) continue;
343  if (min_accepted == -1 || nfa_token < min_accepted) {
344  min_accepted = nfa_token;
345  }
346  }
347  if (min_accepted != -1) add_accept(out, state, min_accepted);
348  }
349  swap(result, out);
350 }
351 
352 struct StateRowLess {
353  Table<int> const& table;
354  std::vector<int> const& accepted;
355  bool operator()(int const& a, int const& b) const {
356  int aa = at(accepted, a);
357  int ab = at(accepted, b);
358  if (aa != ab) return aa < ab;
359  for (int symbol = 0, ncols = get_ncols(table); symbol < ncols; ++symbol) {
360  int na = at(table, a, symbol);
361  int nb = at(table, b, symbol);
362  if (na != nb) return na < nb;
363  }
364  return false;
365  }
366  StateRowLess(Table<int> const& t, std::vector<int> const& a):
367  table(t),accepted(a) {
368  }
369 };
370 
371 typedef std::map<int, int, StateRowLess> StateRow2SimpleState;
372 
373 void simplify_once(FiniteAutomaton& result, FiniteAutomaton const& fa) {
374  using std::swap;
375  StateRow2SimpleState sr2ss(StateRowLess(fa.table, fa.accepted_tokens));
376  int nsimple = 0;
377  for (int state = 0; state < get_nstates(fa); ++state) {
378  std::pair<StateRow2SimpleState::iterator, bool> res =
379  sr2ss.insert(std::make_pair(state, nsimple));
380  if (res.second) {
381  ++nsimple;
382  }
383  }
384  FiniteAutomaton out(get_nsymbols(fa), get_determinism(fa), nsimple);
385  for (int simple = 0; simple < nsimple; ++simple) {
386  add_state(out);
387  }
388  std::vector<bool> did_simple(size_t(nsimple), false);
389  for (int state = 0; state < get_nstates(fa); ++state) {
390  TEUCHOS_ASSERT(sr2ss.count(state));
391  int simple = sr2ss[state];
392  if (at(did_simple, simple)) continue;
393  for (int symbol = 0; symbol < get_nsymbols_eps(fa); ++symbol) {
394  int next_state = step(fa, state, symbol);
395  if (next_state == -1) continue;
396  TEUCHOS_ASSERT(sr2ss.count(next_state));
397  int next_simple = sr2ss[next_state];
398  add_transition(out, simple, symbol, next_simple);
399  }
400  int token = accepts(fa, state);
401  if (token != -1) {
402  add_accept(out, simple, token);
403  }
404  at(did_simple, simple) = true;
405  }
406  swap(result, out);
407 }
408 
409 void simplify(FiniteAutomaton& result, FiniteAutomaton const& fa) {
410  using std::swap;
411  FiniteAutomaton prev;
412  FiniteAutomaton next = fa;
413  int nstates_next = get_nstates(next);
414  int nstates;
415  int i = 0;
416  do {
417  swap(prev, next);
418  nstates = nstates_next;
419  simplify_once(next, prev);
420  ++i;
421  nstates_next = get_nstates(next);
422  } while (nstates_next < nstates);
423  swap(result, next);
424 }
425 
426 void add_char_transition(FiniteAutomaton& fa, int from_state, char at_char, int to_state) {
427  add_transition(fa, from_state, get_symbol(at_char), to_state);
428 }
429 
430 template <typename T, bool is_signed = std::numeric_limits<T>::is_signed>
431 struct IsSymbol;
432 
433 template <typename T>
434 struct IsSymbol<T, true> {
435  static bool eval(T c) {
436  if (c < 0) return false;
437  return 0 <= Teuchos::chartab[int(c)];
438  }
439 };
440 
441 template <typename T>
442 struct IsSymbol<T, false> {
443  static bool eval(T c) {
444  if (c >= TEUCHOS_CHARTAB_SIZE) return false;
445  return 0 <= Teuchos::chartab[int(c)];
446  }
447 };
448 
449 bool is_symbol(char c) {
450  return IsSymbol<char>::eval(c);
451 }
452 
453 template <typename T, bool is_signed = std::numeric_limits<T>::is_signed>
454 struct GetSymbol;
455 
456 template <typename T>
457 struct GetSymbol<T, true> {
458  static int eval(T c) {
459  TEUCHOS_ASSERT(0 <= c);
460  int symbol = Teuchos::chartab[int(c)];
461  TEUCHOS_ASSERT(0 <= symbol);
462  return symbol;
463  }
464 };
465 
466 template <typename T>
467 struct GetSymbol<T, false> {
468  static int eval(T c) {
469  int symbol = Teuchos::chartab[int(c)];
470  TEUCHOS_ASSERT(0 <= symbol);
471  return symbol;
472  }
473 };
474 
475 int get_symbol(char c) {
476  return GetSymbol<char>::eval(c);
477 }
478 
479 char get_char(int symbol) {
480  TEUCHOS_ASSERT(0 <= symbol);
481  TEUCHOS_ASSERT(symbol < Teuchos::NCHARS);
482  return inv_chartab[symbol];
483 }
484 
485 void make_char_set_nfa(FiniteAutomaton& result, std::set<char> const& accepted, int token) {
486  std::set<int> symbol_set;
487  for (std::set<char>::const_iterator it = accepted.begin(); it != accepted.end(); ++it) {
488  char c = *it;
489  symbol_set.insert(get_symbol(c));
490  }
491  return make_set_nfa(result, Teuchos::NCHARS, symbol_set, token);
492 }
493 
494 void make_char_range_nfa(FiniteAutomaton& result, char range_start, char range_end, int token) {
495  return make_range_nfa(result, Teuchos::NCHARS, get_symbol(range_start), get_symbol(range_end), token);
496 }
497 
498 void make_char_single_nfa(FiniteAutomaton& result, char symbol_char, int token) {
499  return make_range_nfa(result, Teuchos::NCHARS, get_symbol(symbol_char), get_symbol(symbol_char), token);
500 }
501 
502 void negate_set(std::set<char>& result, std::set<char> const& s) {
503  using std::swap;
504  std::set<char> out;
505  for (int symbol = 0; symbol < NCHARS; ++symbol) {
506  char c = inv_chartab[symbol];
507  if (!s.count(c)) out.insert(c);
508  }
509  swap(result, out);
510 }
511 
512 std::ostream& operator<<(std::ostream& os, FiniteAutomaton const& fa) {
513  if (get_determinism(fa)) os << "dfa ";
514  else os << "nfa ";
515  os << get_nstates(fa) << " states " << get_nsymbols(fa) << " symbols\n";
516  for (int state = 0; state < get_nstates(fa); ++state) {
517  for (int symbol = 0; symbol < get_nsymbols(fa); ++symbol) {
518  int next_state = step(fa, state, symbol);
519  if (next_state != -1) os << "(" << state << ", " << symbol << ") -> " << next_state << '\n';
520  }
521  if (!get_determinism(fa)) {
522  for (int symbol = get_epsilon0(fa); symbol <= get_epsilon1(fa); ++symbol) {
523  int next_state = step(fa, state, symbol);
524  if (next_state != -1) os << "(" << state << ", eps" << (symbol - get_epsilon0(fa)) << ") -> " << next_state << '\n';
525  }
526  }
527  int token = accepts(fa, state);
528  if (token != -1) os << state << " accepts " << token << '\n';
529  }
530  return os;
531 }
532 
533 } // end namespace Teuchos
#define TEUCHOS_ASSERT(assertion_test)
This macro is throws when an assert fails.
Reference-counted pointer class and non-member templated function implementations.