-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathre_parser.py
205 lines (157 loc) · 6.67 KB
/
re_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
from automaton import Automaton, Node
from nested_re import NestedRE
class REParser():
"""
Extract a regular expression from a 0-reversible automaton. This is an
implementation of the State-Elimination Algorithm (as described in Getir,
e.a. 2007).
On each iteration a node is eliminated and its edges are merged to the other
nodes in the Automaton. At the end we have only an initial and a final node,
and the label of their edge is the regular expression.
This is work in progress and I am still looking for a away to simplify the
final expression (hence for the moment this is only an extractor and not
a parser).
"""
def __init__(self, automaton):
"""
:args:
automaton - an deterministic FSA, instance of Automaton
"""
self.A = automaton
self.verbose = False
def parse(self, verbose=False):
"""
Apply State-Elimination to extract regular expression from the automaton.
:args:
verbose - print a lot of info (useful for debugging)
:returns:
regex - string
"""
self.verbose = verbose
# Requirement of the SE algorithm, automaton should be uniform
if not self.is_uniform():
self.make_uniform()
if verbose:
print('Converted to uniform Automaton')
self.A.show('Uniform 0-Automaton')
# List nodes to be eliminated, ignore initial and final states
nodes = [n for n in self.A.nodes if not (n.is_initial or n.is_final) ]
i = 0
while nodes:
# Choose node to be eliminated
k = nodes.pop(0)
new_edges = [ [ [] for i in range(self.A.esize)] for j in range(self.A.esize)]
# DEBUG
i+=1
if verbose:
print(f'Loop={i}, N={len(nodes)}, Eliminating Node k={k.index}')
for n1 in list(self.A.nodes):
for n2 in list(self.A.nodes):
if not (n1==k or n2==k):
new_label = self.derive_pattern(n1, n2, k)
if new_label:
new_edges[n1.index][n2.index].append( new_label )
self.A.delete_node(k)
self.A.edges = new_edges
if verbose:
self.A.show(f'i={i}')
final_edge = self.A.edges[self.A.root.index][self.A.accepting_nodes[0].index]
return final_edge
def is_uniform(self):
"""
Check if automaton is uniform, i.e.
- has exactly one initial state with no incoming edges
- has exactly one final states with no outgoing edges
"""
# Class Automaton only allows one initial state, so only check incoming edges
if any(e[self.A.root.index] for e in self.A.edges):
return False
# There should be only one final state
if len(self.A.accepting_nodes) > 1:
return False
# Final state should not have outgoing edges
f = self.A.accepting_nodes[0]
if any( self.A.edges[f.index] ):
return False
return True
def make_uniform(self):
"""
Convert automaton to a uniform automaton. This is done by adding a new
initial state with a ϵ-transition to the previous initial state.
Similarly, we create a new final state and add a ϵ-transition from the
previous final state(s).
"""
# If inital state is final state or if inital state has incoming edges
# create new initial state with ϵ-transition the old one
if self.A.root in self.A.accepting_nodes or any(e[self.A.root.index] for e in self.A.edges):
new_root = self.A.add_node()
self.A.add_edge(new_root, self.A.root, 'ϵ')
self.A.root.is_initial = False
new_root.is_initial = True
self.A.root = new_root
# If there are more than one final states or a final state has outgoing edges
# create new final state with ϵ-transition(s) to the old final state(s)
if len(self.A.accepting_nodes) > 1 or any(self.A.edges[self.A.accepting_nodes[0].index]):
new_final = self.A.add_node()
for f in self.A.accepting_nodes:
self.A.add_edge(f, new_final, 'ϵ')
f.is_final = False
new_final.is_final = True
self.A.accepting_nodes = [ new_final ]
def derive_pattern(self, s, t, k):
"""
Find a label for the edge between the nodes s (source) and t (target) after
node k is eliminated. The basic idea is to take the untion of two possible
paths:
L: direct path between s and t (if there is such)
R: indirect path between s and t through k (if exists)
So the derived pattern is (L|R).
args:
s source, Node instance
t target, Node instance
k eliminated node, Node instance
returns:
P string, if new edge between s and t is possible,
None otherwise.
"""
s2t = NestedRE( '|'.join(self.A.edges[s.index][t.index]) )
s2k = NestedRE( '|'.join(self.A.edges[s.index][k.index]) )
k2k = NestedRE( '|'.join(self.A.edges[k.index][k.index]), '*' )
k2t = NestedRE( '|'.join(self.A.edges[k.index][t.index]) )
if self.verbose:
print('-' * 40)
print(f'Deriving pattern for pair: [s={s.index}, t={t.index}, k={k.index}]')
print('s2t; ', s2t)
print('s2k; ', s2k)
print('k2k; ', k2k)
print('k2t; ', k2t)
# Left-hand side of the pattern, which is just s2t or None if it's empty
L = s2t if s2t else None
# Right-hand side of the pattern, take the intersection of the three patterns
R = None
if s2k and k2t:
#if s2k and s2k != ['ϵ']:
# right_pattern.append(s2k)
R = s2k
if self.verbose:
print('[R = s2k]: ', str(R))
if k2k and str(k2k) != 'ϵ':
R = R.join_intersection(k2k)
if self.verbose:
print('[R += k2k]: ', str(R))
R = R.join_intersection(k2t)
if self.verbose:
print('[R += k2t]: ', str(R))
# Full pattern, union of L and R
P = None
if L and R:
P = L.join_union(R)
elif L and not R:
P = L
elif R and not L:
P = R
if self.verbose:
print('[P=L|R]: ', P)
return str(P) if P else None
def get_automaton(self):
return self.A