-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfelex.py
267 lines (209 loc) · 8.89 KB
/
felex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
#!python3
"""Lexical analysis for the xxx.feature source files.
"""
import re
# The .feature files contain human-readable sentences. It is line oriented
# in the sense that each line has a specific meaning. The first words
# on the line determine a kind of the line. In some sense, each line
# can be viewed as a lexical token (with respect to the compiler theory).
# The first words of the line determine the symbol of the lexical token,
# the other part of the line captures the value (lexem) of the token.
#
# The following list of items contain the tuples where the first element
# captures a regular expression, the second element is the lex symbol identifier.
#
# The first words may have more forms (think or about more human languages
# or about various free language expressions where the alternatives improve
# readability of the text). The important subpaterns only are only described
# below. The related full patterns are constructed in the build_rex_closures()
# below.
#
# NOTE: The language independent elements (like 'emptyline' or 'ccommentstart'
# are not captured here.
#
humanLanguageDependentRules = [
# Labels that identify portions via free text (inside comments
# of the Catch test sources).
(r'(User\s+)?Story:', 'story'),
(r'Feature:', 'feature'),
(r'Scenario:', 'scenario'),
(r'Given:', 'given'),
(r'When:', 'when'),
(r'Then:', 'then'),
(r'And:', 'and'),
(r'But:', 'but'),
(r'Test:', 'test_case'),
(r'Sec(tion)?:', 'section'),
# English free form labels.
(r'Given\s+', 'given'),
(r'When\s+', 'when'),
(r'Then\s+', 'then'),
(r'and\s+', 'and'),
(r'but\s+', 'but'),
# Czech equivalents.
(r'(Uživatelský\s+)?Požadavek:', 'story'),
(r'Rys:', 'feature'),
(r'Scénář:', 'scenario'),
(r'Dáno:', 'given'),
(r'Když:', 'when'),
(r'Pak:', 'then'),
(r'a:', 'and'),
(r'ale:', 'but'),
# Czech free form labels.
(r'Je dán[ao]?\s+', 'given'),
(r'Když\s+', 'when'),
(r'Pokud\s+', 'when'),
(r'Pak\s+', 'then'),
(r'a\s+', 'and'),
(r'ale\s+', 'but'),
]
#-----------------------------------------------------------------------
def build_rex_closures(pattern, lexsym):
"""Builds the pair of closures for the regex pattern.
"""
def match_rex(line):
# Actually returns a match object that can be interpreted
# in a boolean context as True/False (matches/does not match).
rex = re.compile(pattern, re.IGNORECASE)
return rex.match(line)
def result_rex(line):
m = match_rex(line) # see the match_rex() above
text = ''
if 'text' in m.groupdict():
text = m.group('text') # the matched text
tags = None
if lexsym in ('scenario', 'test_case'):
tags = m.group('tags')
##print( (lexsym, text, tags) )
return lexsym, text, tags
return match_rex, result_rex
#-----------------------------------------------------------------------
def buildRegexMatchFunctions():
"""Builds the list of (match_fn, result_fn) closures for regular expressions.
"""
# As a container can be iterated by several iterators, both the container
# and the iterator must be passed (not captured inside the closures).
# The rules are defined by the global one; hence, captured inside.
functions = []
# Human-language independent patterns first. The order may be important.
functions.append(build_rex_closures(r'^\s*$', 'emptyline'))
functions.append(build_rex_closures(r'^\s*/\*(?P<text>.*?)\*/\s*$', 'ccommentoneliner'))
functions.append(build_rex_closures(r'^\s*/\*(?P<text>.*?)$', 'ccommentstart'))
functions.append(build_rex_closures(r'^(?P<text>.*?)\*/\s*$', 'ccommentend'))
functions.append(build_rex_closures(r'^\s*//(?P<text>.*?)$', 'cppcomment'))
# Human-language dependent patterns.
for pat, lexsym in humanLanguageDependentRules:
if lexsym in ('scenario', 'test_case'):
pattern = r'^\s*' + pat + \
r'\s*(?P<text>.*?)\s*(?P<tags>(\[\w+\])*)\s*$'
else:
pattern = r'^\s*' + pat + r'\s*(?P<text>.*?)\s*$'
functions.append(build_rex_closures(pattern, lexsym))
return functions
#-----------------------------------------------------------------------
class Iterator:
"""Iterates over the Container and returns lexical elements.
"""
def __init__(self, container, startlineno):
self.container = container
self.lineno = startlineno
self.lines = self.container.lines
self.len = len(self.lines)
self.source_name = self.container.source_name
self.status = 0 # of the finite automaton
self.symbol = None
self.value = None
self.lexem = None
self.tags = None # for scenario/test
self.regex_match_fns = buildRegexMatchFunctions()
def __iter__(self):
return self
def notImplemented(self, msg=''):
raise NotImplementedError('status={}: {!r}'.format(self.status, msg))
def lextoken(self):
"""Forms lexical token from the member variables.
"""
# Form the lexical token.
tags = self.tags if self.tags else None
token = (self.symbol, self.value, self.lexem, tags)
# Warn if symbol was not recognized.
if self.symbol is None:
print('Warning: symbol not set for', token)
# Reset the variables.
self.symbol = None
self.value = None
self.lexem = None
self.tags = None
# Return the result.
return token
def __next__(self):
"""Returns lexical tokens (symbol, value, lexem, tags).
"""
# Loop until the end of data.
while self.status != 1000:
# Get the next character or set the status for the end of data
if self.lineno < self.len:
line = self.lines[self.lineno]
self.lexem = line # whole line is the lexem
self.lineno += 1 # advanced to the next one
else:
# End of data.
self.status = 800
#============================ initial state, nothing known
if self.status == 0:
assert self.symbol is None
for match_fn, result_fn in self.regex_match_fns:
if match_fn(line):
self.symbol, self.value, self.tags = result_fn(line)
return self.lextoken()
# Other lines are considered just 'line'.
self.symbol = 'line'
self.value = line.rstrip()
return self.lextoken()
#---------------------------- end of data
elif self.status == 800:
self.symbol = '$'
self.status = 1000
return self.lextoken()
#---------------------------- unknown status
else:
raise NotImplementedError('Unknown status: {}'.format(self.status))
raise StopIteration
#-----------------------------------------------------------------------
class Container:
"""Iterable container for lexical parsing of the *.feature source.
The source is passed or as a multiline string, or as an open file,
processed by lines.
"""
def __init__(self, source):
if hasattr(source, 'readlines'):
# It is a file object opened for reading lines in text mode.
self.lines = source.readlines()
self.source_name = source.name # filename
elif source == '':
# It is an empty string.
self.lines = []
self.source_name = '<str>'
else:
# It is a multiline string.
lines = source.split('\n') # multiline split to list of lines
self.lines = [line + '\n' for line in lines[:-1]] # adding newlines back
self.lines.append(lines[-1])
self.source_name = '<str>'
def __iter__(self):
return Iterator(self, 0)
#-----------------------------------------------------------------------
if __name__ == '__main__':
import textwrap
source = textwrap.dedent("""\
Story: story identifier
As a user
I want the feature
so that my life is to be easier.
Scenario: scenario identifier
Given: given identifier
When: when identifier
Then: then identifier
""")
for e in Container(source):
print(e)