-
Notifications
You must be signed in to change notification settings - Fork 39
/
Copy pathswig_debug_parser.py
441 lines (347 loc) · 14.7 KB
/
swig_debug_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
# Copyright 2016 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Uses SWIG output to convert C++ code snippets to C# code snippets.
Basic Usage: swig_debug_parser.py -d <swig_debug_output> -s <cs_sources>
When running SWIG, use `-debug-top 4` and pipe the output to a file to collect
the debug information about the SWIG parse state. This information can be used
by this script to generate a mapping from C++ identifiers to C# identifiers.
Then, the script looks for code snippets in the comments of the C# source
(denoted by areas in comments wrapped in backticks) and replaces all C++
identifiers with the appropriate C# identifiers.
"""
import re
from absl import app
from absl import flags
from absl import logging
FLAGS = flags.FLAGS
flags.DEFINE_spaceseplist('srcs',
None,
'The C# source files to process in-place.',
short_name='s')
flags.DEFINE_string('debug_top',
None,
'The SWIG output file generated by running SWIG with the '
'argument `debug_top 4`.',
short_name='d')
flags.DEFINE_string('namespace',
None,
'Place all C# identifiers in the given namespace.',
short_name='n')
# Matches strings of the form '+++ SomeToken --------------------------'
NODE_HEADER_REGEX = re.compile(r'\+\+\+ ([a-zA-Z0-9_:]+) -+')
# Matches strings of the form ' | Key - "StringValue"'
NODE_FIELD_REGEX = re.compile(r' *\| ([a-zA-Z0-9_:]+) *- "(.+)"')
# Matches strings of the form ' | Key - 0xdeadbeef'
NODE_HEX_FIELD_REGEX = re.compile(r' *\| ([a-zA-Z0-9_:]+) *- (0x[0-9a-f]+)')
# Matches strings of the form '... `some::function::name(int, std::string)` ...'
FUNCTION_NAME_REGEX = re.compile(r'`([a-zA-Z_:]+)[^`]*`')
# When parsing nodes it is useful to know what kind of node is being parsed, so
# a special key is added to each node so that it is aware of what list it came
# from.
NODE_TYPE_KEY = '__type__'
# We have to store children in a special sub-dict because some of the child node
# identifiers can clash with field keys.
NODE_CHILDREN_KEY = '__children__'
class PeekableIter(object):
"""A simple iterator wrapper that supplies a peek function.
This is useful when doing things like parsing text, where you often want to
see the next token before consuming it.
"""
def __init__(self, iterable):
self.iterator = iter(iterable)
self.next_value = None
def __iter__(self):
return self
def next(self):
"""Returns the next value being iterated over."""
if self.next_value is not None:
next_value = self.next_value
self.next_value = None
return next_value
else:
return next(self.iterator)
def peek(self):
"""Returns the next value being iterated over without consuming it."""
if self.next_value is None:
self.next_value = next(self.iterator)
return self.next_value
def indentation(line):
"""Returns the number of leading whitespace characters."""
return len(line) - len(line.lstrip())
def parse_children(node, it, indent):
"""Parses a child node on the given node.
Child nodes start with three '+', followed by the node name, followed by a
series of -'s.
SWIG Debug output looks something like the following:
+++ somenode ----------------------------------------
| field - "Value"
| another_field - "Some other value"
+++ childnode ----------------------------------------
| child_field - "blah blah blah"
Each node has a header which declares what kind of node it is, followed by
zero or more key value pairs, followed by zero or more child nodes which
follow the same pattern.
Sometimes there is a blank link, or a line consisting of only a vertical pipe
character between the final field and the child nodes.
Args:
node: The node to parse for children.
it: A PeekableIter to iterate over the lines of the swig debug output.
indent: The current indentation level. Used to determine when to recurse
into children or return.
"""
field = NODE_HEADER_REGEX.search(it.next()).group(1)
children = node.setdefault(NODE_CHILDREN_KEY, {})
field_list = children.setdefault(field, [])
child_node = parse_node(it, indent)
child_node[NODE_TYPE_KEY] = field
field_list.append(child_node)
def consume_empty_lines(it):
"""Skip over the trailing lines at the end of a node.
Some nodes have a trailing newline or pipe character for no real reason, so we
skip them.
Args:
it: A PeekableIter to iterate over the lines of the swig debug output.
Raises:
StopIteration: The iterator reached the end.
"""
# pylint: disable=g-explicit-bool-comparison
while it.peek() == '' or it.peek().strip() == '|':
it.next()
def parse_node(it, indent):
"""Parses the fields of the node and returns a dict with those fields.
Child nodes start with three '+', followed by the node name, followed by a
series of -'s.
Args:
it: A PeekableIter to iterate over the lines of the swig debug output.
indent: The current indentation level. Used to determine when to recurse
into children or return.
Returns:
A dict containing the parsed fields.
"""
node = {}
try:
while indentation(it.peek()) == indent:
line = it.peek()
# This is the start of a new node.
if '+++' in line:
break
# Check if this a field we can read
matches = NODE_FIELD_REGEX.search(line)
if matches:
key, value = matches.groups()
node[key] = value
else:
matches = NODE_HEX_FIELD_REGEX.search(line)
if matches:
key, value = matches.groups()
node[key] = int(value, 16)
it.next()
consume_empty_lines(it)
except StopIteration:
return node
# Parse child nodes.
try:
while indentation(it.peek()) > indent:
parse_children(node, it, indentation(it.peek()))
consume_empty_lines(it)
except StopIteration:
return node
return node
def parse_swig_debug_top(it):
"""Parses the output of `swig --debug-top 4`.
Parses the debug output into a series of nested dictionaries, which we can
use to # generate a map of C++ classes and enums to C# classes and enums.
Args:
it: A PeekableIter to iterate over the lines of the swig debug output.
Returns:
A dictionary representing the root node of the parse tree.
"""
# The deubg output begins with a bunch of stuff we don't care about. Parse
# forward until we find a line containing +++, which indicates the root node
# of the tree.
while it.peek()[0:3] != '+++':
it.next()
root_node = {}
parse_children(root_node, it, 0)
return root_node
def gather_subtitution_data(node,
identifier_metadata_map,
file_module_map,
current_class=None,
includes=None):
"""Builds metadata needed to perform identifer substitution on the C# sources.
This is basically doing the first phase of two-phase parsing. We need to build
a mapping between C++ identifiers to C# identifiers. However, the C# class
names are not necessarily known until after the nodes are parsed. To solve
that, a second mapping between filenames and the modules they represent is
used. The identifer map stores some metadata, including what file it it was
declared in. Those together can later be used to evaluate what class a given
identifier should belong to.
Args:
node: The current node being scanned for metadata.
identifier_metadata_map: A map between C++ identifiers and the metadata
needed to determine their C# identifer.
file_module_map: The map between files and which module they represent.
current_class: The class that the data in the current node refers to.
includes: The list of includes that we had to parse to get to this object.
"""
# Check if we've recursed into an included file.
local_includes = includes or []
node_type = node.get(NODE_TYPE_KEY)
if node_type in ('include', 'import'):
name = node.get('name')
if name:
local_includes = list(includes)
local_includes.append(name)
# If this include file has declared that its contents is part of a module,
# record the mapping between the file name and the module name.
module = node.get('module')
if module and isinstance(name, str):
file_module_map[name] = module
# Add classes and nodes to the identifier map.
elif node_type == 'class' or node_type == 'struct':
current_class = node.get('name')
metadata = {'includes': local_includes}
if FLAGS.namespace:
metadata['namespace'] = FLAGS.namespace
identifier_metadata_map[current_class] = metadata
# Add C++ function declarations to the identifier map.
elif node_type == 'cdecl':
symname = node.get('sym:name')
name = node.get('name')
if current_class:
name = '%s::%s' % (current_class, name)
if name and symname:
metadata = {
'includes': local_includes,
'symname': symname,
}
if FLAGS.namespace:
metadata['namespace'] = FLAGS.namespace
identifier_metadata_map[name] = metadata
# Recurse into all children nodes and repeat
child_nodes = node.get(NODE_CHILDREN_KEY, {})
for children in child_nodes.values():
for child_node in children:
gather_subtitution_data(child_node, identifier_metadata_map,
file_module_map, current_class, local_includes)
def resolve_module(includes, file_module_map):
"""Scan backwards through the list of includes to find the module.
The files that SWIG operates on can recursively include other files. We are
only interested in the most recent module declaration, so we scan the list
backwards until we find a node that has declared a module.
Args:
includes: The list of includes that we had to parse to get to this object.
file_module_map: The map between files and which module they represent.
Returns:
The current module, if any, for the given list of includes.
"""
for filename in reversed(includes):
module = file_module_map.get(filename)
if module:
return module
return None
def generated_substitution_map(identifier_metadata_map, file_module_map):
"""Uses the collected metadata to create a dict of C++ to C# identifiers.
Args:
identifier_metadata_map: A map between C++ identifiers and the metadata
needed to determine their C# identifer.
file_module_map: The map between files and which module they represent.
Returns:
A dict of fully qualified C++ identifiers and the C# identifiers they map
to.
"""
substitution_map = {}
for identifier, metadata in identifier_metadata_map.items():
includes = metadata.get('includes')
if includes:
cs_name = []
namespace = metadata.get('namespace')
module = resolve_module(includes, file_module_map)
symname = metadata.get('symname')
if namespace:
cs_name.append(namespace)
if module:
cs_name.append(module)
if symname:
cs_name.append(symname)
substitution_map[identifier] = '.'.join(cs_name)
return substitution_map
def perform_substitution(match, substitution_map):
"""Substitutes C++ identifiers with C# identifiers.
We want to perform subsitutions on function names, but not accidentally hit
anything else in the string. For example, if the line looks like this:
/// Returns true if `firebase::crash::Initialize()` has been called.
Then we want the final string to be:
/// Returns true if `Firebase.Crash.Initialize()` has been called.
The regex looks for identifiers enclosed within backticks ignoring things
like parentheses. If we did the substitution directly, the backticks and
parentheses would be lost. Instead, what we do is find out what the captured
match was (in this case, 'firebase::crash::Initialize') then take the whole
match ('`firebase::crash::Initialize()`'), and subtitute just the portion we
care about so that the surrounding characters can be preserved.
Args:
match: The re.Match object representing the match.
substitution_map: The dict of potential substitutions.
Returns:
The new C# code resulting from performing the substitutions.
"""
full_match = match.group(0)
cpp = match.group(1)
cs = substitution_map.get(cpp)
if cs:
return full_match.replace(cpp, cs)
else:
return full_match
def apply_substitution(file_content, substitution_map):
"""Apply the substitution map to an entire file.
Args:
file_content: The text on which to perform the substitutions.
substitution_map: The dict of potential substitutions.
Returns:
A new string with all substitutions performed.
"""
return FUNCTION_NAME_REGEX.sub(
lambda match: perform_substitution(match, substitution_map), file_content)
def main(unused_argv):
"""Converts references to C++ identifiers into C# identifiers.
Given the output of running SWIG with the argument `-debug-top 4`, convert all
references to C++ identifiers into C# identifiers in the given files,
optionally prepending all C# identifiers with the namespace given by -n
"""
with open(FLAGS.debug_top, 'r') as debug_file:
debug_file_content = debug_file.read()
# Parse the debug output into a format we can work with.
debug_line_iter = PeekableIter(debug_file_content.splitlines())
debug_data = parse_swig_debug_top(debug_line_iter)
# Gather the data we need to perform the substitutions.
identifier_metadata_map = {}
file_module_map = {}
gather_subtitution_data(debug_data, identifier_metadata_map, file_module_map)
substitution_map = generated_substitution_map(identifier_metadata_map,
file_module_map)
for src in FLAGS.srcs:
with open(src, 'r') as cs_file:
file_content = cs_file.read()
# Apply the maps to the C# code.
file_content = apply_substitution(file_content, substitution_map)
try:
with open(src, 'w') as cs_file:
cs_file.write(file_content)
except IOError as e:
logging.warning('Unable to patch file %s (%s)', cs_file, str(e))
if __name__ == '__main__':
flags.mark_flag_as_required('debug_top')
flags.mark_flag_as_required('srcs')
app.run(main)