-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvertWord.py
589 lines (496 loc) · 18.6 KB
/
convertWord.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
# -*- coding: utf-8 -*-
#!/usr/bin/env python3
from __future__ import absolute_import, division, print_function
import os
import re
import sys
# Read and process MS Word documents, converting old Cherokee encoding into
# Unicode characters.
# https://openpyxl.readthedocs.io/en/default/tutorial.html
from docx import Document
import adlamConversion
import convertUtil
import inspect
FONTS_TO_CONVERT = ['Fulfulde - Pulaar', 'MT Extra', 'Cherokee OLD',
'Cherokee;Cherokee2',
'Cherokee2']
# Flag for handling all characters in an Old font.
convertAllInOldFontRange = True
debugFlag = True # False
# Set to True to get lower case conversion
toLowerCase = True # False
# Check for text and convert the old font encoded parts of the strings.
# It assumes that the font has been detected.
def checkAndConvertText(textIn, converter, fontIndex):
if not textIn or textIn[0] == '=':
# Ignore function calls
return textIn
fontTextInfo = None
result = converter.convertText(textIn, fontIndex=fontIndex)
return result
class paragraphData():
def __init__(self):
self.runs = []
self.formatList = []
self.textList = []
self.textsize = 0
self.startPoint = []
def adddata(self, run, textNode, formatNode):
self.runs.append(run)
self.formatList.append(formatNode)
self.textList.append(textNode)
self.startPoint.append(self.textsize)
self.textsize += len(textNode.text)
class runInfo():
def __init__(self, run):
self.run = run
self.sentence_start_offsets = []
self.exclamation_mark_offsets = []
self.question_mark_offsets = []
self.stop_mark_offsets = []
self.text = ''
# Creates a new document with converted text, adding in pieces as neede
# and flexibly handling text runs.
class copyConverter():
def __init__(self, path_in, path_out=None):
self.path_in = path_in
self.doc_in = Document(self.path_in)
styles = self.doc_in.styles
print('styles (%d)= %s' % (len(styles), styles))
for style in styles:
print(' style = %s' % (style))
if not path_out:
newName = os.path.splitext(self.path_in)[0]
path_out = newName + '.u2.docx'
self.path_out = path_out
self.converter = None
# TODO FIX THIS
self.unicode_font_out = 'Noto Sans Adlan New'
def deleteParagraph(self, paragraph):
p = paragraph._element
p.getparent().remove(p)
p._p = p._element = None
def convertParagraph(selfself, p):
# TODO Fill this in.
return
def convertDoc(self, unicodeFont, debugInfo=None):
# Creating a new document
newdoc = Document()
# TODO: set up style with new Unicode font
for style in self.doc_in.styles:
newdoc.styles.add_style(style.name, style.type)
new_paragraph = newdoc.add_paragraph()
new_paragraph.style = style
print('Added style %s' % style.name)
self.deleteParagraph(new_paragraph)
for old_section in self.doc_in.sections:
new_section = newdoc.add_section(start_type=old_section.start_type)
new_section.page_height = old_section.page_height
new_section.page_width = old_section.page_width
new_section.orientation = old_section.orientation
new_section.top_margin = old_section.top_margin
new_section.gutter = old_section.gutter
new_section.header_distance = old_section.header_distance
new_section.footer_distance = old_section.footer_distance
new_section.left_margin = old_section.left_margin
new_section.right_margin = old_section.right_margin
hparagraph = new_section.header.paragraphs[0]
hparagraph.text = old_section.header.paragraphs[0].text
p_num = 0
for old_p in self.doc_in.paragraphs:
if not old_p.text:
print('P %d is empty' % p_num)
p_num += 1
# Add simple text paragraph, converted as a whole
# newText = self.converter.convertText(old_p.text) if self.converter else old_p.text
new_p = newdoc.add_paragraph()
# Copy properties of the paragraph
new_p.style = old_p.style
old_p_format = old_p.paragraph_format
new_p_format = new_p.paragraph_format
# new_p.paragraph_format = old_p.paragraph_format
new_p_format.line_spacing_rule = old_p_format.line_spacing_rule
new_p_format.alignment = old_p_format.alignment
new_p_format.left_indent = old_p_format.left_indent
new_p_format.right_indent = old_p_format.right_indent
new_p_format.first_line_indent = old_p_format.first_line_indent
# Now get the runs of text, with compatibilities
print('RUNS %2d old %2d new', (len(old_p.runs), len(new_p.runs)))
new_runs = self.extractRuns(old_p, new_p)
# Convert each run
for run in new_p.runs:
new_text = self.converter.convertText(run.text)
run.text = new_text
run.font.name = self.unicode_font_out
run.rtl = self.converter.isRtl()
# Now consider special punctuation and capitalization
# TODO
# For testing after combining and converting.
self.analyzeRuns(new_runs)
# TODO: process capitalization and punctuation adjustments.
newdoc.save(self.path_out)
def compareRunFonts(self, f1, f2):
font_diffs = []
if f1.all_caps != f2.all_caps:
font_diffs.append('all_caps')
if f1.bold != f2.bold:
font_diffs.append('bold')
if f1.bold != f2.bold:
font_diffs.append('bold')
# if f1.theme_color != f2.theme_color:
# font_diffs.append('theme_color')
if f1.complex_script != f2.complex_script:
font_diffs.append('complex_script')
if f1.cs_bold != f2.cs_bold:
font_diffs.append('cs_bold')
if f1.cs_italic != f2.cs_italic:
font_diffs.append('cs_italic')
if f1.double_strike != f2.double_strike:
font_diffs.append('double_strike')
if f1.emboss != f2.emboss:
font_diffs.append('emboss')
if f1.hidden != f2.hidden:
font_diffs.append('hidden')
if f1.highlight_color != f2.highlight_color:
font_diffs.append('highlight_color')
if f1.imprint != f2.imprint:
font_diffs.append('imprint')
if f1.italic != f2.italic:
font_diffs.append('italic')
if f1.math != f2.math:
font_diffs.append('math')
if f1.name != f2.name:
font_diffs.append('name')
# if f1.theme_color != f2.theme_color:
# font_diffs.append('theme_color')
if f1.outline != f2.outline:
font_diffs.append('outline')
if f1.rtl != f2.rtl:
font_diffs.append('rtl')
if f1.shadow != f2.shadow:
font_diffs.append('shadow')
if f1.size != f2.size:
font_diffs.append('size')
if f1.small_caps != f2.small_caps:
font_diffs.append('small_caps')
if f1.snap_to_grid != f2.snap_to_grid:
font_diffs.append('snap_to_grid')
if f1.spec_vanish != f2.spec_vanish:
font_diffs.append('spec_vanish')
if f1.strike != f2.strike:
font_diffs.append('strike')
if f1.subscript != f2.subscript:
font_diffs.append('subscript')
if f1.superscript != f2.superscript:
font_diffs.append('superscript')
if f1.underline != f2.underline:
font_diffs.append('underline')
if f1.web_hidden != f2.web_hidden:
font_diffs.append('web_hidden')
return font_diffs
def runsCompatible(self, prev, current):
# Check the characteristics for compatibility
# This includes most attributes, but not rtl or font details
# The checks may include looking at the contents of the run's text
# compared with the output expected.
if not prev:
return False
# Also check font properties such as size, color, style, etc.
mergeable_text = self.converter.checkContentsForMerge(current.text)
if mergeable_text:
return True
diffs = self.compareRunFonts(prev.font, current.font)
# Here's where we check for important differences
# print('Font diffs = %s' % (diffs))
if not diffs:
return True
return False # This will just copy runs.
def copyFontData(self, new_run, old_run):
old_font = old_run.font
new_run.font.all_caps = old_font.all_caps
new_run.font.bold = old_font.bold
new_run.font.color.rgb = old_font.color.rgb
new_run.font.color.theme_color = old_font.color.theme_color
new_run.font.complex_script = old_font.complex_script
new_run.font.cs_bold = old_font.cs_bold
new_run.font.cs_italic = old_font.cs_italic
new_run.font.double_strike = old_font.double_strike
new_run.font.emboss = old_font.emboss
new_run.font.hidden = old_font.hidden
new_run.font.highlight_color = old_font.highlight_color
new_run.font.imprint = old_font.imprint
new_run.font.italic = old_font.italic
new_run.font.math = old_font.math
new_run.font.name = old_font.name
new_run.font.no_proof = old_font.no_proof
new_run.font.outline = old_font.outline
new_run.font.rtl = old_font.rtl
new_run.font.shadow = old_font.shadow
new_run.font.size = old_font.size
new_run.font.small_caps = old_font.small_caps
new_run.font.snap_to_grid = old_font.snap_to_grid
new_run.font.spec_vanish = old_font.spec_vanish
new_run.font.strike = old_font.strike
new_run.font.subscript = old_font.subscript
new_run.font.superscript = old_font.superscript
new_run.font.underline = old_font.underline
new_run.font.web_hidden = old_font.web_hidden
def extractRuns(self, old_p, new_p):
# Get run data, combining runs as needed.
p_data = paragraphData()
runs = []
prev_run = None
empty_run_count = 0
# TEMPORARY FOR TESTING
for run in old_p.runs:
# Compare attributes
# If compatible with previous, append text
if prev_run and self.runsCompatible(prev_run, run):
print('Combining runs %s with %s' % (prev_run.text, run.text))
prev_run.text += run.text
else:
if not run.text:
empty_run_count += 1
else:
# else create new run with the style
new_run = new_p.add_run(run.text)
new_run.style = run.style
new_run.bold = run.bold
new_run.italic = run.italic
new_run.underline = run.underline
self.copyFontData(new_run, run)
# print('Run %s text size = %s' % (len(runs), len(new_run.text)))
runs.append(new_run)
prev_run = new_run
print('%d runs found. %d empty found' % (len(runs), empty_run_count))
return runs
def analyzeRuns(self, run_list):
# Look for beginning of sentences for capitalization and insertion
# of start of sentence marks.
analyses = []
end_sentence_pattern = r'([\.\?\!\u061F])($|\u0020)'
# Find if there are ends of sentences with exclamation or question marks
print('---------------')
starting = True
for run in run_list:
analysis = runInfo(run)
analysis.text = run.text
sentence_parts = re.split(end_sentence_pattern, run.text)
analyses.append(analysis)
if starting:
analysis.sentence_start_offsets.append(0) # Actual first non-white-space
starting = False
match_iter = re.finditer(end_sentence_pattern, run.text)
for match in match_iter:
span = match.span()
first = span[0]
matched_char = match.groups(0)[0][0]
# print(' match %s %s [%s]' % (match.span(), match.groups(), matched_char))
if matched_char == '!':
analysis.exclamation_mark_offsets.append(first)
elif matched_char == '.':
analysis.stop_mark_offsets.append(first)
else:
analysis.question_mark_offsets.append(first)
if span[1] >= len(run.text) and (span[1] - span[0] > 1):
starting = True # The previous was
# The last run is assumed to end a sentence.
# TODO: Complete
runnum = 0
# for analysis in analyses:
# print(' Analysis %2d %s %s %s %s %s' % (runnum,
# analysis.sentence_start_offsets,
# analysis.stop_mark_offsets,
# analysis.exclamation_mark_offsets,
# analysis.question_mark_offsets,
# analysis.text,
# ))
runnum += 1
# Handles sentence conversion if needed.
def convertParagraph(para, converter, unicodeFont, debugInfo=False):
skipped_fonts = set() # Record any that were not converted.
numConverts = 0
notConverted = 0
runs = para.runs
if False and debugInfo:
print(' %d runs in paragraph' % (len(runs)))
print(' paragraph text = %s' % (para.text))
runNum = 0
runNum = 1
fontsInRun = []
for run in runs:
if False and debugInfo:
print(' run %d text = (%d) %s' % (runNum, len(run.text), run.text))
print(' element = %s, parent = %s' % (run._element, run._element.getparent()))
fontsInRun.append(run.font)
thisText = run.text
fontObj = run.font
fontName = fontObj.name
# print('$$$$ name = %s' % inspect.getmembers(fontObj.name))q
if False and debugInfo:
print(' Run #%1d in font >%s<. Text(%d) = >%s<' % (
runNum, fontName, len(run.text), run.text))
try:
if converter.forceFont:
fontObj.name = unicodeFont
except:
pass
if fontName == None and thisText != '':
x = thisText # Catch places where font name is not set
if fontName == "Phake Script":
x = thisText
try:
## If font is not in the list to convert, skip it!
font_index = converter.oldFonts.index(fontName)
fontObj.name = unicodeFont ## Change font even if text is empty.
convertedText = checkAndConvertText(thisText, converter, font_index)
if '၁' == convertedText:
x=1
if thisText != convertedText:
if False and debugInfo:
print(' Converted %s to %s' % (thisText, convertedText))
numConverts += 1
try:
run.text = convertedText
except Exception as e:
print('** Text assignment error %s with %s' % (e, convertedText))
else:
notConverted += 1
except ValueError as e:
if fontName:
skipped_fonts.add(fontName)
continue
if False and debugInfo:
print('Fonts in run %s' % fontsInRun)
if len(fontsInRun) > 1:
compareFonts(fontsInRun[0], fontsInRun[1])
runNum += 1
# Additional processing as needed.
try:
converter.processParagraphRuns(para)
except:
pass
# Check for multiple fonts in runs
fonts_in_runs = {}
run_text = []
for run in runs:
if run.font.name:
fonts_in_runs[run.font.name] = run
run_text.append([run.text, run.font.name])
if False and debugInfo and len(fonts_in_runs) > 1:
print('*** Multiple fonts in run: %s' % (fonts_in_runs.keys()))
print('** Run text : %s' % (run_text))
# Check for skipped fonts
if len(skipped_fonts) > 0:
x = skipped_fonts
return numConverts
def convertTables(doc, converter, unicodeFont, debug=False):
# Look at the paragraphs in each cell of each table
numConverted = 0
for table in doc.tables:
# Look at the cells
row_num = 0
for row in table.rows:
for cell in table.row_cells(row_num):
p = cell.paragraphs
for p in cell.paragraphs:
numConverted += convertParagraph(p, converter, unicodeFont, debug)
row_num += 1
return numConverted
def convertDoc(doc, converter, unicodeFont, debugInfo=None):
sections = doc.sections
print(' %d sections' % len(sections))
print(' %d tables' % len(sections))
print(' unicodeFont = %s' % unicodeFont)
paragraphs = doc.paragraphs
print(' %d paragraphs' % len(doc.paragraphs))
numConverts = 0
notConverted = 0
# Add font for Unicode
for style in doc.styles:
if style.name.find('Default') >= 0:
print('DEFAULT %s' % style.font.name)
style.font.name = unicodeFont
## print('Style %s: %s' % (style.name, style.type))
# Headers and footers
for section in sections:
header = section.header
for p in header.paragraphs:
numConverts += convertParagraph(p, converter, unicodeFont)
numConverts += convertTables(doc, converter, unicodeFont, debugInfo)
paraNum = 0
for para in paragraphs:
num_converted = convertParagraph(para, converter, unicodeFont, debugInfo)
numConverts += num_converted
paraNum += 1
print(' %d values converted to Unicode' % numConverts)
return (numConverts, notConverted)
def compareFonts(f1, f2):
# TODO: Figure this out.
f1dir = inspect.getmembers(f1)
f2dir = inspect.getmembers(f2)
sameFont = True
return sameFont
# Process one DOCX, substituting text in the old font with converted values.
def convertOneDoc(path_to_doc, converter,
outpath=None, isString=False):
print('Converting in file: %s' % path_to_doc)
unicodeFont = converter.unicodeFont
doc = Document(path_to_doc)
(numConverts, numNoteConverted) = convertDoc(
doc, converter, unicodeFont, debugInfo=debugFlag)
if numConverts:
newName = os.path.splitext(path_to_doc)[0]
unicode_path_to_doc = newName + '.unicode.docx'
doc.save(unicode_path_to_doc)
print(' ** Saved new version to file %s\n' % unicode_path_to_doc)
else:
print(' @@@ No conversion done, so no new file created.\n')
return doc
def processArgs(argv):
if len(sys.argv) <= 1:
print('Usage:')
print(' convertWord.py inputFile.docx')
print(' convertWord.py inputFile1.docx inputFile2.docx ... ')
print(' convertWord.py -i fileWithFileNames')
return None
path_to_docs = []
if len(argv) == 2:
path_to_docs.append(sys.argv[1])
else:
if len(argv) == 3 and argv[1] == '-f':
# Get the file containing conversion list and get all items.
path_to_docs = convertUtil.infileToList(argv[2])
if not path_to_docs:
print('Error: no contents found in file %s' %
argv[2])
return
else:
# Expect a list of files in the
path_to_docs = [path for path in argv[1:]]
return path_to_docs
def main(argv):
if len(sys.argv) > 1:
path_to_doc = sys.argv[1]
else:
print('An input .docx file is required.')
doc_list = processArgs(argv)
adlamFonts = [['Fulfulde - Aissata', 'arab'], ]
convertFileCount = 0
adlam_converter = adlamConversion.converter(adlamFonts)
adlam_converter.setLowerMode(toLowerCase)
adlam_converter.debug = False
create_new_doc = False
# Creates a copy
for doc_path in doc_list:
if create_new_doc:
copy_converter = copyConverter(doc_path)
copy_converter.converter = adlam_converter
copy_converter.convertDoc('Noto Sans Adlam New')
else:
# Make modified copy.
convertOneDoc(doc_path, adlam_converter)
convertFileCount += 1
print('%d processed' % convertFileCount)
if __name__ == "__main__":
main(sys.argv)