-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcommand_line.py
123 lines (97 loc) · 3.14 KB
/
command_line.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/python3
# -*- coding: utf-8 -*-
from io import BytesIO
from io import StringIO
import glob
import logging
import os
import sys
import docx
from docx import Document
import adlamConversion
import ahomConversion
from mendeConverter import MendeConverter
import phkConversion
from convertDoc2 import ConvertDocx
# get uploaded file into document form
def createDocFromFile(file_path):
try:
file = open(file_path, 'rb')
text = file.read()
data = BytesIO(text)
count = len(text)
doc = Document(data)
file.close()
return doc, count
except BaseException as err:
print('Cannot create Docx for %s. Err = %s' % (file_path, err))
return None, -1
def convertThisDoc(lang, inputFileName):
base_name = os.path.splitext(inputFileName)[0]
out_file_name = base_name + '_Unicode.docx'
if base_name.find('Unicode') > 0:
return None
doc, file_size = createDocFromFile(inputFileName)
if not doc:
logging.warning('No document %s opened: %s', inputFileName, docx)
return None
else:
logging.info('Doc created from %s', inputFileName)
sentence_mode = False
lang_converter = None
if lang == 'ff':
lang_converter = adlamConversion.AdlamConverter()
sentence_mode = True
elif lang == 'aho':
lang_converter = ahomConversion.AhomConverter()
elif lang == 'phk':
lang_converter = phkConversion.PhakeConverter()
elif lang == 'men':
lang_converter = MendeConverter()
if not lang_converter:
return None
lang_converter.setScriptIndex(0)
lang_converter.setLowerMode(True)
lang_converter.setSentenceMode(sentence_mode)
try:
paragraphs = doc.paragraphs
except AttributeError:
pass
# msgToSend = '%d paragraphs in %s\n' % (count, inputFileName)
new_progress_obj = None
doc_converter = ConvertDocx(lang_converter, documentIn=doc,
reportProgressObj=new_progress_obj)
if doc_converter:
result = doc_converter.processDocx()
doc.save(out_file_name)
else:
result = None
word_frequencies = None
try:
word_frequencies = lang_converter.getSortedWordList()
if word_frequencies:
# Do something with this information
for item in word_frequencies:
print(item)
except BaseException as err:
logging.warning('FAILED TO GET WORD LIST: %s' % err)
return result
def main(argv):
if len(argv) < 3:
print('Convert .docx files from font encodings to Unicode text')
print('Usage: python3 command_line lang_code file1 file2 file ...')
return
lang = argv[1]
# For each item in the list, [2:...]
files = []
for doc_path in argv[2:]:
if os.path.isdir(doc_path):
# Expand with glob
files.extend(glob.glob(doc_path + "/*.docx"))
else:
files.append(doc_path)
for file_path in files:
print('Converting %s in document %s' % (lang, file_path))
result = convertThisDoc(lang, file_path)
if __name__ == '__main__':
main(sys.argv)