-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathrun.py
49 lines (45 loc) · 1.83 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import argparse
from os.path import join
from time import time
from Extractor import Extractor
RFDIR = "data/"
WFDIR = "out/"
parser = argparse.ArgumentParser()
parser.add_argument("-f", "--fname", required=True, type=str, dest='fname')
parser.add_argument("-o", "--oname", required=False,
default=None, type=str, dest='oname')
parser.add_argument("--thresh", required=False,
default=4.0, type=float, dest='thresh')
parser.add_argument("--count", required=False,
default=40, type=int, dest='count')
parser.add_argument("-n", "--ngram", required=False,
default=8, type=int, dest='ngram')
parser.add_argument("--save", required=False,
default=True, type=bool, dest='save')
parser.add_argument("--preprocess", required=False,
default=False, type=bool, dest='preprocess')
if __name__ == '__main__':
tic = time()
args = parser.parse_args()
rfpath = join(RFDIR, args.fname)
print(args.preprocess, args.count)
if not args.preprocess:
try:
text = open(rfpath, "r").readlines()
except:
text = open(rfpath, "r", encoding="utf-8").readlines()
text = [line.strip() for line in text]
extracter = Extractor(text=text, max_len=args.ngram)
else:
extracter = Extractor(rfpath=rfpath, max_len=args.ngram)
words = extracter.extract_words(score_thresh=args.thresh, cnt_thresh=args.count)
if args.save:
if args.oname:
opath = join(WFDIR, args.oname)
words.to_csv(opath, encoding="utf_8_sig", index=False, sep='\t')
else:
opath = join(WFDIR, args.fname)
words.to_csv(opath, encoding="utf_8_sig", index=False, sep='\t')
print(words)
toc = time()
print("Total time: %.2fs" % (toc - tic))