forked from muskaancontlo/Pdf-Search-Engine
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpara.py
86 lines (56 loc) · 1.75 KB
/
para.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import json
sum = 0
count = 0
# maxi = 0 #-->not using due to outliners
# Calculating the average character in a line
with open('final-final.txt', errors="ignore") as f:
for line in f:
length = len(line.strip())
# maxi = max(maxi, length)
if(length > 10):
sum += length
count += 1
avg = sum/count
# print(maxi)
#### Either use lists or dicts ####
# Making paragraphs marker using lists
start = [0]
end = []
count = 0
# marking the the starting and wending line of paragraphs
with open('final-final.txt', errors="ignore") as f:
for line in f:
length = len(line.strip())
# print(str(line.strip()[-1])
count += 1
if(length <(avg) and len(line.strip())!=0 and line.strip()[-1] == '.'):
end.append(count)
start.append(count+1)
start.pop()
#Printing paragraph lists - start & end
# for i in range(len(start)):
# print(str(start[i])+ " " + str(end[i]))
# Making paragraphs marker using dict
# para = {0 : [0]}
# count = 0
# count_para = 0
# with open('AU127-1.txt') as f:
# for line in f:
# length = len(line.strip())
# count += 1
# if(length <(avg*2/3) and line.strip()[-1] == '.'):
# para[count_para].append(count)
# count_para += 1
# para[count_para] = [count+1]
# para.pop(count_para)
#Printing paragraph dict
# print(para)
########## #########
json_start = json.dumps(start)
json_end = json.dumps(end)
f = open("start.json", 'w')
f.write(json_start)
f.close()
f = open("end.json", 'w')
f.write(json_end)
f.close()