-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_output.py
72 lines (56 loc) · 2.07 KB
/
clean_output.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import csv
import re
import csv
import string
filtered_words = []
filtered_frequencies = []
total_length = 0
filtered_length = 0
file_path = 'data/output/3_clean.csv'
# Open the CSV file
with open(file_path, 'r') as file:
reader = csv.reader(file)
header = next(reader) # Skip the header row
words = []
frequencies = []
# Iterate over each row in the CSV file
for row in reader:
word = row[0]
frequency = row[1]
total_length += 1
# Check if the word is non-English and non-punctuation
if not re.match(r'[a-zA-Z]+', word) and not all(char in string.punctuation for char in word):
# check if first character is a punctuation
if word[0] in string.punctuation:
while word[0] in string.punctuation:
word = word[1:] # remove the punctuation
print(word)
if word == '':
continue
filtered_length += 1
# check if last character is punctuation
if word[-1] in string.punctuation:
while word[-1] in string.punctuation:
word = word[:-1] # remove the punctuation
print(word)
if word == '':
continue
filtered_length += 1
filtered_words.append(word)
filtered_frequencies.append(frequency)
filtered_length += 1
else:
print(word, frequency)
# updates
# if total_length % 100000 == 0:
# print('Processed {} words'.format(total_length))
# save the filtered words and frequencies to the csv file
with open('data/output/3_clean.csv', 'w') as file:
writer = csv.writer(file)
# header
writer.writerow(['word', 'frequency'])
for word, frequency in zip(filtered_words, filtered_frequencies):
writer.writerow([word, frequency])
# result
print('Total words: ', total_length)
print('Filtered words: ', total_length - filtered_length)