-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWordFrequencies.py
69 lines (60 loc) · 1.71 KB
/
WordFrequencies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# -*- coding: utf-8 -*-
import os
import re
import sys
# Read text file and compute word frequencies.
# Output is CSV of word, count, relative_frequency
class frequency():
def __init__(self, filepath):
self.f = open(filepath, encoding="utf-8")
self.count_map = {}
self.word_count = 0
self.lowerCase = True
self.reSplit = re.compile("\W+")
def compute_map(self):
for line in self.f:
# words = line.split()
words = self.reSplit.split(line)
for word in words:
if self.lowerCase:
word = word.lower()
word = word.strip() # Remove
if not word:
continue
if word not in self.count_map:
self.count_map[word] = 1
else:
self.count_map[word] += 1
self.word_count += 1
def output_frequencies(self):
keys = self.count_map.keys()
total = float(self.word_count)
for word in sorted(keys):
count = self.count_map[word]
outline = (word, count, count / total)
print(outline)
def sort2(a, b):
if a[1] == b[1]:
# Consider alpha if same frequency
if a[0] < b[0]:
return 1
else:
return -1
# Frequencies only
return b[1] - a[1]
def output_tsv(self):
keys = self.count_map.keys()
total = float(self.word_count)
alpha_list = sorted(self.count_map.items(), key=lambda item: item[0])
for result in sorted(alpha_list, key=lambda item: item[1], reverse=True):
outline = "%s\t%s" % (result[0], result[1])
print(outline)
def main(args):
if len(args) < 2:
return
compute = frequency(args[1])
compute.compute_map()
#compute.output_frequencies()
compute.output_tsv()
if __name__ == "__main__":
main(sys.argv)