-
Notifications
You must be signed in to change notification settings - Fork 69
/
Copy pathgeneratefeedvector.py
62 lines (48 loc) · 1.43 KB
/
generatefeedvector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import feedparser
import collections
import re
def getwords(html):
text = re.compile(r'<[^>]+>').sub('', html)
words = re.compile(r'[^A-z^a-z]+').split(text)
return [word.lower() for word in words if word]
def getwordcounts(url):
d = feedparser.parse(url)
print d
wc = collections.defaultdict(int)
for e in d.entries:
if 'summary' in e: summary = e.summary
else: summary = e.description
words = getwords('%s %s' % (e.title, summary))
for word in words:
wc[word] += 1
if 'title' not in d.feed:
print 'Invalid url', url
return 'bogus data', wc
return d.feed.title, wc
def main():
# XXX: break this up into smaller funtions, write tests for them
apcount = collections.defaultdict(int)
wordcounts = {}
feedlist = open('feedlist.txt').readlines()
for url in feedlist:
title, wc = getwordcounts(url)
wordcounts[title] = wc
for word, count in wc.iteritems():
if count > 1:
apcount[word] += 1
wordlist = []
for w, bc in apcount.iteritems():
frac = float(bc)/len(feedlist)
if 0.1 < frac < 0.5: wordlist.append(w)
out = file('blogdata.txt', 'w')
out.write('Blog')
for w in wordlist: out.write('\t' + w)
out.write('\n')
for blogname, counts in wordcounts.iteritems():
out.write(blogname)
for w in wordlist:
if w in counts: out.write('\t%d' % counts[w])
else: out.write('\t0')
out.write('\n')
if __name__ == '__main__':
main()