-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathNMF.py
79 lines (69 loc) · 2.43 KB
/
NMF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# -*- coding: utf-8 -*-
"""
Topic Mining of the Wikipedia literature using Non-Negative Matrix
Factorization.
"""
from sklearn import decomposition
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import Data
from compiler.ast import flatten
from gensim import corpora, models
import string
import nltk.corpus
papers, abstracts = Data.load_data()
dictionary, corpus_tfidf, corpus= Data.prepare_data()
#texts = flatten((texts))
words = []
for paper in papers:
words.append(map(string.lower, nltk.word_tokenize(paper['Abstract'])))
#for paper in papers:
# words = map(string.lower, nltk.word_tokenize(paper['Abstract']))
# paper.update({'words': words})
all_words = [ word for paper in words for word in paper ]
## Count the occurences of all words
#wordcounts = dict([ [t, all_words.count(t)] for t in set(all_words) ])
#
#stopwords = nltk.corpus.stopwords.words('english')
#
#terms = {}
#for word, count in wordcounts.iteritems():
# if count > 2 and word not in stopwords and word.isalpha():
# terms[word] = count
#
#
## Change the ordering of value and key for sorting
#items = [(v, k) for k, v in terms.items()]
#
#for count, word in sorted(items, reverse=True)[:5]:
# print("%5d %s" % (count, word))
#
#
## Wikipedia is the main topic of all the papers to remove it
#terms.pop('wikipedia')
terms = list(dictionary.token2id)
# Construct a bag-of-words matrix
M = np.asmatrix(np.zeros([len(papers), len(terms)]))
for n, paper in enumerate(words):
for m, term in enumerate(terms):
M[n,m] = words[n].count(term)
def nmf(M, components=5, iterations=5000):
"""Applies NMF and returns the results"""
# Initialize to matrices
W = np.asmatrix(np.random.random(([M.shape[0], components])))
H = np.asmatrix(np.random.random(([components, M.shape[1]])))
for n in range(0, iterations):
H = np.multiply(H, (W.T * M) / (W.T * W * H + 0.001))
W = np.multiply(W, (M * H.T) / (W * (H * H.T) + 0.001))
print "%d/%d" % (n, iterations)
return (W, H)
## Perform the actual computation
W, H = nmf(M, iterations=50, components=50)
for component in range(W.shape[1]):
print("="*80)
print("TOPIC %d: " % (component,))
indices = (-H[component,:]).getA1().argsort()
print([dictionary[i] for i in indices[:6] ])
print("-")
indices = (-W[:,component]).getA1().argsort()
print("\n".join([ papers[i][''] for i in indices[:5] ]))