forked from rahgoar/DataScience_OttawaU_2019
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTokenization.py
36 lines (27 loc) · 847 Bytes
/
Tokenization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# -*- coding: UTF-8 -*-
import nltk
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
tokenized_sents=nltk.sent_tokenize(text)
#print(tokenized_sents)
tokenized_word=nltk.word_tokenize(text)
fdist = FreqDist(tokenized_word)
print(fdist)
print(fdist.most_common(2))
fdist.plot(30,cumulative=False)
plt.show()
stop_words=set(stopwords.words("english"))
print(stop_words)
filtered_sents=[]
for w in tokenized_sents:#is this wrong and should be tokenized_word?
if w not in stop_words:
filtered_sents.append(w)
#print("Tokenized Sentence:",tokenized_sents)
#print("Filterd Sentence:",filtered_sents)
fdist = FreqDist(filtered_sents)
print(fdist)
print(fdist.most_common(2))
fdist.plot(30,cumulative=False)
plt.show()