-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtürkçe_vt_preprocessing.py
126 lines (103 loc) · 4.25 KB
/
türkçe_vt_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# -*- coding: utf-8 -*-
"""türkçe_vt_preprocessing.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1m_uwuny5uWvljqripBzU6YV1uQBAjGXr
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
# %matplotlib inline
# %config InlineBackend.figure_format = 'retina'
"""**bu programda türkçe veri seti için ön işlem aşaması yapılmaktadır**"""
#train ve test verilerini train ve test olarak okuduk
cols=['text','sentiment']
train=pd.read_excel("train_tweets.xlsx",names=cols)
test=pd.read_excel("test_tweets.xlsx",names=cols)
train.head()
#3 ayrı sınıf için olumsuz =-1 ,olumlu=1 ve nötr=0 belirlendi
train["sentiment"]=train["sentiment"].map({'olumlu':1,'olumsuz':-1,'notr':0})
test["sentiment"]=test["sentiment"].map({'olumlu':1,'olumsuz':-1,'notr':0})
train.head()
#her bir classtan kaçar tane var-train seti için
train.sentiment.value_counts()
#her bir classtan kaçar tane var-test seti için
test.sentiment.value_counts()
!pip install TurkishStemmer
!pip install snowballstemmer
import re
import nltk
from nltk.tokenize import WordPunctTokenizer
from TurkishStemmer import TurkishStemmer
nltk.download('stopwords')
#from snowballstemmer import TurkishStemmer
stemmer = TurkishStemmer()
WPT = nltk.WordPunctTokenizer()
stop_word_list = nltk.corpus.stopwords.words('turkish')
def handle_emojis(tweet):
# Smile -- :), : ), :-), (:, ( :, (-:, :')
tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
# Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMO_POS ', tweet)
# Love -- <3, :*
tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
# Wink -- ;-), ;), ;-D, ;D, (;, (-;
tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', tweet)
# Sad -- :-(, : (, :(, ):, )-:
tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
# Cry -- :,(, :'(, :"(
tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)
return tweet
def clean_doc(text):
processed_tweet = []
text=text.lower()
text = handle_emojis(text)
text=text.replace('ş','s')
text=text.replace('ı','i')
text=text.replace('ö','o')
text=text.replace('ü','u')
text=text.replace('ğ','g')
text=text.replace('ç','c')
text = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' ', text)
text = re.sub(r'\brt\b', ' ', text)
text=re.sub(r'((@[\S]+)|(#[\S]+))', ' ', text)
#text = re.sub(r'@[A-Za-z0-9_]+', ' ', text)
# text = re.sub(r'#(\S+)', r' \1 ', text)
pattern = r"[{}]".format('\'"?!,.():;><_/')
text = re.sub(pattern, " ", text)
text = re.sub("\d+", " ", text)
text = re.sub(r'\'\b', ' ',text)
#ikiden fazla noktayı bir boşlukla değiştir
text = re.sub(r'\.\?\!{2,}', ' ', text)
pattern = r"(.)\1+".format(' ')
text = re.sub(pattern, r'\1', text)
text = text.strip()
tokens = WPT.tokenize(text)
# filtered_tokens=[stemmer.stem(token) for token in tokens if token not in stop_word_list]
filtered_tokens = [token for token in tokens if token not in stop_word_list ]
filtered_tokens = [token for token in tokens if len(token)>2 ]
text = ' '.join(filtered_tokens)
return text
text="Hêvî Retweetledi:wwww.khjkhkh rtKoray ÇalışkanSon günlerde Turkcell;Aradığınız kişiye şu an ulaşılamıyor.Muhtemelen Güneydoğu'da.. #SilvanAndNusaybinUnderAttack Hêvî ekledi,Koray Çalışkan @koraycaliskanSatmayan"
print(clean_doc(text))
"""%%time
clean_tweet_texts = []
for i in range(0,len(train)):
clean_tweet_texts.append(clean_doc(train['text'][i]))
"""
len(clean_tweet_texts)
clean_df = pd.DataFrame(clean_tweet_texts,columns=['text'])
clean_df['sentiment'] =train.sentiment
clean_df.to_excel('clean_tweet_train.xlsx',encoding='utf-8')
clean_df.head()
"""%%time
print ("Cleaning the tweets...\n")
clean_tweet_texts_test = []
for i in range(0,len(test)):
clean_tweet_texts_test.append(clean_doc(test['text'][i]))
"""
clean_df_test = pd.DataFrame(clean_tweet_texts_test,columns=['text'])
clean_df_test['sentiment'] = test.sentiment
clean_df_test.to_excel('clean_tweet_test.xlsx',encoding='utf-8')
clean_df_test.head()