forked from TimJheng/R_Projects
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathText Mining_Concept Clustering.R
93 lines (73 loc) · 2.42 KB
/
Text Mining_Concept Clustering.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
###Concept Clustering###
#library checker
libs = c("tidyverse", "data.table","dplyr", "Rtsne", "readxl", "ggplot2", "factoextra")
for (i in libs){
if( !is.element(i, .packages(all.available = TRUE)) ) {
install.packages(i)
}
library(i,character.only = TRUE)
}
lapply(libs, require, character.only = TRUE)
getwd()
{
setwd("")
Key <- read_xlsx(path="Concept Clustering.xlsx", sheet = 'name&key') %>% as.data.frame()
data <- read_xlsx(path="Concept Clustering.xlsx", sheet = 'data')
data <- data[,-1] %>% as.data.frame() #data need to be tagged(Document)
Key <- Key$key %>% as.data.frame() #Key words
}
#Create dimention-Document Term Matrix
colbase = NULL
for (i in 1:nrow(Key))
{
x <- ifelse(grepl(Key[i,],data[,1])==TRUE,1,0)
colbase <- cbind(colbase,x)
}
colbase <- colbase %>% as.data.frame
name2 <- as.character(Key$name)
colnames(colbase) <- c(name2)
#final <- cbind(data,colbase)
#DTM format confirmed
{
colbase$sum <- apply(colbase,1,sum)
colbase <- colbase %>% filter(!sum==0)
colbase <- colbase %>% select(-sum)
newdata <- colbase %>% t() %>% as.data.frame()
}
#tSNE-dimention reduction
newdata <- newdata %>% as.matrix()
tsne <- Rtsne(newdata, dims = 2, perplexity=50, verbose=TRUE, max_iter = 1200, pca = FALSE)
T <- tsne$Y %>% as.data.frame()
rownames(T)=row.names(newdata)
#sp = ggplot(T, aes(T[,1], T[,2],label = rownames(T)))
#sp + geom_point()
#sp + geom_text()
#Check the optimized number of clusters
fviz_nbclust(T,
FUNcluster = kmeans,
method = "silhouette",
k.max = 15)+labs(title="最佳分群數")
#+geom_vline(xintercept = 7,linetype = 2)
Cluster=kmeans(T,7)
sp = ggplot(T, aes(T[,1], T[,2],label = rownames(T),color =Cluster$cluster))
sp +geom_point() +scale_colour_gradientn(colours=rainbow(5))
sp +geom_text(size=4.5) +scale_colour_gradientn(colours=rainbow(3))
#Interpretation
Cluster=kmeans(T,3)
w=Cluster$cluster%>% as.data.frame()
table(w$.)
#PCA for dimention reduction
#pca <- prcomp(newdata, scale=TRUE)
#plot(pca)
#rd <- pca$x %>% as.data.frame() %>% select(PC1,PC2,PC3,PC4,PC5,PC6)
#library(factoextra)
#fviz_nbclust(rd,
# FUNcluster = kmeans,
# method = "wss",
# k.max = 12)+labs(title="最佳分群數")+geom_vline(xintercept = 7,linetype = 2)
#Cluster<-kmeans(rd,8)
#w<-Cluster$cluster%>% as.data.frame()
#table(w$.)
#sp <- ggplot(rd, aes(rd[,1], rd[,2],label = rownames(rd)))
#sp + geom_point()
#sp + geom_text()