-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathreviews_and_tests.r
226 lines (154 loc) · 7.42 KB
/
reviews_and_tests.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# Review code, this should help reviewers to validate the code, note that tests were also added in the Rcode/tests folder.
# We first run the code for the Ro_testdata_mbr project, and then come back step by step.
# in the second part, we use the dataset created to test that the SVM analysis runs.
## input variables, similar as in master_noshiny.r
setwd("analysis")
library (randomForest)
library (ica)
library (e1071) #svm
require(Hmisc) #binomial confidence
require (plotly)
library (rstatix) #effect size calculation
library (coin) #effect size calculation
library(osfr) ##access to osf
library (tidyverse)
library (stringr)
library(gridExtra)
library(RGraphics)
source ("Rcode/functions.r")
version = "noused"
PMeta ="../data/Projects_metadata.csv"
RECREATEMINFILE= T # set to true if you want to recreate an existing min file, otherwise the code will create one only if it cannot find an exisiting one.
NO_svm = TRUE
# variable grouping, only working with HCS data at the moment.
groupingby = "Berlin" #"Jhuang" # other possibilities:"Berlin" #
# choosing time windows for the analysis
selct_TW = c(8:9)
Npermutation = 1 # number of permutation to perform. set to 1 if testing (42 s per run with AOCF designation,30s with MIT)
Name_project ="Ro_testdata_mbr"
## run code
source ("master_shiny.r")
#################---------------------#################
#################---------------------#################
#################---------------------#################
## review process point by point
## experiment metadata: (read in input.r file, saved with the data in Min_Ro_testdata_mbr.csv)
## ----------------------------------------------
metadata # information about animals and lab (light on and light off time) as entered in the metadata files,
#### depending of the type of input data used (minute summary, mbr files), links to data files are either here
metadata$Onemin_summary
#### or here
metadata$primary_behav_sequence
#### grouping variable (created in animal_groups.r from master metadata file entry)
metadata$groupingvar
metadata$confoundvar # confounding variables not interesting to test, but grouping may help statistics
## The data (MIN_data, written to Min_Ro_testdata_mbr.csv)
## ----------------------------------------------
# create_minfile.r code read, and pull together the data, its output is saved in the Min_Ro_testdata_mbr file, available in the data folder.
# Unless the input is hourly summary, it is a minute summary file, giving the amount of time doing each behavior for each minute of experiment.
# metadata and relation to light on/off time is added to the data.
MIN_data
# The file created by R from the raw behavior sequence should be very similar to identical to the excel file information created with the home cage scan software.
#compare for instance:
View(MIN_data %>% filter (ID =="100"))
# with 150609_Testdata_HCS_groupB/150609_Testdata_HCS_min_2.xlsx
## the time-windowed summary data (Multi_datainput, written to timedwindowed_Ro_testdata_mbr.csv)
## this will become the input for the analysis (PCR, random forest, svm)
## ----------------------------------------------
## timewindows are created, only non-empty ones are displayed (depends on the length of the experiment)
Timewindows
## for each selected time window
selct_TW
## the code takes minutes summary for each meta-behavior (2 grouping choice possible at the moment berlin, jhuang),
names(behav_gp)
## and calculates the percentage of time spent doing each one, in each time window, based on the function get_windowsummary()
## its output is Multi_datainput, (metadata is not included, only animal ID is present)
unique(Multi_datainput$ID)
# one number (representing the time window) is added to each metabehavior name in the header
names(Multi_datainput)
## Analysis
## ID is replaced by groupingvar (and confoundvar if exists) in Multi_datainput_m (or Multi_datainput_m2)
Multi_datainput_m$groupingvar
## random forest + ICA: RF_selection_2rounds.r, ICA.r
RF_selec # selection of Multi_datainput_m, with a minimum of 8 variables
# all variables having a MeanDecreaseGini > 0.95,
# theoretical maximum of variables is 20, because 20 are chosen after a first random forest.
pls2 # 3d plot of the results of the ICA done over RF_select, groups as color
## PCA analysis
# done over Multi_datainput_m,
Moddata #output with each principal components
Moddata$PC1 # first principal components where statistics is done
###------------------#####
###------------------#####
###------------------#####
### SVM tests
## Multi_datainput_m is the only input of the svm analysis
Npermutation = 1 # make sure only one permutation is done.
## data input:
#source ("analysis/reviews_and_tests.r") # set working directory to project first.
#seed was set to 74 in master_shiny.r
# prepare extra data to test multiple groups groups:
Multi_datainput_moriginal = Multi_datainput_m
Multi_datainput_mplus = Multi_datainput_moriginal
Multi_datainput_mplus$groupingvar = "third"
# test 1: do not run if groups are not equal
Multi_datainput_m= rbind(Multi_datainput_moriginal, Multi_datainput_mplus
)
source ("Rcode/svmtest.r")
if (!NO_svm){ source ("Rcode/svmperf.r") } # will not run
# put same number of data per group
Multi_datainput_m = Multi_datainput_m [1:30,]
# test 2: run with 3 groups of 10
NO_svm = FALSE
source ("Rcode/svmtest.r")
if (!NO_svm){ source ("Rcode/svmperf.r") } # not run
# run the code for 3 groups, get 3 text outputs (takes time, as 3x one permutation is done)
p1
p2
p3
# test 3: with more than 15 animals per group: run testset and trainset (2 groups)
Multi_datainput_mplus = Multi_datainput_moriginal
Multi_datainput_m= rbind(Multi_datainput_moriginal, Multi_datainput_mplus
)
NO_svm = FALSE
source ("Rcode/svmtest.r")
if (!NO_svm){ source ("Rcode/svmperf.r") }
trainset$groupingvar
testset$groupingvar
Accuracy
# Accuracy very good:
# trying more permutation to see whether there is a chance problem:
Npermutation = 30
if (!NO_svm){ source ("Rcode/svmperf.r") }
hist(Acc_sampled, breaks=c(0:15)/15*2-1)
abline(v = Accuracyreal, col="Red")
beepr::beep()
Npermutation = 10
Acc_sampled = c()
# High Accuracyreal indeed due to chance, of course when test data are part of train data, it does not work !
# test 4: 2-out validation with 2 groups, with original data
Multi_datainput_m = Multi_datainput_moriginal
NO_svm = FALSE
source ("Rcode/svmtest.r")
if (!NO_svm){ source ("Rcode/svmperf.r") }
Accuracyreal
# Accuracyreal is very low, but it does not seem to be a bug:
# if we change data to make difference between groups bigger, and run it again:
Multi_datainput_m = Multi_datainput_moriginal
Multi_datainput_m [11:20, 3] = 3*Multi_datainput_m [11:20, 3]
Multi_datainput_m [11:20, 4] = Multi_datainput_m [11:20, 4]- 2*Multi_datainput_m [11:20, 3] # total will still 1
Multi_datainput_m [1:20, 3]
Multi_datainput_m [1:20, 4]
# PCA strategy leads to huge difference:
source ("Rcode/PCA_strategy.r")
boxplotPCA1
# run svm
NO_svm = FALSE
source ("Rcode/svmtest.r")
if (!NO_svm){ source ("Rcode/svmperf.r") }
Accuracyreal
# Accuracyreal is still very low, probably a issue with svm not being good in 2-out validation, or a specific issue with this kind of data ?
# indeed, when looking into it, the svm tuning does not find any good parameter for the svm even for the training data, so no surprise it can not predict the test data.
objR$best.performance # the lower this score, the better the performance.
# get original dataset for further use:
Multi_datainput_moriginal -> Multi_datainput_m