forked from jaechoullee/COVID-19-Policy-and-Changepoints
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalysis_GA-RWD_deaths_v1-gh.R
237 lines (178 loc) · 10.2 KB
/
analysis_GA-RWD_deaths_v1-gh.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
#*[-----------------------------------------------------------------------------------------------]*#
#*[ Objectives : This program applies our changepoint method to COVID-19 daily deaths ]*#
#*[ using a random walk model with piecewise drifts. ]*#
#*[ Last update: 2021-04-28 ]*#
#*[ Author : Jaechoul Lee ]*#
#*[-----------------------------------------------------------------------------------------------]*#
### Setup data input and output directories
WD.lib <- c("L:/Home/JaechoulLee/!1Research/Paper/Epidemiology/P02_Covid19/R_library/")
WD.inp <- c("L:/Home/JaechoulLee/!1Research/Paper/Epidemiology/P02_Covid19/Data/")
WD.out <- c("L:/Home/JaechoulLee/!1Research/Paper/Epidemiology/P02_Covid19/Application_R1/Out-deaths_2/")
### Load the R code for the proposed piecewise drift random walk model and GA
source(file=paste(WD.lib,"lib_ga-RWD_v1-2.R",sep=""))
### Required packages
library(tidyverse)
library(doSNOW)
library(foreach)
library(doParallel)
#*[-----------------------------------------------------------------------------------------------]*#
### Data pre-processing
#*[-----------------------------------------------------------------------------------------------]*#
### Read the data trimmed for the study period 2020-03-08 to 2021-02-28
data.STATES <- read.csv(file=paste(WD.inp,"us-states_20210303.csv",sep=""))
names(table(data.STATES$state))
states.m <- c("Alabama","Alaska","Arizona","Arkansas","California",
"Colorado","Connecticut","Delaware","District of Columbia","Florida",
"Georgia","Hawaii","Idaho","Illinois","Indiana",
"Iowa","Kansas","Kentucky","Louisiana","Maine",
"Maryland","Massachusetts","Michigan","Minnesota","Mississippi",
"Missouri","Montana","Nebraska","Nevada","New Hampshire",
"New Jersey","New Mexico","New York","North Carolina","North Dakota",
"Ohio","Oklahoma","Oregon","Pennsylvania","Rhode Island",
"South Carolina","South Dakota","Tennessee","Texas","Utah",
"Vermont","Virginia","Washington","West Virginia","Wisconsin",
"Wyoming")
n.states <- length(states.m)
### Find the first and last recording dates for each state
date.first <- numeric()
date.last <- numeric()
for (i in 1:n.states) {
state_i <- states.m[i] # i-th state
data_i <- filter(data.STATES,state==state_i) # i-th state dataset
n_i <- nrow(data_i) # i-th state sample size
date.first_i <- data_i$date[1] # initial first date
date.last_i <- data_i$date[n_i] # last date
date.first[i] <- date.first_i
date.last[i] <- date.last_i
}
state.date <- data.frame(state=states.m,date.first,date.last)
state.date
table(state.date$date.first)
### Create dataset with common first and last dates (2020-03-15 to 2021-03-03)
date.first_target <- "2020-03-15"
data.STATES_target <- numeric()
for (i in 1:n.states) {
state_i <- states.m[i] # i-th state
data_i <- filter(data.STATES,state==state_i) # i-th state dataset
date.first_i <- data_i$date[1] # initial first date
if (date.first_i <= date.first_target) {
data_i.target <- filter(data_i,date>=date.first_target)
} else {
fips_i <- data_i$fips[1]
n.dif_i <- as.numeric(as.Date(date.first_i)-as.Date(date.first_target))
data_i.early <- data.frame(date=as.character(as.Date(date.first_target)+0:(n.dif_i-1)),
state=state_i,
fips=fips_i,
cases=0,
deaths=0)
data_i.target <- rbind(data_i.early,data_i)
}
data.STATES_target <- rbind(data.STATES_target,data_i.target)
}
dim(data.STATES_target)
### Cumulative deaths in the last day
death.last <- numeric()
for (i in 1:n.states) {
state_i <- states.m[i] # i-th state
data_i <- filter(data.STATES_target,state==state_i) # i-th state dataset
n_i.all <- nrow(data_i) # sample size in daily data
death.last_i <- filter(data_i,date==data_i$date[n_i.all])
death.last <- rbind(death.last,death.last_i)
}
death.last
filter(death.last,deaths<=720) # only for states <= 720(=2*n) deaths
date state fips cases deaths
1 2021-03-03 Alaska 2 58574 291
2 2021-03-03 Hawaii 15 27549 438
3 2021-03-03 Maine 23 45091 705
4 2021-03-03 Vermont 50 15487 207
5 2021-03-03 Wyoming 56 54616 682
#*[-----------------------------------------------------------------------------------------------]*#
### Changepoint estimation via GA method
#*[-----------------------------------------------------------------------------------------------]*#
### Setup parallel backend to use multiple cores
cores <- detectCores()
cl <- makeCluster(cores-1)
registerDoParallel(cl)
### Record run time
print(paste("#==========[ GA began at",Sys.time()," ]==========#"))
### GA for changepoints detection
GA.sol <- foreach (i=1:n.states,.packages='tidyverse') %dopar% {
state_i <- states.m[i] # i-th state
data_i <- filter(data.STATES_target,state==state_i) # i-th state dataset
n_i.all <- nrow(data_i) # sample size in daily data
if (data_i$deaths[n_i.all] > 720) { # [!CAUTION!] only for states > 720(=2*n) deaths, excluding
# Alaska,Hawaii,Maine,Vermont,Wyoming
t.1_i.loc <- 1+3 # location of first day for 7-day moving-average
t.n_i.loc <- n_i.all-3 # location of last day for 7-day moving-average
t.1_i <- as.Date(data_i$date[t.1_i.loc]) # first day for 7-day moving-average: 2020-03-18
t.n_i <- as.Date(data_i$date[t.n_i.loc]) # last day for 7-day moving-average: 2021-02-28
# Set up the series for 7-day moving-average
x.t_all <- c(data_i$deaths[1],diff(data_i$deaths)) # daily count of new deaths in data
x.t_all.mis <- ifelse(x.t_all>=0,x.t_all,NA) # NA if daily count is < 0
m.t_all <- mov.avg_q(x.t_all.mis,q=3) # 7(=2q+1)-day moving-average
t.t <- as.Date(data_i$date[t.1_i.loc:t.n_i.loc]) # day for study period
y.t <- data_i$deaths[t.1_i.loc:t.n_i.loc] # daily count of cumulative deaths
x.t <- x.t_all.mis[t.1_i.loc:t.n_i.loc] # daily count of new deaths, with negative as missing
m.t_0 <- m.t_all[t.1_i.loc:t.n_i.loc] # 2q+1=7 day moving-average
m.t <- m.t_0 # if m.t_0 < 0.25, m.t is set to be around 0.25
for (t in 1:length(m.t_0)) {
if (m.t_0[t] < 0.25) m.t[t] <- rnorm(1,mean=0.25,sd=0.01)
}
lm.t <- log(m.t) # use log(m.t) for change in growth rate
# GA changepoint estimation # use log(m.t) for change in growth rate
ga.out <- ga.cpt_ts(y=lm.t,fitness=pnllik.MDL_RWD,gen.size=200,max.itr=150,p.mut=0.05,
seed=10*(i-1)+1000*(i==10)+543,is.graphic=FALSE,is.print=FALSE,is.export=FALSE)
ga.sol <- ga.out$solution # GA estimated changepoints
ga.mdl <- ga.out$val.sol[length(ga.out$val.sol)] # optimized value of MDL
ga.day <- ga.sol[-1]+t.t[1]-1 # dates of changepoints
### ML estimation with GA estimated changepoints
out.ga <- fit.RWD(lm.t,cp=ga.sol) # use log(m.t) for change in growth rate
delta.ga <- out.ga$delta
sterr.ga <- out.ga$delta.se
sigma.ga <- out.ga$sigma
lnlik.ga <- out.ga$loglik
ml.out <- cbind(delta.ga,sterr.ga,sigma.ga,lnlik.ga)
list(state=state_i,ga.sol=ga.sol,ga.mdl=ga.mdl,ga.day=ga.day,ml.est=ml.out)
} # end of if-statement
}
### Record run time
print(paste("#==========[ GA ended at",Sys.time()," ]==========#"))
### Print GA changepoint and ML estimationi results
GA.sol
### End parallel backend
stopCluster(cl)
### Save to files
t.n_i <- "2021-02-28" # last day for 7-day moving-average (3-03 minus 3)
capture.output(GA.sol,file=paste(WD.out,"out_ga-RWD_deaths_",t.n_i,".txt",sep=""))
save(GA.sol,file=paste(WD.out,"out_ga-RWD_deaths_",t.n_i,".RData",sep=""))
#*[-----------------------------------------------------------------------------------------------]*#
### GA results summary
#*[-----------------------------------------------------------------------------------------------]*#
### Load GA changepoint analysis results
t.1_i <- "2020-03-18" # first day for 7-day moving-average: 3-(15+3)
t.n_i <- "2021-02-28" # last day for 7-day moving-average: 3-(03-3)
load(file=paste(WD.out,"out_ga-RWD_deaths_",t.n_i,".RData",sep="")) # Loaded is GA.sol
GA.sol
### GA changepoint days
GA.dates <- data.frame()
for (i in 1:n.states) {
ga.out_i <- GA.sol[[i]] # GA result for i-th state
GA.dates_i <- merge(ga.out_i$state,c(t.1_i,as.character(ga.out_i$ga.day))) # state & its changepoint dates
GA.dates <- rbind(GA.dates,GA.dates_i)
}
colnames(GA.dates) <- c("state","cpt.date")
GA.dates
### RWD estimates with GA changepoints considered
GA.mlest <- data.frame()
for (i in 1:n.states) {
ga.out_i <- GA.sol[[i]] # GA result for i-th state
GA.mlest_i <- ga.out_i$ml.est # RWD estimates
GA.mlest <- rbind(GA.mlest,GA.mlest_i)
}
GA.mlest
### All GA results
GA.result <- data.frame(GA.dates,GA.mlest)
GA.result
### Export output file in .csv format
write.csv(GA.result,file=paste(WD.out,"out_ga-RWD_deaths_",t.n_i,".csv",sep=""))