forked from Danko-Lab/polymeraseWaves
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpolymeraseWave.bw.R
525 lines (479 loc) · 24.1 KB
/
polymeraseWave.bw.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
###########################################################################
##
## Copyright 2013, 2014 Charles Danko and Minho Chae.
##
## This program is part of the groHMM R package
##
## groHMM is free software: you can redistribute it and/or modify it
## under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
## or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
## for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program. If not, see <http://www.gnu.org/licenses/>.
##
##########################################################################
#' RgammaMLE fits a gamma distribution to a specified data vector using maximum
#' likelihood.
#'
#' @param X A vector of observations, assumed to be real numbers in the
#' inveraval [0,+Inf).
#' @return Returns a list of parameters for the best-fit gamma
#' distribution (shape and scale).
#' @author Charles G. Danko
RgammaMLE <- function(X) {
if(sum(X<0) > 0) message("Negative values not allowed!")
N <- as.double(NROW(X))
sumxis <- as.double(sum(X))
sumlogxis <- as.double(sum(log(X)))
Fit <- .Call("RgammaMLE", N, sumxis, sumlogxis, PACKAGE = "groHMM")
return(Fit)
}
#' Rnorm fits a normal distribution to a specified data vector using maximum
#' likelihood.
#'
#' @param X A vector of observations, assumed to be real numbers in the
#' inveraval (-Inf,+Inf).
#' @return Returns a list of parameters for the best-fit normal distribution
#' (mean and varience).
#' @author Charles G. Danko
Rnorm <- function(X) {
returnList <- list()
returnList$mean <- mean(X)
returnList$var <- var(X)
return(returnList)
}
##
## TODO: Re-factor to one function, allowing the model version to be specified
## as an argument (Charles).
##
##
##
#' Given GRO-seq data, identifies the location of the polymerase 'wave' in
#' up- or down- regulated genes.
#'
#' The model is a three state hidden Markov model (HMM). States represent:
#' (1) the 5' end of genes upstream of the transcription start site,
#' (2) upregulated sequence, and (3) the 3' end of the gene through the
#' polyadenylation site.
#'
#' The model computes differences in read counts between the two conditions.
#' Differences are assumed fit a functional form which can be specified by
#' the user (using the emissionDistAssumption argument).
#' Currently supported functional forms include a normal distribution
#' (good for GRO-seq data prepared using the circular ligation protocol),
#' a gamma distribution (good for 'spikey' ligation based GRO-seq data),
#' and a long-tailed normal+exponential distribution was implemented, but
#' never deployed.
#'
#' Initial parameter estimates are based on initial assumptions of
#' transcription rates taken from the literature. Subsequently all parameters
#' are fit using Baum-Welch expetation maximization.
#'
#' Reference: Danko CG, Hah N, Luo X, Martins AL, Core L, Lis JT, Siepel A,
#' Kraus WL. Signaling Pathways Differentially Affect RNA Polymerase II
#' Initiation, Pausing, and Elongation Rate in Cells. Mol Cell.
#' 2013 Mar 19. doi:pii: S1097-2765(13)00171-8. 10.1016/j.molcel.2013.02.015.
#'
#' Arguments:
#' @param reads1 Mapped reads in time point 1.
#' @param reads2 Mapped reads in time point 2.
#' @param genes A set of genes in which to search for the wave.
#' @param approxDist The approximate position of the wave.
#' Suggest using 2000 [bp/ min] * time [min], for mammalian data.
#' @param size The size of the moving window. Suggest using 50 for direct
#' ligation data, and 200 for circular ligation data. Default: 50.
#' @param upstreamDist The amount of upstream sequence to include
#' Default: 10 kb.
#' @param TSmooth Optimonally, outlying windows are set a maximum value
#' over the inter-quantile interval, specified by TSmooth.
#' Reasonable value: 20; Default: NA (for no smoothing). Users are encouraged
#' to use this parameter ONLY in combination with the normal distribution
#' assumptions.
#' @param NonMap Optionally, un-mappable positions are trated as missing data.
#' NonMap passes in the list() structure for un-mappable regions.
#' @param prefix Optionally, writes out png images of each gene examined for
#' a wave. 'Prefix' denotes the file prefix for image names written to disk.
#' Users are encouraged to create a new directory and write in a full path.
#' @param emissionDistAssumption Takes values "norm", "normExp", and "gamma".
#' Specifies the functional form of the 'emission' distribution for states
#' I and II (i.e. 5' of the gene, and inside of the wave).
#' In our experience, "gamma" works best for highly-variable 'spikey' data,
#' and "norm" works for smooth data. As a general rule of thumb, "gamma"
#' is used for libraries made using the direct ligation method, and "norm"
#' for circular ligation data. Default: "gamma".
#' @param finterWindowSize Method returns 'quality' information for
#' each gene to which a wave was fit. Included in these metrics are several
#' that define a moving window. The moving window size is specified by
#' filterWindowSize. Default: 10 kb.
#' @param limitPCRDups If true, counts only 1 read at each position
#' with >= 1 read. NOT recommended to set this to TRUE. Defulat: FALSE.
#' @param returnVal Takes value "simple" (default) or "alldata". "simple"
#' returns a data.frame with Pol II wave end positions. "alldata" returns all
#' of the availiable data from each gene, including the full posterior
#' distribution of the model after EM.
#' @param debug If TRUE, prints error messages.
#' @return Returns either a data.frame with Pol II wave end positions,
#' or a List() structure with additional data, as specified by returnVal.
#' @author Charles G. Danko
#' @examples
#' genes <- GRanges("chr7", IRanges(2394474,2420377), strand="+",
#' SYMBOL="CYP2W1", ID="54905")
#' reads1 <- as(readGAlignments(system.file("extdata", "S0mR1.bam",
#' package="groHMM")), "GRanges")
#' reads2 <- as(readGAlignments(system.file("extdata", "S40mR1.bam",
#' package="groHMM")), "GRanges")
#' approxDist <- 2000*10
#' # Not run:
#' # pw <- polymeraseWave(reads1, reads2, genes, approxDist)
## Given GRO-seq data, identifies the location of the polymerase wave in up-
## or down-regulated genes. This version is based on a full Baum-Welch EM
## implementation.
##
## This is a three state HMM -- initial state representing the intergenic
## region 5' of a gene, the second representing the initially upregulated
## region, and the third representing the remaining sequence of a gene.
##
## We assume that upstream region is intergenic, and thus its emmission
## distriubtion is assumed to be a constant, set based on a representative
## intergenic region. This is accomidated in my [1,*) framework by keeping
## the vairence constant, and scaling the mean for each gene.
##
## Test with GREB1: chr2:11,591,693-11,700,363
## GREB1 <- data.frame(chr="chr2", start=11591693, end=11700363, str="+")
polymeraseWaveBW <- function(reads1_plus, reads1_minus, reads2_plus, reads2_minus, genes, approxDist, size=50,
upstreamDist= 10000, TSmooth=20, NonMap=NULL, prefix=NULL,
emissionDistAssumption= "gamma", finterWindowSize=10000,
limitPCRDups=FALSE, returnVal="simple", debug=TRUE) {
if(debug) {
message("Analyzing windows")
}
genes <- as.data.frame(genes)
# genes <- genes[,c("seqnames", "start", "end", "strand", "SYMBOL", "ID")]
Fp1 <- load.bigWig(reads1_plus)
Fp2 <- load.bigWig(reads2_plus)
Fm1 <- load.bigWig(reads1_minus)
Fm2 <- load.bigWig(reads2_minus)
sizeP1 <- NROW(Fp1$mean*Fp1$basesCovered+abs(Fm1$mean)*Fm1$basesCovered)
sizeP2 <- NROW(Fp2$mean*Fp2$basesCovered+abs(Fm2$mean)*Fm2$basesCovered)
expCounts <- mean(sizeP1,sizeP2)
ANS <- rep(-1, NROW(genes))
ENDwave <- rep(-1,NROW(genes))
STRTwave <- rep(-1,NROW(genes))
KLdivFinal <- rep(-1,NROW(genes))
KLdivPar <- rep(-1,NROW(genes))
minWindLTMed <- rep(FALSE,NROW(genes))
minMeanWindLTMed <- rep(FALSE,NROW(genes))
nstates<-as.integer(3) # number of states in HMM.
unmap <- NA
## Possible return value.
dataList <- list()
## Run the model separately on each gene.
for(i in 1:NROW(genes)) {
geneData <- list()
if(debug) {
message("Starting HMM: ", genes[i,5])
}
############################################################################
#### Define the gene in terms of the windowed size.
## Pull the data for the gene.
if(genes[i,4] == "+") {
start <- floor((genes[i,2]-upstreamDist))#/size)
end <- ceiling(genes[i,3])#/size)
emis1 <- step.bpQuery.bigWig(Fp1, as.character(genes[i,1]), start, end, size)/sizeP1*expCounts
#(as.numeric(Fp1[[ as.character(genes[i,1]) ]]))[c(start:end)]
#/sizeP1*expCounts
emis2 <- step.bpQuery.bigWig(Fp2, as.character(genes[i,1]), start, end, size)/sizeP2*expCounts
#(as.numeric(Fp2[[ as.character(genes[i,1]) ]]))[c(start:end)]
#/sizeP2*expCounts
}
else {
start <- floor(genes[i,2])#/size)
end <- ceiling((genes[i,3]+upstreamDist))#/size)
emis1 <- abs(rev(step.bpQuery.bigWig(Fm1, as.character(genes[i,1]), start, end, size)))/sizeP1*expCounts
#rev((as.integer(Fm1[[ as.character(genes[i,1]) ]]))[c(start:end)])#/sizeP1*expCounts
emis2 <- abs(rev(step.bpQuery.bigWig(Fm2, as.character(genes[i,1]), start, end, size)))/sizeP2*expCounts
#rev((as.integer(Fm2[[ as.character(genes[i,1]) ]]))[c(start:end)])#/sizeP2*expCounts
}
## Scale to a minimum of 1 read at each position (for fitting Gamma).
gene <- as.numeric(emis1 - emis2)
if(emissionDistAssumption == "gamma") {
## Leave centered on 0 for the norm_exp/norm emission functions
gene <- gene +(-1)*(min(gene))+1
## Must translate points if gamma distributed (gamma undefined <0).
}
if(is.double(TSmooth)) { ## Interperts it as a fold over the inter
## quantile interval to filter.
message("TSmooth is.integer:", TSmooth)
medGene <- median(gene)
iqrGene <- IQR(gene)
gene[(medGene-gene)>(TSmooth*(iqrGene+1))] <-
medGene-(TSmooth*(iqrGene+1))
gene[(gene-medGene)>(TSmooth*(iqrGene+1))] <-
medGene+(TSmooth*(iqrGene+1))
} else if(!is.na(TSmooth)) {
gene <- smooth(gene, kind=TSmooth)
}
#write.table(gene, "TMP.gene.Rflat")
# uTrans<- as.integer(ceiling((upstreamDist)/size))
## Make the initial guess +5kb --> approxDist.
uTrans<- as.integer(ceiling((upstreamDist-5000)/size))
iTrans<- as.integer(ceiling((upstreamDist+approxDist)/size))
## Run Baum-Welch
if(debug) message("initial guess:", uTrans, iTrans, NROW(gene))
counter <- 0
###########################################################################
## Calculate a moving average and moving max.
# For each point. Left of point is defined as P, right is defined as Q.
# From min(gene):max(gene).
# Calculate histogram of points
# if(!is.null(prefix)) { ## Slow, but required for MinOfMax/Avg filter(s).
MovMeanSpd <- finterWindowSize#10000
KLdiv <- rep(0,NROW(gene))
KS <- rep(0,NROW(gene))
Means <- rep(0,NROW(gene))
MovMean <- rep(0,NROW(gene))
MovMax <- rep(0,NROW(gene))
dMovMean <- rep(0,NROW(gene))
for(k in c(2:(NROW(gene)-2))) {
left <- gene[c(1:k)]
right <- gene[c((k+1):NROW(gene))]
LeftHist <- hist(left, breaks=c((min(gene, na.rm=TRUE)-1)
:(max(gene, na.rm=TRUE)+1)), plot=FALSE)
RightHist <- hist(right, breaks=c((min(gene, na.rm=TRUE)-1)
:(max(gene, na.rm=TRUE)+1)), plot=FALSE)
minD <- 0.000001
KLdiv[k] <- sum((LeftHist$density*log(
(LeftHist$density+minD)/(RightHist$density+minD))))
KS[k] <- ks.test(left,right)$statistic[[1]]
Means[k] <- mean(left, na.rm=TRUE)-mean(right, na.rm=TRUE)
MovMean[k] <- mean(gene[max((k-(MovMeanSpd/size)),1)
:min((k+(MovMeanSpd/size)),NROW(gene))], na.rm=TRUE)
MovMax[k] <- max(gene[max((k-(MovMeanSpd/size)),1)
:min((k+(MovMeanSpd/size)),NROW(gene))], na.rm=TRUE)
dMovMean[k] <- MovMean[k-1] - MovMean[k]
}
# }
############################################################################
#### Set up initial paremeter estimates.
## Fit transition and initial probabilities.
tProb <- as.list(data.frame(
log(c((1-(1/uTrans)),(1/uTrans),0)),
log(c(0,(1-(1/(iTrans-uTrans))),(1/(iTrans-uTrans)))),
log(c(0, 0, 1)))) # Trans. prob.
iProb <- as.double(log(c(1, 0, 0))) # iProb.
## Fit initial distribution paremeters for emission probabilities.
parInt <- Rnorm(gene[c(1:uTrans)])
if(is.na(parInt$var) | parInt$var == 0) parInt$var = 0.00001
## Check that the varience of the intergenic state is NOT 0.
if(emissionDistAssumption == "norm") {
ePrDist <- c("norm", "norm", "norm")
# ePrDist <- c("norm", "normexp", "normexp")
parPsi <- Rnorm(gene[c((uTrans+1):iTrans)])
#Rnorm.exp(gene[c((uTrans+1):iTrans)], tol=1e-4) #
parBas <- Rnorm(gene[c((iTrans+1):NROW(gene))])
#Rnorm.exp(gene[c((iTrans+1):NROW(gene))], tol=1e-4) #
ePrVars <- data.frame(c(parInt$mean,
sqrt(parInt$var), -1, -1),
c(parPsi$mean, sqrt(parPsi$var), -1, -1),
c(parBas$mean, sqrt(parBas$var), -1, -1))
}
else if(emissionDistAssumption == "normExp") {
ePrDist <- c("norm", "normexp", "normexp")
parPsi <- Rnorm.exp(gene[c((uTrans+1):iTrans)], tol=1e-4) #
parBas <- Rnorm.exp(gene[c((iTrans+1):NROW(gene))], tol=1e-4) #
ePrVars <- data.frame(c(parInt$mean, sqrt(parInt$var), -1, -1),
parPsi$parameters, parBas$parameters)
}
else if(emissionDistAssumption == "gamma") {
ePrDist <- c("norm", "gamma", "gamma")
parPsi <- RgammaMLE(gene[c((uTrans+1):iTrans)])
parBas <- RgammaMLE(gene[c((iTrans+1):NROW(gene))])
ePrVars <- data.frame(c(parInt$mean, sqrt(parInt$var), -1),
c(parPsi$shape, parPsi$scale, -1),
c(parBas$shape, parBas$scale, -1))
}
else if(emissionDistAssumption == "2waves") {
ePrDist <- c("gamma", "gamma", "gamma")
parInt <- RgammaMLE(gene[c(1:uTrans)])
parPsi <- RgammaMLE(gene[c((uTrans+1):iTrans)])
parBas <- RgammaMLE(gene[c((iTrans+1):NROW(gene))])
ePrVars <- data.frame(c(parInt$mean, sqrt(parInt$var), -1),
c(parPsi$shape, parPsi$scale, -1),
c(parBas$shape, parBas$scale, -1))
}
else {
message("emissionDistAssumption should be set to: 'norm', 'normExp', or 'gamma'.")
stopifnot(FALSE) ## Stop here.
}
# print(data.frame(c(iMean, parInt$var, -1),c(shape, scale, -1),
# c(meanBas, stdeBas, -1)))
###########################################################################
## Finally, choose non-mappable windows and treat them as missing values.
if(!is.null(NonMap)) {
windowStarts <- seq(start, end, size)
## Start and end are like chromStart and chromEnd --
## strand invariant.
windowEnds <- windowStarts+size
windowsToSurvey <- data.frame(chrom= rep(as.character(genes[i,1]),
NROW(windowStarts)), chromStart= as.integer(windowStarts),
chromEnd= as.integer(windowEnds), strand=
rep("+",NROW(windowStarts)))
print(head(windowsToSurvey))
print(tail(windowsToSurvey))
unmap <- countMappableReadsInInterval(windowsToSurvey, NonMap)
if(genes[i,4] == "+") {
unmap <- rev(unmap)
}
gene[unmap/size < 0.10] <- NaN
## Missing data if more than 90% of the windows are unmappable.
}
############################################################################
## Now run the HMM.
if(debug) {
print(ePrVars)
print(tProb)
}
g <- list()
g[[1]] <- gene
ans <- list()
ans <- tryCatch(.Call("RBaumWelchEM", nstates, g, as.integer(1),
ePrDist, ePrVars, tProb, iProb, 0.01, c(TRUE,TRUE,TRUE),
c(TRUE, TRUE, TRUE), as.integer(10), TRUE, PACKAGE="groHMM"),
error=function(e) e)
## Update emis...
if(NROW(ans) < 3) {
## An error will have a length of 2 (is this guaranteed?!).
print("ERROR CAUGHT ON THE C SIDE")
print(ans)
### Will be a previous iteration of ans...?! Generated a new 'ans'
ans[[1]] <- NA
ans[[2]] <- NA
ans[[3]] <- c(rep(0, NROW(gene)-2), 1, 2)
ans[[4]] <- NA
ans[[5]] <- NA
}
ansVitervi <- ans[[3]][[1]]
DTs <- max(which(ansVitervi==0))
DTe <- max(which(ansVitervi==1))
### Calculate quality filters... Wrap up.
KLdivHMM <- 0
if(debug) print(paste("EM Converged to: ",DTs,DTe,NROW(gene)))
if((DTs >= 1) & (DTe > 1) & (DTs < DTe) &
(DTe < NROW(gene)) & (DTs < NROW(gene))) {
## iff convergest to something useful.
## Refit and calculate KL-divergence at that point.
pINew <- Rnorm(gene[c(1:DTs)])
if(emissionDistAssumption == "norm") {
pPNew <- Rnorm(gene[c((uTrans+1):iTrans)])
# Rnorm.exp(gene[c((uTrans+1):iTrans)], tol=1e-4)
#Rnorm(gene[c((uTrans+1):iTrans)])
pBNew <- Rnorm(gene[c((DTe+1):NROW(gene))])
#Rnorm.exp(gene[c((DTe+1):NROW(gene))], tol=1e-4) #
## Estimate KL-divergence.
PSI <- dnorm(c(min(gene):max(gene)), pBNew$mean,
sqrt(pBNew$var))
BAS <- dnorm(c(min(gene):max(gene)), pBNew$mean,
sqrt(pBNew$var))
}
else if(emissionDistAssumption == "normExp") {
pPNew <- Rnorm.exp(gene[c((uTrans+1):iTrans)], tol=1e-4)
#Rnorm(gene[c((uTrans+1):iTrans)])
pBNew <- Rnorm.exp(gene[c((DTe+1):NROW(gene))], tol=1e-4)
PSI <- (pPNew$parameters[1])*dnorm(c(min(gene):max(gene)),
pPNew$parameters[2], pPNew$parameters[3])+
(1-pPNew$parameters[1])*dexp(c(min(gene):max(gene)),
1/pPNew$parameters[4]) ## 1/rate
BAS <- (pBNew$parameters[1])*dnorm(c(min(gene):max(gene)),
pBNew$parameters[2], pBNew$parameters[3])+
(1-pBNew$parameters[1])*dexp(c(min(gene):max(gene)),
1/pBNew$parameters[4]) ## 1/rate
}
else if(emissionDistAssumption == "gamma") {
## Refit and calculate KL-divergence at that point.
pPNew <- RgammaMLE(gene[c((DTs+1):DTe)])
pBNew <- RgammaMLE(gene[c((DTe+1):NROW(gene))])
## Estimate KL-divergence.
PSI <- dgamma(c(min(gene):max(gene)), shape=pPNew$shape,
scale=pPNew$scale)
BAS <- dgamma(c(min(gene):max(gene)), pBNew$shape, pBNew$scale)
}
else if(emissionDistAssumption == "2waves") {
## Refit and calculate KL-divergence at that point.
pPNew <- RgammaMLE(gene[c((DTs+1):DTe)])
pBNew <- RgammaMLE(gene[c((DTe+1):NROW(gene))])
## Estimate KL-divergence.
PSI <- dgamma(c(min(gene):max(gene)), shape=pPNew$shape,
scale=pPNew$scale)
BAS <- dgamma(c(min(gene):max(gene)), pBNew$shape, pBNew$scale)
}
minD2 <- 1e-300
KLdivHMM <- sum((PSI*log((PSI+minD2)/(BAS+minD2))))
ANS[i] <- (DTe-DTs)*size
STRTwave[i] <- DTs*size
ENDwave[i] <- DTe*size
KLdivFinal[i] <- KLdiv[DTe]
##KLdivHMM## Try switching to the empirical KL-div.
KLdivPar[i] <- KLdivHMM
## Calculates min/max and min/avg filters.
medDns <- median(MovMax[c(max((which(ansVitervi == 1))+
round(MovMeanSpd/size)):NROW(MovMax))])
minMax <- min(MovMax[c(min(which(ansVitervi == 1))
:max(which(ansVitervi == 1)))])
minWindLTMed[i] <- (medDns < minMax)
## True if min(wave) > med(wave.upstream)
avgDns <- median(MovMean[c(max((which(ansVitervi == 1))+
round(MovMeanSpd/size)):NROW(MovMean))])
minAvg <- min(MovMean[c(min(which(ansVitervi == 1))
:max(which(ansVitervi == 1)))])
minMeanWindLTMed[i] <- (avgDns < minAvg)
## True if min(wave) > med(wave.upstream)
## Construct the return value...
geneData[[1]] <- gene[c(1:DTs)]
geneData[[2]] <- gene[c((DTs+1):DTe)]
geneData[[3]] <- gene[c((DTe+1):NROW(gene))]
geneData[[4]] <- emis1 ## Value of the gene... c1.
geneData[[5]] <- emis2 ## Value of the gene... c2.
geneData[[6]] <- ans[[4]] ## Matrix of posteriors.
geneData[[7]] <- ans[[5]]
## Posteriors of a transition from state 2->3.
geneData[[8]] <- unmap
dataList[[i]] <- geneData
## Draw an image...
if(!is.null(prefix)) {
png(paste(prefix,".",i,".",genes[i,5],".png",sep=""), width=750, height=600)
par(mar=c(5, 4, 4, 4) + 0.1)
plot((c(1:NROW(gene))*size), gene, axes=F, xlab="", ylab="", ylim=c(min(gene),max(gene)),
type="p",col="black", main=paste("WindowedReads; KLdiv:",KLdivHMM), cex=1, pch=20)
axis(2, ylim=c(min(gene),max(gene)),col="black")
wave <- rep(0,NROW(gene))
wave[which(ansVitervi == 1)] <- 1
points((c(1:NROW(gene))*size), (log(wave)+max(gene)), col="red", cex=1, pch=15)
abline(v=(DTs*size), lty="dotted", lwd=2, col="dark gray")
abline(v=(DTe*size), lty="dotted", lwd=2, col="dark gray")
points((c(1:NROW(gene))*size), gene, pch=20, cex=1)
points((c(1:NROW(gene))*size),MovMax,xlab="",ylab="",axes=F,type="l",col="blue",lwd=1.5)
abline(h=medDns, col="blue", lty="dotted", lwd=1.5)
axis(1,pretty(range(c(1:NROW(gene))*size),10))
mtext("Position (bp)",side=1,col="black",line=2.5)
box()
dev.off()
}
}
}
returnDF <- data.frame(StartWave= STRTwave, EndWave= ENDwave, Rate= ANS,
KLdiv= KLdivFinal, KLdivParametric= KLdivPar, minOfMax= minWindLTMed,
minOfAvg= minMeanWindLTMed, ID= genes[,5], ExternalID= genes[,6])
if(returnVal == "simple") {
return(returnDF)
}
if(returnVal == "alldata") {
dataList[[NROW(genes)+1]] <- returnDF
return(dataList)
}
}