-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrank_single_biomarkers_main.R
112 lines (93 loc) · 5.12 KB
/
rank_single_biomarkers_main.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
## thein: last updated 13 March 2024
# script for identifying single biomarkers and validation in an independent dataset
# command line arguments
args = commandArgs(trailingOnly = TRUE)
# start timer
tm_start <- proc.time()
# load required packages & functions
packages <- c("ggplot2", "ggpattern", "writexl", "readxl", "parallel","tidyverse", "gghighlight")
lapply(packages, library, character.only = TRUE)
fn_names <- c('src_single/filter_data.R', 'src_single/subset_vp.R', 'src_single/add_counts.R', 'src_single/evaluate_biomarkers.R',
'src_single/plotting.R', 'src_single/calc_metrics.R', 'src_single/store_cutoffs.R', 'src_single/plot_cutoffs.R')
lapply(fn_names, source)
# Hard coded variables
# indices of unused quantities
unused_vars = c(1, 2, 5:10, 21, 24, 32:34, 47, 48, 62, 83, 86, 93:96, 97, 100, 102, 103, 115)
# number of virtual patient subsets/thresholds
VP_nsubsets = 8
# minimum number of patients per subset
min_VPs = 20
min_VPs_validation = 1
# method to identify thresholds: uniform (uniformly spaced cutoffs), quantiles
method_threshold = "quantiles"
# naming timepoint
timept_var = args[1]
# "positive"/"negative" for positive/negative predictive biomarkers
predictive=args[2]
if(args[3] == "R"){
consider_SD_as_responder = TRUE # considering SD as responders
}else if(args[3] == "NR"){
consider_SD_as_responder = FALSE
}else{
print("Error: unknown response status for SD")
}
# input file names
biomarker_file_name = paste0('input_files/', timept_var, '_biomarkers_trainset.csv')
biomarker_file_name_test = paste0('input_files/', timept_var, '_biomarkers_testset.csv')
annotations_file_name = "input_files/biomarker_names_units_v1.txt"
# read input files
# annotation file with biomarker names (col 1), units (col 2) & compartment (col 3)
annotations_raw <- read.delim(annotations_file_name, header = TRUE, sep = ",")
# a file with biomarker candidate levels (cols) for virtual patients (rows)
raw_data_train <- read.csv(biomarker_file_name) %>% data.frame() # train data
raw_data_test <- read.csv(biomarker_file_name_test) %>% data.frame() # test data
# filtering train data
filter_output = filter_data(raw_data_train, raw_data_test, annotations_raw, unused_vars)
raw_data_train = filter_output[[1]]
raw_data_test = filter_output[[2]]
annotations = filter_output[[3]]
# calculating total no. of VPs, responders and non-responders
add_counts_output <- add_counts(raw_data_train, consider_SD_as_responder)
tt_VPs_train = add_counts_output[[1]]
tt_responders_train = add_counts_output[[2]]
tt_nonresponders_train = add_counts_output[[3]]
# training
evaluate_biomarkers_output <- evaluate_biomarkers(raw_data_train,
VP_nsubsets,
min_VPs,
tt_responders_train,
tt_nonresponders_train,
annotations,
method_threshold,
NULL,
NULL,
consider_SD_as_responder,
predictive,
"train")
biomarker_summary_train = evaluate_biomarkers_output[[1]]
plotting(biomarker_summary_train, biomarker_summary_train, "train", method_threshold, VP_nsubsets, timept_var, predictive)
# calculating total no. of VPs, responders and non-responders
add_counts_output_test <- add_counts(raw_data_test, consider_SD_as_responder)
tt_VPs_test = add_counts_output_test[[1]]
tt_responders_test = add_counts_output_test[[2]]
tt_nonresponders_test = add_counts_output_test[[3]]
# training set results as input for validation
response_train_data <- read.csv(paste0('output_files/', timept_var, '/single_response_train.csv')) %>% data.frame()
ris_train_data <- read.csv(paste0('output_files/', timept_var, '/single_ris_train.csv')) %>% data.frame()
evaluate_biomarkers_output_test <- evaluate_biomarkers(raw_data_test,
VP_nsubsets,
min_VPs_validation,
tt_responders_test,
tt_nonresponders_test,
annotations,
method_threshold,
response_train_data,
ris_train_data,
consider_SD_as_responder,
predictive,
"test")
response_biomarker_summary_test = evaluate_biomarkers_output_test[[1]]
ris_biomarker_summary_test = evaluate_biomarkers_output_test[[2]]
plotting(response_biomarker_summary_test, ris_biomarker_summary_test, "test", method_threshold, VP_nsubsets, timept_var, predictive)
# stop timer
proc.time() - tm_start