-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathplink2_workflow.wdl
218 lines (184 loc) · 5.7 KB
/
plink2_workflow.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
task process_phenos {
File phenofile
String sample_id_header
String outcome
String exposure
String? covar_names
String? delimiter
String? missing
Int ppmem
command {
python3 /format_plink2_phenos.py ${phenofile} ${sample_id_header} ${outcome} ${exposure} "${covar_names}" "${delimiter}" ${missing}
}
runtime {
docker: "quay.io/large-scale-gxe-methods/plink2-workflow"
memory: ppmem + "GB"
}
output {
File pheno_fmt = "plink2_phenotypes.txt"
File plink2_parameter_file = "plink2_parameters_string.txt"
}
}
task run_interaction {
File genofile_pgen
File genofile_psam
File genofile_pvar
File phenofile
String outcome
Boolean binary_outcome
String exposure
String? covar_names
File plink2_parameter_file
Int? memory
Int? disk
Int threads
Int monitoring_freq
String covar_name_str = exposure + " " + covar_names
String plink2_parameter_string = read_string(plink2_parameter_file)
command {
dstat -c -d -m --nocolor 1 > system_resource_usage.log &
atop -x -P PRM 1 | grep '(plink2)' > process_resource_usage.log &
/plink2 --pgen ${genofile_pgen} \
--psam ${genofile_psam} \
--pvar ${genofile_pvar} \
--allow-extra-chr \
--pheno-name ${outcome} \
${true="--1" false="" binary_outcome} \
--pheno ${phenofile} \
--covar-name ${covar_name_str} \
--glm interaction \
--parameters ${plink2_parameter_string} \
--threads ${threads} \
--out plink2_res
mv plink2_res.${outcome}.glm.${true='logistic' false='linear' binary_outcome} plink2_res
}
runtime {
docker: "quay.io/large-scale-gxe-methods/plink2-workflow"
memory: "${memory} GB"
disks: "local-disk ${disk} HDD"
gpu: false
dx_timeout: "7D0H00M"
}
output {
File res = "plink2_res"
File system_resource_usage = "system_resource_usage.log"
File process_resource_usage = "process_resource_usage.log"
}
}
task standardize_output {
File resfile
String exposure
Boolean binary_outcome
String outfile_base = basename(resfile)
String outfile = "${outfile_base}.fmt"
command {
python3 /format_plink2_output.py ${resfile} ${exposure} ${binary_outcome} ${outfile}
}
runtime {
docker: "quay.io/large-scale-gxe-methods/plink2-workflow"
memory: "2 GB"
}
output {
File res_fmt = "${outfile}"
}
}
task cat_results {
Array[File] results_array
command {
head -1 ${results_array[0]} > all_results.txt && \
for res in ${sep=" " results_array}; do tail -n +2 $res >> all_results.txt; done
}
runtime {
docker: "quay.io/large-scale-gxe-methods/plink2-workflow"
disks: "local-disk 5 HDD"
}
output {
File all_results = "all_results.txt"
}
}
workflow run_plink2 {
Array[File] genofiles_pgen
Array[File] genofiles_psam
Array[File] genofiles_pvar
File phenofile
String sample_id_header
String outcome
Boolean binary_outcome
String exposure_names
String? covar_names = ""
String? delimiter = ","
String? missing = "NA"
Boolean? robust
Int? memory = 10
Int? disk = 20
Int? threads = 1
Int? monitoring_freq = 1
Int ppmem = 2 * ceil(size(phenofile, "GB")) + 1
call process_phenos {
input:
phenofile = phenofile,
sample_id_header = sample_id_header,
outcome = outcome,
exposure = exposure_names,
covar_names = covar_names,
delimiter = delimiter,
missing = missing,
ppmem = ppmem
}
scatter (i in range(length(genofiles_pgen))) {
call run_interaction {
input:
genofile_pgen = genofiles_pgen[i],
genofile_psam = genofiles_psam[i],
genofile_pvar = genofiles_pvar[i],
phenofile = process_phenos.pheno_fmt,
outcome = outcome,
binary_outcome = binary_outcome,
exposure = exposure_names,
covar_names = covar_names,
plink2_parameter_file = process_phenos.plink2_parameter_file,
memory = memory,
disk = disk,
threads = threads,
monitoring_freq = monitoring_freq
}
}
scatter (resfile in run_interaction.res) {
call standardize_output {
input:
resfile = resfile,
exposure = exposure_names,
binary_outcome = binary_outcome
}
}
call cat_results {
input:
results_array = standardize_output.res_fmt
}
output {
File results = cat_results.all_results
Array[File] system_resource_usage = run_interaction.system_resource_usage
Array[File] process_resource_usage = run_interaction.process_resource_usage
}
parameter_meta {
genofiles_pgen: "Array of PLINK2 genotype (.pgen) filepaths."
genofiles_psam: "Array of PLINK2 sample (.psam) filepaths."
genofiles_pvar: "Array of PLINK2 variant (.pvar) filepaths."
phenofile: "Phenotype filepath. Does not need to be in PLINK format (will be processed as part of the workflow)."
sample_id_header: "Optional column header name of sample ID in phenotype file."
outcome: "Column header name of phenotype data in phenotype file."
binary_outcome: "Boolean: is the outcome binary? Otherwise, quantitative is assumed."
exposure_names: "Column header name(s) of the exposures for genotype interaction testing (space-delimited). Only one exposures is currently allowed."
covar_names: "Column header name(s) of any covariates for which only main effects should be included (space-delimited). This set should not overlap with exposures or int_covar_names."
delimiter: "Delimiter used in the phenotype file."
missing: "Missing value key of phenotype file."
cpu: "Minimum number of requested cores."
disk: "Requested disk space (in GB)."
monitoring_freq: "Delay between each output for process monitoring (in seconds). Default is 1 second."
}
meta {
author: "Kenny Westerman"
email: "[email protected]"
description: "Run interaction tests using PLINK2 and return summary statistics for 1-DF and 2-DF tests."
}
}