forked from MelcyPhilip/METASEED
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathread_collecting.sh
122 lines (109 loc) · 6.35 KB
/
read_collecting.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/bin/bash
sample_id=$1
R1_file=$2
R2_file=$3
dbase=$4
tmp_folder=$5
seed_abundance_file=$6
max_n=$7
threads=$8
echo "*** Mapping $R1_file reads to seeds with bowtie2, and collecting matching reads"
$bowtie2_app bowtie2 --threads $threads --local -x $dbase -U $R1_file \
| $samtools_app samtools view --threads $threads -b - \
| $samtools_app samtools fasta -0 $tmp_folder/$sample_id\_R1.fasta -F 4 -
echo "*** Mapping $R2_file reads to seeds with bowtie2, and collecting matching reads"
$bowtie2_app bowtie2 --threads $threads --local -x $dbase -U $R2_file \
| $samtools_app samtools view --threads $threads -b - \
| $samtools_app samtools fasta -0 $tmp_folder/$sample_id\_R2.fasta -F 4 -
echo "*** Aligning collected R1-reads to seeds with BLAST..."
$blast_app blastn -db $dbase -query $tmp_folder/$sample_id\_R1.fasta -perc_identity 99 -num_threads $threads \
-outfmt '6 qseqid sseqid pident gapopen qlen qstart qend slen sstart send bitscore' \
-penalty -1 -reward 1 -gapopen 2 -gapextend 1 -out $tmp_folder/$sample_id\_R1.txt
echo "*** Aligning collected R1-reads to seeds with BLAST..."
$blast_app blastn -db $dbase -query $tmp_folder/$sample_id\_R2.fasta -perc_identity 99 -num_threads $threads \
-outfmt '6 qseqid sseqid pident gapopen qlen qstart qend slen sstart send bitscore' \
-penalty -1 -reward 1 -gapopen 2 -gapextend 1 -out $tmp_folder/$sample_id\_R2.txt
echo "*** Reading BLAST results and finding the matching read-pairs"
Rscript -e "suppressMessages(library(tidyverse))
args <- commandArgs(trailingOnly = T)
seed_abundance_file <- args[1]
tmp_folder <- args[2]
sample_id <- args[3]
max_n <- args[4]
seed.tbl <- suppressMessages(read_delim(seed_abundance_file, delim = '\t')) %>%
select(1, all_of(sample_id))
colnames(seed.tbl) <- c('seed_id', 'abundance')
r1_blast_file <- file.path(tmp_folder, str_c(sample_id, '_R1.txt'))
r2_blast_file <- file.path(tmp_folder, str_c(sample_id, '_R2.txt'))
b1.tbl <- suppressMessages(read_delim(r1_blast_file, delim = '\t',
col_names = c('read_id', 'seed_id', 'pident', 'gapopen',
'qlen', 'qstart', 'qend',
'slen', 'sstart', 'send', 'bitscore'))) %>%
mutate(type = 'R1')
b2.tbl <- suppressMessages(read_delim(r2_blast_file, delim = '\t',
col_names = c('read_id', 'seed_id', 'pident', 'gapopen',
'qlen', 'qstart', 'qend',
'slen', 'sstart', 'send', 'bitscore'))) %>%
mutate(type = 'R2')
b.tbl <- bind_rows(b1.tbl, b2.tbl) %>%
mutate(readpair_id = str_remove(read_id, '/[12]$')) %>% # read 1 and 2 may have /1 and /2 at the end of the name
filter(gapopen == 0) %>%
mutate(strand = if_else(sstart < send, '+', '-')) %>%
mutate(match = if_else(qstart == 1 & qend == qlen, 'inside', 'local')) %>%
mutate(match = if_else((qstart == 1 & qend < qlen) & ((send == slen & strand == '+') | (send == 1 & strand == '-')), 'start_end', match)) %>%
mutate(match = if_else((qstart > 1 & qend == qlen) & ((sstart == 1 & strand == '+') | (sstart == slen & strand == '-')), 'end_start', match)) %>%
filter(match != 'local') %>%
distinct(read_id, seed_id, .keep_all = T) %>% # only one combination of read and seed
left_join(seed.tbl, by = 'seed_id') %>% # include abundance column
arrange(desc(abundance)) %>% # arrange by abundance, descending order
group_by(read_id) %>% # group by read_id
mutate(max_score = max(bitscore)) %>% # compute max_score for each group
filter(bitscore == max_score) %>% # keep only alignments with max_score
mutate(n_hits = length(unique(seed_id))) %>% # then count how many different seeds a read matches
filter(n_hits <= max_n) %>% # keep only those with max_n hits or less
slice(1) %>% # keep only the largest abundance within group
ungroup() %>%
group_by(readpair_id) %>% # in case the two reads in a pair disagree on seed
slice(1) %>% # again we keep the seed with largest abundance
ungroup() %>%
arrange(desc(n_hits), readpair_id, desc(abundance)) %>%
select(readpair_id, read_id, seed_id, n_hits, abundance)
cat('Found', nrow(b.tbl), 'alignments matching', length(unique(b.tbl\$seed_id)), 'unique seeds...\n')
b.tbl %>%
mutate(read_id = str_replace(read_id, '/2$', '/1')) %>%
distinct(read_id) %>%
write_delim(delim = '\t', file = file.path(tmp_folder, str_c(sample_id, '_R1_tags.txt')), col_names = F)
b.tbl %>%
mutate(read_id = str_replace(read_id, '/1$', '/2')) %>%
distinct(read_id) %>%
write_delim(delim = '\t', file = file.path(tmp_folder, str_c(sample_id, '_R2_tags.txt')), col_names = F)
write_delim(b.tbl, delim = '\t', file = file.path(tmp_folder, str_c(sample_id, '_b.tbl')))" $seed_abundance_file $tmp_folder $sample_id $max_n
echo "*** Collecting the actual read-pairs"
$bbmap_app filterbyname.sh in=$R1_file out=$tmp_folder/$sample_id\_R1.fq names=$tmp_folder/$sample_id\_R1_tags.txt include=t
$bbmap_app filterbyname.sh in=$R2_file out=$tmp_folder/$sample_id\_R2.fq names=$tmp_folder/$sample_id\_R2_tags.txt include=t
echo "*** Distributing reads by seeds"
Rscript -e "suppressMessages(library(tidyverse))
suppressMessages(library(microseq))
args <- commandArgs(trailingOnly = T)
tmp_folder <- args[1]
sample_id <- args[2]
b.tbl <- read_delim(file.path(tmp_folder, str_c(sample_id, '_b.tbl')))
fq1 <- readFastq(file.path(tmp_folder, str_c(sample_id, '_R1.fq'))) %>%
mutate(Header = word(Header, 1))
fq2 <- readFastq(file.path(tmp_folder, str_c(sample_id, '_R2.fq'))) %>%
mutate(Header = word(Header, 1))
u.seeds <- unique(b.tbl\$seed_id)
for(i in 1:length(u.seeds)){
s.tbl <- b.tbl %>%
filter(seed_id == u.seeds[i]) %>%
mutate(read_id = str_replace(read_id, '/2$', '/1'))
fq1 %>%
filter(Header %in% s.tbl\$read_id)%>%
writeFastq(file.path(tmp_folder, str_c(sample_id, '_', u.seeds[i], '_R1.fq')))
s.tbl <- b.tbl %>%
filter(seed_id == u.seeds[i]) %>%
mutate(read_id = str_replace(read_id, '/1$', '/2'))
fq2 %>%
filter(Header %in% s.tbl\$read_id) %>%
writeFastq(file.path(tmp_folder, str_c(sample_id, '_', u.seeds[i], '_R2.fq')))
}" $tmp_folder $sample_id