Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into mk/ann_fork
Browse files Browse the repository at this point in the history
  • Loading branch information
karasikov committed Jul 19, 2021
2 parents 575baac + 0d156d3 commit 2ac20d1
Show file tree
Hide file tree
Showing 29 changed files with 1,114 additions and 113 deletions.
7 changes: 6 additions & 1 deletion metagraph/integration_tests/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,9 @@ def _clean(graph, output, extra_params=''):
def _annotate_graph(input, graph_path, output, anno_repr,
separate=False, no_fork_opt=False, no_anchor_opt=False):
target_anno = anno_repr
if anno_repr in {'row_sparse'} or anno_repr.endswith('brwt') or anno_repr.startswith('row_diff'):
if (anno_repr in {'row_sparse', 'column_coord'} or
anno_repr.endswith('brwt') or
anno_repr.startswith('row_diff')):
target_anno = anno_repr
anno_repr = 'column'
elif anno_repr in {'flat', 'rbfish'}:
Expand All @@ -117,6 +119,9 @@ def _annotate_graph(input, graph_path, output, anno_repr,
-i {graph_path} --anno-type {anno_repr} \
-o {output} {input}'

if target_anno.endswith('_coord'):
command += ' --coordinates'

with_counts = target_anno.endswith('int_brwt')
if with_counts:
command += ' --count-kmers'
Expand Down
60 changes: 60 additions & 0 deletions metagraph/integration_tests/test_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,66 @@ def test_simple_align_all_graphs(self, representation):
self.assertEqual(last_split[1], "AACAGAGAATTGTTTAAATTACAATCTTAGCTATGGGTGCTAAAGGTGGAGTTATAGACTTTTTCACTGATTTGTCGTTGGAAAAAGCTTTTCATCTCGGGTTTACAAGTCTGGTGTATTTGTTTATACTAGAAGGACAGGCGCATTTGA")
self.assertEqual(last_split[4], "22")

@parameterized.expand(GRAPH_TYPES)
def test_simple_align_map_all_graphs(self, representation):

self._build_graph(input=TEST_DATA_DIR + '/genome.MT.fa',
output=self.tempdir.name + '/genome.MT',
k=11, repr=representation,
extra_params="--mask-dummy")

res = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation])
params_str = res.stdout.decode().split('\n')[2:]
self.assertEqual('k: 11', params_str[0])
self.assertEqual('nodes (k): 16438', params_str[1])
self.assertEqual('mode: basic', params_str[2])

stats_command = '{exe} align -i {graph} --map --count-kmers {reads}'.format(
exe=METAGRAPH,
graph=self.tempdir.name + '/genome.MT' + graph_file_extension[representation],
reads=TEST_DATA_DIR + '/genome_MT1.fq',
)
res = subprocess.run(stats_command.split(), stdout=PIPE)
self.assertEqual(res.returncode, 0)
params_str = res.stdout.decode().rstrip().split('\n')
self.assertEqual(len(params_str), 6)
self.assertEqual(params_str[0], 'MT-10/1\t1/140/1')
self.assertEqual(params_str[1], 'MT-8/1\t140/140/140')
self.assertEqual(params_str[2], 'MT-6/1\t140/140/140')
self.assertEqual(params_str[3], 'MT-4/1\t0/140/0')
self.assertEqual(params_str[4], 'MT-2/1\t140/140/140')
self.assertEqual(params_str[5], 'MT-11/1\t1/140/1')

@parameterized.expand(GRAPH_TYPES)
def test_simple_align_map_canonical_all_graphs(self, representation):

self._build_graph(input=TEST_DATA_DIR + '/genome.MT.fa',
output=self.tempdir.name + '/genome.MT',
k=11, repr=representation, mode='canonical',
extra_params="--mask-dummy")

res = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation])
params_str = res.stdout.decode().split('\n')[2:]
self.assertEqual('k: 11', params_str[0])
self.assertEqual('nodes (k): 32782', params_str[1])
self.assertEqual('mode: canonical', params_str[2])

stats_command = '{exe} align -i {graph} --map --count-kmers {reads}'.format(
exe=METAGRAPH,
graph=self.tempdir.name + '/genome.MT' + graph_file_extension[representation],
reads=TEST_DATA_DIR + '/genome_MT1.fq',
)
res = subprocess.run(stats_command.split(), stdout=PIPE)
self.assertEqual(res.returncode, 0)
params_str = res.stdout.decode().rstrip().split('\n')
self.assertEqual(len(params_str), 6)
self.assertEqual(params_str[0], 'MT-10/1\t140/140/140')
self.assertEqual(params_str[1], 'MT-8/1\t140/140/140')
self.assertEqual(params_str[2], 'MT-6/1\t140/140/140')
self.assertEqual(params_str[3], 'MT-4/1\t129/140/129')
self.assertEqual(params_str[4], 'MT-2/1\t140/140/139')
self.assertEqual(params_str[5], 'MT-11/1\t2/140/2')

@parameterized.expand(['succinct'])
def test_simple_align_json_all_graphs(self, representation):

Expand Down
253 changes: 253 additions & 0 deletions metagraph/integration_tests/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from tempfile import TemporaryDirectory
import glob
import os
import numpy as np
from helpers import get_test_class_name
from base import TestingBase, METAGRAPH, TEST_DATA_DIR, graph_file_extension

Expand All @@ -16,6 +17,7 @@
PROTEIN_MODE = os.readlink(METAGRAPH).endswith("_Protein")

anno_file_extension = {'column': '.column.annodbg',
'column_coord': '.column_coord.annodbg',
'row': '.row.annodbg',
'row_diff': '.row_diff.annodbg',
'row_sparse': '.row_sparse.annodbg',
Expand Down Expand Up @@ -503,6 +505,257 @@ def test_batch_query_with_tiny_batch(self):
self.assertEqual(res.returncode, 0)
self.assertEqual(len(res.stdout), 136959)

def test_query_coordinates(self):
if not self.anno_repr.endswith('_coord'):
self.skipTest('annotation does not support coordinates')

query_command = f'{METAGRAPH} query --query-coords \
-i {self.tempdir.name}/graph{graph_file_extension[self.graph_repr]} \
-a {self.tempdir.name}/annotation{anno_file_extension[self.anno_repr]} \
--discovery-fraction 0.05 {TEST_DATA_DIR}/transcripts_100.fa'

res = subprocess.run(query_command.split(), stdout=PIPE)
self.assertEqual(res.returncode, 0)
self.assertEqual(len(res.stdout), 2155983)

query_command = f'{METAGRAPH} query --query-coords \
-i {self.tempdir.name}/graph{graph_file_extension[self.graph_repr]} \
-a {self.tempdir.name}/annotation{anno_file_extension[self.anno_repr]} \
--discovery-fraction 0.95 {TEST_DATA_DIR}/transcripts_100.fa'

res = subprocess.run(query_command.split(), stdout=PIPE)
self.assertEqual(res.returncode, 0)
self.assertEqual(len(res.stdout), 687712)


@parameterized_class(('graph_repr', 'anno_repr'),
input_values=product(
[repr for repr in GRAPH_TYPES if not (repr == 'bitmap' and PROTEIN_MODE)],
['int_brwt', 'row_diff_int_brwt']
),
class_name_func=get_test_class_name
)
class TestQueryCounts(TestingBase):
@classmethod
def setUpClass(cls):
cls.tempdir = TemporaryDirectory()

cls.kmer_counts_1 = {
'AAA': 1,
'AAC': 2,
'ACC': 3,
'CCC': 4,
'CCG': 5,
'CGG': 6,
'GGG': 7,
'GGT': 8,
'GTT': 9,
'TTT': 10,
'TTA': 11,
'TAA': 12,
}
cls.kmer_counts_2 = {
'AAA': 11,
'AAC': 12,
'ACC': 13,
'CCC': 14,
'CCG': 15,
'CGG': 16,
'GGG': 17,
'GGT': 18,
'GTT': 19,
'TTT': 20,
}
fasta_file = cls.tempdir.name + '/file.fa'
with open(fasta_file, 'w') as f:
for kmer, count in cls.kmer_counts_1.items():
f.write(f'>L1\n{kmer}\n' * count)

for kmer, count in cls.kmer_counts_2.items():
f.write(f'>L2\n{kmer}\n' * count)

cls.k = 3

cls.with_bloom = False
if cls.graph_repr == 'succinct_bloom':
cls.graph_repr = 'succinct'
cls.with_bloom = True

cls.mask_dummy = False
if cls.graph_repr == 'succinct_mask':
cls.graph_repr = 'succinct'
cls.mask_dummy = True

construct_command = f"{METAGRAPH} build {'--mask-dummy' if cls.mask_dummy else ''} -p {NUM_THREADS} \
--graph {cls.graph_repr} -k {cls.k} -o {cls.tempdir.name}/graph {fasta_file}"

res = subprocess.run([construct_command], shell=True)
assert(res.returncode == 0)

stats_command = '{exe} stats {graph}'.format(
exe=METAGRAPH,
graph=cls.tempdir.name + '/graph' + graph_file_extension[cls.graph_repr],
)
res = subprocess.run(stats_command.split(), stdout=PIPE)
assert(res.returncode == 0)
params_str = res.stdout.decode().split('\n')[2:]
assert('k: 3' == params_str[0])
if cls.graph_repr != 'succinct' or cls.mask_dummy:
assert('nodes (k): 12' == params_str[1])
assert('mode: basic' == params_str[2])

if cls.with_bloom:
convert_command = '{exe} transform -o {outfile} --initialize-bloom {bloom_param} {input}'.format(
exe=METAGRAPH,
outfile=cls.tempdir.name + '/graph',
bloom_param='--bloom-fpp 0.1',
input=cls.tempdir.name + '/graph' + graph_file_extension[cls.graph_repr],
)
res = subprocess.run([convert_command], shell=True)
assert(res.returncode == 0)

def check_suffix(anno_repr, suffix):
match = anno_repr.endswith(suffix)
if match:
anno_repr = anno_repr[:-len(suffix)]
return anno_repr, match

cls.anno_repr, separate = check_suffix(cls.anno_repr, '_separate')
cls.anno_repr, no_fork_opt = check_suffix(cls.anno_repr, '_no_fork_opt')
cls.anno_repr, no_anchor_opt = check_suffix(cls.anno_repr, '_no_anchor_opt')

cls._annotate_graph(
fasta_file,
cls.tempdir.name + '/graph' + graph_file_extension[cls.graph_repr],
cls.tempdir.name + '/annotation',
cls.anno_repr,
separate,
no_fork_opt,
no_anchor_opt
)

# check annotation
anno_stats_command = '{exe} stats -a {annotation}'.format(
exe=METAGRAPH,
annotation=cls.tempdir.name + '/annotation' + anno_file_extension[cls.anno_repr],
)
res = subprocess.run(anno_stats_command.split(), stdout=PIPE)
assert(res.returncode == 0)
params_str = res.stdout.decode().split('\n')[2:]
assert('labels: 2' == params_str[0])
if cls.graph_repr != 'hashfast' and (cls.graph_repr != 'succinct' or cls.mask_dummy):
assert('objects: 12' == params_str[1])
assert('representation: ' + cls.anno_repr == params_str[3])

def test_count_query(self):
query_file = self.tempdir.name + '/query.fa'
queries = [
'AAA',
'AAAA',
'AAAAAAAAAAAAA',
'CCC',
'CCCC',
'CCCCCCCCCCCCC',
'TTT',
'AAACCCGGGTTT',
'AAACCCGGGTTTTTT',
'AAACCCGGGTTTAAA',
'TTTAAACCCGGG',
'ACACACACACACATTTAAACCCGGG',
]
for discovery_rate in np.linspace(0, 1, 5):
expected_output = ''
with open(query_file, 'w') as f:
for i, s in enumerate(queries):
f.write(f'>s{i}\n{s}\n')
expected_output += f'{i}\ts{i}'
def get_count(d, kmer):
try:
return d[kmer]
except:
return 0

num_kmers = len(s) - self.k + 1

num_matches_1 = sum([get_count(self.kmer_counts_1, s[i:i + self.k]) > 0 for i in range(num_kmers)])
count_1 = sum([get_count(self.kmer_counts_1, s[i:i + self.k]) for i in range(len(s) - self.k + 1)])

num_matches_2 = sum([get_count(self.kmer_counts_2, s[i:i + self.k]) > 0 for i in range(num_kmers)])
count_2 = sum([get_count(self.kmer_counts_2, s[i:i + self.k]) for i in range(len(s) - self.k + 1)])

for (c, i, n) in [(count_1, 1, num_matches_1), (count_2, 0, num_matches_2)]:
if n >= discovery_rate * num_kmers:
expected_output += f'\t<L{2-i}>:{c}'

expected_output += '\n'

query_command = f'{METAGRAPH} query --fast --count-kmers \
-i {self.tempdir.name}/graph{graph_file_extension[self.graph_repr]} \
-a {self.tempdir.name}/annotation{anno_file_extension[self.anno_repr]} \
--discovery-fraction {discovery_rate} {query_file}'

res = subprocess.run(query_command.split(), stdout=PIPE)
self.assertEqual(res.returncode, 0)
self.assertEqual(res.stdout.decode(), expected_output)

def test_count_quantiles(self):
query_file = self.tempdir.name + '/query.fa'
queries = [
'AAA',
'AAAA',
'AAAAAAAAAAAAA',
'CCC',
'CCCC',
'CCCCCCCCCCCCC',
'TTT',
'AAACCCGGGTTT',
'AAACCCGGGTTTTTT',
'AAACCCGGGTTTAAA',
'TTTAAACCCGGG',
'ACACACACACACATTTAAACCCGGG',
]
quantiles = np.linspace(0, 1, 100)
expected_output = ''
with open(query_file, 'w') as f:
for i, s in enumerate(queries):
f.write(f'>s{i}\n{s}\n')
expected_output += f'{i}\ts{i}\t<L1>'
def get_count(d, kmer):
try:
return d[kmer]
except:
return 0
for p in quantiles:
counts = [get_count(self.kmer_counts_1, s[i:i + self.k]) for i in range(len(s) - self.k + 1)]
expected_output += f':{np.quantile(counts, p, interpolation="lower")}'
expected_output += f'\t<L2>'
for p in quantiles:
counts = [get_count(self.kmer_counts_2, s[i:i + self.k]) for i in range(len(s) - self.k + 1)]
expected_output += f':{np.quantile(counts, p, interpolation="lower")}'
expected_output += '\n'

query_command = f'{METAGRAPH} query --fast --count-quantiles <SET_BELOW> \
-i {self.tempdir.name}/graph{graph_file_extension[self.graph_repr]} \
-a {self.tempdir.name}/annotation{anno_file_extension[self.anno_repr]} \
--discovery-fraction 0.0 {query_file}'

query_command = query_command.split()
query_command[4] = ' '.join([str(p) for p in quantiles])
res = subprocess.run(query_command, stdout=PIPE)
self.assertEqual(res.returncode, 0)
self.assertEqual(res.stdout.decode(), expected_output)

query_command = f'{METAGRAPH} query --fast --count-quantiles <SET_BELOW> \
-i {self.tempdir.name}/graph{graph_file_extension[self.graph_repr]} \
-a {self.tempdir.name}/annotation{anno_file_extension[self.anno_repr]} \
--discovery-fraction 1.0 {query_file}'

query_command = query_command.split()
query_command[4] = ' '.join([str(p) for p in quantiles])
res = subprocess.run(query_command, stdout=PIPE)
self.assertEqual(res.returncode, 0)
self.assertEqual(len(res.stdout.decode()), 5230)


@parameterized_class(('graph_repr', 'anno_repr'),
input_values=(product(list(set(GRAPH_TYPES) - {'hashstr'}), ANNO_TYPES) +
Expand Down
Loading

0 comments on commit 2ac20d1

Please sign in to comment.