diff --git a/bio2zarr/vcf_utils.py b/bio2zarr/vcf_utils.py index 30fc9d3..78690fc 100644 --- a/bio2zarr/vcf_utils.py +++ b/bio2zarr/vcf_utils.py @@ -441,9 +441,9 @@ def count_variants(self, region): return sum(1 for _ in self.variants(region)) def variants(self, region): - # Need to filter because of indels overlapping the region start = 1 if region.start is None else region.start for var in self.vcf(str(region)): + # Need to filter because of indels overlapping the region if var.POS >= start: yield var diff --git a/tests/data/vcf/chr_m_indels.vcf.gz b/tests/data/vcf/chr_m_indels.vcf.gz new file mode 100644 index 0000000..a2510e9 Binary files /dev/null and b/tests/data/vcf/chr_m_indels.vcf.gz differ diff --git a/tests/data/vcf/chr_m_indels.vcf.gz.csi b/tests/data/vcf/chr_m_indels.vcf.gz.csi new file mode 100644 index 0000000..25bedd7 Binary files /dev/null and b/tests/data/vcf/chr_m_indels.vcf.gz.csi differ diff --git a/tests/test_core.py b/tests/test_core.py index c6e8f45..6fd71f8 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -192,8 +192,8 @@ def test_5_chunk_1(self, n, expected): # It *might* work in CI, but it may well not either, as it's # probably dependent on a whole bunch of things. Expect to fail # at some point. - ("tests/data", 4960266), - ("tests/data/vcf", 4948129), + ("tests/data", 4973315), + ("tests/data/vcf", 4961178), ("tests/data/vcf/sample.vcf.gz", 1089), ], ) diff --git a/tests/test_vcf_examples.py b/tests/test_vcf_examples.py index 07545b1..909efc7 100644 --- a/tests/test_vcf_examples.py +++ b/tests/test_vcf_examples.py @@ -837,6 +837,7 @@ def test_duplicate_paths(self, tmp_path): "1kg_2020_chrM.vcf.gz", "field_type_combos.vcf.gz", "out_of_order_contigs.vcf.gz", + "chr_m_indels.vcf.gz", ], ) def test_by_validating(name, tmp_path): diff --git a/tests/test_vcf_utils.py b/tests/test_vcf_utils.py index 4800176..5c1e75f 100644 --- a/tests/test_vcf_utils.py +++ b/tests/test_vcf_utils.py @@ -29,6 +29,13 @@ def test_context_manager_error(self): with vcf_utils.IndexedVcf(data_path / "no-such-file.bcf"): pass + def test_indels_filtered(self): + with vcf_utils.IndexedVcf(data_path / "chr_m_indels.vcf.gz") as vfile: + # Hand-picked example that results in filtering + region = vcf_utils.Region("chrM", 300, 314) + pos = [var.POS for var in vfile.variants(region)] + assert pos == [307, 308, 309, 312, 313, 314] + # values computed using bcftools index -s @pytest.mark.parametrize( ("index_file", "expected"), @@ -58,6 +65,7 @@ def test_context_manager_error(self): ("1kg_2020_chr20_annotations.bcf.csi", {"chr20": 21}), ("NA12878.prod.chr20snippet.g.vcf.gz.tbi", {"20": 301778}), ("multi_contig.vcf.gz.tbi", {str(j): 933 for j in range(5)}), + ("chr_m_indels.vcf.gz.csi", {"chrM": 155}), ], ) def test_contig_record_counts(self, index_file, expected): @@ -82,6 +90,7 @@ def test_contig_record_counts(self, index_file, expected): ("1kg_2020_chr20_annotations.bcf.csi", ["chr20:60070-"]), ("NA12878.prod.chr20snippet.g.vcf.gz.tbi", ["20:60001-"]), ("multi_contig.vcf.gz.tbi", [f"{j}:1-" for j in range(5)]), + ("chr_m_indels.vcf.gz.csi", ["chrM:26-"]), ], ) def test_partition_into_one_part(self, index_file, expected): @@ -106,6 +115,7 @@ def test_partition_into_one_part(self, index_file, expected): ("1kg_2020_chr20_annotations.bcf.csi", 1, 21), ("NA12878.prod.chr20snippet.g.vcf.gz.tbi", 59, 301778), ("multi_contig.vcf.gz.tbi", 5, 5 * 933), + ("chr_m_indels.vcf.gz.csi", 1, 155), ], ) def test_partition_into_max_parts(self, index_file, num_expected, total_records):