Skip to content

Commit

Permalink
Merge pull request #178 from jeromekelleher/one-based
Browse files Browse the repository at this point in the history
Support one-based coordinates in explode/encode partition
  • Loading branch information
jeromekelleher authored May 3, 2024
2 parents 55801f7 + 6ae035e commit a8ef611
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 34 deletions.
35 changes: 25 additions & 10 deletions bio2zarr/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,13 @@ def list_commands(self, ctx):
help="Force overwriting of existing directories",
)

one_based = click.option(
"--one-based",
is_flag=True,
flag_value=True,
help="Partition indexes are interpreted as one-based",
)

version = click.version_option(version=f"{provenance.__version__}")

worker_processes = click.option(
Expand Down Expand Up @@ -226,13 +233,18 @@ def dexplode_init(
@icf_path
@partition
@verbose
def dexplode_partition(icf_path, partition, verbose):
@one_based
def dexplode_partition(icf_path, partition, verbose, one_based):
"""
Convert a VCF partition to intermediate columnar format. Must be called *after*
the ICF path has been initialised with dexplode_init. Partition indexes must be
from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
Convert a VCF partition to intermediate columnar format. Must be called
after the ICF path has been initialised with dexplode_init. By default,
partition indexes are from 0 to the number of partitions N (returned by
dexplode_init), exclusive. If the --one-based option is specifed,
partition indexes are in the range 1 to N, inclusive.
"""
setup_logging(verbose)
if one_based:
partition -= 1
vcf.explode_partition(icf_path, partition)


Expand Down Expand Up @@ -371,14 +383,17 @@ def dencode_init(
@zarr_path
@partition
@verbose
def dencode_partition(zarr_path, partition, verbose):
"""
Convert a partition from intermediate columnar format to VCF Zarr.
Must be called *after* the Zarr path has been initialised with dencode_init.
Partition indexes must be from 0 (inclusive) to the number of paritions
returned by dencode_init (exclusive).
@one_based
def dencode_partition(zarr_path, partition, verbose, one_based):
"""
Convert a partition from intermediate columnar format to VCF Zarr. Must be
called after the Zarr path has been initialised with dencode_init. By
default, partition indexes are from 0 to the number of partitions N
(returned by dencode_init), exclusive. If the --one-based option is
specifed, partition indexes are in the range 1 to N, inclusive."""
setup_logging(verbose)
if one_based:
partition -= 1
vcf.encode_partition(zarr_path, partition)


Expand Down
14 changes: 5 additions & 9 deletions bio2zarr/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,8 +382,8 @@ def sanitise_value_bool(buff, j, value):
def sanitise_value_float_scalar(buff, j, value):
x = value
if value is None:
x = FLOAT32_MISSING
buff[j] = x
x = [FLOAT32_MISSING]
buff[j] = x[0]


def sanitise_value_int_scalar(buff, j, value):
Expand All @@ -392,7 +392,7 @@ def sanitise_value_int_scalar(buff, j, value):
# print("MISSING", INT_MISSING, INT_FILL)
x = [INT_MISSING]
else:
x = sanitise_int_array([value], ndmin=1, dtype=np.int32)
x = sanitise_int_array(value, ndmin=1, dtype=np.int32)
buff[j] = x[0]


Expand Down Expand Up @@ -1148,9 +1148,7 @@ def explode(self, *, worker_processes=1, show_progress=False):
def explode_partition(self, partition):
self.load_metadata()
if partition < 0 or partition >= self.num_partitions:
raise ValueError(
"Partition index must be in the range 0 <= index < num_partitions"
)
raise ValueError("Partition index not in the valid range")
self.process_partition(partition)

def finalise(self):
Expand Down Expand Up @@ -1801,9 +1799,7 @@ def partition_array_path(self, partition_index, name):
def encode_partition(self, partition_index):
self.load_metadata()
if partition_index < 0 or partition_index >= self.num_partitions:
raise ValueError(
"Partition index must be in the range 0 <= index < num_partitions"
)
raise ValueError("Partition index not in the valid range")
partition_path = self.wip_partition_path(partition_index)
partition_path.mkdir(exist_ok=True)
logger.info(f"Encoding partition {partition_index} to {partition_path}")
Expand Down
62 changes: 49 additions & 13 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,23 @@ def test_vcf_dexplode_partition(self, mocked, tmp_path):
str(icf_path), 1, **DEFAULT_DEXPLODE_PARTITION_ARGS
)

@mock.patch("bio2zarr.vcf.explode_partition")
def test_vcf_dexplode_partition_one_based(self, mocked, tmp_path):
runner = ct.CliRunner(mix_stderr=False)
icf_path = tmp_path / "icf"
icf_path.mkdir()
result = runner.invoke(
cli.vcf2zarr,
f"dexplode-partition {icf_path} 1 --one-based",
catch_exceptions=False,
)
assert result.exit_code == 0
assert len(result.stdout) == 0
assert len(result.stderr) == 0
mocked.assert_called_once_with(
str(icf_path), 0, **DEFAULT_DEXPLODE_PARTITION_ARGS
)

@mock.patch("bio2zarr.vcf.explode_partition")
def test_vcf_dexplode_partition_missing_dir(self, mocked, tmp_path):
runner = ct.CliRunner(mix_stderr=False)
Expand Down Expand Up @@ -436,6 +453,23 @@ def test_vcf_dencode_partition(self, mocked, tmp_path):
str(zarr_path), 1, **DEFAULT_DENCODE_PARTITION_ARGS
)

@mock.patch("bio2zarr.vcf.encode_partition")
def test_vcf_dencode_partition_one_based(self, mocked, tmp_path):
runner = ct.CliRunner(mix_stderr=False)
zarr_path = tmp_path / "zarr"
zarr_path.mkdir()
result = runner.invoke(
cli.vcf2zarr,
f"dencode-partition {zarr_path} 1 --one-based",
catch_exceptions=False,
)
assert result.exit_code == 0
assert len(result.stdout) == 0
assert len(result.stderr) == 0
mocked.assert_called_once_with(
str(zarr_path), 0, **DEFAULT_DENCODE_PARTITION_ARGS
)

@mock.patch("bio2zarr.vcf.encode_finalise")
def test_vcf_dencode_finalise(self, mocked, tmp_path):
runner = ct.CliRunner(mix_stderr=False)
Expand Down Expand Up @@ -489,7 +523,8 @@ def test_convert_plink(self, mocked):
class TestVcfEndToEnd:
vcf_path = "tests/data/vcf/sample.vcf.gz"

def test_dexplode(self, tmp_path):
@pytest.mark.parametrize("one_based", [False, True])
def test_dexplode(self, tmp_path, one_based):
icf_path = tmp_path / "icf"
runner = ct.CliRunner(mix_stderr=False)
result = runner.invoke(
Expand All @@ -501,11 +536,11 @@ def test_dexplode(self, tmp_path):
assert result.stdout.strip() == "3"

for j in range(3):
result = runner.invoke(
cli.vcf2zarr,
f"dexplode-partition {icf_path} {j}",
catch_exceptions=False,
)
if one_based:
cmd = f"dexplode-partition {icf_path} {j + 1} --one-based"
else:
cmd = f"dexplode-partition {icf_path} {j}"
result = runner.invoke(cli.vcf2zarr, cmd, catch_exceptions=False)
assert result.exit_code == 0
result = runner.invoke(
cli.vcf2zarr, f"dexplode-finalise {icf_path}", catch_exceptions=False
Expand Down Expand Up @@ -552,7 +587,8 @@ def test_encode(self, tmp_path):
# Arbitrary check
assert "variant_position" in result.stdout

def test_dencode(self, tmp_path):
@pytest.mark.parametrize("one_based", [False, True])
def test_dencode(self, tmp_path, one_based):
icf_path = tmp_path / "icf"
zarr_path = tmp_path / "zarr"
runner = ct.CliRunner(mix_stderr=False)
Expand All @@ -569,12 +605,12 @@ def test_dencode(self, tmp_path):
assert result.stdout.split()[0] == "3"

for j in range(3):
result = runner.invoke(
cli.vcf2zarr,
f"dencode-partition {zarr_path} {j}",
catch_exceptions=False,
)
assert result.exit_code == 0
if one_based:
cmd = f"dencode-partition {zarr_path} {j + 1} --one-based"
else:
cmd = f"dencode-partition {zarr_path} {j}"
result = runner.invoke(cli.vcf2zarr, cmd, catch_exceptions=False)
assert result.exit_code == 0

result = runner.invoke(
cli.vcf2zarr, f"dencode-finalise {zarr_path}", catch_exceptions=False
Expand Down
2 changes: 1 addition & 1 deletion tests/test_icf.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def test_double_explode_partition(self, tmp_path):
def test_explode_partition_out_of_range(self, tmp_path, partition):
icf_path = tmp_path / "x.icf"
vcf.explode_init(icf_path, [self.data_path])
with pytest.raises(ValueError, match="Partition index must be in the range"):
with pytest.raises(ValueError, match="Partition index not in the valid range"):
vcf.explode_partition(icf_path, partition)

def test_explode_same_file_twice(self, tmp_path):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,5 +502,5 @@ def test_double_encode_partition(self, icf_path, tmp_path, caplog):
def test_encode_partition_out_of_range(self, icf_path, tmp_path, partition):
zarr_path = tmp_path / "x.zarr"
vcf.encode_init(icf_path, zarr_path, 3, variants_chunk_size=3)
with pytest.raises(ValueError, match="Partition index must be in the range"):
with pytest.raises(ValueError, match="Partition index not in the valid range"):
vcf.encode_partition(zarr_path, partition)

0 comments on commit a8ef611

Please sign in to comment.