Skip to content

Commit

Permalink
Simple implementation of compute_laa and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
jeromekelleher committed Jan 14, 2025
1 parent d347e37 commit 4accfc5
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 31 deletions.
45 changes: 14 additions & 31 deletions bio2zarr/vcf2zarr/vcz.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,44 +513,27 @@ def fromdict(d):
return ret


def compute_laa_field(genotypes, alleles) -> np.ndarray:
def compute_laa_field(genotypes) -> np.ndarray:
"""
Computes the value of the LAA field for each sample given the genotypes
for a variant.
The LAA field is a list of one-based indices into the ALT alleles
that indicates which alternate alleles are observed in the sample.
"""
alt_allele_count = len(alleles) - 1
allele_counts = np.zeros((genotypes.shape[0], len(alleles)), dtype=int)

genotypes = genotypes.clip(0, None)
genotype_allele_counts = np.apply_along_axis(
np.bincount, axis=1, arr=genotypes, minlength=len(alleles)
)
allele_counts += genotype_allele_counts

allele_counts[:, 0] = 0 # We don't count the reference allele
max_row_length = 1

def nonzero_pad(arr: np.ndarray, *, length: int):
nonlocal max_row_length
alleles = arr.nonzero()[0]
max_row_length = max(max_row_length, len(alleles))
pad_length = length - len(alleles)
return np.pad(
alleles,
(0, pad_length),
mode="constant",
constant_values=constants.INT_FILL,
)

alleles = np.apply_along_axis(
nonzero_pad, axis=1, arr=allele_counts, length=max(1, alt_allele_count)
)
alleles = alleles[:, :max_row_length]

return alleles
v = 2**31 - 1
if np.any(genotypes >= v):
raise ValueError("Extreme allele value not supported")
G = genotypes.astype(np.int32)
# Anything <=0 gets mapped to -2 (pad) in the output, which comes last.
# So, to get this sorting correctly, we remap to the largest value for
# sorting, then map back. We promote the genotypes up to 32 bit for convenience
# here, assuming that we'll never have a allele of 2**31 - 1.
assert np.all(G != v)
G[G <= 0] = v
G.sort(axis=1)
G[G == v] = -2
return G.astype(genotypes.dtype)


@dataclasses.dataclass
Expand Down
30 changes: 30 additions & 0 deletions tests/test_local_alleles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import numpy as np
import numpy.testing as nt
import pytest

from bio2zarr.vcf2zarr.vcz import compute_laa_field


class TestComputeLAA:
@pytest.mark.parametrize(
("genotypes", "expected"),
[
([[]], [[]]),
([[0, 0]], [[-2, -2]]),
([[0, 0], [0, 0]], [[-2, -2], [-2, -2]]),
([[0, 1], [3, 2], [3, 0]], [[1, -2], [2, 3], [3, -2]]),
([[0, 0], [2, 3]], [[-2, -2], [2, 3]]),
([[2, 3], [0, 0]], [[2, 3], [-2, -2]]),
([[128, 0], [6, 5]], [[128, -2], [5, 6]]),
([[0, -1], [-1, 5]], [[-2, -2], [5, -2]]),
],
)
def test_simple_examples(self, genotypes, expected):
G = np.array(genotypes)
result = compute_laa_field(G)
nt.assert_array_equal(result, expected)

def test_extreme_value(self):
G = np.array([[0, 2**32 - 1]])
with pytest.raises(ValueError, match="Extreme"):
compute_laa_field(G)

0 comments on commit 4accfc5

Please sign in to comment.