Skip to content

Commit

Permalink
(fix): indexing performance in backed mode for boolean case (#1233)
Browse files Browse the repository at this point in the history
* (feat): first pass heuristic

* (feat): the function is faster within the call?

* (feat): add break-even point

* (chore): light refactor

* (fix): not everything can be converted to slices

* (chore): add comment.

* (feat): first pass sparse from elems

* (chore): rename

* (chore): try refactor

* (chore): add test

* (feat): add benchmark

* (fix): higher numpy version bug

* (refactor): `efficient` -> `compressed`, index method

* (refactor): use two conditions again

* Minor simplification

* Release note

---------

Co-authored-by: Isaac Virshup <[email protected]>
  • Loading branch information
ilan-gold and ivirshup authored Jan 10, 2024
1 parent c5531b7 commit ab43f8d
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 4 deletions.
54 changes: 51 additions & 3 deletions anndata/_core/sparse_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import warnings
from abc import ABC
from itertools import accumulate, chain
from math import floor
from pathlib import Path
from typing import TYPE_CHECKING, Literal, NamedTuple

Expand Down Expand Up @@ -164,7 +165,6 @@ def _get_sliceXslice(self, row: slice, col: slice) -> ss.csr_matrix:
slice_len(row, self.shape[0]),
slice_len(col, self.shape[1]),
)

if out_shape[0] == 1:
return self._get_intXslice(slice_as_int(row, self.shape[0]), col)
elif out_shape[1] == self.shape[1] and out_shape[0] < self.shape[0]:
Expand Down Expand Up @@ -246,6 +246,26 @@ def get_compressed_vectors(
return data, indices, indptr


def get_compressed_vectors_for_slices(
x: BackedSparseMatrix, slices: Iterable[slice]
) -> tuple[Sequence, Sequence, Sequence]:
indptr_sels = [x.indptr[slice(s.start, s.stop + 1)] for s in slices]
data = np.concatenate([x.data[s[0] : s[-1]] for s in indptr_sels])
indices = np.concatenate([x.indices[s[0] : s[-1]] for s in indptr_sels])
# Need to track the size of the gaps in the slices to each indptr subselection
total = indptr_sels[0][0]
offsets = [total]
for i, sel in enumerate(indptr_sels[1:]):
total = (sel[0] - indptr_sels[i][-1]) + total
offsets.append(total)
start_indptr = indptr_sels[0] - offsets[0]
end_indptr = np.concatenate(
[s[1:] - offsets[i + 1] for i, s in enumerate(indptr_sels[1:])]
)
indptr = np.concatenate([start_indptr, end_indptr])
return data, indices, indptr


def get_compressed_vector(
x: BackedSparseMatrix, idx: int
) -> tuple[Sequence, Sequence, Sequence]:
Expand All @@ -256,6 +276,21 @@ def get_compressed_vector(
return data, indices, indptr


def subset_by_major_axis_mask(
mtx: ss.spmatrix, mask: np.ndarray
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
slices = np.ma.extras._ezclump(mask)

def mean_slice_length(slices):
return floor((slices[-1].stop - slices[0].start) / len(slices))

# heuristic for whether slicing should be optimized
if mean_slice_length(slices) <= 7:
return get_compressed_vectors(mtx, np.where(mask)[0])
else:
return get_compressed_vectors_for_slices(mtx, slices)


def get_format(data: ss.spmatrix) -> str:
for fmt, _, memory_class in FORMATS:
if isinstance(data, memory_class):
Expand Down Expand Up @@ -351,9 +386,22 @@ def __repr__(self) -> str:
return f"{type(self).__name__}: backend {self.backend}, shape {self.shape}, data_dtype {self.dtype}"

def __getitem__(self, index: Index | tuple[()]) -> float | ss.spmatrix:
row, col = self._normalize_index(index)
indices = self._normalize_index(index)
row, col = indices
mtx = self._to_backed()
sub = mtx[row, col]

# Handle masked indexing along major axis
if self.format == "csr" and np.array(row).dtype == bool:
sub = ss.csr_matrix(
subset_by_major_axis_mask(mtx, row), shape=(row.sum(), mtx.shape[1])
)[:, col]
elif self.format == "csc" and np.array(col).dtype == bool:
sub = ss.csc_matrix(
subset_by_major_axis_mask(mtx, col), shape=(mtx.shape[0], col.sum())
)[row, :]
else:
sub = mtx[row, col]

# If indexing is array x array it returns a backed_sparse_matrix
# Not sure what the performance is on that operation
if isinstance(sub, BackedSparseMatrix):
Expand Down
35 changes: 35 additions & 0 deletions anndata/tests/test_backed_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,41 @@ def test_backed_indexing(
assert_equal(csr_mem[:, var_idx].X, dense_disk[:, var_idx].X)


# test behavior from https://github.com/scverse/anndata/pull/1233
def test_consecutive_bool(
ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData],
):
_, csr_disk, csc_disk, _ = ondisk_equivalent_adata

randomized_mask = np.zeros(csr_disk.shape[0], dtype=bool)
inds = np.random.choice(csr_disk.shape[0], 20, replace=False)
inds.sort()
for i in range(0, len(inds) - 1, 2):
randomized_mask[inds[i] : inds[i + 1]] = True

# non-random indices, with alternating one false and n true
def make_alternating_mask(n):
mask_alternating = np.ones(csr_disk.shape[0], dtype=bool)
for i in range(0, csr_disk.shape[0], n):
mask_alternating[i] = False
return mask_alternating

alternating_mask = make_alternating_mask(10)

# indexing needs to be on `X` directly to trigger the optimization.
# `_normalize_indices`, which is used by `AnnData`, converts bools to ints with `np.where`
assert_equal(
csr_disk.X[alternating_mask, :], csr_disk.X[np.where(alternating_mask)]
)
assert_equal(
csc_disk.X[:, alternating_mask], csc_disk.X[:, np.where(alternating_mask)[0]]
)
assert_equal(csr_disk.X[randomized_mask, :], csr_disk.X[np.where(randomized_mask)])
assert_equal(
csc_disk.X[:, randomized_mask], csc_disk.X[:, np.where(randomized_mask)[0]]
)


@pytest.mark.parametrize(
["sparse_format", "append_method"],
[
Expand Down
15 changes: 14 additions & 1 deletion benchmarks/benchmarks/sparse_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,26 @@
from anndata.experimental import sparse_dataset, write_elem


def make_alternating_mask(n):
mask_alternating = np.ones(10_000, dtype=bool)
for i in range(0, 10_000, n):
mask_alternating[i] = False
return mask_alternating


class SparseCSRContiguousSlice:
params = (
[
(10_000, 10_000),
# (10_000, 500)
],
[slice(0, 1000), slice(0, 9000), slice(None, 9000, -1), slice(None, None, 2)],
[
slice(0, 1000),
slice(0, 9000),
slice(None, 9000, -1),
slice(None, None, 2),
make_alternating_mask(10),
],
)
param_names = ["shape", "slice"]

Expand Down
2 changes: 2 additions & 0 deletions docs/release-notes/0.10.5.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,5 @@

```{rubric} Performance
```

* Improved performance when indexing backed sparse matrices with boolean masks along their major axis {pr}`1233` {user}`ilan-gold`

0 comments on commit ab43f8d

Please sign in to comment.