(fix): indexing performance in backed mode for boolean case (#1233)

* (feat): first pass heuristic * (feat): the function is faster within the call? * (feat): add break-even point * (chore): light refactor * (fix): not everything can be converted to slices * (chore): add comment. * (feat): first pass sparse from elems * (chore): rename * (chore): try refactor * (chore): add test * (feat): add benchmark * (fix): higher numpy version bug * (refactor): `efficient` -> `compressed`, index method * (refactor): use two conditions again * Minor simplification * Release note --------- Co-authored-by: Isaac Virshup <[email protected]>
scverse · Jan 10, 2024 · ab43f8d · ab43f8d
1 parent c5531b7
commit ab43f8d
Show file tree

Hide file tree

Showing 4 changed files with 102 additions and 4 deletions.
diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py
@@ -16,6 +16,7 @@
 import warnings
 from abc import ABC
 from itertools import accumulate, chain
+from math import floor
 from pathlib import Path
 from typing import TYPE_CHECKING, Literal, NamedTuple
 
@@ -164,7 +165,6 @@ def _get_sliceXslice(self, row: slice, col: slice) -> ss.csr_matrix:
             slice_len(row, self.shape[0]),
             slice_len(col, self.shape[1]),
         )
-
         if out_shape[0] == 1:
             return self._get_intXslice(slice_as_int(row, self.shape[0]), col)
         elif out_shape[1] == self.shape[1] and out_shape[0] < self.shape[0]:
@@ -246,6 +246,26 @@ def get_compressed_vectors(
     return data, indices, indptr
 
 
+def get_compressed_vectors_for_slices(
+    x: BackedSparseMatrix, slices: Iterable[slice]
+) -> tuple[Sequence, Sequence, Sequence]:
+    indptr_sels = [x.indptr[slice(s.start, s.stop + 1)] for s in slices]
+    data = np.concatenate([x.data[s[0] : s[-1]] for s in indptr_sels])
+    indices = np.concatenate([x.indices[s[0] : s[-1]] for s in indptr_sels])
+    # Need to track the size of the gaps in the slices to each indptr subselection
+    total = indptr_sels[0][0]
+    offsets = [total]
+    for i, sel in enumerate(indptr_sels[1:]):
+        total = (sel[0] - indptr_sels[i][-1]) + total
+        offsets.append(total)
+    start_indptr = indptr_sels[0] - offsets[0]
+    end_indptr = np.concatenate(
+        [s[1:] - offsets[i + 1] for i, s in enumerate(indptr_sels[1:])]
+    )
+    indptr = np.concatenate([start_indptr, end_indptr])
+    return data, indices, indptr
+
+
 def get_compressed_vector(
     x: BackedSparseMatrix, idx: int
 ) -> tuple[Sequence, Sequence, Sequence]:
@@ -256,6 +276,21 @@ def get_compressed_vector(
     return data, indices, indptr
 
 
+def subset_by_major_axis_mask(
+    mtx: ss.spmatrix, mask: np.ndarray
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    slices = np.ma.extras._ezclump(mask)
+
+    def mean_slice_length(slices):
+        return floor((slices[-1].stop - slices[0].start) / len(slices))
+
+    # heuristic for whether slicing should be optimized
+    if mean_slice_length(slices) <= 7:
+        return get_compressed_vectors(mtx, np.where(mask)[0])
+    else:
+        return get_compressed_vectors_for_slices(mtx, slices)
+
+
 def get_format(data: ss.spmatrix) -> str:
     for fmt, _, memory_class in FORMATS:
         if isinstance(data, memory_class):
@@ -351,9 +386,22 @@ def __repr__(self) -> str:
         return f"{type(self).__name__}: backend {self.backend}, shape {self.shape}, data_dtype {self.dtype}"
 
     def __getitem__(self, index: Index | tuple[()]) -> float | ss.spmatrix:
-        row, col = self._normalize_index(index)
+        indices = self._normalize_index(index)
+        row, col = indices
         mtx = self._to_backed()
-        sub = mtx[row, col]
+
+        # Handle masked indexing along major axis
+        if self.format == "csr" and np.array(row).dtype == bool:
+            sub = ss.csr_matrix(
+                subset_by_major_axis_mask(mtx, row), shape=(row.sum(), mtx.shape[1])
+            )[:, col]
+        elif self.format == "csc" and np.array(col).dtype == bool:
+            sub = ss.csc_matrix(
+                subset_by_major_axis_mask(mtx, col), shape=(mtx.shape[0], col.sum())
+            )[row, :]
+        else:
+            sub = mtx[row, col]
+
         # If indexing is array x array it returns a backed_sparse_matrix
         # Not sure what the performance is on that operation
         if isinstance(sub, BackedSparseMatrix):

diff --git a/anndata/tests/test_backed_sparse.py b/anndata/tests/test_backed_sparse.py
@@ -95,6 +95,41 @@ def test_backed_indexing(
     assert_equal(csr_mem[:, var_idx].X, dense_disk[:, var_idx].X)
 
 
+# test behavior from https://github.com/scverse/anndata/pull/1233
+def test_consecutive_bool(
+    ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData],
+):
+    _, csr_disk, csc_disk, _ = ondisk_equivalent_adata
+
+    randomized_mask = np.zeros(csr_disk.shape[0], dtype=bool)
+    inds = np.random.choice(csr_disk.shape[0], 20, replace=False)
+    inds.sort()
+    for i in range(0, len(inds) - 1, 2):
+        randomized_mask[inds[i] : inds[i + 1]] = True
+
+    # non-random indices, with alternating one false and n true
+    def make_alternating_mask(n):
+        mask_alternating = np.ones(csr_disk.shape[0], dtype=bool)
+        for i in range(0, csr_disk.shape[0], n):
+            mask_alternating[i] = False
+        return mask_alternating
+
+    alternating_mask = make_alternating_mask(10)
+
+    # indexing needs to be on `X` directly to trigger the optimization.
+    # `_normalize_indices`, which is used by `AnnData`, converts bools to ints with `np.where`
+    assert_equal(
+        csr_disk.X[alternating_mask, :], csr_disk.X[np.where(alternating_mask)]
+    )
+    assert_equal(
+        csc_disk.X[:, alternating_mask], csc_disk.X[:, np.where(alternating_mask)[0]]
+    )
+    assert_equal(csr_disk.X[randomized_mask, :], csr_disk.X[np.where(randomized_mask)])
+    assert_equal(
+        csc_disk.X[:, randomized_mask], csc_disk.X[:, np.where(randomized_mask)[0]]
+    )
+
+
 @pytest.mark.parametrize(
     ["sparse_format", "append_method"],
     [

diff --git a/benchmarks/benchmarks/sparse_dataset.py b/benchmarks/benchmarks/sparse_dataset.py
@@ -7,13 +7,26 @@
 from anndata.experimental import sparse_dataset, write_elem
 
 
+def make_alternating_mask(n):
+    mask_alternating = np.ones(10_000, dtype=bool)
+    for i in range(0, 10_000, n):
+        mask_alternating[i] = False
+    return mask_alternating
+
+
 class SparseCSRContiguousSlice:
     params = (
         [
             (10_000, 10_000),
             # (10_000, 500)
         ],
-        [slice(0, 1000), slice(0, 9000), slice(None, 9000, -1), slice(None, None, 2)],
+        [
+            slice(0, 1000),
+            slice(0, 9000),
+            slice(None, 9000, -1),
+            slice(None, None, 2),
+            make_alternating_mask(10),
+        ],
     )
     param_names = ["shape", "slice"]
 

diff --git a/docs/release-notes/0.10.5.md b/docs/release-notes/0.10.5.md
@@ -10,3 +10,5 @@
 
 ```{rubric} Performance
 ```
+
+* Improved performance when indexing backed sparse matrices with boolean masks along their major axis {pr}`1233` {user}`ilan-gold`
Original file line number	Diff line number	Diff line change
Expand Up		@@ -10,3 +10,5 @@

		```{rubric} Performance
		```

		* Improved performance when indexing backed sparse matrices with boolean masks along their major axis {pr}`1233` {user}`ilan-gold`