From 8b296a0dff95bc301ed360b479a18392918a2b6f Mon Sep 17 00:00:00 2001
From: mtsch <matijacufar@gmail.com>
Date: Tue, 4 Jun 2024 16:34:44 +1200
Subject: [PATCH] Fix terrible performance with custom filtrations (#171)

* Fix terrible performance with custom filtrations

* Fix type params, prevent unnecessary overflows in high-dimensional simplices

* fixup aqua tests

* update changelog

* Fix aqua and formatting

* Update doctests

* Update test/aqua.jl

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Update test/doctests.jl

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* don't fix in doctest

* revert doctests

* cleanup Project.toml

* remove future

* Add assertion to Mod

* Fixup docs

* Format

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 CHANGELOG.md                     |   4 +
 Project.toml                     |  12 +-
 docs/make.jl                     |   8 +-
 docs/src/examples/malaria.jl     | 213 -------------------------------
 src/base/abstractcell.jl         |   5 +-
 src/base/abstractfiltration.jl   |   5 +-
 src/base/abstractsimplex.jl      |   6 +-
 src/base/chain.jl                |   2 +-
 src/base/primefield.jl           |   7 +-
 src/extra/circularcoordinates.jl |   2 +-
 src/filtrations/alpha.jl         |   8 +-
 src/filtrations/custom.jl        |  19 ++-
 test/aqua.jl                     |   7 +-
 13 files changed, 52 insertions(+), 246 deletions(-)
 delete mode 100644 docs/src/examples/malaria.jl

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a9788a21..0ff34845 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# v0.16.13
+
+Fix performance issues with high-dimensional `Custom` filtrations.
+
 # v0.16.9
 
 Replace LightGraps with Graphs.
diff --git a/Project.toml b/Project.toml
index 86605ebe..3cd5d77c 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,13 +1,12 @@
 name = "Ripserer"
 uuid = "aa79e827-bd0b-42a8-9f10-2b302677a641"
 authors = ["mtsch <matijacufar@gmail.com>"]
-version = "0.16.12"
+version = "0.16.13"
 
 [deps]
 Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
-Future = "9fa8497b-333b-5362-9e8d-4d0656e87820"
 Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
 IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -21,17 +20,26 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6"
 
 [compat]
+Aqua = "0.8"
 Compat = "^3.10, 4"
 DataStructures = "0.17, 0.18"
 Distances = "0.8, 0.9, 0.10"
+Documenter = "1"
 Graphs = "1"
 IterTools = "1"
+LinearAlgebra = "1"
+MLJBase = "1"
 MLJModelInterface = "^0.3.5, 0.4, 1"
 MiniQhull = "0.2, 0.3, 0.4"
 PersistenceDiagrams = "0.9"
 ProgressMeter = "1"
+Random = "1"
 RecipesBase = "1"
+SafeTestsets = "0.1"
+SparseArrays = "1"
 StaticArrays = "0.12, 1"
+Suppressor = "0.2"
+Test = "1"
 TupleTools = "1"
 julia = "1.6"
 
diff --git a/docs/make.jl b/docs/make.jl
index 712719a6..6fb7f6aa 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -28,12 +28,8 @@ makedocs(;
         "Home" => "index.md",
         "Usage Guide" => "generated/basics.md",
         "API" => "api.md",
-        "Examples" => [
-            "generated/stability.md",
-            "generated/cocycles.md",
-            "generated/cubical.md",
-            "generated/malaria.md",
-        ],
+        "Examples" =>
+            ["generated/stability.md", "generated/cocycles.md", "generated/cubical.md"],
         "Benchmarks" => "benchmarks.md",
         "Related Julia Packages" => "related-work.md",
         "Acknowledgements and References" => "references.md",
diff --git a/docs/src/examples/malaria.jl b/docs/src/examples/malaria.jl
deleted file mode 100644
index be18f275..00000000
--- a/docs/src/examples/malaria.jl
+++ /dev/null
@@ -1,213 +0,0 @@
-# # Image Classification With Cubical Persistent Homology
-
-# In this example, we will show how to use Ripserer in an image classification
-# context. Persistent homology is not a predictive algorithm, but it can be used to extract
-# useful features from data.
-
-# ## Setting up
-
-using Ripserer
-using PersistenceDiagrams
-using Images # also required: ImageIO to read .png files
-using Plots
-using ProgressMeter
-using Random
-Random.seed!(1337)
-
-data_dir = joinpath(@__DIR__, "../assets/data/malaria") # replace with the correct path.
-nothing # hide
-
-# Let's load the data. We will use a a [data
-# set](https://lhncbc.nlm.nih.gov/publication/pub9932) with microscope images of healthy
-# cells and cells infected with malaria. The original data set is quite large, but we can
-# pretend we were only given 200 images to work with. We have chosen the 200 images
-# randomly.
-
-uninfected = shuffle!(Images.load.(readdir(joinpath(data_dir, "uninfected"); join=true)))
-infected = shuffle!(Images.load.(readdir(joinpath(data_dir, "infected"); join=true)))
-
-images = [uninfected; infected]
-classes = [fill(false, length(uninfected)); fill(true, length(infected))]
-nothing # hide
-
-# Let's see what the images look like.
-
-plot(
-    plot(uninfected[1]; title="Healthy"),
-    plot(uninfected[2]; title="Healthy"),
-    plot(infected[1]; title="Infected"),
-    plot(infected[2]; title="Infected"),
-)
-
-# To make the images work with Ripserer, we convert them to floating gray scale values. We
-# do not have to resize the images. Maybe some additional preprocessing, such as
-# normalization would help, but we'll skip it for this example.
-
-inputs = [Gray.(image) for image in images]
-nothing # hide
-
-# Now we can compute persistence diagrams. Since we are working with images, we have to use
-# the `Cubical` filtration type. Cubical persistent homology should detect the dark spots
-# (local minima) in the images. It's pretty efficient, so this should only take a few
-# seconds.
-
-diagrams = @showprogress [ripserer(Cubical(i)) for i in inputs]
-
-# This is what some of the diagrams look like.
-
-plot(plot(images[1]; title="Healthy"), plot(diagrams[1]))
-
-#
-
-plot(plot(images[end]; title="Infected"), plot(diagrams[end]))
-
-# Notice that there is a lot more going on in the middle of the infected diagram, especially
-# in ``H_0``.
-
-# The persistence diagrams might look nice, but are hard to use with machine learning
-# algorithms. The number of points in the diagram may be different for every image, even
-# when images are of the same size. We can solve this problem by using a vectorization
-# method, such as converting all diagrams to persistence images.
-
-# Persistence images work by weighting each point in the diagram with a distribution. The
-# distribution defaults to a Gaussian, but any function of two arguments can be used. Each
-# point is also weighted by a weighting function that should be equal to zero along the
-# ``x``-axis. It defaults to a function that is zero on the ``x``-axis and linearly
-# increases to the maximum persistence in the diagram.
-
-# We start by splitting the diagrams into their 0 and 1 dimensional components.
-
-dim_0 = first.(diagrams)
-dim_1 = last.(diagrams)
-nothing; # hide
-
-# We feed the diagram to the `PersistenceImage` constructor which will choose ranges that
-# will fit all the diagrams. We set the sigma value to 0.1, since all persistence pairs are
-# in the ``[0,1]×[0,1]`` square and the default sigma of 1 would be too wide. We will use
-# the default image size, which is 5×5.
-
-image_0 = PersistenceImage(dim_0; sigma=0.1)
-
-#
-
-image_1 = PersistenceImage(dim_1; sigma=0.1)
-
-# Let's see how some of the images look like.
-
-plot(plot(dim_0[end]; persistence=true), heatmap(image_0(dim_0[end]); aspect_ratio=1))
-
-#
-
-plot(plot(dim_1[end]; persistence=true), heatmap(image_1(dim_1[end]); aspect_ratio=1))
-
-# Next, we convert all diagrams to images and use `vec` to turn them into flat vectors. We
-# then concatenate the zero and one-dimensional images. The result is a vector of length 50
-# for each diagram.
-
-persims = [[vec(image_0(dim_0[i])); vec(image_1(dim_1[i]))] for i in 1:length(diagrams)]
-
-# ## Fitting A Model
-
-# Now it's time to fit our model. We will use
-# [GLMNet.jl](https://github.com/JuliaStats/GLMNet.jl) to fit a regularized linear model.
-
-using GLMNet
-
-# Convert the image vectors to a matrix that will be understood by `glmnet`.
-
-X = reduce(hcat, persims)'
-y = classes
-nothing; # hide
-
-# Start by randomly splitting the data into two sets, a training and a testing set.
-
-perm = shuffle(1:200)
-train_x = X[perm[1:100], :]
-train_y = y[perm[1:100]]
-test_x = X[perm[101:end], :]
-test_y = y[perm[101:end]]
-nothing; # hide
-
-# Fit the model and predict.
-
-path = glmnet(train_x, train_y)
-cv = glmnetcv(train_x, train_y)
-
-λ = path.lambda[argmin(cv.meanloss)]
-path = glmnet(train_x, train_y; lambda=[λ])
-
-predictions = .!iszero.(round.(GLMNet.predict(path, test_x)))
-nothing; # hide
-
-# Get the classification accuracy.
-
-count(predictions .== test_y) / length(test_y)
-
-# Not half bad considering we haven't touched the images and we left pretty much all
-# settings on default.
-
-# Now let's look at the misclassified examples.
-
-missed = findall(predictions .!= test_y)
-label = ("Healthy", "Infected")
-plts = [plot(images[i]; title="$(label[test_y[i] + 1])", ticks=nothing) for i in missed]
-plot(plts...)
-
-# Finally, let's look at which parts of the persistence images `glmnet` considered important.
-
-plot(
-    heatmap(reshape(path.betas[1:25], (5, 5)); title="H₀ coefficients"),
-    heatmap(reshape(path.betas[26:50], (5, 5)); title="H₁ coefficients"),
-)
-
-# These correspond to the area we identified at the beginning. Also note that in this case,
-# the classifier does not care about ``H_1`` at all.
-
-# ## Using MLJ
-
-# Another, more straightforward way to execute a similar pipeline is to use Ripserer's
-# [MLJ.jl](https://github.com/alan-turing-institute/MLJ.jl) integration. We will use a
-# random forest classifier for this example.
-
-# We start by loading MLJ and the classifier. Not that
-# [MLJDecisionTreeInterface.jl](https://github.com/bensadeghi/DecisionTree.jl) needs to be
-# installed for this to work.
-
-using MLJ
-tree = @load RandomForestClassifier pkg = "DecisionTree" verbosity = 0
-
-# We create a pipeline of `CubicalPersistentHomology` followed by the classifier. In this
-# case, `CubicalPersistentHomology` takes care of both the homology computation and the
-# conversion to persistence images.
-
-pipe = @pipeline(CubicalPersistentHomology(), tree)
-
-# We train the pipeline the same way you would fit any other MLJ model. Remember, we need to
-# use grayscale versions of images stored in `inputs`.
-
-classes = coerce(classes, Binary)
-train, test = partition(eachindex(classes), 0.7; shuffle=true, rng=1337)
-mach = machine(pipe, inputs, classes)
-fit!(mach; rows=train)
-
-# Next, we predict the classes on the test data and print out the classification accuracy.
-
-yhat = predict_mode(mach, inputs[test])
-accuracy(yhat, classes[test])
-
-# The result is quite a bit worse than before. We can try mitigating that by using a
-# different vectorizer.
-
-pipe.cubical_persistent_homology.vectorizer = PersistenceCurveVectorizer()
-mach = machine(pipe, inputs, classes)
-fit!(mach; rows=train)
-
-yhat = predict_mode(mach, inputs[test])
-accuracy(yhat, classes[test])
-
-# The result could be improved further by choosing a different model and
-# vectorizer. However, this is just a short introduction. Please see the [MLJ.jl
-# documentation](https://alan-turing-institute.github.io/MLJ.jl/dev/) for more information
-# on model tuning and selection, and the [PersistenceDiagrams.jl
-# documentation](https://mtsch.github.io/PersistenceDiagrams.jl/dev/mlj/) for a list of
-# vectorizers and their options.
diff --git a/src/base/abstractcell.jl b/src/base/abstractcell.jl
index 7c57263f..2ccb9ab6 100644
--- a/src/base/abstractcell.jl
+++ b/src/base/abstractcell.jl
@@ -13,9 +13,8 @@ is determined by the following.
 * The `birth` of type `T` determines when the cell enters the filtration. Note that two
   cells with the same `index` should also have the same `birth`.
 
-* `vertices` should return the cell's vertices as a tuple. For 0-cells (vertices), the
-  `vertices` are also used to index into a filtration's
-  [`vertices`](vertices(::AbstractFiltration)).
+* `vertices` should return the cell's vertices as a tuple. For 0-cells, these are also used
+  to index into a filtration's [`vertices`](@ref vertices(::AbstractFiltration)).
 
 * The `sign` determens its orientation. Note that `cell == -cell`.
 
diff --git a/src/base/abstractfiltration.jl b/src/base/abstractfiltration.jl
index 07257617..9e78ca2d 100644
--- a/src/base/abstractfiltration.jl
+++ b/src/base/abstractfiltration.jl
@@ -162,10 +162,7 @@ julia> vertices(Rips([0 1 1; 1 0 1; 1 1 0]))
 Base.OneTo(3)
 
 julia> vertices(Cubical([0 1 1; 1 0 1; 1 1 0]))
-3×3 CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}:
- CartesianIndex(1, 1)  CartesianIndex(1, 2)  CartesianIndex(1, 3)
- CartesianIndex(2, 1)  CartesianIndex(2, 2)  CartesianIndex(2, 3)
- CartesianIndex(3, 1)  CartesianIndex(3, 2)  CartesianIndex(3, 3)
+CartesianIndices((3, 3))
 
 ```
 """
diff --git a/src/base/abstractsimplex.jl b/src/base/abstractsimplex.jl
index 4d016c5b..62cc1549 100644
--- a/src/base/abstractsimplex.jl
+++ b/src/base/abstractsimplex.jl
@@ -65,7 +65,11 @@ Use binary search to find index of first vertex in `(K - 1)`-dimensional simplex
 """
 function _first_vertex(index::I, ::Val{K}) where {I,K}
     lo = I(K - 1)
-    hi = I(K + 100)
+    if K < 3
+        hi = I(K + 100)
+    else
+        hi = I(K)
+    end
     while _binomial(hi, Val(K)) ≤ index
         lo = hi
         hi <<= 0x01
diff --git a/src/base/chain.jl b/src/base/chain.jl
index b18b200b..d3476ca5 100644
--- a/src/base/chain.jl
+++ b/src/base/chain.jl
@@ -22,7 +22,7 @@ julia> data = [(rand(), rand(), rand()) for _ in 1:100];
 julia> result = ripserer(data; reps=true, modulus=7);
 
 julia> chain = result[end][end].representative
-28-element Chain{Ripserer.Mod{7},Ripserer.Simplex{1, Float64, Int64}}:
+28-element Chain{Mod{7},Simplex{1, Float64, Int64}}:
  +Simplex{1}((68, 54), 0.25178097927369875) => 6 mod 7
  +Simplex{1}((54, 46), 0.2575262844682746) => 1 mod 7
  +Simplex{1}((88, 56), 0.25936896586973557) => 1 mod 7
diff --git a/src/base/primefield.jl b/src/base/primefield.jl
index b33bcf0d..dd5106c6 100644
--- a/src/base/primefield.jl
+++ b/src/base/primefield.jl
@@ -27,7 +27,12 @@ is_prime(::Any) = false
 Like `mod`, but with prime `M`.
 """
 function mod_prime(i, ::Val{M}) where {M}
-    is_prime(M) || throw(DomainError(M, "modulus must be a prime number"))
+    if !is_prime(M)
+        throw(DomainError(M, "modulus must be a prime number"))
+    end
+    if !(1 < M ≤ 3037000499)
+        throw(DomainError(M, "modulus must be smaller than √(typemax(Int))"))
+    end
     i = i % M
     return i + ifelse(signbit(i), M, 0)
 end
diff --git a/src/extra/circularcoordinates.jl b/src/extra/circularcoordinates.jl
index 22299fba..bcb2b954 100644
--- a/src/extra/circularcoordinates.jl
+++ b/src/extra/circularcoordinates.jl
@@ -192,7 +192,7 @@ CircularCoordinates(
   radius=(0.9510565162951535,),
   n_landmarks=10,
   partition=linear,
-  metric=Distances.Euclidean(0.0),
+  metric=Euclidean(0.0),
 )
 
 julia> summary(cc(data))
diff --git a/src/filtrations/alpha.jl b/src/filtrations/alpha.jl
index ce4b86c2..bd7a27c1 100644
--- a/src/filtrations/alpha.jl
+++ b/src/filtrations/alpha.jl
@@ -180,13 +180,13 @@ julia> length(Ripserer.edges(rips))
 
 julia> sort(ripserer(alpha)[2], by=persistence)[end]
 [0.375, 2.01) with:
- birth_simplex: Ripserer.Simplex{1, Float64, Int64}
- death_simplex: Ripserer.Simplex{2, Float64, Int64}
+ birth_simplex: Simplex{1, Float64, Int64}
+ death_simplex: Simplex{2, Float64, Int64}
 
 julia> sort(ripserer(rips)[2], by=persistence)[end]
 [0.375, 2.01) with:
- birth_simplex: Ripserer.Simplex{1, Float64, Int64}
- death_simplex: Ripserer.Simplex{2, Float64, Int64}
+ birth_simplex: Simplex{1, Float64, Int64}
+ death_simplex: Simplex{2, Float64, Int64}
 ```
 """
 struct Alpha{I,P<:SVector} <: AbstractCustomFiltration{I,Float64}
diff --git a/src/filtrations/custom.jl b/src/filtrations/custom.jl
index af4d79a1..56d53316 100644
--- a/src/filtrations/custom.jl
+++ b/src/filtrations/custom.jl
@@ -100,8 +100,7 @@ struct Custom{I,T} <: AbstractCustomFiltration{I,T}
     threshold::T
 end
 
-@inline insert_simplex!(::Vector{Dict{I,T}}, ::Tuple{}, _, _) where {I,T} = nothing
-@inline function insert_simplex!(
+function insert_simplex!(
     dicts::Vector{Dict{I,T}}, vertices::NTuple{N,I}, birth, threshold
 ) where {N,T,I}
     if birth > threshold
@@ -111,11 +110,19 @@ end
         dim = N - 1
         d_dict = dicts[dim + 1]
         d_dict[idx] = min(birth, get(d_dict, idx, typemax(T)))
-        for vs in IterTools.subsets(vertices, Val(N - 1))
-            # Some extra work is being done here, but this tends to be type stable.
-            insert_simplex!(dicts, vs, birth, threshold)
-        end
+        _insert_simplex_facets!(dicts, vertices, birth, Val(N - 1))
+    end
+end
+@inline _insert_simplex_facets!(_, _, _, ::Val{0}) = nothing
+@inline function _insert_simplex_facets!(dicts, vertices, birth, ::Val{N}) where {N}
+    dim = N - 1
+    d_dict = dicts[dim + 1]
+    T = valtype(d_dict)
+    for vs in IterTools.subsets(vertices, Val(N))
+        idx = index(vs)
+        d_dict[idx] = min(birth, get(d_dict, idx, typemax(T)))
     end
+    return _insert_simplex_facets!(dicts, vertices, birth, Val(N - 1))
 end
 
 function _adjacency_matrix(dicts)
diff --git a/test/aqua.jl b/test/aqua.jl
index 14654fef..32ffc713 100644
--- a/test/aqua.jl
+++ b/test/aqua.jl
@@ -1,7 +1,6 @@
 using Aqua
 using Ripserer
+using PersistenceDiagrams
 
-if VERSION < v"1.8-DEV"
-    # Note: will have to change Project.toml for 1.8
-    Aqua.test_all(Ripserer; ambiguities=false)
-end
+Aqua.test_all(Ripserer; ambiguities=false, piracies=false)
+Aqua.test_piracies(Ripserer; treat_as_own=[PersistenceDiagram, PersistenceInterval])