From 8b296a0dff95bc301ed360b479a18392918a2b6f Mon Sep 17 00:00:00 2001 From: mtsch Date: Tue, 4 Jun 2024 16:34:44 +1200 Subject: [PATCH] Fix terrible performance with custom filtrations (#171) * Fix terrible performance with custom filtrations * Fix type params, prevent unnecessary overflows in high-dimensional simplices * fixup aqua tests * update changelog * Fix aqua and formatting * Update doctests * Update test/aqua.jl Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * Update test/doctests.jl Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * don't fix in doctest * revert doctests * cleanup Project.toml * remove future * Add assertion to Mod * Fixup docs * Format --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- CHANGELOG.md | 4 + Project.toml | 12 +- docs/make.jl | 8 +- docs/src/examples/malaria.jl | 213 ------------------------------- src/base/abstractcell.jl | 5 +- src/base/abstractfiltration.jl | 5 +- src/base/abstractsimplex.jl | 6 +- src/base/chain.jl | 2 +- src/base/primefield.jl | 7 +- src/extra/circularcoordinates.jl | 2 +- src/filtrations/alpha.jl | 8 +- src/filtrations/custom.jl | 19 ++- test/aqua.jl | 7 +- 13 files changed, 52 insertions(+), 246 deletions(-) delete mode 100644 docs/src/examples/malaria.jl diff --git a/CHANGELOG.md b/CHANGELOG.md index a9788a21..0ff34845 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +# v0.16.13 + +Fix performance issues with high-dimensional `Custom` filtrations. + # v0.16.9 Replace LightGraps with Graphs. diff --git a/Project.toml b/Project.toml index 86605ebe..3cd5d77c 100644 --- a/Project.toml +++ b/Project.toml @@ -1,13 +1,12 @@ name = "Ripserer" uuid = "aa79e827-bd0b-42a8-9f10-2b302677a641" authors = ["mtsch "] -version = "0.16.12" +version = "0.16.13" [deps] Compat = "34da2185-b29b-5c13-b0c7-acf172513d20" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" -Future = "9fa8497b-333b-5362-9e8d-4d0656e87820" Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -21,17 +20,26 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6" [compat] +Aqua = "0.8" Compat = "^3.10, 4" DataStructures = "0.17, 0.18" Distances = "0.8, 0.9, 0.10" +Documenter = "1" Graphs = "1" IterTools = "1" +LinearAlgebra = "1" +MLJBase = "1" MLJModelInterface = "^0.3.5, 0.4, 1" MiniQhull = "0.2, 0.3, 0.4" PersistenceDiagrams = "0.9" ProgressMeter = "1" +Random = "1" RecipesBase = "1" +SafeTestsets = "0.1" +SparseArrays = "1" StaticArrays = "0.12, 1" +Suppressor = "0.2" +Test = "1" TupleTools = "1" julia = "1.6" diff --git a/docs/make.jl b/docs/make.jl index 712719a6..6fb7f6aa 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -28,12 +28,8 @@ makedocs(; "Home" => "index.md", "Usage Guide" => "generated/basics.md", "API" => "api.md", - "Examples" => [ - "generated/stability.md", - "generated/cocycles.md", - "generated/cubical.md", - "generated/malaria.md", - ], + "Examples" => + ["generated/stability.md", "generated/cocycles.md", "generated/cubical.md"], "Benchmarks" => "benchmarks.md", "Related Julia Packages" => "related-work.md", "Acknowledgements and References" => "references.md", diff --git a/docs/src/examples/malaria.jl b/docs/src/examples/malaria.jl deleted file mode 100644 index be18f275..00000000 --- a/docs/src/examples/malaria.jl +++ /dev/null @@ -1,213 +0,0 @@ -# # Image Classification With Cubical Persistent Homology - -# In this example, we will show how to use Ripserer in an image classification -# context. Persistent homology is not a predictive algorithm, but it can be used to extract -# useful features from data. - -# ## Setting up - -using Ripserer -using PersistenceDiagrams -using Images # also required: ImageIO to read .png files -using Plots -using ProgressMeter -using Random -Random.seed!(1337) - -data_dir = joinpath(@__DIR__, "../assets/data/malaria") # replace with the correct path. -nothing # hide - -# Let's load the data. We will use a a [data -# set](https://lhncbc.nlm.nih.gov/publication/pub9932) with microscope images of healthy -# cells and cells infected with malaria. The original data set is quite large, but we can -# pretend we were only given 200 images to work with. We have chosen the 200 images -# randomly. - -uninfected = shuffle!(Images.load.(readdir(joinpath(data_dir, "uninfected"); join=true))) -infected = shuffle!(Images.load.(readdir(joinpath(data_dir, "infected"); join=true))) - -images = [uninfected; infected] -classes = [fill(false, length(uninfected)); fill(true, length(infected))] -nothing # hide - -# Let's see what the images look like. - -plot( - plot(uninfected[1]; title="Healthy"), - plot(uninfected[2]; title="Healthy"), - plot(infected[1]; title="Infected"), - plot(infected[2]; title="Infected"), -) - -# To make the images work with Ripserer, we convert them to floating gray scale values. We -# do not have to resize the images. Maybe some additional preprocessing, such as -# normalization would help, but we'll skip it for this example. - -inputs = [Gray.(image) for image in images] -nothing # hide - -# Now we can compute persistence diagrams. Since we are working with images, we have to use -# the `Cubical` filtration type. Cubical persistent homology should detect the dark spots -# (local minima) in the images. It's pretty efficient, so this should only take a few -# seconds. - -diagrams = @showprogress [ripserer(Cubical(i)) for i in inputs] - -# This is what some of the diagrams look like. - -plot(plot(images[1]; title="Healthy"), plot(diagrams[1])) - -# - -plot(plot(images[end]; title="Infected"), plot(diagrams[end])) - -# Notice that there is a lot more going on in the middle of the infected diagram, especially -# in ``H_0``. - -# The persistence diagrams might look nice, but are hard to use with machine learning -# algorithms. The number of points in the diagram may be different for every image, even -# when images are of the same size. We can solve this problem by using a vectorization -# method, such as converting all diagrams to persistence images. - -# Persistence images work by weighting each point in the diagram with a distribution. The -# distribution defaults to a Gaussian, but any function of two arguments can be used. Each -# point is also weighted by a weighting function that should be equal to zero along the -# ``x``-axis. It defaults to a function that is zero on the ``x``-axis and linearly -# increases to the maximum persistence in the diagram. - -# We start by splitting the diagrams into their 0 and 1 dimensional components. - -dim_0 = first.(diagrams) -dim_1 = last.(diagrams) -nothing; # hide - -# We feed the diagram to the `PersistenceImage` constructor which will choose ranges that -# will fit all the diagrams. We set the sigma value to 0.1, since all persistence pairs are -# in the ``[0,1]×[0,1]`` square and the default sigma of 1 would be too wide. We will use -# the default image size, which is 5×5. - -image_0 = PersistenceImage(dim_0; sigma=0.1) - -# - -image_1 = PersistenceImage(dim_1; sigma=0.1) - -# Let's see how some of the images look like. - -plot(plot(dim_0[end]; persistence=true), heatmap(image_0(dim_0[end]); aspect_ratio=1)) - -# - -plot(plot(dim_1[end]; persistence=true), heatmap(image_1(dim_1[end]); aspect_ratio=1)) - -# Next, we convert all diagrams to images and use `vec` to turn them into flat vectors. We -# then concatenate the zero and one-dimensional images. The result is a vector of length 50 -# for each diagram. - -persims = [[vec(image_0(dim_0[i])); vec(image_1(dim_1[i]))] for i in 1:length(diagrams)] - -# ## Fitting A Model - -# Now it's time to fit our model. We will use -# [GLMNet.jl](https://github.com/JuliaStats/GLMNet.jl) to fit a regularized linear model. - -using GLMNet - -# Convert the image vectors to a matrix that will be understood by `glmnet`. - -X = reduce(hcat, persims)' -y = classes -nothing; # hide - -# Start by randomly splitting the data into two sets, a training and a testing set. - -perm = shuffle(1:200) -train_x = X[perm[1:100], :] -train_y = y[perm[1:100]] -test_x = X[perm[101:end], :] -test_y = y[perm[101:end]] -nothing; # hide - -# Fit the model and predict. - -path = glmnet(train_x, train_y) -cv = glmnetcv(train_x, train_y) - -λ = path.lambda[argmin(cv.meanloss)] -path = glmnet(train_x, train_y; lambda=[λ]) - -predictions = .!iszero.(round.(GLMNet.predict(path, test_x))) -nothing; # hide - -# Get the classification accuracy. - -count(predictions .== test_y) / length(test_y) - -# Not half bad considering we haven't touched the images and we left pretty much all -# settings on default. - -# Now let's look at the misclassified examples. - -missed = findall(predictions .!= test_y) -label = ("Healthy", "Infected") -plts = [plot(images[i]; title="$(label[test_y[i] + 1])", ticks=nothing) for i in missed] -plot(plts...) - -# Finally, let's look at which parts of the persistence images `glmnet` considered important. - -plot( - heatmap(reshape(path.betas[1:25], (5, 5)); title="H₀ coefficients"), - heatmap(reshape(path.betas[26:50], (5, 5)); title="H₁ coefficients"), -) - -# These correspond to the area we identified at the beginning. Also note that in this case, -# the classifier does not care about ``H_1`` at all. - -# ## Using MLJ - -# Another, more straightforward way to execute a similar pipeline is to use Ripserer's -# [MLJ.jl](https://github.com/alan-turing-institute/MLJ.jl) integration. We will use a -# random forest classifier for this example. - -# We start by loading MLJ and the classifier. Not that -# [MLJDecisionTreeInterface.jl](https://github.com/bensadeghi/DecisionTree.jl) needs to be -# installed for this to work. - -using MLJ -tree = @load RandomForestClassifier pkg = "DecisionTree" verbosity = 0 - -# We create a pipeline of `CubicalPersistentHomology` followed by the classifier. In this -# case, `CubicalPersistentHomology` takes care of both the homology computation and the -# conversion to persistence images. - -pipe = @pipeline(CubicalPersistentHomology(), tree) - -# We train the pipeline the same way you would fit any other MLJ model. Remember, we need to -# use grayscale versions of images stored in `inputs`. - -classes = coerce(classes, Binary) -train, test = partition(eachindex(classes), 0.7; shuffle=true, rng=1337) -mach = machine(pipe, inputs, classes) -fit!(mach; rows=train) - -# Next, we predict the classes on the test data and print out the classification accuracy. - -yhat = predict_mode(mach, inputs[test]) -accuracy(yhat, classes[test]) - -# The result is quite a bit worse than before. We can try mitigating that by using a -# different vectorizer. - -pipe.cubical_persistent_homology.vectorizer = PersistenceCurveVectorizer() -mach = machine(pipe, inputs, classes) -fit!(mach; rows=train) - -yhat = predict_mode(mach, inputs[test]) -accuracy(yhat, classes[test]) - -# The result could be improved further by choosing a different model and -# vectorizer. However, this is just a short introduction. Please see the [MLJ.jl -# documentation](https://alan-turing-institute.github.io/MLJ.jl/dev/) for more information -# on model tuning and selection, and the [PersistenceDiagrams.jl -# documentation](https://mtsch.github.io/PersistenceDiagrams.jl/dev/mlj/) for a list of -# vectorizers and their options. diff --git a/src/base/abstractcell.jl b/src/base/abstractcell.jl index 7c57263f..2ccb9ab6 100644 --- a/src/base/abstractcell.jl +++ b/src/base/abstractcell.jl @@ -13,9 +13,8 @@ is determined by the following. * The `birth` of type `T` determines when the cell enters the filtration. Note that two cells with the same `index` should also have the same `birth`. -* `vertices` should return the cell's vertices as a tuple. For 0-cells (vertices), the - `vertices` are also used to index into a filtration's - [`vertices`](vertices(::AbstractFiltration)). +* `vertices` should return the cell's vertices as a tuple. For 0-cells, these are also used + to index into a filtration's [`vertices`](@ref vertices(::AbstractFiltration)). * The `sign` determens its orientation. Note that `cell == -cell`. diff --git a/src/base/abstractfiltration.jl b/src/base/abstractfiltration.jl index 07257617..9e78ca2d 100644 --- a/src/base/abstractfiltration.jl +++ b/src/base/abstractfiltration.jl @@ -162,10 +162,7 @@ julia> vertices(Rips([0 1 1; 1 0 1; 1 1 0])) Base.OneTo(3) julia> vertices(Cubical([0 1 1; 1 0 1; 1 1 0])) -3×3 CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}: - CartesianIndex(1, 1) CartesianIndex(1, 2) CartesianIndex(1, 3) - CartesianIndex(2, 1) CartesianIndex(2, 2) CartesianIndex(2, 3) - CartesianIndex(3, 1) CartesianIndex(3, 2) CartesianIndex(3, 3) +CartesianIndices((3, 3)) ``` """ diff --git a/src/base/abstractsimplex.jl b/src/base/abstractsimplex.jl index 4d016c5b..62cc1549 100644 --- a/src/base/abstractsimplex.jl +++ b/src/base/abstractsimplex.jl @@ -65,7 +65,11 @@ Use binary search to find index of first vertex in `(K - 1)`-dimensional simplex """ function _first_vertex(index::I, ::Val{K}) where {I,K} lo = I(K - 1) - hi = I(K + 100) + if K < 3 + hi = I(K + 100) + else + hi = I(K) + end while _binomial(hi, Val(K)) ≤ index lo = hi hi <<= 0x01 diff --git a/src/base/chain.jl b/src/base/chain.jl index b18b200b..d3476ca5 100644 --- a/src/base/chain.jl +++ b/src/base/chain.jl @@ -22,7 +22,7 @@ julia> data = [(rand(), rand(), rand()) for _ in 1:100]; julia> result = ripserer(data; reps=true, modulus=7); julia> chain = result[end][end].representative -28-element Chain{Ripserer.Mod{7},Ripserer.Simplex{1, Float64, Int64}}: +28-element Chain{Mod{7},Simplex{1, Float64, Int64}}: +Simplex{1}((68, 54), 0.25178097927369875) => 6 mod 7 +Simplex{1}((54, 46), 0.2575262844682746) => 1 mod 7 +Simplex{1}((88, 56), 0.25936896586973557) => 1 mod 7 diff --git a/src/base/primefield.jl b/src/base/primefield.jl index b33bcf0d..dd5106c6 100644 --- a/src/base/primefield.jl +++ b/src/base/primefield.jl @@ -27,7 +27,12 @@ is_prime(::Any) = false Like `mod`, but with prime `M`. """ function mod_prime(i, ::Val{M}) where {M} - is_prime(M) || throw(DomainError(M, "modulus must be a prime number")) + if !is_prime(M) + throw(DomainError(M, "modulus must be a prime number")) + end + if !(1 < M ≤ 3037000499) + throw(DomainError(M, "modulus must be smaller than √(typemax(Int))")) + end i = i % M return i + ifelse(signbit(i), M, 0) end diff --git a/src/extra/circularcoordinates.jl b/src/extra/circularcoordinates.jl index 22299fba..bcb2b954 100644 --- a/src/extra/circularcoordinates.jl +++ b/src/extra/circularcoordinates.jl @@ -192,7 +192,7 @@ CircularCoordinates( radius=(0.9510565162951535,), n_landmarks=10, partition=linear, - metric=Distances.Euclidean(0.0), + metric=Euclidean(0.0), ) julia> summary(cc(data)) diff --git a/src/filtrations/alpha.jl b/src/filtrations/alpha.jl index ce4b86c2..bd7a27c1 100644 --- a/src/filtrations/alpha.jl +++ b/src/filtrations/alpha.jl @@ -180,13 +180,13 @@ julia> length(Ripserer.edges(rips)) julia> sort(ripserer(alpha)[2], by=persistence)[end] [0.375, 2.01) with: - birth_simplex: Ripserer.Simplex{1, Float64, Int64} - death_simplex: Ripserer.Simplex{2, Float64, Int64} + birth_simplex: Simplex{1, Float64, Int64} + death_simplex: Simplex{2, Float64, Int64} julia> sort(ripserer(rips)[2], by=persistence)[end] [0.375, 2.01) with: - birth_simplex: Ripserer.Simplex{1, Float64, Int64} - death_simplex: Ripserer.Simplex{2, Float64, Int64} + birth_simplex: Simplex{1, Float64, Int64} + death_simplex: Simplex{2, Float64, Int64} ``` """ struct Alpha{I,P<:SVector} <: AbstractCustomFiltration{I,Float64} diff --git a/src/filtrations/custom.jl b/src/filtrations/custom.jl index af4d79a1..56d53316 100644 --- a/src/filtrations/custom.jl +++ b/src/filtrations/custom.jl @@ -100,8 +100,7 @@ struct Custom{I,T} <: AbstractCustomFiltration{I,T} threshold::T end -@inline insert_simplex!(::Vector{Dict{I,T}}, ::Tuple{}, _, _) where {I,T} = nothing -@inline function insert_simplex!( +function insert_simplex!( dicts::Vector{Dict{I,T}}, vertices::NTuple{N,I}, birth, threshold ) where {N,T,I} if birth > threshold @@ -111,11 +110,19 @@ end dim = N - 1 d_dict = dicts[dim + 1] d_dict[idx] = min(birth, get(d_dict, idx, typemax(T))) - for vs in IterTools.subsets(vertices, Val(N - 1)) - # Some extra work is being done here, but this tends to be type stable. - insert_simplex!(dicts, vs, birth, threshold) - end + _insert_simplex_facets!(dicts, vertices, birth, Val(N - 1)) + end +end +@inline _insert_simplex_facets!(_, _, _, ::Val{0}) = nothing +@inline function _insert_simplex_facets!(dicts, vertices, birth, ::Val{N}) where {N} + dim = N - 1 + d_dict = dicts[dim + 1] + T = valtype(d_dict) + for vs in IterTools.subsets(vertices, Val(N)) + idx = index(vs) + d_dict[idx] = min(birth, get(d_dict, idx, typemax(T))) end + return _insert_simplex_facets!(dicts, vertices, birth, Val(N - 1)) end function _adjacency_matrix(dicts) diff --git a/test/aqua.jl b/test/aqua.jl index 14654fef..32ffc713 100644 --- a/test/aqua.jl +++ b/test/aqua.jl @@ -1,7 +1,6 @@ using Aqua using Ripserer +using PersistenceDiagrams -if VERSION < v"1.8-DEV" - # Note: will have to change Project.toml for 1.8 - Aqua.test_all(Ripserer; ambiguities=false) -end +Aqua.test_all(Ripserer; ambiguities=false, piracies=false) +Aqua.test_piracies(Ripserer; treat_as_own=[PersistenceDiagram, PersistenceInterval])