diff --git a/Project.toml b/Project.toml
index 7a8ee3f..7016bd6 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "BlockFactorizations"
 uuid = "5c499583-5bfe-4591-9b59-c1e192d48697"
 authors = ["Sebastian Ament <sebastianeament@gmail.com>"]
-version = "1.2.0"
+version = "1.2.1"
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
diff --git a/src/block.jl b/src/block.jl
index 50971ba..27fe6d0 100644
--- a/src/block.jl
+++ b/src/block.jl
@@ -140,102 +140,35 @@ end
 function LinearAlgebra.mul!(y::AbstractVector, B::BlockFactorization, x::AbstractVector, α::Real = 1, β::Real = 0)
     xx = [@view x[B.mindices[i] : B.mindices[i+1]-1] for i in 1:length(B.mindices)-1]
     yy = [@view y[B.nindices[i] : B.nindices[i+1]-1] for i in 1:length(B.nindices)-1]
-    strided = Val(false)
-    blockmul!(yy, B.A, xx, strided, α, β)
+    blockmul!(yy, B.A, xx, α, β)
     return y
 end
 
 function LinearAlgebra.mul!(Y::AbstractMatrix, B::BlockFactorization, X::AbstractMatrix, α::Real = 1, β::Real = 0)
     XX = [@view X[B.mindices[i] : B.mindices[i+1]-1, :] for i in 1:length(B.mindices)-1]
     YY = [@view Y[B.nindices[i] : B.nindices[i+1]-1, :] for i in 1:length(B.nindices)-1]
-    strided = Val(false)
-    blockmul!(YY, B.A, XX, strided, α, β)
+    blockmul!(YY, B.A, XX, α, β)
     return Y
 end
 
 # carries out multiplication for general BlockFactorization
-function blockmul!(y::AbstractVecOfVecOrMat, G::AbstractMatrix,
-                   x::AbstractVecOfVecOrMat, strided::Val{false} = Val(false), α::Real = 1, β::Real = 0)
+function blockmul!(y::AbstractVecOfVecOrMat, G::AbstractMatrix, x::AbstractVecOfVecOrMat, α::Real = 1, β::Real = 0)
     @threads for i in eachindex(y)
         @. y[i] = β * y[i]
         for j in eachindex(x)
-            Gij = G[i, j] # if it is not strided, we can't pre-allocate memory for blocks
-            mul!(y[i], Gij, x[j], α, 1) # woodbury still allocates here because of Diagonal
+            mul!(y[i], G[i, j], x[j], α, 1)
         end
     end
     return y
 end
 
-# carries out block multiplication for strided block factorization
-function LinearAlgebra.mul!(y::AbstractVector, B::StridedBlockFactorization, x::AbstractVector, α::Real = 1, β::Real = 0)
-    length(x) == size(B, 2) || throw(DimensionMismatch("length(x) = $(length(x)) ≠ $(size(B, 2)) = size(B, 2)"))
-    length(y) == size(B, 1) || throw(DimensionMismatch("length(y) = $(length(y)) ≠ $(size(B, 1)) = size(B, 1)"))
-    X, Y = reshape(x, B.mindices.step, :), reshape(y, B.nindices.step, :)
-    xx, yy = [c for c in eachcol(X)], [c for c in eachcol(Y)]
-    strided = Val(true)
-    blockmul!(yy, B.A, xx, strided, α, β)
-    return y
-end
-
-function LinearAlgebra.mul!(Y::AbstractMatrix, B::StridedBlockFactorization, X::AbstractMatrix, α::Real = 1, β::Real = 0)
-    size(X, 1) == size(B, 2) || throw(DimensionMismatch("size(X, 1) = $(size(X, 1)) ≠ $(size(B, 2)) = size(B, 2)"))
-    size(Y, 1) == size(B, 1) || throw(DimensionMismatch("size(Y, 1) = $(size(Y, 1)) ≠ $(size(B, 1)) = size(B, 1)"))
-    k = size(Y, 2)
-    size(Y, 2) == size(X, 2) || throw(DimensionMismatch("size(Y, 2) = $(size(Y, 2)) ≠ $(size(X, 1)) = size(X, 1)"))
-    XR, YR = reshape(X, B.mindices.step, :, k), reshape(Y, B.nindices.step, :, k)
-    n, m = size(XR, 2), size(YR, 2)
-    XX, YY = @views [XR[:, i, :] for i in 1:n], [YR[:, i, :] for i in 1:m]
-    strided = Val(true)
-    blockmul!(YY, B.A, XX, strided, α, β)
-    return Y
-end
-
-# recursively calls mul!, thereby avoiding memory allocation of block-matrix multiplication
-function blockmul!(y::AbstractVecOfVecOrMat, G::AbstractMatrix,
-                   x::AbstractVecOfVecOrMat, strided::Val{true}, α::Real = 1, β::Real = 0)
-    # pre-allocate temporary storage for matrix elements (needs to be done better, "similar"?)
-    Gijs = [G[1, 1] for _ in 1:nthreads()] # IDEA could be nothing if G is a AbstractMatrix{<:Matrix}
-    @threads for i in eachindex(y)
-        @. y[i] = β * y[i]
-        Gij = Gijs[threadid()]
-        for j in eachindex(x)
-            Gij = evaluate_block!(Gij, G, i, j) # evaluating G[i, j] but can be more efficient if block has special structure (e.g. Woodbury)
-            mul!(y[i], Gij, x[j], α, 1) # woodbury still allocates here because of Diagonal
-        end
-    end
-    return y
-end
-
-# fallback for generic matrices or factorizations
-# does not overwrite Gij in this case, only for more advanced data structures,
-# that are not already fully allocated
-function evaluate_block!(Gij, G::AbstractMatrix, i::Int, j::Int)
-    G[i, j]
-end
-
 ################## specialization for diagonal block factorizations ############
 # const DiagonalBlockFactorization{T} = BlockFactorization{T, <:Diagonal}
 # carries out multiplication for Diagonal BlockFactorization
 function blockmul!(y::AbstractVecOfVecOrMat, G::Diagonal,
-                   x::AbstractVecOfVecOrMat, strided::Val{false}, α::Real = 1, β::Real = 0)
+                   x::AbstractVecOfVecOrMat, α::Real = 1, β::Real = 0)
     @threads for i in eachindex(y)
-        @. y[i] = β * y[i] # IDEA: y[i] .*= β ?
-        Gii = G[i, i] # if it is not strided, we can't pre-allocate memory for blocks
-        mul!(y[i], Gii, x[i], α, 1) # woodbury still allocates here because of Diagonal
-    end
-    return y
-end
-
-# recursively calls mul!, thereby avoiding memory allocation of block-matrix multiplication
-function blockmul!(y::AbstractVecOfVecOrMat, G::Diagonal,
-                   x::AbstractVecOfVecOrMat, strided::Val{true}, α::Real = 1, β::Real = 0)
-    # pre-allocate temporary storage for matrix elements (needs to be done better, "similar"?)
-    Giis = [G[1, 1] for _ in 1:nthreads()]
-    @threads for i in eachindex(y)
-        @. y[i] = β * y[i]
-        Gii = Giis[threadid()]
-        Gii = evaluate_block!(Gii, G, i, i) # evaluating G[i, j] but can be more efficient if block has special structure (e.g. Woodbury)
-        mul!(y[i], Gii, x[i], α, 1) # woodbury still allocates here because of Diagonal
+        mul!(y[i], G[i, i], x[i], α, β)
     end
     return y
 end