From ebf39214268676eb78e18a83272b4d1b98c97639 Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Fri, 29 Oct 2021 18:30:16 +0200
Subject: [PATCH 01/34] implement network_basis kernel

---
 src/Polar/Constraints/basis.jl       | 101 +++++++++++++++++++++++++++
 src/Polar/Constraints/constraints.jl |  10 ++-
 src/Polar/kernels.jl                 |  51 ++++++++++++++
 src/PowerSystem/matpower.jl          |  24 +++++++
 test/Polar/autodiff.jl               |  10 +--
 5 files changed, 189 insertions(+), 7 deletions(-)
 create mode 100644 src/Polar/Constraints/basis.jl

diff --git a/src/Polar/Constraints/basis.jl b/src/Polar/Constraints/basis.jl
new file mode 100644
index 00000000..dd5a71b2
--- /dev/null
+++ b/src/Polar/Constraints/basis.jl
@@ -0,0 +1,101 @@
+function network_basis end
+is_constraint(::typeof(network_basis)) = true
+
+function size_constraint(polar::PolarForm, ::typeof(network_basis))
+    return PS.get(polar.network, PS.NumberOfBuses()) + 2 * PS.get(polar.network, PS.NumberOfLines())
+end
+
+# We add constraint only on vmag_pq
+function _network_basis(polar::PolarForm, cons, vmag, vang)
+    nbus = PS.get(polar.network, PS.NumberOfBuses())
+    nlines = PS.get(polar.network, PS.NumberOfLines())
+
+    ev = basis_kernel!(polar.device)(
+        cons, vmag, vang,
+        polar.topology.f_buses, polar.topology.t_buses, nlines, nbus,
+        ndrange=(nlines+nbus, size(cons, 2)),
+        dependencies=Event(polar.device)
+    )
+    wait(ev)
+    return
+end
+
+function network_basis(polar::PolarForm, cons, vmag, vang, pnet, qnet, pload, qload)
+    _network_basis(polar, cons, vmag, vang)
+end
+function network_basis(polar::PolarForm, cons, buffer)
+    _network_basis(polar, cons, buffer.vmag, buffer.vang)
+end
+
+function adjoint!(
+    polar::PolarForm,
+    pbm::AutoDiff.TapeMemory{F, S, I},
+    cons, ∂cons,
+    vmag, ∂vmag,
+    vang, ∂vang,
+    pnet, ∂pnet,
+    pload, qload,
+) where {F<:typeof(network_basis), S, I}
+    nl = PS.get(polar.network, PS.NumberOfLines())
+    nb = PS.get(polar.network, PS.NumberOfBuses())
+    top = polar.topology
+    f = top.f_buses
+    t = top.t_buses
+
+    fill!(pbm.intermediate.∂edge_vm_fr , 0.0)
+    fill!(pbm.intermediate.∂edge_vm_to , 0.0)
+    fill!(pbm.intermediate.∂edge_va_fr , 0.0)
+    fill!(pbm.intermediate.∂edge_va_to , 0.0)
+
+    ndrange = (nl+nb, size(∂cons, 2))
+    ev = adj_basis_kernel!(polar.device)(
+        ∂cons,
+        ∂vmag,
+        pbm.intermediate.∂edge_vm_fr,
+        pbm.intermediate.∂edge_vm_to,
+        pbm.intermediate.∂edge_va_fr,
+        pbm.intermediate.∂edge_va_to,
+        vmag, vang, f, t, nl, nb,
+        ndrange=ndrange, dependencies=Event(polar.device),
+    )
+    wait(ev)
+
+    Cf = sparse(f, 1:nl, ones(nl), nb, nl)       # connection matrix for line & from buses
+    Ct = sparse(t, 1:nl, ones(nl), nb, nl)       # connection matrix for line & to buses
+    mul!(∂vmag, Cf, pbm.intermediate.∂edge_vm_fr, 1.0, 1.0)
+    mul!(∂vmag, Ct, pbm.intermediate.∂edge_vm_to, 1.0, 1.0)
+    mul!(∂vang, Cf, pbm.intermediate.∂edge_va_fr, 1.0, 1.0)
+    mul!(∂vang, Ct, pbm.intermediate.∂edge_va_to, 1.0, 1.0)
+    return
+end
+
+function matpower_jacobian(polar::PolarForm, X::Union{State, Control}, ::typeof(network_basis), V)
+    nbus = get(polar, PS.NumberOfBuses())
+    nlines = get(polar, PS.NumberOfLines())
+    pf = polar.network
+    ref, pv, pq = index_buses_host(polar)
+    nref = length(ref)
+    npv = length(pv)
+    npq = length(pq)
+
+    dS_dVm, dS_dVa = PS._matpower_basis_jacobian(V, pf.lines)
+    dV2 = 2 * sparse(1:nbus, 1:nbus, abs.(V), nbus, nbus)
+
+    if isa(X, State)
+        j11 = real(dS_dVa[:, [pv; pq]])
+        j12 = real(dS_dVm[:, pq])
+        j21 = imag(dS_dVa[:, [pv; pq]])
+        j22 = imag(dS_dVm[:, pq])
+        j31 = spzeros(nbus, npv + npq)
+        j32 = dV2[:, pq]
+        return [j11 j12; j21 j22; j31 j32]::SparseMatrixCSC{Float64, Int}
+    elseif isa(X, Control)
+        j11 = real(dS_dVm[:, [ref; pv]])
+        j12 = spzeros(nlines, npv)
+        j21 = imag(dS_dVm[:, [ref; pv]])
+        j22 = spzeros(nlines, npv)
+        j31 = dV2[:, [ref; pv]]
+        j32 = spzeros(nbus, npv)
+        return [j11 j12; j21 j22; j31 j32]::SparseMatrixCSC{Float64, Int}
+    end
+end
diff --git a/src/Polar/Constraints/constraints.jl b/src/Polar/Constraints/constraints.jl
index e173aa49..45c24af4 100644
--- a/src/Polar/Constraints/constraints.jl
+++ b/src/Polar/Constraints/constraints.jl
@@ -14,16 +14,22 @@ include("reactive_power.jl")
 include("line_flow.jl")
 include("ramping_rate.jl")
 include("network_operation.jl")
+include("basis.jl")
 
 # By default, function does not have any intermediate state
 _get_intermediate_stack(polar::PolarForm, func::Function, VT, nbatch) = nothing
 
 function _get_intermediate_stack(
     polar::PolarForm, func::F, VT, nbatch
-) where {F <: Union{typeof(reactive_power_constraints), typeof(flow_constraints), typeof(power_balance), typeof(bus_power_injection)}}
+) where {F <: Union{typeof(reactive_power_constraints), typeof(flow_constraints), typeof(power_balance), typeof(bus_power_injection), typeof(network_basis)}}
     nlines = PS.get(polar.network, PS.NumberOfLines())
     # Take care that flow_constraints needs a buffer with a different size
-    nnz = isa(func, typeof(flow_constraints)) ? nlines : length(polar.topology.ybus_im.nzval)
+    nnz = if isa(func, typeof(flow_constraints))  || isa(func, typeof(network_basis))
+        nlines
+    else
+        length(polar.topology.ybus_im.nzval)
+    end
+
     # Return a NamedTuple storing all the intermediate states
     if nbatch == 1
         return (
diff --git a/src/Polar/kernels.jl b/src/Polar/kernels.jl
index 1b533613..aebae94e 100644
--- a/src/Polar/kernels.jl
+++ b/src/Polar/kernels.jl
@@ -753,3 +753,54 @@ function adj_branch_flow!(
     )
     wait(ev)
 end
+
+KA.@kernel function basis_kernel!(
+    cons, @Const(vmag), @Const(vang), @Const(f), @Const(t), nlines, nbus,
+)
+    i, j = @index(Global, NTuple)
+
+    if i <= nlines
+        ℓ = i
+        fr_bus = f[ℓ]
+        to_bus = t[ℓ]
+        Δθ = vang[fr_bus, j] - vang[to_bus, j]
+        cosθ = cos(Δθ)
+        sinθ = sin(Δθ)
+        cons[ℓ,        j] = vmag[fr_bus, j] * vmag[to_bus, j] * cosθ
+        cons[ℓ+nlines, j] = vmag[fr_bus, j] * vmag[to_bus, j] * sinθ
+    else i <= nlines + nbus
+        b = i - nlines
+        cons[b+2*nlines, j] = vmag[b, j] * vmag[b, j]
+    end
+end
+
+KA.@kernel function adj_basis_kernel!(
+    ∂cons, adj_vmag, adj_vmag_fr, adj_vmag_to,
+    adj_vang_fr, adj_vang_to,
+    @Const(vmag), @Const(vang), @Const(f), @Const(t), nlines, nbus,
+)
+    i, j = @index(Global, NTuple)
+
+    if i <= nlines
+        ℓ = i
+        fr_bus = f[ℓ]
+        to_bus = t[ℓ]
+        Δθ = vang[fr_bus, j] - vang[to_bus, j]
+        cosθ = cos(Δθ)
+        sinθ = sin(Δθ)
+
+        adj_vang_fr[i]  = -vmag[fr_bus, j] * vmag[to_bus, j] * sinθ * ∂cons[ℓ, j]
+        adj_vang_fr[i] +=  vmag[fr_bus, j] * vmag[to_bus, j] * cosθ * ∂cons[ℓ+nlines, j]
+        adj_vang_to[i]  =  vmag[fr_bus, j] * vmag[to_bus, j] * sinθ * ∂cons[ℓ, j]
+        adj_vang_to[i] -=  vmag[fr_bus, j] * vmag[to_bus, j] * cosθ * ∂cons[ℓ+nlines, j]
+
+        adj_vmag_fr[i] =  vmag[to_bus, j] * cosθ * ∂cons[ℓ, j]
+        adj_vmag_fr[i] += vmag[to_bus, j] * sinθ * ∂cons[ℓ+nlines, j]
+
+        adj_vmag_to[i] =  vmag[fr_bus, j] * cosθ * ∂cons[ℓ, j]
+        adj_vmag_to[i] += vmag[fr_bus, j] * sinθ * ∂cons[ℓ+nlines, j]
+    else i <= nlines + nbus
+        b = i - nlines
+        adj_vmag[b, j] = 2.0 * vmag[b, j] * ∂cons[b+2*nlines, j]
+    end
+end
diff --git a/src/PowerSystem/matpower.jl b/src/PowerSystem/matpower.jl
index bb0f7dcd..62a9df35 100644
--- a/src/PowerSystem/matpower.jl
+++ b/src/PowerSystem/matpower.jl
@@ -78,6 +78,30 @@ function matpower_lineflow_power_jacobian(V, branches::Branches)
     return dH_dVm, dH_dVa
 end
 
+function _matpower_basis_jacobian(V, branches::Branches)
+    nb = size(V, 1)
+    f = branches.from_buses
+    t = branches.to_buses
+    nl = length(f)
+    # Connection matrices
+    Cf = sparse(1:nl, f, ones(nl), nl, nb)
+    Ct = sparse(1:nl, t, ones(nl), nl, nb)
+
+    Vf = Cf * V
+    Vt = Ct * V
+
+    Ev = sparse(1:nb, 1:nb, V./abs.(V), nb, nb)
+    diagV = sparse(1:nb, 1:nb, V, nb, nb)
+    diagVf = sparse(1:nl, 1:nl, Vf, nl, nl)
+    diagVt = sparse(1:nl, 1:nl, Vt, nl, nl)
+
+    dS_dVm = diagVf * Ct * conj(Ev) + conj(diagVt) * Cf * Ev
+    dS_dVa = im * (-diagVf * Ct * conj(diagV) + conj(diagVt) * Cf * diagV)
+
+    return (dS_dVm, dS_dVa)
+end
+
+
 # Hessian vector-product H*λ, H = Jₓₓ (from Matpower)
 # Suppose ordering is correct
 function _matpower_hessian(V, Ybus, λ)
diff --git a/test/Polar/autodiff.jl b/test/Polar/autodiff.jl
index caebe25a..6935a11f 100644
--- a/test/Polar/autodiff.jl
+++ b/test/Polar/autodiff.jl
@@ -15,7 +15,7 @@ function test_constraints_jacobian(polar, device, MT)
     println(devnull, jx)
 
     # Solve power flow
-    conv = powerflow(polar, jx, cache, NewtonRaphson(tol=1e-12))
+    conv = powerflow(polar, jx, cache, NewtonRaphson(tol=1e-14); linear_solver=LS.DirectSolver(jx.J))
     # Get solution in complex form.
     V = ExaPF.voltage_host(cache)
 
@@ -27,6 +27,7 @@ function test_constraints_jacobian(polar, device, MT)
         ExaPF.reactive_power_constraints,
         ExaPF.flow_constraints,
         ExaPF.bus_power_injection,
+        ExaPF.network_basis,
     ]
         m = ExaPF.size_constraint(polar, cons)
         # Allocation
@@ -41,7 +42,6 @@ function test_constraints_jacobian(polar, device, MT)
         J = AutoDiff.jacobian!(polar, xjacobianAD, cache)
         # Matpower Jacobian
         Jmat_x = ExaPF.matpower_jacobian(polar, State(), cons, V)
-        # Evaluate Jacobian transpose vector product
 
         # Compare with FiniteDiff
         function jac_fd_x(x)
@@ -60,8 +60,8 @@ function test_constraints_jacobian(polar, device, MT)
         ∂cons = pbm.stack
 
         @test size(J) == (m, length(x))
-        @test isapprox(Jd, Jx, rtol=1e-5)
-        @test isapprox(Jmat_x, Jx, rtol=1e-4)
+        @test isapprox(Jd, Jx, rtol=1e-6)
+        @test isapprox(Jmat_x, Jx, rtol=1e-6)
         @test isapprox(∂cons.∂x, xjacobianAD.J' * tgt, rtol=1e-6)
 
         ## CONTROL JACOBIAN
@@ -86,7 +86,7 @@ function test_constraints_jacobian(polar, device, MT)
         if !isnothing(ujacobianAD.J)
             Ju = ujacobianAD.J |> SparseMatrixCSC |> Array
             @test size(J) == (m, length(u))
-            @test isapprox(Jd, Ju, rtol=1e-5)
+            @test isapprox(Jd, Ju, rtol=1e-6)
             @test isapprox(Jmat_u, Ju, rtol=1e-6)
             @test isapprox(∂cons.∂u, ujacobianAD.J' * tgt, rtol=1e-6)
         end

From cc51a45c6387da82ca82122451fdb69a781d5596 Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Fri, 29 Oct 2021 16:59:34 -0500
Subject: [PATCH 02/34] fix code on GPU

---
 src/Polar/Constraints/basis.jl       |  7 ++-
 src/Polar/Constraints/constraints.jl | 16 ++++++
 src/Polar/kernels.jl                 | 75 ++++++++++++++++------------
 3 files changed, 61 insertions(+), 37 deletions(-)

diff --git a/src/Polar/Constraints/basis.jl b/src/Polar/Constraints/basis.jl
index dd5a71b2..54f2ca4c 100644
--- a/src/Polar/Constraints/basis.jl
+++ b/src/Polar/Constraints/basis.jl
@@ -13,7 +13,7 @@ function _network_basis(polar::PolarForm, cons, vmag, vang)
     ev = basis_kernel!(polar.device)(
         cons, vmag, vang,
         polar.topology.f_buses, polar.topology.t_buses, nlines, nbus,
-        ndrange=(nlines+nbus, size(cons, 2)),
+        ndrange=(2 * nlines+nbus, size(cons, 2)),
         dependencies=Event(polar.device)
     )
     wait(ev)
@@ -46,7 +46,6 @@ function adjoint!(
     fill!(pbm.intermediate.∂edge_vm_to , 0.0)
     fill!(pbm.intermediate.∂edge_va_fr , 0.0)
     fill!(pbm.intermediate.∂edge_va_to , 0.0)
-
     ndrange = (nl+nb, size(∂cons, 2))
     ev = adj_basis_kernel!(polar.device)(
         ∂cons,
@@ -60,8 +59,8 @@ function adjoint!(
     )
     wait(ev)
 
-    Cf = sparse(f, 1:nl, ones(nl), nb, nl)       # connection matrix for line & from buses
-    Ct = sparse(t, 1:nl, ones(nl), nb, nl)       # connection matrix for line & to buses
+    Cf = pbm.intermediate.Cf
+    Ct = pbm.intermediate.Ct
     mul!(∂vmag, Cf, pbm.intermediate.∂edge_vm_fr, 1.0, 1.0)
     mul!(∂vmag, Ct, pbm.intermediate.∂edge_vm_to, 1.0, 1.0)
     mul!(∂vang, Cf, pbm.intermediate.∂edge_va_fr, 1.0, 1.0)
diff --git a/src/Polar/Constraints/constraints.jl b/src/Polar/Constraints/constraints.jl
index 45c24af4..8f96c7ce 100644
--- a/src/Polar/Constraints/constraints.jl
+++ b/src/Polar/Constraints/constraints.jl
@@ -29,10 +29,26 @@ function _get_intermediate_stack(
     else
         length(polar.topology.ybus_im.nzval)
     end
+    nl = nlines
+    nb = PS.get(polar.network, PS.NumberOfBuses())
+
+    # TODO: URGENT This part is very dirty
+    if isa(func, typeof(network_basis))
+        Cf = sparse(polar.network.lines.from_buses, 1:nl, ones(nl), nb, nl)
+        Ct = sparse(polar.network.lines.to_buses, 1:nl, ones(nl), nb, nl)
+        if isa(polar.device, GPU)
+            Cf = CUSPARSE.CuSparseMatrixCSR(Cf)
+            Ct = CUSPARSE.CuSparseMatrixCSR(Ct)
+        end
+    else
+        Cf = nothing
+        Ct = nothing
+    end
 
     # Return a NamedTuple storing all the intermediate states
     if nbatch == 1
         return (
+            Cf=Cf, Ct=Ct,
             ∂edge_vm_fr = VT(undef, nnz),
             ∂edge_va_fr = VT(undef, nnz),
             ∂edge_vm_to = VT(undef, nnz),
diff --git a/src/Polar/kernels.jl b/src/Polar/kernels.jl
index aebae94e..453fdf14 100644
--- a/src/Polar/kernels.jl
+++ b/src/Polar/kernels.jl
@@ -759,18 +759,25 @@ KA.@kernel function basis_kernel!(
 )
     i, j = @index(Global, NTuple)
 
-    if i <= nlines
-        ℓ = i
-        fr_bus = f[ℓ]
-        to_bus = t[ℓ]
-        Δθ = vang[fr_bus, j] - vang[to_bus, j]
-        cosθ = cos(Δθ)
-        sinθ = sin(Δθ)
-        cons[ℓ,        j] = vmag[fr_bus, j] * vmag[to_bus, j] * cosθ
-        cons[ℓ+nlines, j] = vmag[fr_bus, j] * vmag[to_bus, j] * sinθ
-    else i <= nlines + nbus
-        b = i - nlines
-        cons[b+2*nlines, j] = vmag[b, j] * vmag[b, j]
+    @inbounds begin
+        if i <= nlines
+            ℓ = i
+            fr_bus = f[ℓ]
+            to_bus = t[ℓ]
+            Δθ = vang[fr_bus, j] - vang[to_bus, j]
+            cosθ = cos(Δθ)
+            cons[i,        j] = vmag[fr_bus, j] * vmag[to_bus, j] * cosθ
+        elseif i <= 2 * nlines
+            ℓ = i - nlines
+            fr_bus = f[ℓ]
+            to_bus = t[ℓ]
+            Δθ = vang[fr_bus, j] - vang[to_bus, j]
+            sinθ = sin(Δθ)
+            cons[i, j] = vmag[fr_bus, j] * vmag[to_bus, j] * sinθ
+        elseif i <= 2 * nlines + nbus
+            b = i - 2 * nlines
+            cons[i, j] = vmag[b, j] * vmag[b, j]
+        end
     end
 end
 
@@ -781,26 +788,28 @@ KA.@kernel function adj_basis_kernel!(
 )
     i, j = @index(Global, NTuple)
 
-    if i <= nlines
-        ℓ = i
-        fr_bus = f[ℓ]
-        to_bus = t[ℓ]
-        Δθ = vang[fr_bus, j] - vang[to_bus, j]
-        cosθ = cos(Δθ)
-        sinθ = sin(Δθ)
-
-        adj_vang_fr[i]  = -vmag[fr_bus, j] * vmag[to_bus, j] * sinθ * ∂cons[ℓ, j]
-        adj_vang_fr[i] +=  vmag[fr_bus, j] * vmag[to_bus, j] * cosθ * ∂cons[ℓ+nlines, j]
-        adj_vang_to[i]  =  vmag[fr_bus, j] * vmag[to_bus, j] * sinθ * ∂cons[ℓ, j]
-        adj_vang_to[i] -=  vmag[fr_bus, j] * vmag[to_bus, j] * cosθ * ∂cons[ℓ+nlines, j]
-
-        adj_vmag_fr[i] =  vmag[to_bus, j] * cosθ * ∂cons[ℓ, j]
-        adj_vmag_fr[i] += vmag[to_bus, j] * sinθ * ∂cons[ℓ+nlines, j]
-
-        adj_vmag_to[i] =  vmag[fr_bus, j] * cosθ * ∂cons[ℓ, j]
-        adj_vmag_to[i] += vmag[fr_bus, j] * sinθ * ∂cons[ℓ+nlines, j]
-    else i <= nlines + nbus
-        b = i - nlines
-        adj_vmag[b, j] = 2.0 * vmag[b, j] * ∂cons[b+2*nlines, j]
+    @inbounds begin
+        if i <= nlines
+            ℓ = i
+            fr_bus = f[ℓ]
+            to_bus = t[ℓ]
+            Δθ = vang[fr_bus, j] - vang[to_bus, j]
+            cosθ = cos(Δθ)
+            sinθ = sin(Δθ)
+
+            adj_vang_fr[i]  = -vmag[fr_bus, j] * vmag[to_bus, j] * sinθ * ∂cons[ℓ, j]
+            adj_vang_fr[i] +=  vmag[fr_bus, j] * vmag[to_bus, j] * cosθ * ∂cons[ℓ+nlines, j]
+            adj_vang_to[i]  =  vmag[fr_bus, j] * vmag[to_bus, j] * sinθ * ∂cons[ℓ, j]
+            adj_vang_to[i] -=  vmag[fr_bus, j] * vmag[to_bus, j] * cosθ * ∂cons[ℓ+nlines, j]
+
+            adj_vmag_fr[i] =  vmag[to_bus, j] * cosθ * ∂cons[ℓ, j]
+            adj_vmag_fr[i] += vmag[to_bus, j] * sinθ * ∂cons[ℓ+nlines, j]
+
+            adj_vmag_to[i] =  vmag[fr_bus, j] * cosθ * ∂cons[ℓ, j]
+            adj_vmag_to[i] += vmag[fr_bus, j] * sinθ * ∂cons[ℓ+nlines, j]
+        else i <= nlines + nbus
+            b = i - nlines
+            adj_vmag[b, j] = 2.0 * vmag[b, j] * ∂cons[b+2*nlines, j]
+        end
     end
 end

From 230cb83e32bec1d84f8f6ced17e6879b3492aac3 Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Tue, 21 Dec 2021 21:02:24 -0600
Subject: [PATCH 03/34] polar: better accumulation for line flow constraints

---
 src/Polar/Constraints/constraints.jl | 23 +++++++++--------------
 src/Polar/Constraints/line_flow.jl   | 10 ++++++++--
 src/Polar/kernels.jl                 | 23 ++++++++++++++---------
 test/Polar/autodiff.jl               | 10 ++++++----
 4 files changed, 37 insertions(+), 29 deletions(-)

diff --git a/src/Polar/Constraints/constraints.jl b/src/Polar/Constraints/constraints.jl
index 8f96c7ce..26e309f3 100644
--- a/src/Polar/Constraints/constraints.jl
+++ b/src/Polar/Constraints/constraints.jl
@@ -23,26 +23,20 @@ function _get_intermediate_stack(
     polar::PolarForm, func::F, VT, nbatch
 ) where {F <: Union{typeof(reactive_power_constraints), typeof(flow_constraints), typeof(power_balance), typeof(bus_power_injection), typeof(network_basis)}}
     nlines = PS.get(polar.network, PS.NumberOfLines())
+    nbus = PS.get(polar.network, PS.NumberOfBuses())
     # Take care that flow_constraints needs a buffer with a different size
     nnz = if isa(func, typeof(flow_constraints))  || isa(func, typeof(network_basis))
         nlines
     else
         length(polar.topology.ybus_im.nzval)
     end
-    nl = nlines
-    nb = PS.get(polar.network, PS.NumberOfBuses())
-
-    # TODO: URGENT This part is very dirty
-    if isa(func, typeof(network_basis))
-        Cf = sparse(polar.network.lines.from_buses, 1:nl, ones(nl), nb, nl)
-        Ct = sparse(polar.network.lines.to_buses, 1:nl, ones(nl), nb, nl)
-        if isa(polar.device, GPU)
-            Cf = CUSPARSE.CuSparseMatrixCSR(Cf)
-            Ct = CUSPARSE.CuSparseMatrixCSR(Ct)
-        end
-    else
-        Cf = nothing
-        Ct = nothing
+
+    Cf = nothing
+    Ct = nothing
+    if isa(func, typeof(network_basis)) || isa(func, typeof(flow_constraints))
+        SMT, _ = get_jacobian_types(polar.device)
+        Cf = sparse(polar.network.lines.from_buses, 1:nlines, ones(nlines), nbus, nlines) |> SMT
+        Ct = sparse(polar.network.lines.to_buses, 1:nlines, ones(nlines), nbus, nlines) |> SMT
     end
 
     # Return a NamedTuple storing all the intermediate states
@@ -56,6 +50,7 @@ function _get_intermediate_stack(
         )
     else
         return (
+            Cf=Cf, Ct=Ct,
             ∂edge_vm_fr = VT(undef, nnz, nbatch),
             ∂edge_va_fr = VT(undef, nnz, nbatch),
             ∂edge_vm_to = VT(undef, nnz, nbatch),
diff --git a/src/Polar/Constraints/line_flow.jl b/src/Polar/Constraints/line_flow.jl
index b4f37b18..b27ff684 100644
--- a/src/Polar/Constraints/line_flow.jl
+++ b/src/Polar/Constraints/line_flow.jl
@@ -39,13 +39,16 @@ function flow_constraints_grad!(polar::PolarForm, cons_grad, buffer, weights)
     fill!(∂edge_vm_to, 0)
     fill!(∂edge_va_fr, 0)
     fill!(∂edge_va_to, 0)
+    SMT, _ = get_jacobian_types(polar.device)
+    Cf = sparse(polar.network.lines.from_buses, 1:nlines, ones(nlines), nbus, nlines) |> SMT
+    Ct = sparse(polar.network.lines.to_buses, 1:nlines, ones(nlines), nbus, nlines) |> SMT
     adj_branch_flow!(weights, buffer.vmag, adj_vmag,
             buffer.vang, adj_vang,
             ∂edge_vm_fr, ∂edge_vm_to,
             ∂edge_va_fr, ∂edge_va_to,
             PT.yff_re, PT.yft_re, PT.ytf_re, PT.ytt_re,
             PT.yff_im, PT.yft_im, PT.ytf_im, PT.ytt_im,
-            PT.f_buses, PT.t_buses, nlines, polar.device
+            PT.f_buses, PT.t_buses, Cf, Ct, nlines, polar.device
     )
     return cons_grad
 end
@@ -87,7 +90,10 @@ function adjoint!(
         pbm.intermediate.∂edge_va_to,
         top.yff_re, top.yft_re, top.ytf_re, top.ytt_re,
         top.yff_im, top.yft_im, top.ytf_im, top.ytt_im,
-        top.f_buses, top.t_buses, nlines, polar.device
+        top.f_buses, top.t_buses,
+        pbm.intermediate.Cf,
+        pbm.intermediate.Ct,
+        nlines, polar.device
     )
 end
 
diff --git a/src/Polar/kernels.jl b/src/Polar/kernels.jl
index 453fdf14..14a9698a 100644
--- a/src/Polar/kernels.jl
+++ b/src/Polar/kernels.jl
@@ -730,7 +730,7 @@ function adj_branch_flow!(
         adj_vm_from_lines, adj_va_from_lines, adj_vm_to_lines, adj_va_to_lines,
         yff_re, yft_re, ytf_re, ytt_re,
         yff_im, yft_im, ytf_im, ytt_im,
-        f, t, nlines, device
+        f, t, Cf, Ct, nlines, device
     )
     nvbus = length(vang)
     kernel_edge! = adj_branch_flow_edge_kernel!(device)
@@ -745,13 +745,18 @@ function adj_branch_flow!(
             dependencies=Event(device)
     )
     wait(ev)
-    ev = kernel_node!(
-            vmag, adj_vm, vang, adj_va,
-            adj_va_to_lines, adj_va_from_lines, adj_vm_to_lines, adj_vm_from_lines,
-            f, t, nlines, ndrange = (nvbus, size(adj_slines, 2)),
-            dependencies=Event(device)
-    )
-    wait(ev)
+
+    mul!(adj_vm, Cf, adj_vm_from_lines, 1.0, 1.0)
+    mul!(adj_vm, Ct, adj_vm_to_lines, 1.0, 1.0)
+    mul!(adj_va, Cf, adj_va_from_lines, 1.0, 1.0)
+    mul!(adj_va, Ct, adj_va_to_lines, 1.0, 1.0)
+    # ev = kernel_node!(
+    #         vmag, adj_vm, vang, adj_va,
+    #         adj_va_to_lines, adj_va_from_lines, adj_vm_to_lines, adj_vm_from_lines,
+    #         f, t, nlines, ndrange = (nvbus, size(adj_slines, 2)),
+    #         dependencies=Event(device)
+    # )
+    # wait(ev)
 end
 
 KA.@kernel function basis_kernel!(
@@ -766,7 +771,7 @@ KA.@kernel function basis_kernel!(
             to_bus = t[ℓ]
             Δθ = vang[fr_bus, j] - vang[to_bus, j]
             cosθ = cos(Δθ)
-            cons[i,        j] = vmag[fr_bus, j] * vmag[to_bus, j] * cosθ
+            cons[i, j] = vmag[fr_bus, j] * vmag[to_bus, j] * cosθ
         elseif i <= 2 * nlines
             ℓ = i - nlines
             fr_bus = f[ℓ]
diff --git a/test/Polar/autodiff.jl b/test/Polar/autodiff.jl
index 6935a11f..5c57df14 100644
--- a/test/Polar/autodiff.jl
+++ b/test/Polar/autodiff.jl
@@ -15,7 +15,7 @@ function test_constraints_jacobian(polar, device, MT)
     println(devnull, jx)
 
     # Solve power flow
-    conv = powerflow(polar, jx, cache, NewtonRaphson(tol=1e-14); linear_solver=LS.DirectSolver(jx.J))
+    conv = powerflow(polar, jx, cache, NewtonRaphson(tol=1e-12))
     # Get solution in complex form.
     V = ExaPF.voltage_host(cache)
 
@@ -42,6 +42,7 @@ function test_constraints_jacobian(polar, device, MT)
         J = AutoDiff.jacobian!(polar, xjacobianAD, cache)
         # Matpower Jacobian
         Jmat_x = ExaPF.matpower_jacobian(polar, State(), cons, V)
+        # Evaluate Jacobian transpose vector product
 
         # Compare with FiniteDiff
         function jac_fd_x(x)
@@ -60,8 +61,8 @@ function test_constraints_jacobian(polar, device, MT)
         ∂cons = pbm.stack
 
         @test size(J) == (m, length(x))
-        @test isapprox(Jd, Jx, rtol=1e-6)
-        @test isapprox(Jmat_x, Jx, rtol=1e-6)
+        @test isapprox(Jd, Jx, rtol=1e-5)
+        @test isapprox(Jmat_x, Jx, rtol=1e-4)
         @test isapprox(∂cons.∂x, xjacobianAD.J' * tgt, rtol=1e-6)
 
         ## CONTROL JACOBIAN
@@ -86,7 +87,7 @@ function test_constraints_jacobian(polar, device, MT)
         if !isnothing(ujacobianAD.J)
             Ju = ujacobianAD.J |> SparseMatrixCSC |> Array
             @test size(J) == (m, length(u))
-            @test isapprox(Jd, Ju, rtol=1e-6)
+            @test isapprox(Jd, Ju, rtol=1e-5)
             @test isapprox(Jmat_u, Ju, rtol=1e-6)
             @test isapprox(∂cons.∂u, ujacobianAD.J' * tgt, rtol=1e-6)
         end
@@ -117,6 +118,7 @@ function test_constraints_adjoint(polar, device, MT)
         ExaPF.flow_constraints,
         ExaPF.bus_power_injection,
         ExaPF.network_operations,
+        ExaPF.network_basis,
     ]
         m = ExaPF.size_constraint(polar, cons)
         pbm = AutoDiff.TapeMemory(polar, cons, typeof(u))

From 803614a974baa7d0de42414348ec217840318fa5 Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Thu, 23 Dec 2021 11:30:51 -0600
Subject: [PATCH 04/34] clean code before PR

---
 src/Polar/kernels.jl | 33 ++++-----------------------------
 1 file changed, 4 insertions(+), 29 deletions(-)

diff --git a/src/Polar/kernels.jl b/src/Polar/kernels.jl
index 14a9698a..76c930be 100644
--- a/src/Polar/kernels.jl
+++ b/src/Polar/kernels.jl
@@ -706,25 +706,6 @@ KA.@kernel function adj_branch_flow_edge_kernel!(
     adj_va_to_lines[ℓ, j] -= adj_Δθ
 end
 
-KA.@kernel function adj_branch_flow_node_kernel!(
-    @Const(vmag), adj_vm, @Const(vang), adj_va,
-    @Const(adj_va_to_lines), @Const(adj_va_from_lines),
-    @Const(adj_vm_to_lines), @Const(adj_vm_from_lines),
-    @Const(f), @Const(t), nlines
-)
-    i, j = @index(Global, NTuple)
-    @inbounds for ℓ in 1:nlines
-        if f[ℓ] == i
-            adj_vm[i, j] += adj_vm_from_lines[ℓ, j]
-            adj_va[i, j] += adj_va_from_lines[ℓ, j]
-        end
-        if t[ℓ] == i
-            adj_vm[i, j] += adj_vm_to_lines[ℓ, j]
-            adj_va[i, j] += adj_va_to_lines[ℓ, j]
-        end
-    end
-end
-
 function adj_branch_flow!(
         adj_slines, vmag, adj_vm, vang, adj_va,
         adj_vm_from_lines, adj_va_from_lines, adj_vm_to_lines, adj_va_to_lines,
@@ -733,10 +714,8 @@ function adj_branch_flow!(
         f, t, Cf, Ct, nlines, device
     )
     nvbus = length(vang)
-    kernel_edge! = adj_branch_flow_edge_kernel!(device)
-    kernel_node! = adj_branch_flow_node_kernel!(device)
 
-    ev = kernel_edge!(
+    ev = adj_branch_flow_edge_kernel!(device)(
             adj_slines, vmag, adj_vm, vang, adj_va,
             adj_va_to_lines, adj_va_from_lines, adj_vm_to_lines, adj_vm_from_lines,
             yff_re, yft_re, ytf_re, ytt_re,
@@ -746,17 +725,13 @@ function adj_branch_flow!(
     )
     wait(ev)
 
+    # Aggregate the adjoints on the nodes using the bus-node adjacency matrices.
+    # mul! should be overloaded on the GPU to work with dual numbers
+    # (needed to evaluate the Hessian using forward over reverse)
     mul!(adj_vm, Cf, adj_vm_from_lines, 1.0, 1.0)
     mul!(adj_vm, Ct, adj_vm_to_lines, 1.0, 1.0)
     mul!(adj_va, Cf, adj_va_from_lines, 1.0, 1.0)
     mul!(adj_va, Ct, adj_va_to_lines, 1.0, 1.0)
-    # ev = kernel_node!(
-    #         vmag, adj_vm, vang, adj_va,
-    #         adj_va_to_lines, adj_va_from_lines, adj_vm_to_lines, adj_vm_from_lines,
-    #         f, t, nlines, ndrange = (nvbus, size(adj_slines, 2)),
-    #         dependencies=Event(device)
-    # )
-    # wait(ev)
 end
 
 KA.@kernel function basis_kernel!(

From 98e0562cd39e88033601960bd7b5b2bed23d87b0 Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Thu, 23 Dec 2021 15:26:18 -0600
Subject: [PATCH 05/34] fix tests on GPU

---
 test/gpu.jl      | 34 ++++++++++++++++++++++++++++++++++
 test/runtests.jl | 10 ++--------
 2 files changed, 36 insertions(+), 8 deletions(-)
 create mode 100644 test/gpu.jl

diff --git a/test/gpu.jl b/test/gpu.jl
new file mode 100644
index 00000000..15ea334e
--- /dev/null
+++ b/test/gpu.jl
@@ -0,0 +1,34 @@
+using LinearAlgebra
+using CUDAKernels
+using CUDA.CUSPARSE
+
+CUDA_ARCH = (CUDADevice(), CuArray, CuSparseMatrixCSR)
+push!(ARCHS, CUDA_ARCH)
+
+# Default sparse matrix on CUDA GPU
+ExaPF.default_sparse_matrix(::CUDADevice) = CuSparseMatrixCSR
+
+# LinearAlgebra.mul!
+@kernel function _spmm_kernel!(Y, X, colVal, rowPtr, nzVal, alpha, beta, n, m)
+    i, k = @index(Global, NTuple)
+    Y[i, k] *= beta
+    @inbounds for c in rowPtr[i]:rowPtr[i+1]-1
+        j = colVal[c]
+        Y[i, k] += alpha * nzVal[c] * X[j, k]
+    end
+end
+
+function LinearAlgebra.mul!(Y::AbstractArray{T, 2}, A::CuSparseMatrixCSR, X::AbstractArray{T, 2}, alpha::Number, beta::Number) where {T <: ForwardDiff.Dual}
+    n, m = size(A)
+    p = size(X, 2)
+    @assert size(Y, 1) == n
+    @assert size(X, 1) == m
+    @assert size(X, 2) == size(Y, 2)
+
+    ndrange = (n, p)
+    ev = _spmm_kernel!(CUDADevice())(
+        Y, X, A.colVal, A.rowPtr, A.nzVal, alpha, beta, n, m,
+        ndrange=ndrange,
+    )
+    wait(ev)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index d5487ceb..8efe04a5 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -19,11 +19,7 @@ const CASES = ["case9.m", "case30.m"]
 
 ARCHS = Any[(CPU(), Array, SparseMatrixCSC)]
 if has_cuda_gpu()
-    using CUDAKernels
-    using CUDA.CUSPARSE
-    ExaPF.default_sparse_matrix(::CUDADevice) = CuSparseMatrixCSR
-    CUDA_ARCH = (CUDADevice(), CuArray, CuSparseMatrixCSR)
-    push!(ARCHS, CUDA_ARCH)
+    include("gpu.jl")
 end
 
 # Load test modules
@@ -65,9 +61,7 @@ init_time = time()
     end
     println()
 
-    @testset "Test Documentation" begin
-        include("quickstart.jl")
-    end
+    include("quickstart.jl")
 
     @testset "Test Benchmark script" begin
         empty!(ARGS)

From fb688c55026dc0bf3ca2916f3928bcc13701f235 Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Thu, 30 Dec 2021 09:22:34 -0600
Subject: [PATCH 06/34] hotfix

---
 test/gpu.jl      | 2 +-
 test/runtests.jl | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/gpu.jl b/test/gpu.jl
index 15ea334e..4162f48b 100644
--- a/test/gpu.jl
+++ b/test/gpu.jl
@@ -8,7 +8,7 @@ push!(ARCHS, CUDA_ARCH)
 # Default sparse matrix on CUDA GPU
 ExaPF.default_sparse_matrix(::CUDADevice) = CuSparseMatrixCSR
 
-# LinearAlgebra.mul!
+# Differentiable LinearAlgebra.mul! for ForwardDiff
 @kernel function _spmm_kernel!(Y, X, colVal, rowPtr, nzVal, alpha, beta, n, m)
     i, k = @index(Global, NTuple)
     Y[i, k] *= beta
diff --git a/test/runtests.jl b/test/runtests.jl
index 8efe04a5..abd974d0 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -6,6 +6,7 @@ using SparseArrays
 using CUDA
 using KernelAbstractions
 
+using ForwardDiff
 using FiniteDiff
 
 using ExaPF

From 18b517ce1da71a4ac5ab4ab53b93fec24f3373af Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Wed, 5 Jan 2022 15:52:20 -0600
Subject: [PATCH 07/34] reimplement all constraints with nonlinear basis

---
 src/Polar/functions.jl           | 306 +++++++++++++++++++++++++++++++
 src/Polar/kernels.jl             |  66 ++++++-
 src/Polar/polar.jl               |   1 +
 src/PowerSystem/PowerSystem.jl   |   1 +
 src/PowerSystem/power_network.jl |  44 +++++
 5 files changed, 413 insertions(+), 5 deletions(-)
 create mode 100644 src/Polar/functions.jl

diff --git a/src/Polar/functions.jl b/src/Polar/functions.jl
new file mode 100644
index 00000000..56fa259e
--- /dev/null
+++ b/src/Polar/functions.jl
@@ -0,0 +1,306 @@
+
+
+abstract type AbstractStack end
+
+struct NetworkStack{VT} <: AbstractStack
+    vmag::VT # voltage magnitudes
+    vang::VT # voltage angles
+    pgen::VT # active power generations
+    qgen::VT # reactive power generations
+    ψ::VT    # nonlinear basis ψ(vmag, vang)
+end
+
+function NetworkStack(polar::PolarForm{T,VI,VT,MT}) where {T,VI,VT,MT}
+    nbus = get(polar, PS.NumberOfBuses())
+    nlines = get(polar, PS.NumberOfLines())
+
+    vmag = abs.(polar.network.vbus) |> VT
+    vang = angle.(polar.network.vbus) |> VT
+    pgen = get(polar.network, PS.ActivePower()) |> VT
+    qgen = get(polar.network, PS.ReactivePower()) |> VT
+
+    ψ = VT(undef, 2*nlines + nbus) ; fill!(ψ, zero(T))
+
+    return NetworkStack{VT}(vmag, vang, pgen, qgen, ψ)
+end
+
+function fw_stack(polar)
+    VT = Vector{ForwardDiff.Dual{Nothing, Float64, 1}}
+    nbus = get(polar, PS.NumberOfBuses())
+    nlines = get(polar, PS.NumberOfLines())
+
+    vmag = abs.(polar.network.vbus) |> VT
+    vang = angle.(polar.network.vbus) |> VT
+    pgen = get(polar.network, PS.ActivePower()) |> VT
+    qgen = get(polar.network, PS.ReactivePower()) |> VT
+
+    ψ = VT(undef, 2*nlines + nbus)
+
+    return NetworkStack{VT}(vmag, vang, pgen, qgen, ψ)
+end
+
+function Base.empty!(state::NetworkStack)
+    fill!(state.vmag, 0.0)
+    fill!(state.vang, 0.0)
+    fill!(state.pgen, 0.0)
+    fill!(state.qgen, 0.0)
+    fill!(state.ψ, 0.0)
+    return
+end
+
+
+# update basis
+function forward_eval_intermediate(polar::PolarForm, state::NetworkStack)
+    _network_basis(polar, state.ψ, state.vmag, state.vang)
+end
+
+function reverse_eval_intermediate(polar::PolarForm, ∂state::NetworkStack, state::NetworkStack, intermediate)
+    nl = PS.get(polar.network, PS.NumberOfLines())
+    nb = PS.get(polar.network, PS.NumberOfBuses())
+    top = polar.topology
+    f = top.f_buses
+    t = top.t_buses
+
+    fill!(intermediate.∂edge_vm_fr , 0.0)
+    fill!(intermediate.∂edge_vm_to , 0.0)
+    fill!(intermediate.∂edge_va_fr , 0.0)
+    fill!(intermediate.∂edge_va_to , 0.0)
+
+    # Accumulate on edges
+    ndrange = (nl+nb, size(∂state.vmag, 2))
+    ev = adj_basis_kernel!(polar.device)(
+        ∂state.ψ,
+        ∂state.vmag,
+        intermediate.∂edge_vm_fr,
+        intermediate.∂edge_vm_to,
+        intermediate.∂edge_va_fr,
+        intermediate.∂edge_va_to,
+        state.vmag, state.vang, f, t, nl, nb,
+        ndrange=ndrange, dependencies=Event(polar.device),
+    )
+    wait(ev)
+
+    # Accumulate on nodes
+    Cf = intermediate.Cf
+    Ct = intermediate.Ct
+    mul!(∂state.vmag, Cf, intermediate.∂edge_vm_fr, 1.0, 1.0)
+    mul!(∂state.vmag, Ct, intermediate.∂edge_vm_to, 1.0, 1.0)
+    mul!(∂state.vang, Cf, intermediate.∂edge_va_fr, 1.0, 1.0)
+    mul!(∂state.vang, Ct, intermediate.∂edge_va_to, 1.0, 1.0)
+    return
+end
+
+#=
+    Generic expression
+=#
+
+abstract type AbstractExpression end
+
+function jacobian_transpose_product!(polar::PolarForm, pbm::AutoDiff.TapeMemory, jv, state, ∂v)
+    ∂state = pbm.stack
+    empty!(∂state)
+    adjoint!(pbm.func, ∂state, state, ∂v)
+    # Accumulate on vmag and vang
+    reverse_eval_intermediate(polar, ∂state, state, pbm.intermediate)
+    # Accumulate on x and u
+    reverse_transfer!(
+        polar, jv, ∂state,
+    )
+end
+
+
+#=
+    CostFunction
+=#
+
+struct CostFunction{VT, MT} <: AbstractExpression
+    gen_ref::Vector{Int}
+    M::MT
+    c::VT
+    c0::VT
+    c1::VT
+    c2::VT
+end
+
+function CostFunction(polar::PolarForm{T, VI, VT, MT}) where {T, VI, VT, MT}
+    ngen = get(polar, PS.NumberOfGenerators())
+    SMT = default_sparse_matrix(polar.device)
+    # Load indexing
+    ref = polar.network.ref
+    ref_gen = polar.indexing.index_ref_to_gen
+    # Assemble matrix
+    M_tot = PS.get_basis_matrix(polar.network)
+    M = M_tot[ref, :] |> SMT
+
+    # costs
+    c = VT(undef, ngen)
+    # coefficients
+    coefs = polar.costs_coefficients
+    c0 = @view coefs[:, 2]
+    c1 = @view coefs[:, 3]
+    c2 = @view coefs[:, 4]
+    return CostFunction{VT, SMT}(ref_gen, M, c, c0, c1, c2)
+end
+
+Base.size(::CostFunction) = (1,)
+
+function (func::CostFunction)(state)
+    state.pgen[func.gen_ref] .= func.M * state.ψ
+    # func.c .= func.c0 .+ func.c1 .* state.pgen .+ func.c2 .* state.pgen.^2
+    return sum(func.c0 .+ func.c1 .* state.pgen .+ func.c2 .* state.pgen.^2)
+    # return sum(func.c)
+end
+
+function adjoint!(func::CostFunction, ∂state, state, ∂v)
+    ∂state.pgen .+= ∂v .* (func.c1 .+ 2.0 .* func.c2 .* state.pgen)
+    ∂state.ψ .-= func.M' * ∂state.pgen[func.gen_ref]
+    return
+end
+
+
+struct PowerFlowBalance{VT, MT} <: AbstractExpression
+    M::MT
+    Cg::MT
+    τ::VT
+end
+
+function PowerFlowBalance(polar::PolarForm{T, VI, VT, MT}) where {T, VI, VT, MT}
+    SMT = default_sparse_matrix(polar.device)
+
+    pf = polar.network
+    ngen = pf.ngen
+    nbus = pf.nbus
+    gen = pf.gen2bus
+    pv = pf.pv
+    npq = length(pf.pq)
+
+    # Assemble matrices
+    Cg_tot = sparse(gen, 1:ngen, ones(ngen), nbus, ngen)
+    Cg = -[Cg_tot[pv, :] ; spzeros(2*npq, ngen)] |> SMT
+    M_tot = PS.get_basis_matrix(polar.network)
+    M = -M_tot[[pf.pv; pf.pq; nbus .+ pf.pq], :] |> SMT
+
+    # constant term
+    pload = PS.get(polar.network, PS.ActiveLoad())
+    qload = PS.get(polar.network, PS.ReactiveLoad())
+    τ = [pload[pf.pv]; pload[pf.pq]; qload[pf.pq]] |> VT
+
+    return PowerFlowBalance{VT, SMT}(M, Cg, τ)
+end
+
+Base.size(func::PowerFlowBalance) = size(func.τ)
+
+function (func::PowerFlowBalance)(cons, state)
+    cons .= func.τ .+ func.M * state.ψ .+ func.Cg * state.pgen
+    return
+end
+
+function adjoint!(func::PowerFlowBalance, ∂state, state, ∂v)
+    mul!(∂state.ψ, func.M', ∂v, 1.0, -1.0)
+    mul!(∂state.pgen, func.Cg', ∂v, 1.0, 1.0)
+    return
+end
+
+
+struct VoltageMagnitudePQ <: AbstractExpression
+    pq::Vector{Int}
+
+end
+VoltageMagnitudePQ(polar::PolarForm) = VoltageMagnitudePQ(polar.network.pq)
+
+Base.size(func::VoltageMagnitudePQ) = (length(func.pq),)
+
+function (func::VoltageMagnitudePQ)(cons, state)
+    cons .= state.vmag[func.pq]
+end
+
+function adjoint!(func::VoltageMagnitudePQ, ∂state, state, ∂v)
+    ∂state.vmag[func.pq] .+= ∂v
+end
+
+
+struct PowerGenerationBounds{VT, MT} <: AbstractExpression
+    M::MT
+    τ::VT
+end
+
+function PowerGenerationBounds(polar::PolarForm{T, VI, VT, MT}) where {T, VI, VT, MT}
+    SMT = default_sparse_matrix(polar.device)
+    pf = polar.network
+    M_tot = get_basis_matrix(pf)
+
+    M = -M_tot[[pf.ref; nbus .+ pf.ref; nbus .+ pf.pv], :]
+
+    pload = PS.get(polar.network, PS.ActiveLoad())
+    qload = PS.get(polar.network, PS.ReactiveLoad())
+    τ = [pload[pf.ref]; qload[pf.ref]; qload[pf.pv]]
+
+    return PowerGenerationBounds{VT, SMT}(M, τ)
+end
+
+function (func::PowerGenerationBounds)(cons, state)
+    cons .= func.τ .+ func.M * state.ψ
+    return
+end
+
+function adjoint!(func::PowerGenerationBounds, ∂state, state, ∂v)
+    mul!(∂state.ψ, func.M', ∂v, 1.0, -1.0)
+    return
+end
+
+
+struct LineFlows{VT, MT} <: AbstractExpression
+    nlines::Int
+    Lfp::MT
+    Lfq::MT
+    Ltp::MT
+    Ltq::MT
+    sfp::VT
+    sfq::VT
+    stp::VT
+    stq::VT
+end
+
+function LineFlows(polar::PolarForm{T,VI,VT,MT}) where {T,VI,VT,MT}
+    nlines = get(polar, PS.NumberOfLines())
+    Lfp, Lfq, Ltp, Ltq = PS.get_line_flow_matrices(polar.network)
+    sfp = VT(undef, nlines)
+    sfq = VT(undef, nlines)
+    stp = VT(undef, nlines)
+    stq = VT(undef, nlines)
+    return LineFlows{VT,MT}(nlines, Lfp, Lfq, Ltp, Ltq, sfp, sfq, stp, stq)
+end
+
+Base.size(func::LineFlows) = 2 * func.nlines
+
+function (func::LineFlows)(cons, state)
+    mul!(func.sfp, func.Lfp, state.ψ)
+    mul!(func.sfq, func.Lfq, state.ψ)
+    mul!(func.stp, func.Ltp, state.ψ)
+    mul!(func.stq, func.Ltq, state.ψ)
+    cons[1:func.nlines] .= func.sfp.^2 .+ func.sfq.^2
+    cons[1+func.nlines:2*func.nlines] .= func.stp.^2 .+ func.stq.^2
+    return
+end
+
+function adjoint!(func::LineFlows, ∂state, state, ∂v)
+    nlines = func.nlines
+    mul!(func.sfp, func.Lfp, state.ψ)
+    mul!(func.sfq, func.Lfq, state.ψ)
+    mul!(func.stp, func.Ltp, state.ψ)
+    mul!(func.stq, func.Ltq, state.ψ)
+
+    func.sfp .*= ∂v[1:nlines]
+    func.sfq .*= ∂v[1:nlines]
+    func.stp .*= ∂v[1+nlines:2*nlines]
+    func.stq .*= ∂v[1+nlines:2*nlines]
+
+    # Accumulate adjoint
+    mul!(∂state.ψ, func.Lfp', func.sfp, 2.0, -1.0)
+    mul!(∂state.ψ, func.Lfq', func.sfq, 2.0, -1.0)
+    mul!(∂state.ψ, func.Ltp', func.stp, 2.0, -1.0)
+    mul!(∂state.ψ, func.Ltq', func.stq, 2.0, -1.0)
+
+    return
+end
+
diff --git a/src/Polar/kernels.jl b/src/Polar/kernels.jl
index 76c930be..7c9976f0 100644
--- a/src/Polar/kernels.jl
+++ b/src/Polar/kernels.jl
@@ -386,6 +386,62 @@ function adjoint_transfer!(
     wait(ev)
 end
 
+KA.@kernel function _reverse_transfer_kernel2!(
+        output, @Const(adj_vmag), @Const(adj_vang), @Const(adj_pgen), @Const(pv), @Const(pq), @Const(ref), @Const(pv2gen),
+    npq, npv, nref, ngen,
+)
+    i, j = @index(Global, NTuple)
+
+    output[i, j] = if i <= npv
+        # x (vang_pv)
+        k = pv[i]
+        adj_vang[k, j]
+    elseif i <= npv + npq
+        k = pq[i - npv]
+        # x (vang_pq)
+        adj_vang[k, j]
+    elseif i <= npv + 2*npq
+        # x (vmag_pq)
+        k = pq[i - npv - npq]
+        adj_vmag[k, j]
+    elseif i <= npv + 2*npq + nref
+        # u (vmag_ref)
+        k = ref[i - npv - 2*npq]
+        adj_vmag[k, j]
+    elseif i <= npv + 2*npq + nref + npv
+        # u (vmag_pv)
+        k = pv[i - npv - 2*npq - nref]
+        adj_vmag[k, j]
+    elseif i <= npv + 2*npq + nref + npv + ngen
+        # u (vmag_pg)
+        k = pv2gen[i - 2*npv - 2*npq - nref]
+        adj_pgen[k, j]
+    end
+
+end
+
+function reverse_transfer!(
+    polar::PolarForm,
+    output, ∂state,
+)
+    nx = get(polar, ExaPF.NumberOfState())
+    nu = get(polar, ExaPF.NumberOfControl())
+    nbus = get(polar, PS.NumberOfBuses())
+    pv = polar.indexing.index_pv
+    pq = polar.indexing.index_pq
+    ref = polar.indexing.index_ref
+    pv2gen = polar.indexing.index_pv_to_gen
+    ev = _reverse_transfer_kernel2!(polar.device)(
+        output,
+        ∂state.vmag, ∂state.vang, ∂state.pgen,
+        pv, pq, ref, pv2gen,
+        length(pq), length(pv), length(ref), length(pv2gen),
+        ndrange=(nx+nu, size(output, 2)),
+        dependencies=Event(polar.device)
+    )
+    wait(ev)
+end
+
 KA.@kernel function active_power_slack!(
     cons, vmag, vang, ref, pd,
     @Const(ybus_re_nzval), @Const(ybus_re_colptr), @Const(ybus_re_rowval), @Const(ybus_im_nzval),
@@ -777,19 +833,19 @@ KA.@kernel function adj_basis_kernel!(
             cosθ = cos(Δθ)
             sinθ = sin(Δθ)
 
-            adj_vang_fr[i]  = -vmag[fr_bus, j] * vmag[to_bus, j] * sinθ * ∂cons[ℓ, j]
+            adj_vang_fr[i] += -vmag[fr_bus, j] * vmag[to_bus, j] * sinθ * ∂cons[ℓ, j]
             adj_vang_fr[i] +=  vmag[fr_bus, j] * vmag[to_bus, j] * cosθ * ∂cons[ℓ+nlines, j]
-            adj_vang_to[i]  =  vmag[fr_bus, j] * vmag[to_bus, j] * sinθ * ∂cons[ℓ, j]
+            adj_vang_to[i] +=  vmag[fr_bus, j] * vmag[to_bus, j] * sinθ * ∂cons[ℓ, j]
             adj_vang_to[i] -=  vmag[fr_bus, j] * vmag[to_bus, j] * cosθ * ∂cons[ℓ+nlines, j]
 
-            adj_vmag_fr[i] =  vmag[to_bus, j] * cosθ * ∂cons[ℓ, j]
+            adj_vmag_fr[i] +=  vmag[to_bus, j] * cosθ * ∂cons[ℓ, j]
             adj_vmag_fr[i] += vmag[to_bus, j] * sinθ * ∂cons[ℓ+nlines, j]
 
-            adj_vmag_to[i] =  vmag[fr_bus, j] * cosθ * ∂cons[ℓ, j]
+            adj_vmag_to[i] +=  vmag[fr_bus, j] * cosθ * ∂cons[ℓ, j]
             adj_vmag_to[i] += vmag[fr_bus, j] * sinθ * ∂cons[ℓ+nlines, j]
         else i <= nlines + nbus
             b = i - nlines
-            adj_vmag[b, j] = 2.0 * vmag[b, j] * ∂cons[b+2*nlines, j]
+            adj_vmag[b, j] += 2.0 * vmag[b, j] * ∂cons[b+2*nlines, j]
         end
     end
 end
diff --git a/src/Polar/polar.jl b/src/Polar/polar.jl
index 3f5450e7..ef4d6de6 100644
--- a/src/Polar/polar.jl
+++ b/src/Polar/polar.jl
@@ -42,6 +42,7 @@ include("Constraints/constraints.jl")
 include("powerflow.jl")
 include("objective.jl")
 include("batch.jl")
+include("functions.jl")
 
 function PolarForm(pf::PS.PowerNetwork, device::KA.Device)
     if isa(device, KA.CPU)
diff --git a/src/PowerSystem/PowerSystem.jl b/src/PowerSystem/PowerSystem.jl
index 32188052..d75a8e5a 100644
--- a/src/PowerSystem/PowerSystem.jl
+++ b/src/PowerSystem/PowerSystem.jl
@@ -1,6 +1,7 @@
 module PowerSystem
 
 using Printf
+using LinearAlgebra
 using SparseArrays
 
 import Base: show, get
diff --git a/src/PowerSystem/power_network.jl b/src/PowerSystem/power_network.jl
index a14fbf1d..2f8b8f0a 100644
--- a/src/PowerSystem/power_network.jl
+++ b/src/PowerSystem/power_network.jl
@@ -303,3 +303,47 @@ function get_costs_coefficients(pf::PowerNetwork)
     return coefficients
 end
 
+function get_basis_matrix(pf::PowerNetwork)
+    nb = pf.nbus
+    nl = size(pf.branches, 1)
+    Yff, Yft, Ytf, Ytt = pf.lines.Yff, pf.lines.Yft, pf.lines.Ytf, pf.lines.Ytt
+    f, t = pf.lines.from_buses, pf.lines.to_buses
+
+    Cf = sparse(f, 1:nl, ones(nl), nb, nl)       # connection matrix for line & from buses
+    Ct = sparse(t, 1:nl, ones(nl), nb, nl)       # connection matrix for line & to buses
+
+    ysh = (pf.buses[:, 5] .+ 1im .* pf.buses[:, 6]) ./ pf.baseMVA # vector of shunt admittances
+    Ysh = sparse(1:nb, 1:nb, ysh, nb, nb)
+
+    # Build matrix
+    Yc = Cf * Diagonal(Yft) + Ct * Diagonal(Ytf)
+    Ys = Cf * Diagonal(Yft) - Ct * Diagonal(Ytf)
+    Yd = Cf * Diagonal(Yff) * Cf' + Ct * Diagonal(Ytt) * Ct' + Ysh
+
+    return [-real(Yc) -imag(Ys) -real(Yd);
+             imag(Yc)  -real(Ys)  imag(Yd)]
+end
+
+function get_line_flow_matrices(pf::PowerNetwork)
+    nb = pf.nbus
+    nl = size(pf.branches, 1)
+    Yff, Yft, Ytf, Ytt = pf.lines.Yff, pf.lines.Yft, pf.lines.Ytf, pf.lines.Ytt
+
+    yff = Diagonal(Yff)
+    yft = Diagonal(Yft)
+    ytf = Diagonal(Ytf)
+    ytt = Diagonal(Ytt)
+
+    f, t = pf.lines.from_buses, pf.lines.to_buses
+
+    Cf = sparse(f, 1:nl, ones(nl), nb, nl)       # connection matrix for line & from buses
+    Ct = sparse(t, 1:nl, ones(nl), nb, nl)       # connection matrix for line & to buses
+
+    # Build matrix
+    Lfp = [real(yft)  imag(yft)  real(yff) * Cf']
+    Lfq = [-imag(yft) real(yft) -imag(yff) * Cf']
+    Ltp = [real(ytf)  -imag(ytf)  real(ytt) * Ct']
+    Ltq = [-imag(ytf) -real(ytf) -imag(ytt) * Ct']
+    return (Lfp, Lfq, Ltp, Ltq)
+end
+

From 600ee8a5cd2b27af4c077414f145eede71f0d995 Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Wed, 5 Jan 2022 20:24:17 -0600
Subject: [PATCH 08/34] reimplement Jacobian code

---
 src/Polar/Constraints/constraints.jl |   2 +-
 src/Polar/first_order.jl             | 112 +++++++++++++++++++++++++++
 src/Polar/functions.jl               |  94 +++++++++++++---------
 3 files changed, 170 insertions(+), 38 deletions(-)
 create mode 100644 src/Polar/first_order.jl

diff --git a/src/Polar/Constraints/constraints.jl b/src/Polar/Constraints/constraints.jl
index 26e309f3..e261f6db 100644
--- a/src/Polar/Constraints/constraints.jl
+++ b/src/Polar/Constraints/constraints.jl
@@ -113,7 +113,7 @@ function jacobian_transpose_product!(
 end
 
 ## Sparsity detection
-function jacobian_sparsity(polar::PolarForm, func::Function, xx::AbstractVariable)
+function jacobian_sparsity(polar::PolarForm, func, xx::AbstractVariable)
     nbus = get(polar, PS.NumberOfBuses())
     Vre = Float64[i for i in 1:nbus]
     Vim = Float64[i for i in nbus+1:2*nbus]
diff --git a/src/Polar/first_order.jl b/src/Polar/first_order.jl
new file mode 100644
index 00000000..3efdf94f
--- /dev/null
+++ b/src/Polar/first_order.jl
@@ -0,0 +1,112 @@
+
+function jacobian_transpose_product!(polar::PolarForm, pbm::AutoDiff.TapeMemory, jv, state, ∂v)
+    ∂state = pbm.stack
+    empty!(∂state)
+    adjoint!(pbm.func, ∂state, state, ∂v)
+    # Accumulate on vmag and vang
+    reverse_eval_intermediate(polar, ∂state, state, pbm.intermediate)
+    # Accumulate on x and u
+    reverse_transfer!(
+        polar, jv, ∂state,
+    )
+end
+
+struct MyJacobian{Func, VD, SMT, MT, VI, VP}
+    func::Func
+    stack::NetworkStack{VD}
+    J::SMT
+    compressedJ::MT
+    coloring::VI
+    map::VI
+    t1sseeds::VP
+    t1sF::VD
+end
+
+
+function my_map(polar::PolarForm, ::State)
+    nbus = get(polar, PS.NumberOfBuses())
+    ref, pv, pq = index_buses_device(polar)
+    return Int[nbus .+ pv; nbus .+ pq; pq]
+end
+function my_map(polar::PolarForm, ::Control)
+    nbus = get(polar, PS.NumberOfBuses())
+    ref, pv, pq = index_buses_device(polar)
+    pv2gen = polar.network.pv2gen
+    return Int[ref; pv; 2*nbus .+ pv2gen]
+end
+
+number(polar::PolarForm, ::State) = get(polar, NumberOfState())
+number(polar::PolarForm, ::Control) = get(polar, NumberOfControl())
+
+function MyJacobian(
+    polar::PolarForm{T, VI, VT, MT}, func::AbstractExpression, variable,
+) where {T, VI, VT, MT}
+    (SMT, A) = get_jacobian_types(polar.device)
+
+    pf = polar.network
+    nbus = PS.get(pf, PS.NumberOfBuses())
+    nlines = PS.get(pf, PS.NumberOfLines())
+    ngen = PS.get(pf, PS.NumberOfGenerators())
+
+    # Sparsity pattern
+    J = jacobian_sparsity(polar, func, variable)
+    # Coloring
+    coloring = AutoDiff.SparseDiffTools.matrix_colors(J)
+    ncolor = size(unique(coloring),1)
+
+    m = size(J, 1)
+
+    map = my_map(polar, variable)
+    nmap = number(polar, variable)
+
+    # Move Jacobian to the GPU
+    J = convert(SMT, J)
+
+    # Seedings
+    t1s{N} = ForwardDiff.Dual{Nothing,Float64, N} where N
+    stack = NetworkStack(nbus, ngen, nlines, A{t1s{ncolor}})
+    t1sF = A{t1s{ncolor}}(zeros(Float64, m))
+    t1sseeds = AutoDiff.init_seed(coloring, ncolor, nmap)
+
+    # Move the seeds over to the device, if necessary
+    gput1sseeds = A{ForwardDiff.Partials{ncolor,Float64}}(t1sseeds)
+    compressedJ = MT(zeros(Float64, ncolor, m))
+
+    return MyJacobian(
+        func, stack, J, compressedJ, coloring, map, gput1sseeds, t1sF,
+    )
+end
+
+@kernel function _seed_kernel2!(
+    duals::AbstractArray{ForwardDiff.Dual{T, V, N}}, @Const(x),
+    @Const(seeds), @Const(map),
+) where {T,V,N}
+    i = @index(Global, Linear)
+    duals[map[i]] = ForwardDiff.Dual{T,V,N}(x[map[i]], seeds[i])
+end
+
+function myseed!(dest, src, seeds, map, device)
+    y = dest.input
+    x = src.input
+    ev = _seed_kernel2!(device)(
+        y, x, seeds, map, ndrange=length(map), dependencies=Event(device))
+    wait(ev)
+end
+
+function jacobian!(
+    polar::PolarForm, jac::MyJacobian, state,
+)
+    # init
+    jac.stack.input .= state.input
+    jac.t1sF .= 0.0
+    # seed
+    myseed!(jac.stack, state, jac.t1sseeds, jac.map, polar.device)
+    # forward pass
+    forward_eval_intermediate(polar, jac.stack)
+    jac.func(jac.t1sF, jac.stack)
+    # uncompress
+    AutoDiff.getpartials_kernel!(jac.compressedJ, jac.t1sF, polar.device)
+    AutoDiff.uncompress_kernel!(jac.J, jac.compressedJ, jac.coloring, polar.device)
+    return jac.J
+end
+
diff --git a/src/Polar/functions.jl b/src/Polar/functions.jl
index 56fa259e..8ccdd7f5 100644
--- a/src/Polar/functions.jl
+++ b/src/Polar/functions.jl
@@ -3,47 +3,47 @@
 abstract type AbstractStack end
 
 struct NetworkStack{VT} <: AbstractStack
+    # INPUT
+    input::VT
     vmag::VT # voltage magnitudes
     vang::VT # voltage angles
     pgen::VT # active power generations
-    qgen::VT # reactive power generations
+    # INTERMEDIATE
     ψ::VT    # nonlinear basis ψ(vmag, vang)
 end
 
-function NetworkStack(polar::PolarForm{T,VI,VT,MT}) where {T,VI,VT,MT}
-    nbus = get(polar, PS.NumberOfBuses())
-    nlines = get(polar, PS.NumberOfLines())
-
-    vmag = abs.(polar.network.vbus) |> VT
-    vang = angle.(polar.network.vbus) |> VT
-    pgen = get(polar.network, PS.ActivePower()) |> VT
-    qgen = get(polar.network, PS.ReactivePower()) |> VT
+function NetworkStack(nbus, ngen, nlines, VT)
+    input = VT(undef, 2*nbus + ngen) ; fill!(input, 0.0)
+    # Wrap directly array x to avoid dealing with views
+    p0 = pointer(input)
+    vmag = unsafe_wrap(VT, p0, nbus)
+    p1 = pointer(input, nbus+1)
+    vang = unsafe_wrap(VT, p1, nbus)
+    p2 = pointer(input, 2*nbus+1)
+    pgen = unsafe_wrap(VT, p2, ngen)
 
-    ψ = VT(undef, 2*nlines + nbus) ; fill!(ψ, zero(T))
+    ψ = VT(undef, 2*nlines + nbus) ; fill!(ψ, 0.0)
 
-    return NetworkStack{VT}(vmag, vang, pgen, qgen, ψ)
+    return NetworkStack{VT}(input, vmag, vang, pgen, ψ)
 end
 
-function fw_stack(polar)
-    VT = Vector{ForwardDiff.Dual{Nothing, Float64, 1}}
+function NetworkStack(polar::PolarForm{T,VI,VT,MT}) where {T,VI,VT,MT}
     nbus = get(polar, PS.NumberOfBuses())
+    ngen = get(polar, PS.NumberOfGenerators())
     nlines = get(polar, PS.NumberOfLines())
 
-    vmag = abs.(polar.network.vbus) |> VT
-    vang = angle.(polar.network.vbus) |> VT
-    pgen = get(polar.network, PS.ActivePower()) |> VT
-    qgen = get(polar.network, PS.ReactivePower()) |> VT
+    stack = NetworkStack(nbus, ngen, nlines, VT)
 
-    ψ = VT(undef, 2*nlines + nbus)
-
-    return NetworkStack{VT}(vmag, vang, pgen, qgen, ψ)
+    copyto!(stack.vmag, abs.(polar.network.vbus))
+    copyto!(stack.vang, angle.(polar.network.vbus))
+    copyto!(stack.pgen, get(polar.network, PS.ActivePower()))
+    return stack
 end
 
 function Base.empty!(state::NetworkStack)
     fill!(state.vmag, 0.0)
     fill!(state.vang, 0.0)
     fill!(state.pgen, 0.0)
-    fill!(state.qgen, 0.0)
     fill!(state.ψ, 0.0)
     return
 end
@@ -96,17 +96,8 @@ end
 
 abstract type AbstractExpression end
 
-function jacobian_transpose_product!(polar::PolarForm, pbm::AutoDiff.TapeMemory, jv, state, ∂v)
-    ∂state = pbm.stack
-    empty!(∂state)
-    adjoint!(pbm.func, ∂state, state, ∂v)
-    # Accumulate on vmag and vang
-    reverse_eval_intermediate(polar, ∂state, state, pbm.intermediate)
-    # Accumulate on x and u
-    reverse_transfer!(
-        polar, jv, ∂state,
-    )
-end
+
+include("first_order.jl")
 
 
 #=
@@ -146,9 +137,8 @@ Base.size(::CostFunction) = (1,)
 
 function (func::CostFunction)(state)
     state.pgen[func.gen_ref] .= func.M * state.ψ
-    # func.c .= func.c0 .+ func.c1 .* state.pgen .+ func.c2 .* state.pgen.^2
-    return sum(func.c0 .+ func.c1 .* state.pgen .+ func.c2 .* state.pgen.^2)
-    # return sum(func.c)
+    func.c .= func.c0 .+ func.c1 .* state.pgen .+ func.c2 .* state.pgen.^2
+    return sum(func.c)
 end
 
 function adjoint!(func::CostFunction, ∂state, state, ∂v)
@@ -191,7 +181,9 @@ end
 Base.size(func::PowerFlowBalance) = size(func.τ)
 
 function (func::PowerFlowBalance)(cons, state)
-    cons .= func.τ .+ func.M * state.ψ .+ func.Cg * state.pgen
+    cons .= func.τ
+    mul!(cons, func.M, state.ψ, 1.0, 1.0)
+    mul!(cons, func.Cg, state.pgen, 1.0, 1.0)
     return
 end
 
@@ -227,7 +219,8 @@ end
 function PowerGenerationBounds(polar::PolarForm{T, VI, VT, MT}) where {T, VI, VT, MT}
     SMT = default_sparse_matrix(polar.device)
     pf = polar.network
-    M_tot = get_basis_matrix(pf)
+    nbus = pf.nbus
+    M_tot = PS.get_basis_matrix(pf)
 
     M = -M_tot[[pf.ref; nbus .+ pf.ref; nbus .+ pf.pv], :]
 
@@ -238,6 +231,8 @@ function PowerGenerationBounds(polar::PolarForm{T, VI, VT, MT}) where {T, VI, VT
     return PowerGenerationBounds{VT, SMT}(M, τ)
 end
 
+Base.size(func::PowerGenerationBounds) = size(func.τ)
+
 function (func::PowerGenerationBounds)(cons, state)
     cons .= func.τ .+ func.M * state.ψ
     return
@@ -304,3 +299,28 @@ function adjoint!(func::LineFlows, ∂state, state, ∂v)
     return
 end
 
+function matpower_jacobian(polar::PolarForm, X::Union{State, Control}, func::PowerFlowBalance, V)
+    nbus = get(polar, PS.NumberOfBuses())
+    pf = polar.network
+    ref, pv, pq = index_buses_host(polar)
+    nref = length(ref)
+    npv = length(pv)
+    npq = length(pq)
+    Ybus = pf.Ybus
+
+    dSbus_dVm, dSbus_dVa = PS.matpower_residual_jacobian(V, Ybus)
+
+    if isa(X, State)
+        j11 = real(dSbus_dVa[[pv; pq], [pv; pq]])
+        j12 = real(dSbus_dVm[[pv; pq], pq])
+        j21 = imag(dSbus_dVa[pq, [pv; pq]])
+        j22 = imag(dSbus_dVm[pq, pq])
+        return [j11 j12; j21 j22]::SparseMatrixCSC{Float64, Int}
+    elseif isa(X, Control)
+        j11 = real(dSbus_dVm[[pv; pq], [ref; pv]])
+        j12 = sparse(I, npv + npq, npv)
+        j21 = imag(dSbus_dVm[pq, [ref; pv]])
+        j22 = spzeros(npq, npv)
+        return [j11 -j12; j21 j22]::SparseMatrixCSC{Float64, Int}
+    end
+end

From b2bc6fa1c3682945758728fa790e1f3d25ee1d83 Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Wed, 5 Jan 2022 21:58:04 -0600
Subject: [PATCH 09/34] reimplement Hessian

---
 src/Polar/functions.jl    |  1 +
 src/Polar/second_order.jl | 71 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+)
 create mode 100644 src/Polar/second_order.jl

diff --git a/src/Polar/functions.jl b/src/Polar/functions.jl
index 8ccdd7f5..999464b6 100644
--- a/src/Polar/functions.jl
+++ b/src/Polar/functions.jl
@@ -98,6 +98,7 @@ abstract type AbstractExpression end
 
 
 include("first_order.jl")
+include("second_order.jl")
 
 
 #=
diff --git a/src/Polar/second_order.jl b/src/Polar/second_order.jl
new file mode 100644
index 00000000..8f28f6bf
--- /dev/null
+++ b/src/Polar/second_order.jl
@@ -0,0 +1,71 @@
+
+struct MyHessian{Func, VD, VI, T1, T2, Buff} <: AutoDiff.AbstractHessian
+    func::Func
+    state::NetworkStack{VD}
+    ∂state::NetworkStack{VD}
+    host_t1sseeds::T1 # Needed because seeds have to be created on the host
+    t1sseeds::T2
+    t1sF::VD
+    ∂t1sF::VD
+    map::VI
+    buffer::Buff
+end
+
+function MyHessian(polar::PolarForm{T, VI, VT, MT}, func) where {T, VI, VT, MT}
+    (SMT, A) = get_jacobian_types(polar.device)
+
+    pf = polar.network
+    nbus = PS.get(pf, PS.NumberOfBuses())
+    nlines = PS.get(pf, PS.NumberOfLines())
+    ngen = PS.get(pf, PS.NumberOfGenerators())
+
+    n_cons = size(func)[1]
+
+    map = [my_map(polar, State()); my_map(polar, Control())] |> VI
+    nmap = length(map)
+
+    t1s{N} = ForwardDiff.Dual{Nothing,Float64, N} where N
+    VD = A{t1s{1}}
+
+    # ̇x
+    stack = NetworkStack(nbus, ngen, nlines, VD)
+    # ̄y
+    ∂stack = NetworkStack(nbus, ngen, nlines, VD)
+
+    t1sF = zeros(Float64, n_cons) |> VD
+    adj_t1sF = similar(t1sF)
+
+    # Seedings
+    host_t1sseeds = Vector{ForwardDiff.Partials{1, Float64}}(undef, nmap)
+    t1sseeds = A{ForwardDiff.Partials{1, Float64}}(undef, nmap)
+
+    intermediate = _get_intermediate_stack(polar, network_basis, VD, 1)
+    return MyHessian(
+        func, stack, ∂stack, host_t1sseeds, t1sseeds, t1sF, adj_t1sF, map, intermediate,
+    )
+end
+
+function hprod!(
+    polar, H::MyHessian, hv, state, λ, v,
+)
+    @assert length(hv) == length(v)
+
+    # Init dual variables
+    H.state.input .= state.input
+    empty!(H.∂state)
+    H.∂t1sF .= λ
+
+    # Seeding
+    nmap = length(H.map)
+    # Init seed
+    _init_seed_hessian!(H.t1sseeds, H.host_t1sseeds, v, nmap)
+    myseed!(H.state, state, H.t1sseeds, H.map, polar.device)
+
+    # Reverse
+    adjoint!(H.func, H.∂state, H.state, H.∂t1sF)
+    reverse_eval_intermediate(polar, H.∂state, H.state, H.buffer)
+
+    AutoDiff.getpartials_kernel!(hv, H.∂state.input, H.map, polar.device)
+    return
+end
+

From 48ad8a0c40f4b6925a01fec337eedf17dae0cc3c Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Thu, 6 Jan 2022 12:15:08 -0600
Subject: [PATCH 10/34] add proper intermediate state

---
 src/Polar/functions.jl | 112 ++++++++++++++++++-----------------------
 src/Polar/legacy.jl    |  74 +++++++++++++++++++++++++++
 src/architectures.jl   |   4 +-
 3 files changed, 126 insertions(+), 64 deletions(-)
 create mode 100644 src/Polar/legacy.jl

diff --git a/src/Polar/functions.jl b/src/Polar/functions.jl
index 999464b6..b6a0cb8b 100644
--- a/src/Polar/functions.jl
+++ b/src/Polar/functions.jl
@@ -2,7 +2,7 @@
 
 abstract type AbstractStack end
 
-struct NetworkStack{VT} <: AbstractStack
+struct NetworkStack{VT,NT} <: AbstractStack
     # INPUT
     input::VT
     vmag::VT # voltage magnitudes
@@ -10,6 +10,7 @@ struct NetworkStack{VT} <: AbstractStack
     pgen::VT # active power generations
     # INTERMEDIATE
     ψ::VT    # nonlinear basis ψ(vmag, vang)
+    intermediate::NT
 end
 
 function NetworkStack(nbus, ngen, nlines, VT)
@@ -22,9 +23,18 @@ function NetworkStack(nbus, ngen, nlines, VT)
     p2 = pointer(input, 2*nbus+1)
     pgen = unsafe_wrap(VT, p2, ngen)
 
+    # Basis function
     ψ = VT(undef, 2*nlines + nbus) ; fill!(ψ, 0.0)
+    # Intermediate expressions to avoid unecessary allocations
+    intermediate = (
+        c = VT(undef, ngen),     # buffer for costs
+        sfp = VT(undef, nlines), # buffer for line-flow
+        sfq = VT(undef, nlines), # buffer for line-flow
+        stp = VT(undef, nlines), # buffer for line-flow
+        stq = VT(undef, nlines), # buffer for line-flow
+    )
 
-    return NetworkStack{VT}(input, vmag, vang, pgen, ψ)
+    return NetworkStack(input, vmag, vang, pgen, ψ, intermediate)
 end
 
 function NetworkStack(polar::PolarForm{T,VI,VT,MT}) where {T,VI,VT,MT}
@@ -108,7 +118,6 @@ include("second_order.jl")
 struct CostFunction{VT, MT} <: AbstractExpression
     gen_ref::Vector{Int}
     M::MT
-    c::VT
     c0::VT
     c1::VT
     c2::VT
@@ -124,22 +133,21 @@ function CostFunction(polar::PolarForm{T, VI, VT, MT}) where {T, VI, VT, MT}
     M_tot = PS.get_basis_matrix(polar.network)
     M = M_tot[ref, :] |> SMT
 
-    # costs
-    c = VT(undef, ngen)
     # coefficients
     coefs = polar.costs_coefficients
     c0 = @view coefs[:, 2]
     c1 = @view coefs[:, 3]
     c2 = @view coefs[:, 4]
-    return CostFunction{VT, SMT}(ref_gen, M, c, c0, c1, c2)
+    return CostFunction{VT, SMT}(ref_gen, M, c0, c1, c2)
 end
 
 Base.size(::CostFunction) = (1,)
 
 function (func::CostFunction)(state)
+    costs = state.intermediate.c
     state.pgen[func.gen_ref] .= func.M * state.ψ
-    func.c .= func.c0 .+ func.c1 .* state.pgen .+ func.c2 .* state.pgen.^2
-    return sum(func.c)
+    costs .= func.c0 .+ func.c1 .* state.pgen .+ func.c2 .* state.pgen.^2
+    return sum(costs)
 end
 
 function adjoint!(func::CostFunction, ∂state, state, ∂v)
@@ -251,77 +259,57 @@ struct LineFlows{VT, MT} <: AbstractExpression
     Lfq::MT
     Ltp::MT
     Ltq::MT
-    sfp::VT
-    sfq::VT
-    stp::VT
-    stq::VT
 end
 
 function LineFlows(polar::PolarForm{T,VI,VT,MT}) where {T,VI,VT,MT}
+    SMT = default_sparse_matrix(polar.device)
     nlines = get(polar, PS.NumberOfLines())
     Lfp, Lfq, Ltp, Ltq = PS.get_line_flow_matrices(polar.network)
-    sfp = VT(undef, nlines)
-    sfq = VT(undef, nlines)
-    stp = VT(undef, nlines)
-    stq = VT(undef, nlines)
-    return LineFlows{VT,MT}(nlines, Lfp, Lfq, Ltp, Ltq, sfp, sfq, stp, stq)
+    return LineFlows{VT,SMT}(nlines, Lfp, Lfq, Ltp, Ltq)
 end
 
 Base.size(func::LineFlows) = 2 * func.nlines
 
-function (func::LineFlows)(cons, state)
-    mul!(func.sfp, func.Lfp, state.ψ)
-    mul!(func.sfq, func.Lfq, state.ψ)
-    mul!(func.stp, func.Ltp, state.ψ)
-    mul!(func.stq, func.Ltq, state.ψ)
-    cons[1:func.nlines] .= func.sfp.^2 .+ func.sfq.^2
-    cons[1+func.nlines:2*func.nlines] .= func.stp.^2 .+ func.stq.^2
+function (func::LineFlows)(cons::VT, state::NetworkStack{VT,S}) where {VT<:AbstractVector, S}
+    sfp = state.intermediate.sfp::VT
+    sfq = state.intermediate.sfq::VT
+    stp = state.intermediate.stp::VT
+    stq = state.intermediate.stq::VT
+
+    # TODO: When Dual numbers are used, mul! dispatches on
+    # default Julia implementation, to slow for our use case
+    mul!(sfp, func.Lfp, state.ψ)
+    mul!(sfq, func.Lfq, state.ψ)
+    mul!(stp, func.Ltp, state.ψ)
+    mul!(stq, func.Ltq, state.ψ)
+    cons[1:func.nlines] .= sfp.^2 .+ sfq.^2
+    cons[1+func.nlines:2*func.nlines] .= stp.^2 .+ stq.^2
     return
 end
 
 function adjoint!(func::LineFlows, ∂state, state, ∂v)
     nlines = func.nlines
-    mul!(func.sfp, func.Lfp, state.ψ)
-    mul!(func.sfq, func.Lfq, state.ψ)
-    mul!(func.stp, func.Ltp, state.ψ)
-    mul!(func.stq, func.Ltq, state.ψ)
-
-    func.sfp .*= ∂v[1:nlines]
-    func.sfq .*= ∂v[1:nlines]
-    func.stp .*= ∂v[1+nlines:2*nlines]
-    func.stq .*= ∂v[1+nlines:2*nlines]
+    sfp = ∂state.intermediate.sfp
+    sfq = ∂state.intermediate.sfq
+    stp = ∂state.intermediate.stp
+    stq = ∂state.intermediate.stq
+    mul!(sfp, func.Lfp, state.ψ)
+    mul!(sfq, func.Lfq, state.ψ)
+    mul!(stp, func.Ltp, state.ψ)
+    mul!(stq, func.Ltq, state.ψ)
+
+    sfp .*= ∂v[1:nlines]
+    sfq .*= ∂v[1:nlines]
+    stp .*= ∂v[1+nlines:2*nlines]
+    stq .*= ∂v[1+nlines:2*nlines]
 
     # Accumulate adjoint
-    mul!(∂state.ψ, func.Lfp', func.sfp, 2.0, -1.0)
-    mul!(∂state.ψ, func.Lfq', func.sfq, 2.0, -1.0)
-    mul!(∂state.ψ, func.Ltp', func.stp, 2.0, -1.0)
-    mul!(∂state.ψ, func.Ltq', func.stq, 2.0, -1.0)
+    mul!(∂state.ψ, func.Lfp', sfp, 2.0, -1.0)
+    mul!(∂state.ψ, func.Lfq', sfq, 2.0, -1.0)
+    mul!(∂state.ψ, func.Ltp', stp, 2.0, -1.0)
+    mul!(∂state.ψ, func.Ltq', stq, 2.0, -1.0)
 
     return
 end
 
-function matpower_jacobian(polar::PolarForm, X::Union{State, Control}, func::PowerFlowBalance, V)
-    nbus = get(polar, PS.NumberOfBuses())
-    pf = polar.network
-    ref, pv, pq = index_buses_host(polar)
-    nref = length(ref)
-    npv = length(pv)
-    npq = length(pq)
-    Ybus = pf.Ybus
-
-    dSbus_dVm, dSbus_dVa = PS.matpower_residual_jacobian(V, Ybus)
-
-    if isa(X, State)
-        j11 = real(dSbus_dVa[[pv; pq], [pv; pq]])
-        j12 = real(dSbus_dVm[[pv; pq], pq])
-        j21 = imag(dSbus_dVa[pq, [pv; pq]])
-        j22 = imag(dSbus_dVm[pq, pq])
-        return [j11 j12; j21 j22]::SparseMatrixCSC{Float64, Int}
-    elseif isa(X, Control)
-        j11 = real(dSbus_dVm[[pv; pq], [ref; pv]])
-        j12 = sparse(I, npv + npq, npv)
-        j21 = imag(dSbus_dVm[pq, [ref; pv]])
-        j22 = spzeros(npq, npv)
-        return [j11 -j12; j21 j22]::SparseMatrixCSC{Float64, Int}
-    end
-end
+include("legacy.jl")
diff --git a/src/Polar/legacy.jl b/src/Polar/legacy.jl
new file mode 100644
index 00000000..9b487414
--- /dev/null
+++ b/src/Polar/legacy.jl
@@ -0,0 +1,74 @@
+
+function matpower_jacobian(polar::PolarForm, X::Union{State, Control}, func::PowerFlowBalance, V)
+    nbus = get(polar, PS.NumberOfBuses())
+    pf = polar.network
+    ref, pv, pq = index_buses_host(polar)
+    nref = length(ref)
+    npv = length(pv)
+    npq = length(pq)
+    Ybus = pf.Ybus
+
+    dSbus_dVm, dSbus_dVa = PS.matpower_residual_jacobian(V, Ybus)
+
+    if isa(X, State)
+        j11 = real(dSbus_dVa[[pv; pq], [pv; pq]])
+        j12 = real(dSbus_dVm[[pv; pq], pq])
+        j21 = imag(dSbus_dVa[pq, [pv; pq]])
+        j22 = imag(dSbus_dVm[pq, pq])
+        return [j11 j12; j21 j22]::SparseMatrixCSC{Float64, Int}
+    elseif isa(X, Control)
+        j11 = real(dSbus_dVm[[pv; pq], [ref; pv]])
+        j12 = sparse(I, npv + npq, npv)
+        j21 = imag(dSbus_dVm[pq, [ref; pv]])
+        j22 = spzeros(npq, npv)
+        return [j11 -j12; j21 j22]::SparseMatrixCSC{Float64, Int}
+    end
+end
+
+function matpower_jacobian(
+    polar::PolarForm,
+    X::Union{State,Control},
+    func::VoltageMagnitudePQ,
+    V,
+)
+
+    m = size(func)[1]
+    nᵤ = get(polar, NumberOfControl())
+    nₓ = get(polar, NumberOfState())
+    npv = PS.get(polar.network, PS.NumberOfPVBuses())
+    npq = PS.get(polar.network, PS.NumberOfPQBuses())
+    shift = npq + npv
+
+    I = 1:m
+    J = (shift+1):(shift+npq)
+    V = ones(m)
+    if isa(X, State)
+        return sparse(I, J, V, m, nₓ)
+    elseif isa(X, Control)
+        return spzeros(m, nᵤ)
+    end
+end
+
+function matpower_jacobian(polar::PolarForm, X::Union{State,Control}, func::LineFlows, V)
+    nbus = get(polar, PS.NumberOfBuses())
+    nlines = get(polar, PS.NumberOfLines())
+    pf = polar.network
+    ref, pv, pq = index_buses_host(polar)
+    nref = length(ref)
+    npv  = length(pv)
+    npq  = length(pq)
+    lines = pf.lines
+
+    dSl_dVm, dSl_dVa = PS.matpower_lineflow_power_jacobian(V, lines)
+
+    if isa(X, State)
+        j11 = dSl_dVa[:, [pv; pq]]
+        j12 = dSl_dVm[:, pq]
+        return [j11 j12]::SparseMatrixCSC{Float64, Int}
+    elseif isa(X, Control)
+        j11 = dSl_dVm[:, [ref; pv]]
+        j12 = spzeros(2 * nlines, npv)
+        return [j11 j12]::SparseMatrixCSC{Float64, Int}
+    end
+end
+
diff --git a/src/architectures.jl b/src/architectures.jl
index 30ee31fc..b3591ee7 100644
--- a/src/architectures.jl
+++ b/src/architectures.jl
@@ -7,7 +7,7 @@ xnorm(x::CUDA.CuVector) = CUBLAS.nrm2(x)
 
 xnorm_inf(a) = maximum(abs.(a))
 
-default_sparse_matrix(::CPU) = SparseMatrixCSC
+default_sparse_matrix(::CPU) = SparseMatrixCSC{Float64,Int}
 
 function get_jacobian_types(::CPU)
     SMT = SparseMatrixCSC{Float64,Int}
@@ -31,4 +31,4 @@ function get_batch_jacobian_types(::GPU)
     SMT = CUSPARSE.CuSparseMatrixCSR{Float64}
     A = CUDA.CuArray
     return SMT, A
-end
\ No newline at end of file
+end

From bf20c2a93984d197e524acc246da22cae8c36cfd Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Thu, 6 Jan 2022 15:01:12 -0600
Subject: [PATCH 11/34] update Jacobian and Hessian Autodiff to work with
 mapping

---
 src/Polar/first_order.jl  |  36 +++++++++----
 src/Polar/functions.jl    |   1 +
 src/Polar/legacy.jl       | 105 ++++++++++++++++++++++----------------
 src/Polar/second_order.jl |   7 +--
 4 files changed, 90 insertions(+), 59 deletions(-)

diff --git a/src/Polar/first_order.jl b/src/Polar/first_order.jl
index 3efdf94f..9ff1329f 100644
--- a/src/Polar/first_order.jl
+++ b/src/Polar/first_order.jl
@@ -23,6 +23,8 @@ struct MyJacobian{Func, VD, SMT, MT, VI, VP}
 end
 
 
+# Ordering: [vmag, vang, pgen]
+
 function my_map(polar::PolarForm, ::State)
     nbus = get(polar, PS.NumberOfBuses())
     ref, pv, pq = index_buses_device(polar)
@@ -38,8 +40,26 @@ end
 number(polar::PolarForm, ::State) = get(polar, NumberOfState())
 number(polar::PolarForm, ::Control) = get(polar, NumberOfControl())
 
+# Coloring
+function jacobian_sparsity(polar::PolarForm, func::AbstractExpression)
+    nbus = get(polar, PS.NumberOfBuses())
+    Vre = Float64[i for i in 1:nbus]
+    Vim = Float64[i for i in nbus+1:2*nbus]
+    V = Vre .+ im .* Vim
+    return matpower_jacobian(polar, func, V)
+end
+
+function get_jacobian_colors(polar::PolarForm, func::AbstractExpression, map::Vector{Int})
+    # Sparsity pattern
+    J = jacobian_sparsity(polar, func)
+    Jsub = J[:, map]
+    # Coloring
+    colors = AutoDiff.SparseDiffTools.matrix_colors(Jsub)
+    return (Jsub, colors)
+end
+
 function MyJacobian(
-    polar::PolarForm{T, VI, VT, MT}, func::AbstractExpression, variable,
+    polar::PolarForm{T, VI, VT, MT}, func::AbstractExpression, map::Vector{Int},
 ) where {T, VI, VT, MT}
     (SMT, A) = get_jacobian_types(polar.device)
 
@@ -48,19 +68,13 @@ function MyJacobian(
     nlines = PS.get(pf, PS.NumberOfLines())
     ngen = PS.get(pf, PS.NumberOfGenerators())
 
-    # Sparsity pattern
-    J = jacobian_sparsity(polar, func, variable)
-    # Coloring
-    coloring = AutoDiff.SparseDiffTools.matrix_colors(J)
+    J_host, coloring = get_jacobian_colors(polar, func, map)
     ncolor = size(unique(coloring),1)
 
-    m = size(J, 1)
-
-    map = my_map(polar, variable)
-    nmap = number(polar, variable)
+    J = J_host |> SMT
 
-    # Move Jacobian to the GPU
-    J = convert(SMT, J)
+    m = size(J, 1)
+    nmap = length(map)
 
     # Seedings
     t1s{N} = ForwardDiff.Dual{Nothing,Float64, N} where N
diff --git a/src/Polar/functions.jl b/src/Polar/functions.jl
index b6a0cb8b..1077f837 100644
--- a/src/Polar/functions.jl
+++ b/src/Polar/functions.jl
@@ -151,6 +151,7 @@ function (func::CostFunction)(state)
 end
 
 function adjoint!(func::CostFunction, ∂state, state, ∂v)
+    state.pgen[func.gen_ref] .= -func.M * state.ψ
     ∂state.pgen .+= ∂v .* (func.c1 .+ 2.0 .* func.c2 .* state.pgen)
     ∂state.ψ .-= func.M' * ∂state.pgen[func.gen_ref]
     return
diff --git a/src/Polar/legacy.jl b/src/Polar/legacy.jl
index 9b487414..dff43d1f 100644
--- a/src/Polar/legacy.jl
+++ b/src/Polar/legacy.jl
@@ -1,8 +1,11 @@
+# Matpower Jacobian (order of columns: [vmag, vang, pgen])
 
-function matpower_jacobian(polar::PolarForm, X::Union{State, Control}, func::PowerFlowBalance, V)
-    nbus = get(polar, PS.NumberOfBuses())
+function matpower_jacobian(polar::PolarForm, func::PowerFlowBalance, V)
     pf = polar.network
+    nbus = pf.nbus
+    ngen = pf.ngen
     ref, pv, pq = index_buses_host(polar)
+    gen2bus = pf.gen2bus
     nref = length(ref)
     npv = length(pv)
     npq = length(pq)
@@ -10,50 +13,64 @@ function matpower_jacobian(polar::PolarForm, X::Union{State, Control}, func::Pow
 
     dSbus_dVm, dSbus_dVa = PS.matpower_residual_jacobian(V, Ybus)
 
-    if isa(X, State)
-        j11 = real(dSbus_dVa[[pv; pq], [pv; pq]])
-        j12 = real(dSbus_dVm[[pv; pq], pq])
-        j21 = imag(dSbus_dVa[pq, [pv; pq]])
-        j22 = imag(dSbus_dVm[pq, pq])
-        return [j11 j12; j21 j22]::SparseMatrixCSC{Float64, Int}
-    elseif isa(X, Control)
-        j11 = real(dSbus_dVm[[pv; pq], [ref; pv]])
-        j12 = sparse(I, npv + npq, npv)
-        j21 = imag(dSbus_dVm[pq, [ref; pv]])
-        j22 = spzeros(npq, npv)
-        return [j11 -j12; j21 j22]::SparseMatrixCSC{Float64, Int}
-    end
+    Cg_tot = sparse(gen2bus, 1:ngen, -ones(ngen), nbus, ngen)
+    Cg = Cg_tot[[pv; pq], :]
+
+    j11 = real(dSbus_dVm[[pv; pq], :])
+    j12 = real(dSbus_dVa[[pv; pq], :])
+    j13 = Cg #sparse(gen2bus, 1:ngen, -ones(ngen), npv + npq, ngen)
+    j21 = imag(dSbus_dVm[pq, :])
+    j22 = imag(dSbus_dVa[pq, :])
+    j23 = spzeros(npq, ngen)
+
+    return [
+        j11 j12 j13;
+        j21 j22 j23
+    ]::SparseMatrixCSC{Float64, Int}
 end
 
-function matpower_jacobian(
-    polar::PolarForm,
-    X::Union{State,Control},
-    func::VoltageMagnitudePQ,
-    V,
-)
+function matpower_jacobian(polar::PolarForm, func::VoltageMagnitudePQ, V)
+    pf = polar.network
+    ngen = pf.ngen
+    nbus = pf.nbus
+    ref, pv, pq = index_buses_host(polar)
+    j11 = sparse(1:npq, pq, ones(npq), npq, nbus)
+    j12 = spzeros(npq, nbus + ngen)
+    return [j11 j12]::SparseMatrixCSC{Float64, Int}
+end
 
-    m = size(func)[1]
-    nᵤ = get(polar, NumberOfControl())
-    nₓ = get(polar, NumberOfState())
-    npv = PS.get(polar.network, PS.NumberOfPVBuses())
-    npq = PS.get(polar.network, PS.NumberOfPQBuses())
-    shift = npq + npv
+function matpower_jacobian(polar::PolarForm, func::PowerGenerationBounds, V)
+    pf = polar.network
+    nbus = pf.nbus
+    ngen = pf.ngen
+    gen2bus = pf.gen2bus
+    ref, pv, pq = index_buses_host(polar)
+    nref = length(ref)
+    npv = length(pv)
+    npq = length(pq)
+    Ybus = pf.Ybus
 
-    I = 1:m
-    J = (shift+1):(shift+npq)
-    V = ones(m)
-    if isa(X, State)
-        return sparse(I, J, V, m, nₓ)
-    elseif isa(X, Control)
-        return spzeros(m, nᵤ)
-    end
+    dSbus_dVm, dSbus_dVa = PS.matpower_residual_jacobian(V, Ybus)
+    j11 = real(dSbus_dVm[ref, :])
+    j12 = real(dSbus_dVa[ref, :])
+    j13 = spzeros(nref, ngen)
+
+    j21 = imag(dSbus_dVm[gen2bus, :])
+    j22 = imag(dSbus_dVa[gen2bus, :])
+    j23 = spzeros(ngen, ngen)
+    # w.r.t. control
+    return [
+        j11 j12 j13 ;
+        j21 j22 j23
+    ]::SparseMatrixCSC{Float64, Int}
 end
 
-function matpower_jacobian(polar::PolarForm, X::Union{State,Control}, func::LineFlows, V)
+function matpower_jacobian(polar::PolarForm, func::LineFlows, V)
     nbus = get(polar, PS.NumberOfBuses())
     nlines = get(polar, PS.NumberOfLines())
     pf = polar.network
     ref, pv, pq = index_buses_host(polar)
+    ngen = pf.ngen
     nref = length(ref)
     npv  = length(pv)
     npq  = length(pq)
@@ -61,14 +78,12 @@ function matpower_jacobian(polar::PolarForm, X::Union{State,Control}, func::Line
 
     dSl_dVm, dSl_dVa = PS.matpower_lineflow_power_jacobian(V, lines)
 
-    if isa(X, State)
-        j11 = dSl_dVa[:, [pv; pq]]
-        j12 = dSl_dVm[:, pq]
-        return [j11 j12]::SparseMatrixCSC{Float64, Int}
-    elseif isa(X, Control)
-        j11 = dSl_dVm[:, [ref; pv]]
-        j12 = spzeros(2 * nlines, npv)
-        return [j11 j12]::SparseMatrixCSC{Float64, Int}
-    end
+    j11 = dSl_dVm
+    j12 = dSl_dVa
+    j13 = spzeros(2 * nlines, ngen)
+
+    return [
+        j11 j12 j13;
+    ]::SparseMatrixCSC{Float64, Int}
 end
 
diff --git a/src/Polar/second_order.jl b/src/Polar/second_order.jl
index 8f28f6bf..12f7d21c 100644
--- a/src/Polar/second_order.jl
+++ b/src/Polar/second_order.jl
@@ -11,7 +11,7 @@ struct MyHessian{Func, VD, VI, T1, T2, Buff} <: AutoDiff.AbstractHessian
     buffer::Buff
 end
 
-function MyHessian(polar::PolarForm{T, VI, VT, MT}, func) where {T, VI, VT, MT}
+function MyHessian(polar::PolarForm{T, VI, VT, MT}, func::AbstractExpression, map::Vector{Int}) where {T, VI, VT, MT}
     (SMT, A) = get_jacobian_types(polar.device)
 
     pf = polar.network
@@ -21,8 +21,8 @@ function MyHessian(polar::PolarForm{T, VI, VT, MT}, func) where {T, VI, VT, MT}
 
     n_cons = size(func)[1]
 
-    map = [my_map(polar, State()); my_map(polar, Control())] |> VI
     nmap = length(map)
+    map_device = map |> VI
 
     t1s{N} = ForwardDiff.Dual{Nothing,Float64, N} where N
     VD = A{t1s{1}}
@@ -41,7 +41,7 @@ function MyHessian(polar::PolarForm{T, VI, VT, MT}, func) where {T, VI, VT, MT}
 
     intermediate = _get_intermediate_stack(polar, network_basis, VD, 1)
     return MyHessian(
-        func, stack, ∂stack, host_t1sseeds, t1sseeds, t1sF, adj_t1sF, map, intermediate,
+        func, stack, ∂stack, host_t1sseeds, t1sseeds, t1sF, adj_t1sF, map_device, intermediate,
     )
 end
 
@@ -60,6 +60,7 @@ function hprod!(
     # Init seed
     _init_seed_hessian!(H.t1sseeds, H.host_t1sseeds, v, nmap)
     myseed!(H.state, state, H.t1sseeds, H.map, polar.device)
+    forward_eval_intermediate(polar, H.state)
 
     # Reverse
     adjoint!(H.func, H.∂state, H.state, H.∂t1sF)

From cd6b960ab7778c0139dad0dc864a700875e97523 Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Thu, 6 Jan 2022 17:32:16 -0600
Subject: [PATCH 12/34] fix wrong sign in Hessian computation

---
 src/Polar/functions.jl    | 24 +++++++++++++-----------
 src/Polar/second_order.jl |  1 +
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/Polar/functions.jl b/src/Polar/functions.jl
index 1077f837..274378b1 100644
--- a/src/Polar/functions.jl
+++ b/src/Polar/functions.jl
@@ -131,7 +131,7 @@ function CostFunction(polar::PolarForm{T, VI, VT, MT}) where {T, VI, VT, MT}
     ref_gen = polar.indexing.index_ref_to_gen
     # Assemble matrix
     M_tot = PS.get_basis_matrix(polar.network)
-    M = M_tot[ref, :] |> SMT
+    M = -M_tot[ref, :] |> SMT
 
     # coefficients
     coefs = polar.costs_coefficients
@@ -150,10 +150,14 @@ function (func::CostFunction)(state)
     return sum(costs)
 end
 
+function (func::CostFunction)(output, state)
+    output[1] = func(state)
+    return
+end
+
 function adjoint!(func::CostFunction, ∂state, state, ∂v)
-    state.pgen[func.gen_ref] .= -func.M * state.ψ
     ∂state.pgen .+= ∂v .* (func.c1 .+ 2.0 .* func.c2 .* state.pgen)
-    ∂state.ψ .-= func.M' * ∂state.pgen[func.gen_ref]
+    ∂state.ψ .+= func.M' * ∂state.pgen[func.gen_ref]
     return
 end
 
@@ -198,7 +202,7 @@ function (func::PowerFlowBalance)(cons, state)
 end
 
 function adjoint!(func::PowerFlowBalance, ∂state, state, ∂v)
-    mul!(∂state.ψ, func.M', ∂v, 1.0, -1.0)
+    mul!(∂state.ψ, func.M', ∂v, 1.0, 1.0)
     mul!(∂state.pgen, func.Cg', ∂v, 1.0, 1.0)
     return
 end
@@ -249,7 +253,7 @@ function (func::PowerGenerationBounds)(cons, state)
 end
 
 function adjoint!(func::PowerGenerationBounds, ∂state, state, ∂v)
-    mul!(∂state.ψ, func.M', ∂v, 1.0, -1.0)
+    mul!(∂state.ψ, func.M', ∂v, 1.0, 1.0)
     return
 end
 
@@ -277,8 +281,6 @@ function (func::LineFlows)(cons::VT, state::NetworkStack{VT,S}) where {VT<:Abstr
     stp = state.intermediate.stp::VT
     stq = state.intermediate.stq::VT
 
-    # TODO: When Dual numbers are used, mul! dispatches on
-    # default Julia implementation, to slow for our use case
     mul!(sfp, func.Lfp, state.ψ)
     mul!(sfq, func.Lfq, state.ψ)
     mul!(stp, func.Ltp, state.ψ)
@@ -305,10 +307,10 @@ function adjoint!(func::LineFlows, ∂state, state, ∂v)
     stq .*= ∂v[1+nlines:2*nlines]
 
     # Accumulate adjoint
-    mul!(∂state.ψ, func.Lfp', sfp, 2.0, -1.0)
-    mul!(∂state.ψ, func.Lfq', sfq, 2.0, -1.0)
-    mul!(∂state.ψ, func.Ltp', stp, 2.0, -1.0)
-    mul!(∂state.ψ, func.Ltq', stq, 2.0, -1.0)
+    mul!(∂state.ψ, func.Lfp', sfp, 2.0, 1.0)
+    mul!(∂state.ψ, func.Lfq', sfq, 2.0, 1.0)
+    mul!(∂state.ψ, func.Ltp', stp, 2.0, 1.0)
+    mul!(∂state.ψ, func.Ltq', stq, 2.0, 1.0)
 
     return
 end
diff --git a/src/Polar/second_order.jl b/src/Polar/second_order.jl
index 12f7d21c..4b877b5a 100644
--- a/src/Polar/second_order.jl
+++ b/src/Polar/second_order.jl
@@ -61,6 +61,7 @@ function hprod!(
     _init_seed_hessian!(H.t1sseeds, H.host_t1sseeds, v, nmap)
     myseed!(H.state, state, H.t1sseeds, H.map, polar.device)
     forward_eval_intermediate(polar, H.state)
+    H.func(H.t1sF, H.state)
 
     # Reverse
     adjoint!(H.func, H.∂state, H.state, H.∂t1sF)

From ca72e5f60339f099f0db79778588cbafa38c42e5 Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Thu, 6 Jan 2022 22:33:34 -0600
Subject: [PATCH 13/34] add MultiExpressions for aggregations

---
 src/Polar/functions.jl    | 40 +++++++++++++++++++++++++++++++++------
 src/Polar/legacy.jl       |  7 +++++++
 src/Polar/second_order.jl |  2 +-
 3 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/src/Polar/functions.jl b/src/Polar/functions.jl
index 274378b1..004dc317 100644
--- a/src/Polar/functions.jl
+++ b/src/Polar/functions.jl
@@ -141,7 +141,7 @@ function CostFunction(polar::PolarForm{T, VI, VT, MT}) where {T, VI, VT, MT}
     return CostFunction{VT, SMT}(ref_gen, M, c0, c1, c2)
 end
 
-Base.size(::CostFunction) = (1,)
+Base.length(::CostFunction) = 1
 
 function (func::CostFunction)(state)
     costs = state.intermediate.c
@@ -192,7 +192,7 @@ function PowerFlowBalance(polar::PolarForm{T, VI, VT, MT}) where {T, VI, VT, MT}
     return PowerFlowBalance{VT, SMT}(M, Cg, τ)
 end
 
-Base.size(func::PowerFlowBalance) = size(func.τ)
+Base.length(func::PowerFlowBalance) = length(func.τ)
 
 function (func::PowerFlowBalance)(cons, state)
     cons .= func.τ
@@ -214,7 +214,7 @@ struct VoltageMagnitudePQ <: AbstractExpression
 end
 VoltageMagnitudePQ(polar::PolarForm) = VoltageMagnitudePQ(polar.network.pq)
 
-Base.size(func::VoltageMagnitudePQ) = (length(func.pq),)
+Base.length(func::VoltageMagnitudePQ) = length(func.pq)
 
 function (func::VoltageMagnitudePQ)(cons, state)
     cons .= state.vmag[func.pq]
@@ -245,7 +245,7 @@ function PowerGenerationBounds(polar::PolarForm{T, VI, VT, MT}) where {T, VI, VT
     return PowerGenerationBounds{VT, SMT}(M, τ)
 end
 
-Base.size(func::PowerGenerationBounds) = size(func.τ)
+Base.length(func::PowerGenerationBounds) = length(func.τ)
 
 function (func::PowerGenerationBounds)(cons, state)
     cons .= func.τ .+ func.M * state.ψ
@@ -273,9 +273,9 @@ function LineFlows(polar::PolarForm{T,VI,VT,MT}) where {T,VI,VT,MT}
     return LineFlows{VT,SMT}(nlines, Lfp, Lfq, Ltp, Ltq)
 end
 
-Base.size(func::LineFlows) = 2 * func.nlines
+Base.length(func::LineFlows) = 2 * func.nlines
 
-function (func::LineFlows)(cons::VT, state::NetworkStack{VT,S}) where {VT<:AbstractVector, S}
+function (func::LineFlows)(cons::AbstractVector, state::NetworkStack{VT,S}) where {VT<:AbstractVector, S}
     sfp = state.intermediate.sfp::VT
     sfq = state.intermediate.sfq::VT
     stp = state.intermediate.stp::VT
@@ -315,4 +315,32 @@ function adjoint!(func::LineFlows, ∂state, state, ∂v)
     return
 end
 
+# Aggregate expressions together
+struct MultiExpressions <: AbstractExpression
+    exprs::Vector{AbstractExpression}
+end
+
+Base.length(func::MultiExpressions) = sum(length.(func.exprs))
+
+function (func::MultiExpressions)(output, state)
+    k = 0
+    for expr in func.exprs
+        m = length(expr)
+        y = view(output, k+1:k+m)
+        expr(y, state)
+        k += m
+    end
+end
+
+function adjoint!(func::MultiExpressions, ∂state, state, ∂v)
+    k = 0
+    for expr in func.exprs
+        m = length(expr)
+        y = view(∂v, k+1:k+m)
+        adjoint!(expr, ∂state, state, y)
+        k += m
+    end
+end
+
 include("legacy.jl")
+
diff --git a/src/Polar/legacy.jl b/src/Polar/legacy.jl
index dff43d1f..3c670e77 100644
--- a/src/Polar/legacy.jl
+++ b/src/Polar/legacy.jl
@@ -34,6 +34,8 @@ function matpower_jacobian(polar::PolarForm, func::VoltageMagnitudePQ, V)
     ngen = pf.ngen
     nbus = pf.nbus
     ref, pv, pq = index_buses_host(polar)
+    npq = length(pq)
+
     j11 = sparse(1:npq, pq, ones(npq), npq, nbus)
     j12 = spzeros(npq, nbus + ngen)
     return [j11 j12]::SparseMatrixCSC{Float64, Int}
@@ -87,3 +89,8 @@ function matpower_jacobian(polar::PolarForm, func::LineFlows, V)
     ]::SparseMatrixCSC{Float64, Int}
 end
 
+function matpower_jacobian(polar::PolarForm, func::MultiExpressions, V)
+    return vcat([matpower_jacobian(polar, expr, V) for expr in func.exprs]...)
+end
+
+
diff --git a/src/Polar/second_order.jl b/src/Polar/second_order.jl
index 4b877b5a..1c9716fd 100644
--- a/src/Polar/second_order.jl
+++ b/src/Polar/second_order.jl
@@ -19,7 +19,7 @@ function MyHessian(polar::PolarForm{T, VI, VT, MT}, func::AbstractExpression, ma
     nlines = PS.get(pf, PS.NumberOfLines())
     ngen = PS.get(pf, PS.NumberOfGenerators())
 
-    n_cons = size(func)[1]
+    n_cons = length(func)
 
     nmap = length(map)
     map_device = map |> VI

From 70be81cd223311d5afb62bc58491a1fa5688310b Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Fri, 7 Jan 2022 09:35:40 -0600
Subject: [PATCH 14/34] rewrite transfer function with mapping

---
 src/Polar/kernels.jl | 42 ++++--------------------------------------
 1 file changed, 4 insertions(+), 38 deletions(-)

diff --git a/src/Polar/kernels.jl b/src/Polar/kernels.jl
index 7c9976f0..229e5772 100644
--- a/src/Polar/kernels.jl
+++ b/src/Polar/kernels.jl
@@ -387,37 +387,10 @@ function adjoint_transfer!(
 end
 
 KA.@kernel function _reverse_transfer_kernel2!(
-        output, @Const(adj_vmag), @Const(adj_vang), @Const(adj_pgen), @Const(pv), @Const(pq), @Const(ref), @Const(pv2gen),
-    npq, npv, nref, ngen,
+    dest, @Const(src), @Const(map),
 )
     i, j = @index(Global, NTuple)
-
-    output[i, j] = if i <= npv
-        # x (vang_pv)
-        k = pv[i]
-        adj_vang[k, j]
-    elseif i <= npv + npq
-        k = pq[i - npv]
-        # x (vang_pq)
-        adj_vang[k, j]
-    elseif i <= npv + 2*npq
-        # x (vmag_pq)
-        k = pq[i - npv - npq]
-        adj_vmag[k, j]
-    elseif i <= npv + 2*npq + nref
-        # u (vmag_ref)
-        k = ref[i - npv - 2*npq]
-        adj_vmag[k, j]
-    elseif i <= npv + 2*npq + nref + npv
-        # u (vmag_pv)
-        k = pv[i - npv - 2*npq - nref]
-        adj_vmag[k, j]
-    elseif i <= npv + 2*npq + nref + npv + ngen
-        # u (vmag_pg)
-        k = pv2gen[i - 2*npv - 2*npq - nref]
-        adj_pgen[k, j]
-    end
-
+    dest[i, j] = src[map[i], j]
 end
 
 function reverse_transfer!(
@@ -426,16 +399,9 @@ function reverse_transfer!(
 )
     nx = get(polar, ExaPF.NumberOfState())
     nu = get(polar, ExaPF.NumberOfControl())
-    nbus = get(polar, PS.NumberOfBuses())
-    pv = polar.indexing.index_pv
-    pq = polar.indexing.index_pq
-    ref = polar.indexing.index_ref
-    pv2gen = polar.indexing.index_pv_to_gen
+    map = [my_map(polar, State()); my_map(polar, Control())]
     ev = _reverse_transfer_kernel2!(polar.device)(
-        output,
-        ∂state.vmag, ∂state.vang, ∂state.pgen,
-        pv, pq, ref, pv2gen,
-        length(pq), length(pv), length(ref), length(pv2gen),
+        output, ∂state.input, map,
         ndrange=(nx+nu, size(output, 2)),
         dependencies=Event(polar.device)
     )

From 97061ba61fd03e320140432f3ded9ad4a337baac Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Fri, 7 Jan 2022 17:06:25 -0600
Subject: [PATCH 15/34] update powerflow solver

---
 src/Polar/first_order.jl  | 49 +++++++++++++++++++-----------------
 src/Polar/functions.jl    |  3 ++-
 src/Polar/newton.jl       | 52 +++++++++++++++++++++++++++++++++++++++
 src/Polar/powerflow.jl    | 13 ++++++++++
 src/Polar/second_order.jl | 20 +++++++--------
 5 files changed, 103 insertions(+), 34 deletions(-)
 create mode 100644 src/Polar/newton.jl

diff --git a/src/Polar/first_order.jl b/src/Polar/first_order.jl
index 9ff1329f..7b6750ba 100644
--- a/src/Polar/first_order.jl
+++ b/src/Polar/first_order.jl
@@ -11,17 +11,19 @@ function jacobian_transpose_product!(polar::PolarForm, pbm::AutoDiff.TapeMemory,
     )
 end
 
-struct MyJacobian{Func, VD, SMT, MT, VI, VP}
+struct MyJacobian{Model, Func, VD, SMT, MT, VI, VP}
+    model::Model
     func::Func
+    map::VI
     stack::NetworkStack{VD}
-    J::SMT
     compressedJ::MT
     coloring::VI
-    map::VI
     t1sseeds::VP
     t1sF::VD
+    J::SMT
 end
 
+Base.size(jac::MyJacobian, n::Int) = size(jac.J, n)
 
 # Ordering: [vmag, vang, pgen]
 
@@ -58,9 +60,7 @@ function get_jacobian_colors(polar::PolarForm, func::AbstractExpression, map::Ve
     return (Jsub, colors)
 end
 
-function MyJacobian(
-    polar::PolarForm{T, VI, VT, MT}, func::AbstractExpression, map::Vector{Int},
-) where {T, VI, VT, MT}
+function MyJacobian(polar::PolarForm{T, VI, VT, MT}, func::AbstractExpression, map::Vector{Int}) where {T, VI, VT, MT}
     (SMT, A) = get_jacobian_types(polar.device)
 
     pf = polar.network
@@ -68,32 +68,35 @@ function MyJacobian(
     nlines = PS.get(pf, PS.NumberOfLines())
     ngen = PS.get(pf, PS.NumberOfGenerators())
 
+    n_cons = length(func)
+
+    nmap = length(map)
+    map_device = map |> VI
+
     J_host, coloring = get_jacobian_colors(polar, func, map)
     ncolor = size(unique(coloring),1)
 
-    J = J_host |> SMT
+    t1s{N} = ForwardDiff.Dual{Nothing,Float64, N} where N
+    VD = A{t1s{ncolor}}
 
-    m = size(J, 1)
-    nmap = length(map)
+    J = J_host |> SMT
 
     # Seedings
-    t1s{N} = ForwardDiff.Dual{Nothing,Float64, N} where N
-    stack = NetworkStack(nbus, ngen, nlines, A{t1s{ncolor}})
-    t1sF = A{t1s{ncolor}}(zeros(Float64, m))
+    stack = NetworkStack(nbus, ngen, nlines, VD)
+    t1sF = zeros(Float64, n_cons) |> VD
     t1sseeds = AutoDiff.init_seed(coloring, ncolor, nmap)
 
     # Move the seeds over to the device, if necessary
     gput1sseeds = A{ForwardDiff.Partials{ncolor,Float64}}(t1sseeds)
-    compressedJ = MT(zeros(Float64, ncolor, m))
+    compressedJ = MT(zeros(Float64, ncolor, n_cons))
 
     return MyJacobian(
-        func, stack, J, compressedJ, coloring, map, gput1sseeds, t1sF,
+        polar, func, map_device, stack, compressedJ, coloring, gput1sseeds, t1sF, J,
     )
 end
 
-@kernel function _seed_kernel2!(
-    duals::AbstractArray{ForwardDiff.Dual{T, V, N}}, @Const(x),
-    @Const(seeds), @Const(map),
+@kernel function _seed_kernel!(
+    duals::AbstractArray{ForwardDiff.Dual{T, V, N}}, @Const(x), @Const(seeds), @Const(map),
 ) where {T,V,N}
     i = @index(Global, Linear)
     duals[map[i]] = ForwardDiff.Dual{T,V,N}(x[map[i]], seeds[i])
@@ -102,25 +105,25 @@ end
 function myseed!(dest, src, seeds, map, device)
     y = dest.input
     x = src.input
-    ev = _seed_kernel2!(device)(
+    ev = _seed_kernel!(device)(
         y, x, seeds, map, ndrange=length(map), dependencies=Event(device))
     wait(ev)
 end
 
 function jacobian!(
-    polar::PolarForm, jac::MyJacobian, state,
+    jac::MyJacobian, state,
 )
     # init
     jac.stack.input .= state.input
     jac.t1sF .= 0.0
     # seed
-    myseed!(jac.stack, state, jac.t1sseeds, jac.map, polar.device)
+    myseed!(jac.stack, state, jac.t1sseeds, jac.map, jac.model.device)
     # forward pass
-    forward_eval_intermediate(polar, jac.stack)
+    forward_eval_intermediate(jac.model, jac.stack)
     jac.func(jac.t1sF, jac.stack)
     # uncompress
-    AutoDiff.getpartials_kernel!(jac.compressedJ, jac.t1sF, polar.device)
-    AutoDiff.uncompress_kernel!(jac.J, jac.compressedJ, jac.coloring, polar.device)
+    AutoDiff.getpartials_kernel!(jac.compressedJ, jac.t1sF, jac.model.device)
+    AutoDiff.uncompress_kernel!(jac.J, jac.compressedJ, jac.coloring, jac.model.device)
     return jac.J
 end
 
diff --git a/src/Polar/functions.jl b/src/Polar/functions.jl
index 004dc317..5bca83d0 100644
--- a/src/Polar/functions.jl
+++ b/src/Polar/functions.jl
@@ -43,7 +43,7 @@ function NetworkStack(polar::PolarForm{T,VI,VT,MT}) where {T,VI,VT,MT}
     nlines = get(polar, PS.NumberOfLines())
 
     stack = NetworkStack(nbus, ngen, nlines, VT)
-
+    # Initiate with initial solution
     copyto!(stack.vmag, abs.(polar.network.vbus))
     copyto!(stack.vang, angle.(polar.network.vbus))
     copyto!(stack.pgen, get(polar.network, PS.ActivePower()))
@@ -342,5 +342,6 @@ function adjoint!(func::MultiExpressions, ∂state, state, ∂v)
     end
 end
 
+include("newton.jl")
 include("legacy.jl")
 
diff --git a/src/Polar/newton.jl b/src/Polar/newton.jl
new file mode 100644
index 00000000..04a11c31
--- /dev/null
+++ b/src/Polar/newton.jl
@@ -0,0 +1,52 @@
+
+struct NLBuffer{VT}
+    x::VT
+    y::VT
+end
+NLBuffer{VT}(n::Int) where VT = NLBuffer(VT(undef, n), VT(undef, n))
+
+function extract_values!(dest, src)
+    @assert length(dest) == length(src)
+    for i in eachindex(dest)
+        dest[i] = src[i].value
+    end
+end
+
+function nlsolve!(
+    algo::NewtonRaphson,
+    jac::MyJacobian,
+    state::NetworkStack{VT,Buf};
+    linear_solver=DirectSolver(),
+    nl_buffer=NLBuffer{VT}(size(jac, 2)),
+) where {VT, Buf}
+    iter = 0
+    converged = false
+    normF = Inf
+    linsol_iters = Int[]
+
+    map = jac.map
+    x = view(state.input, map)
+
+    residual = nl_buffer.y
+    Δx = nl_buffer.x
+
+    for i in 1:algo.maxiter
+        J = jacobian!(jac, state)
+        extract_values!(residual, jac.t1sF)
+
+        normF = xnorm(residual)
+        if xnorm(residual) < algo.tol
+            converged = true
+            break
+        end
+
+        # Update
+        n_iters = LS.ldiv!(linear_solver, Δx, J, residual)
+        x .= x .- Δx
+
+        push!(linsol_iters, n_iters)
+
+        iter += 1
+    end
+    return ConvergenceStatus(converged, iter, normF, sum(linsol_iters))
+end
diff --git a/src/Polar/powerflow.jl b/src/Polar/powerflow.jl
index cfa85117..ca8ed577 100644
--- a/src/Polar/powerflow.jl
+++ b/src/Polar/powerflow.jl
@@ -213,3 +213,16 @@ function batch_powerflow(
     return ConvergenceStatus(converged, iter, sum(normF), 0)
 end
 
+function run_pf(
+    polar::PolarForm, state::NetworkStack;
+    rtol=1e-8, max_iter=20,
+)
+    solver = NewtonRaphson(tol=rtol, maxiter=max_iter)
+
+    func = PowerFlowBalance(polar)
+    jac = MyJacobian(polar, func, polar.mapx)
+
+    return nlsolve!(solver, jac, state)
+end
+
+
diff --git a/src/Polar/second_order.jl b/src/Polar/second_order.jl
index 1c9716fd..c2a0a2a8 100644
--- a/src/Polar/second_order.jl
+++ b/src/Polar/second_order.jl
@@ -1,13 +1,14 @@
 
-struct MyHessian{Func, VD, VI, T1, T2, Buff} <: AutoDiff.AbstractHessian
+struct MyHessian{Model, Func, VD, VI, T1, T2, Buff} <: AutoDiff.AbstractHessian
+    model::Model
     func::Func
+    map::VI
     state::NetworkStack{VD}
     ∂state::NetworkStack{VD}
     host_t1sseeds::T1 # Needed because seeds have to be created on the host
     t1sseeds::T2
     t1sF::VD
     ∂t1sF::VD
-    map::VI
     buffer::Buff
 end
 
@@ -27,9 +28,7 @@ function MyHessian(polar::PolarForm{T, VI, VT, MT}, func::AbstractExpression, ma
     t1s{N} = ForwardDiff.Dual{Nothing,Float64, N} where N
     VD = A{t1s{1}}
 
-    # ̇x
     stack = NetworkStack(nbus, ngen, nlines, VD)
-    # ̄y
     ∂stack = NetworkStack(nbus, ngen, nlines, VD)
 
     t1sF = zeros(Float64, n_cons) |> VD
@@ -41,12 +40,13 @@ function MyHessian(polar::PolarForm{T, VI, VT, MT}, func::AbstractExpression, ma
 
     intermediate = _get_intermediate_stack(polar, network_basis, VD, 1)
     return MyHessian(
-        func, stack, ∂stack, host_t1sseeds, t1sseeds, t1sF, adj_t1sF, map_device, intermediate,
+        polar, func, map_device, stack, ∂stack, host_t1sseeds, t1sseeds, t1sF, adj_t1sF,
+        intermediate,
     )
 end
 
 function hprod!(
-    polar, H::MyHessian, hv, state, λ, v,
+    H::MyHessian, hv, state, λ, v,
 )
     @assert length(hv) == length(v)
 
@@ -59,15 +59,15 @@ function hprod!(
     nmap = length(H.map)
     # Init seed
     _init_seed_hessian!(H.t1sseeds, H.host_t1sseeds, v, nmap)
-    myseed!(H.state, state, H.t1sseeds, H.map, polar.device)
-    forward_eval_intermediate(polar, H.state)
+    myseed!(H.state, state, H.t1sseeds, H.map, H.model.device)
+    forward_eval_intermediate(H.model, H.state)
     H.func(H.t1sF, H.state)
 
     # Reverse
     adjoint!(H.func, H.∂state, H.state, H.∂t1sF)
-    reverse_eval_intermediate(polar, H.∂state, H.state, H.buffer)
+    reverse_eval_intermediate(H.model, H.∂state, H.state, H.buffer)
 
-    AutoDiff.getpartials_kernel!(hv, H.∂state.input, H.map, polar.device)
+    AutoDiff.getpartials_kernel!(hv, H.∂state.input, H.map, H.model.device)
     return
 end
 

From 794c505847c160de52596dcf34010b1d3196eb5b Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Fri, 7 Jan 2022 20:36:56 -0600
Subject: [PATCH 16/34] [polar] rewrite tests

---
 src/Polar/caches.jl      |   2 +-
 src/Polar/first_order.jl |   5 +
 src/Polar/functions.jl   |  27 +++++
 test/Polar/api.jl        |  67 ++++-------
 test/Polar/autodiff.jl   | 241 ++++++++++++++-------------------------
 test/Polar/gradient.jl   | 166 ++++++---------------------
 test/Polar/hessian.jl    | 108 +++++-------------
 7 files changed, 206 insertions(+), 410 deletions(-)

diff --git a/src/Polar/caches.jl b/src/Polar/caches.jl
index ef220adc..6bd2bbe7 100644
--- a/src/Polar/caches.jl
+++ b/src/Polar/caches.jl
@@ -79,7 +79,7 @@ function Base.iszero(buf::PolarNetworkState)
 end
 
 voltage(buf::PolarNetworkState) = buf.vmag .* exp.(im .* buf.vang)
-voltage_host(buf::PolarNetworkState) = voltage(buf) |> Array
+voltage_host(buf) = voltage(buf) |> Array
 
 "Store topology of the network on target device."
 struct NetworkTopology{VTI, VTD}
diff --git a/src/Polar/first_order.jl b/src/Polar/first_order.jl
index 7b6750ba..3919cc58 100644
--- a/src/Polar/first_order.jl
+++ b/src/Polar/first_order.jl
@@ -1,3 +1,8 @@
+function get_tape(polar::PolarForm, expr::AbstractExpression, ∂stack::NetworkStack{VT, Buf}) where {VT, Buf}
+    # TODO
+    intermediate = _get_intermediate_stack(polar, ExaPF.network_basis, VT, 1)
+    return AutoDiff.TapeMemory(expr, ∂stack, intermediate)
+end
 
 function jacobian_transpose_product!(polar::PolarForm, pbm::AutoDiff.TapeMemory, jv, state, ∂v)
     ∂state = pbm.stack
diff --git a/src/Polar/functions.jl b/src/Polar/functions.jl
index 5bca83d0..53857c90 100644
--- a/src/Polar/functions.jl
+++ b/src/Polar/functions.jl
@@ -58,6 +58,8 @@ function Base.empty!(state::NetworkStack)
     return
 end
 
+voltage(buf::NetworkStack) = buf.vmag .* exp.(im .* buf.vang)
+
 
 # update basis
 function forward_eval_intermediate(polar::PolarForm, state::NetworkStack)
@@ -194,6 +196,11 @@ end
 
 Base.length(func::PowerFlowBalance) = length(func.τ)
 
+function bounds(polar::PolarForm{T,VI,VT,MT}, func::PowerFlowBalance) where {T,VI,VT,MT}
+    m = length(func)
+    return (fill!(VT(undef, m), zero(T)) , fill!(VT(undef, m), zero(T)))
+end
+
 function (func::PowerFlowBalance)(cons, state)
     cons .= func.τ
     mul!(cons, func.M, state.ψ, 1.0, 1.0)
@@ -216,6 +223,11 @@ VoltageMagnitudePQ(polar::PolarForm) = VoltageMagnitudePQ(polar.network.pq)
 
 Base.length(func::VoltageMagnitudePQ) = length(func.pq)
 
+function bounds(polar::PolarForm{T,VI,VT,MT}, func::VoltageMagnitudePQ) where {T,VI,VT,MT}
+    v_min, v_max = PS.bounds(polar.network, PS.Buses(), PS.VoltageMagnitude())
+    return convert(VT, v_min[func.pq]), convert(VT, v_max[func.pq])
+end
+
 function (func::VoltageMagnitudePQ)(cons, state)
     cons .= state.vmag[func.pq]
 end
@@ -247,6 +259,16 @@ end
 
 Base.length(func::PowerGenerationBounds) = length(func.τ)
 
+function bounds(polar::PolarForm{T,VI,VT,MT}, func::PowerGenerationBounds) where {T,VI,VT,MT}
+    p_min, p_max = PS.bounds(polar.network, PS.Generators(), PS.ActivePower())
+    q_min, q_max = PS.bounds(polar.network, PS.Generators(), PS.ReactivePower())
+    _, ref2gen, _ = index_generators_host(polar)
+    return (
+        convert(VT, [p_min[ref2gen]; q_min]),
+        convert(VT, [p_max[ref2gen]; q_max]),
+    )
+end
+
 function (func::PowerGenerationBounds)(cons, state)
     cons .= func.τ .+ func.M * state.ψ
     return
@@ -275,6 +297,11 @@ end
 
 Base.length(func::LineFlows) = 2 * func.nlines
 
+function bounds(polar::PolarForm{T,VI,VT,MT}, func::LineFlows) where {T,VI,VT,MT}
+    f_min, f_max = PS.bounds(polar.network, PS.Lines(), PS.ActivePower())
+    return convert(VT, [f_min; f_min]), convert(VT, [f_max; f_max])
+end
+
 function (func::LineFlows)(cons::AbstractVector, state::NetworkStack{VT,S}) where {VT<:AbstractVector, S}
     sfp = state.intermediate.sfp::VT
     sfq = state.intermediate.sfq::VT
diff --git a/test/Polar/api.jl b/test/Polar/api.jl
index 318ce77b..6fce6d13 100644
--- a/test/Polar/api.jl
+++ b/test/Polar/api.jl
@@ -47,78 +47,57 @@ end
 function test_polar_api(polar, device, M)
     pf = polar.network
     tolerance = 1e-8
-    cache = ExaPF.get(polar, ExaPF.PhysicalState())
-    ExaPF.init_buffer!(polar, cache)
+    stack = ExaPF.NetworkStack(polar)
+    ExaPF.forward_eval_intermediate(polar, stack)
+    power_balance = ExaPF.PowerFlowBalance(polar)
     # Test that values are matching
-    @test myisapprox(pf.vbus, cache.vmag .* exp.(im .* cache.vang))
-    @test myisapprox(pf.sbus, (cache.pnet .- cache.pload) .+ im .* (cache.qnet .- cache.qload))
+    @test myisapprox(pf.vbus, stack.vmag .* exp.(im .* stack.vang))
     xₖ = ExaPF.initial(polar, State())
-    # Init AD factory
-    jx = AutoDiff.Jacobian(polar, ExaPF.power_balance, State())
 
     # Check that initial residual is correct
     mis = pf.vbus .* conj.(pf.Ybus * pf.vbus) .- pf.sbus
     f_mat = [real(mis[[pf.pv; pf.pq]]); imag(mis[pf.pq])];
 
-    cons = cache.balance
-    ExaPF.power_balance(polar, cons, cache)
+    cons = similar(xₖ)
+    power_balance(cons, stack)
     @test myisapprox(cons, f_mat)
 
     # Test powerflow with cache signature
-    conv = powerflow(polar, jx, cache, NewtonRaphson(tol=tolerance))
+    conv = ExaPF.run_pf(polar, stack)
     @test conv.has_converged
 
-    # Get current state
-    ExaPF.get!(polar, State(), xₖ, cache)
-
-    # Bounds on state and control
-    u_min, u_max = ExaPF.bounds(polar, Control())
-    x_min, x_max = ExaPF.bounds(polar, State())
-
-    # Get current control
-    u = similar(u_min)
-    ExaPF.get!(polar, Control(), u, cache)
-
-    h_u_min, h_u_max = u_min |> Array, u_max |> Array
-    h_x_min, h_x_max = x_min |> Array, x_max |> Array
-    @test isless(h_u_min, h_u_max)
-    @test isless(h_x_min, h_x_max)
-
     # Test callbacks
     ## Power Balance
-    ExaPF.power_balance(polar, cons, cache)
+    ExaPF.forward_eval_intermediate(polar, stack)
+    power_balance(cons, stack)
     # As we run powerflow before, the balance should be below tolerance
     @test ExaPF.xnorm_inf(cons) < tolerance
 
     ## Cost Production
-    c2 = ExaPF.cost_production(polar, cache)
+    cost_production = ExaPF.CostFunction(polar)
+    c2 = cost_production(stack)
     @test isa(c2, Real)
     return nothing
 end
 
 function test_polar_constraints(polar, device, M)
-    cache = ExaPF.get(polar, ExaPF.PhysicalState())
-    ExaPF.init_buffer!(polar, cache)
-    # Test that default function is not flagged as a constraint
-    foo(x) = 2*x
-    @test !ExaPF.is_constraint(foo)
-
-    ## Inequality constraint
-    @testset "Function $cons_function" for cons_function in [
-        ExaPF.voltage_magnitude_constraints,
-        ExaPF.active_power_constraints,
-        ExaPF.reactive_power_constraints,
-        ExaPF.flow_constraints,
-        ExaPF.power_balance,
+    stack = ExaPF.NetworkStack(polar)
+
+    @testset "Expressions $expr" for expr in [
+        ExaPF.VoltageMagnitudePQ,
+        ExaPF.PowerGenerationBounds,
+        ExaPF.LineFlows,
+        ExaPF.PowerFlowBalance,
     ]
-        @test ExaPF.is_constraint(cons_function)
-        m = ExaPF.size_constraint(polar, cons_function)
+        # Instantiate
+        constraints = expr(polar)
+        m = length(constraints)
         @test isa(m, Int)
         g = M{Float64, 1}(undef, m) # TODO: this signature is not great
         fill!(g, 0)
-        cons_function(polar, g, cache)
+        constraints(g, stack)
 
-        g_min, g_max = ExaPF.bounds(polar, cons_function)
+        g_min, g_max = ExaPF.bounds(polar, constraints)
         @test length(g_min) == m
         @test length(g_max) == m
         @test isa(g_min, M)
diff --git a/test/Polar/autodiff.jl b/test/Polar/autodiff.jl
index 5c57df14..8b9261f1 100644
--- a/test/Polar/autodiff.jl
+++ b/test/Polar/autodiff.jl
@@ -1,206 +1,137 @@
 function test_constraints_jacobian(polar, device, MT)
-    pf = polar.network
-    nbus = pf.nbus
-    pv = pf.pv ; npv = length(pv)
-    pq = pf.pq ; npq = length(pq)
-    ref = pf.ref ; nref = length(ref)
+    nx = length(polar.mapx)
+    nu = length(polar.mapu)
 
-    cache = ExaPF.get(polar, ExaPF.PhysicalState())
-    ExaPF.init_buffer!(polar, cache)
+    stack = ExaPF.NetworkStack(polar)
+    ∂stack = ExaPF.NetworkStack(polar)
 
-    u = ExaPF.initial(polar, Control())
-
-    jx = AutoDiff.Jacobian(polar, ExaPF.power_balance, State())
-
-    println(devnull, jx)
+    mymap = [ExaPF.my_map(polar, State()); ExaPF.my_map(polar, Control())]
+    # n = length(stack.input)
+    # mymap = collect(1:n)
 
     # Solve power flow
-    conv = powerflow(polar, jx, cache, NewtonRaphson(tol=1e-12))
+    conv = ExaPF.run_pf(polar, stack)
     # Get solution in complex form.
-    V = ExaPF.voltage_host(cache)
+    V = ExaPF.voltage_host(stack)
 
     # Test Jacobian w.r.t. State
-    @testset "Constraint $(cons)" for cons in [
-        ExaPF.voltage_magnitude_constraints,
-        ExaPF.power_balance,
-        ExaPF.active_power_constraints,
-        ExaPF.reactive_power_constraints,
-        ExaPF.flow_constraints,
-        ExaPF.bus_power_injection,
-        ExaPF.network_basis,
+    @testset "Jacobian $(expr)" for expr in [
+        ExaPF.VoltageMagnitudePQ,
+        ExaPF.PowerFlowBalance,
+        # ExaPF.PowerGenerationBounds, TODO
+        ExaPF.LineFlows,
     ]
-        m = ExaPF.size_constraint(polar, cons)
-        # Allocation
-        pbm = AutoDiff.TapeMemory(polar, cons, typeof(u))
-        tgt = rand(m) |> MT
-        c = zeros(m) |> MT
+        constraint = expr(polar)
+        m = length(constraint)
+        pbm = ExaPF.get_tape(polar, constraint, ∂stack)
 
-        ## STATE JACOBIAN
+        # Allocation
 
-        xjacobianAD = ExaPF.AutoDiff.Jacobian(polar, cons, State())
+        jac = ExaPF.MyJacobian(polar, constraint, mymap)
         # Evaluate Jacobian with AD
-        J = AutoDiff.jacobian!(polar, xjacobianAD, cache)
+        J = ExaPF.jacobian!(jac, stack)
         # Matpower Jacobian
-        Jmat_x = ExaPF.matpower_jacobian(polar, State(), cons, V)
-        # Evaluate Jacobian transpose vector product
+        Jmat = ExaPF.matpower_jacobian(polar, constraint, V)
+        Jmat = Jmat[:, mymap]
 
         # Compare with FiniteDiff
         function jac_fd_x(x)
-            cache.vang[pv] .= x[1:npv]
-            cache.vang[pq] .= x[npv+1:npv+npq]
-            cache.vmag[pq] .= x[npv+npq+1:end]
-            c = zeros(m) |> MT
-            cons(polar, c, cache.vmag, cache.vang, cache.pnet, cache.qnet, cache.pload, cache.qload)
+            stack.input[mymap] .= x
+            ExaPF.forward_eval_intermediate(polar, stack)
+            c = zeros(m)
+            constraint(c, stack)
             return c
         end
-        x = [cache.vang[pv]; cache.vang[pq]; cache.vmag[pq]]
+        x = copy(stack.input[mymap])
         Jd = FiniteDiff.finite_difference_jacobian(jac_fd_x, x) |> Array
-        Jx = xjacobianAD.J |> SparseMatrixCSC |> Array
+        Jx = jac.J |> SparseMatrixCSC |> Array
+
         ## JACOBIAN VECTOR PRODUCT
-        ExaPF.jacobian_transpose_product!(polar, pbm, cache, tgt)
-        ∂cons = pbm.stack
+        tgt = rand(m) |> MT
+        output = zeros(nx+nu) |> MT
+        ExaPF.jacobian_transpose_product!(polar, pbm, output, stack, tgt)
 
-        @test size(J) == (m, length(x))
+        @test size(J) == (m, length(mymap))
         @test isapprox(Jd, Jx, rtol=1e-5)
-        @test isapprox(Jmat_x, Jx, rtol=1e-4)
-        @test isapprox(∂cons.∂x, xjacobianAD.J' * tgt, rtol=1e-6)
-
-        ## CONTROL JACOBIAN
-        # Matpower Jacobian
-        Jmat_u = ExaPF.matpower_jacobian(polar, Control(), cons, V)
-        Jacobian = ExaPF.is_linear(polar, cons) ? ExaPF.AutoDiff.ConstantJacobian : ExaPF.AutoDiff.Jacobian
-        ujacobianAD = Jacobian(polar, cons, Control())
-        # Evaluate Jacobian with AD
-        J = AutoDiff.jacobian!(polar, ujacobianAD, cache)
-
-        # Compare with FiniteDiff
-        function jac_fd_u(u)
-            cache.vmag[ref] .= u[1:nref]
-            cache.vmag[pv] .= u[nref+1:npv+nref]
-            cache.pnet[pv] .= u[nref+npv+1:end]
-            c = zeros(m) |> MT
-            cons(polar, c, cache.vmag, cache.vang, cache.pnet, cache.qnet, cache.pload, cache.qload)
-            return c
-        end
-        u = [cache.vmag[ref]; cache.vmag[pv]; cache.pnet[pv]]
-        Jd = FiniteDiff.finite_difference_jacobian(jac_fd_u, u) |> Array
-        if !isnothing(ujacobianAD.J)
-            Ju = ujacobianAD.J |> SparseMatrixCSC |> Array
-            @test size(J) == (m, length(u))
-            @test isapprox(Jd, Ju, rtol=1e-5)
-            @test isapprox(Jmat_u, Ju, rtol=1e-6)
-            @test isapprox(∂cons.∂u, ujacobianAD.J' * tgt, rtol=1e-6)
-        end
+        @test isapprox(Jmat, Jx, rtol=1e-5)
+        @test isapprox(Jmat, Jd, rtol=1e-5)
+        @test isapprox(∂stack.input[mymap], Jx' * tgt, rtol=1e-6)
     end
 end
 
 function test_constraints_adjoint(polar, device, MT)
-    pf = polar.network
-    nbus = pf.nbus
-    pv = pf.pv ; npv = length(pv)
-    pq = pf.pq ; npq = length(pq)
-    ref = pf.ref ; nref = length(ref)
+    nx = length(polar.mapx)
+    nu = length(polar.mapu)
+    mymap = [ExaPF.my_map(polar, State()); ExaPF.my_map(polar, Control())]
 
-    cache = ExaPF.get(polar, ExaPF.PhysicalState())
-    ExaPF.init_buffer!(polar, cache)
+    stack = ExaPF.NetworkStack(polar)
+    ∂stack = ExaPF.NetworkStack(polar)
 
-    u = ExaPF.initial(polar, Control())
+    conv = ExaPF.run_pf(polar, stack)
 
-    jx = AutoDiff.Jacobian(polar, ExaPF.power_balance, State())
-    conv = powerflow(polar, jx, cache, NewtonRaphson(tol=1e-12))
+    ExaPF.forward_eval_intermediate(polar, stack)
 
-    # Test Jacobian w.r.t. State
-    @testset "Constraint $(cons)" for cons in [
-        ExaPF.voltage_magnitude_constraints,
-        ExaPF.power_balance,
-        ExaPF.active_power_constraints,
-        ExaPF.reactive_power_constraints,
-        ExaPF.flow_constraints,
-        ExaPF.bus_power_injection,
-        ExaPF.network_operations,
-        ExaPF.network_basis,
+    @testset "Adjoint $(expr)" for expr in [
+        ExaPF.CostFunction,
+        ExaPF.VoltageMagnitudePQ,
+        ExaPF.PowerFlowBalance,
+        ExaPF.PowerGenerationBounds,
+        ExaPF.LineFlows,
     ]
-        m = ExaPF.size_constraint(polar, cons)
-        pbm = AutoDiff.TapeMemory(polar, cons, typeof(u))
+        constraint = expr(polar)
+        m = length(constraint)
+        pbm = ExaPF.get_tape(polar, constraint, ∂stack)
         tgt = rand(m) |> MT
+        output = zeros(nx+nu) |> MT
+
         c = zeros(m) |> MT
-        # ADJOINT
-        cons(polar, c, cache)
-        ExaPF.adjoint!(polar, pbm, tgt, c, cache)
-        function test_fd(vvm)
-            cache.vmag .= vvm[1:nbus]
-            cache.vang .= vvm[1+nbus:2*nbus]
-            cons(polar, c, cache.vmag, cache.vang, cache.pnet, cache.qnet, cache.pload, cache.qload)
+        constraint(c, stack)
+
+        ExaPF.jacobian_transpose_product!(polar, pbm, output, stack, tgt)
+        function test_fd(x)
+            stack.input[mymap] .= x
+            ExaPF.forward_eval_intermediate(polar, stack)
+            constraint(c, stack)
             return dot(c, tgt)
         end
-        vv = [cache.vmag; cache.vang]
-        vv_fd = FiniteDiff.finite_difference_jacobian(test_fd, vv)
+        x = copy(stack.input[mymap])
+        adj_fd = FiniteDiff.finite_difference_jacobian(test_fd, x) |> Array
         # Loosen the tolerance to 1e-5 there (finite_difference_jacobian
         # is less accurate than finite_difference_gradient)
-        @test myisapprox(vv_fd[1:nbus], pbm.stack.∂vm, rtol=1e-5)
-        @test myisapprox(vv_fd[1+nbus:2*nbus], pbm.stack.∂va, rtol=1e-5)
+        @test isapprox(∂stack.input[mymap], adj_fd[:], rtol=1e-5)
     end
 end
 
 function test_full_space_jacobian(polar, device, MT)
-    pf = polar.network
-    pv = pf.pv ; npv = length(pv)
-    pq = pf.pq ; npq = length(pq)
-    ref = pf.ref ; nref = length(ref)
+    stack = ExaPF.NetworkStack(polar)
+    ExaPF.forward_eval_intermediate(polar, stack)
+
+    n = length(stack.input)
+    mymap = collect(1:n)
+
     constraints = [
-        ExaPF.voltage_magnitude_constraints,
-        ExaPF.active_power_constraints,
-        ExaPF.reactive_power_constraints,
-        ExaPF.flow_constraints,
+        ExaPF.VoltageMagnitudePQ(polar),
+        # ExaPF.PowerGenerationBounds(polar),
+        ExaPF.LineFlows(polar),
+        ExaPF.PowerFlowBalance(polar),
     ]
+    mycons = ExaPF.MultiExpressions(constraints)
 
-    m = sum(ExaPF.size_constraint.(Ref(polar), constraints))
+    m = length(mycons)
 
-    buffer = ExaPF.get(polar, ExaPF.PhysicalState())
-    ExaPF.init_buffer!(polar, buffer)
-    # Init Jacobian storage
-    jac = ExaPF.ConstraintsJacobianStorage(polar, constraints)
-    # Update State and Control Jacobians
-    ExaPF.update_full_jacobian!(polar, jac, buffer)
-    Jx = jac.Jx |> SparseMatrixCSC |> Array
-    Ju = jac.Ju |> SparseMatrixCSC |> Array
+    jac = ExaPF.MyJacobian(polar, mycons, mymap)
+    J = ExaPF.jacobian!(jac, stack)
 
+    y = zeros(m)
     function jac_fd_x(x)
-        buffer.vang[pv] .= x[1:npv]
-        buffer.vang[pq] .= x[npv+1:npv+npq]
-        buffer.vmag[pq] .= x[npv+npq+1:end]
-        g = similar(x, m)
-        f, t = 1, 0
-        for cons in constraints
-            m_ = ExaPF.size_constraint(polar, cons)
-            t += m_
-            g_ = @view g[f:t]
-            cons(polar, g_, buffer.vmag, buffer.vang, buffer.pnet, buffer.qnet, buffer.pload, buffer.qload)
-            f += m_
-        end
-        return g
+        stack.input .= x
+        ExaPF.forward_eval_intermediate(polar, stack)
+        c = zeros(m)
+        mycons(c, stack)
+        return c
     end
-    x = [buffer.vang[pv]; buffer.vang[pq]; buffer.vmag[pq]]
+    x = copy(stack.input)
     Jd = FiniteDiff.finite_difference_jacobian(jac_fd_x, x) |> Array
-    @test isapprox(Jd, Jx, rtol=1e-5)
-
-    function jac_fd_u(u)
-        buffer.vmag[ref] .= u[1:nref]
-        buffer.vmag[pv] .= u[nref+1:npv+nref]
-        buffer.pnet[pv] .= u[nref+npv+1:end]
-        g = similar(x, m)
-        f, t = 1, 0
-        for cons in constraints
-            m_ = ExaPF.size_constraint(polar, cons)
-            t += m_
-            g_ = @view g[f:t]
-            cons(polar, g_, buffer.vmag, buffer.vang, buffer.pnet, buffer.qnet, buffer.pload, buffer.qload)
-            f += m_
-        end
-        return g
-    end
-    u = [buffer.vmag[ref]; buffer.vmag[pv]; buffer.pnet[pv]]
-    Jd = FiniteDiff.finite_difference_jacobian(jac_fd_u, u) |> Array
-    @test isapprox(Jd, Ju, rtol=1e-5)
+    @test isapprox(Jd, J, rtol=1e-5)
 end
 
diff --git a/test/Polar/gradient.jl b/test/Polar/gradient.jl
index 951e0d55..cd93a8ca 100644
--- a/test/Polar/gradient.jl
+++ b/test/Polar/gradient.jl
@@ -1,38 +1,45 @@
 function test_reduced_gradient(polar, device, MT)
-    cache = ExaPF.get(polar, ExaPF.PhysicalState())
-    ExaPF.init_buffer!(polar, cache)
+    stack = ExaPF.NetworkStack(polar)
+    ∂stack = ExaPF.NetworkStack(polar)
 
-    u = ExaPF.initial(polar, Control())
-    jx = AutoDiff.Jacobian(polar, ExaPF.power_balance, State())
-    ju = AutoDiff.Jacobian(polar, ExaPF.power_balance, Control())
-    ∂obj = ExaPF.AdjointStackObjective(polar)
-    pbm = AutoDiff.TapeMemory(ExaPF.cost_production, ∂obj, nothing)
+    power_balance = ExaPF.PowerFlowBalance(polar)
+
+    mapx = ExaPF.my_map(polar, State())
+    mapu = ExaPF.my_map(polar, Control())
+    nx = length(mapx)
+    nu = length(mapu)
+
+    jx = ExaPF.MyJacobian(polar, power_balance, mapx)
+    ju = ExaPF.MyJacobian(polar, power_balance, mapu)
 
     # Solve power flow
-    conv = powerflow(polar, jx, cache, NewtonRaphson(tol=1e-12))
+    solver = NewtonRaphson(tol=1e-12)
+    ExaPF.nlsolve!(solver, jx, stack)
     # No need to recompute ∇gₓ
     ∇gₓ = jx.J
-    ∇gᵤ = AutoDiff.jacobian!(polar, ju, cache)
-    # test jacobian wrt x
-    ∇gᵥ = AutoDiff.jacobian!(polar, jx, cache)
-    @test isequal(∇gₓ, ∇gᵥ)
+    ∇gᵤ = ExaPF.jacobian!(ju, stack)
 
     # Test with Matpower's Jacobian
-    V = ExaPF.voltage_host(cache)
-    Jx = ExaPF.matpower_jacobian(polar, State(), ExaPF.power_balance, V)
-    Ju = ExaPF.matpower_jacobian(polar, Control(), ExaPF.power_balance, V)
+    V = ExaPF.voltage_host(stack)
+    J = ExaPF.matpower_jacobian(polar, power_balance, V)
     h∇gₓ = ∇gₓ |> SparseMatrixCSC |> Array
     h∇gᵤ = ∇gᵤ |> SparseMatrixCSC |> Array
-    @test isapprox(h∇gₓ, Jx)
-    @test isapprox(h∇gᵤ, Ju)
+    @test isapprox(h∇gₓ, J[:, mapx])
+    @test isapprox(h∇gᵤ, J[:, mapu])
 
-    ExaPF.cost_production(polar, cache)
-    ExaPF.gradient_objective!(polar, pbm, cache)
-    ∇fₓ = ∂obj.∇fₓ
-    ∇fᵤ = ∂obj.∇fᵤ
+    cost_production = ExaPF.CostFunction(polar)
+    ExaPF.forward_eval_intermediate(polar, stack)
+    obj = cost_production(stack)
+    pbm = ExaPF.get_tape(polar, cost_production, ∂stack)
+
+    grad = similar(stack.input, nx+nu)
 
-    h∇fₓ = Array(∇fₓ)
-    h∇fᵤ = Array(∇fᵤ)
+    ExaPF.jacobian_transpose_product!(polar, pbm, grad, stack, 1.0)
+    ∇fₓ = grad[1:nx]
+    ∇fᵤ = grad[1+nx:nx+nu]
+
+    h∇fₓ = ∇fₓ |> Array
+    h∇fᵤ = ∇fᵤ |> Array
     ## ADJOINT
     # lamba calculation
     λk  = -(h∇gₓ') \ h∇fₓ
@@ -44,63 +51,17 @@ function test_reduced_gradient(polar, device, MT)
 
     # Compare with finite difference
     function reduced_cost(u_)
-        # Ensure we remain in the manifold
-        ExaPF.transfer!(polar, cache, u_)
-        convergence = powerflow(polar, jx, cache, NewtonRaphson(tol=1e-14))
-        return ExaPF.cost_production(polar, cache)
+        stack.input[mapu] .= u_
+        ExaPF.nlsolve!(solver, jx, stack)
+        ExaPF.forward_eval_intermediate(polar, stack)
+        return cost_production(stack)
     end
 
+    u = stack.input[mapu]
     grad_fd = FiniteDiff.finite_difference_jacobian(reduced_cost, u)
     @test isapprox(grad_fd[:], grad_adjoint, rtol=1e-4)
 end
 
-function test_line_flow_gradient(polar, device, MT)
-    u = ExaPF.initial(polar, Control())
-    cache = ExaPF.get(polar, ExaPF.PhysicalState())
-    ExaPF.init_buffer!(polar, cache)
-    jx = AutoDiff.Jacobian(polar, ExaPF.power_balance, State())
-    # solve power flow
-    conv = powerflow(polar, jx, cache, NewtonRaphson(tol=1e-12))
-
-    # Adjoint of flow_constraints()
-    nbus = length(cache.vmag)
-    m = ExaPF.size_constraint(polar, ExaPF.flow_constraints)
-    x = similar(u, 2 * nbus)
-    x[1:nbus] .= cache.vmag
-    x[1+nbus:2*nbus] .= cache.vang
-    bus_gen = polar.indexing.index_generators
-    VI = typeof(bus_gen)
-
-    ## Example with using sum as a sort of lumping of all constraints
-    function sum_constraints(x)
-        VT = typeof(x)
-        # Needed for ForwardDiff to have a cache with the right active type VT
-        adcache = ExaPF.PolarNetworkState{VI, VT}(
-            cache.vmag, cache.vang, cache.pnet, cache.qnet,
-            cache.pgen, cache.qgen, cache.pload, cache.qload, cache.balance, cache.dx, bus_gen,
-        )
-        adcache.vmag .= x[1:nbus]
-        adcache.vang .= x[1+nbus:2*nbus]
-        g = VT(undef, m) ; fill!(g, 0)
-        ExaPF.flow_constraints(polar, g, adcache)
-        return sum(g)
-    end
-    adgradg = []
-    CUDA.@allowscalar adgradg = ForwardDiff.gradient(sum_constraints,x)
-    fdgradg = FiniteDiff.finite_difference_jacobian(sum_constraints,x)
-    ## We pick sum() as the reduction function. This could be a mask function for active set or some log(x) for lumping.
-    m_flows = ExaPF.size_constraint(polar, ExaPF.flow_constraints)
-    weights = ones(m_flows) |> MT
-    gradg = similar(u, 2 * nbus)
-    ExaPF.flow_constraints_grad!(polar, gradg, cache, weights)
-    adgradg = adgradg |> Array ; gradg = gradg |> Array ; fdgradg = fdgradg |> Array
-    # Both tests fail with rtoldefault(...) = 1e-8
-    # The handwritten adjoint agrees with AutoDiff for rtol>=1e-7
-    @test isapprox(adgradg, gradg, rtol=1e-7)
-    # The finite difference gradient disagrees with AutoDiff for rtol>=1e-7
-    @test isapprox(gradg, fdgradg[:], rtol=1e-5)
-end
-
 function test_objective_adjoint(polar, device, MT)
     pf = polar.network
     nbus = pf.nbus
@@ -143,60 +104,3 @@ function test_objective_adjoint(polar, device, MT)
     @test myisapprox(∇f[1+nx:end], pbm.stack.∇fᵤ, rtol=1e-5)
     return
 end
-
-function test_objective_with_ramping_adjoint(polar, device, MT)
-    pf = polar.network
-    nbus = pf.nbus
-    ngen = pf.ngen
-    pv = pf.pv ; npv = length(pv)
-    pq = pf.pq ; npq = length(pq)
-    ref = pf.ref ; nref = length(ref)
-    pv2gen = polar.indexing.index_pv_to_gen
-    nx = ExaPF.get(polar, ExaPF.NumberOfState())
-
-    cache = ExaPF.get(polar, ExaPF.PhysicalState())
-    ExaPF.init_buffer!(polar, cache)
-
-    u = ExaPF.initial(polar, Control())
-
-    jx = AutoDiff.Jacobian(polar, ExaPF.power_balance, State())
-    conv = powerflow(polar, jx, cache, NewtonRaphson(tol=1e-12))
-    x = [cache.vang[pv] ; cache.vang[pq] ; cache.vmag[pq]]
-    u = [cache.vmag[ref]; cache.vmag[pv]; cache.pgen[pv2gen]]
-
-    # Intermediate
-    s = similar(u, ngen) ; fill!(s, 0.0)
-    σ = 1.0
-    ρ1 = 1.0
-    ρ2 = 1.0
-    τ = 1.0
-    p1 = similar(u, ngen) ; fill!(p1, 0.0)
-    p2 = similar(u, ngen) ; fill!(p2, 0.0)
-    p3 = similar(u, ngen) ; fill!(p3, 0.0)
-    λ1 = similar(u, ngen) ; fill!(λ1, 1.0)
-    λ2 = similar(u, ngen) ; fill!(λ2, 1.0)
-    # Evaluate gradient
-    pbm = ExaPF.pullback_ramping(polar, nothing)
-
-    for t in [0, 1, 2]
-        ExaPF.adjoint_penalty_ramping_constraints!(polar, pbm, cache, s, t, σ, τ, λ1, λ2, ρ1, ρ2, p1, p2, p3)
-
-        # Compare with finite diff
-        function test_objective_fd(z)
-            x_ = z[1:nx]
-            u_ = z[1+nx:end]
-            # Transfer control
-            ExaPF.transfer!(polar, cache, u_)
-            # Transfer state (manually)
-            cache.vang[pv] .= x_[1:npv]
-            cache.vang[pq] .= x_[npv+1:npv+npq]
-            cache.vmag[pq] .= x_[npv+npq+1:end]
-            return ExaPF.cost_penalty_ramping_constraints(polar, cache, s, t, σ, τ, λ1, λ2, ρ1, ρ2, p1, p2, p3)
-        end
-        ∇f = FiniteDiff.finite_difference_jacobian(test_objective_fd, [x; u])
-
-        @test myisapprox(∇f[1:nx], pbm.stack.∇fₓ, rtol=1e-5)
-        @test myisapprox(∇f[1+nx:end], pbm.stack.∇fᵤ, rtol=1e-5)
-    end
-end
-
diff --git a/test/Polar/hessian.jl b/test/Polar/hessian.jl
index 43c02fa2..3cd61415 100644
--- a/test/Polar/hessian.jl
+++ b/test/Polar/hessian.jl
@@ -68,97 +68,47 @@ function test_hessian_with_matpower(polar, device, AT; atol=1e-6, rtol=1e-6)
 end
 
 function test_hessian_with_finitediff(polar, device, MT; rtol=1e-6, atol=1e-6)
-    pf = polar.network
-    pv = pf.pv ; npv = length(pv)
-    pq = pf.pq ; npq = length(pq)
-    ref = pf.ref ; nref = length(ref)
-    nbus = pf.nbus
-    ngen = get(polar, PS.NumberOfGenerators())
+    nx = length(polar.mapx)
+    nu = length(polar.mapu)
 
-    pv2gen = polar.indexing.index_pv_to_gen
-    ref2gen = polar.indexing.index_ref_to_gen
-    gen2bus = polar.indexing.index_generators
-    cache = ExaPF.get(polar, ExaPF.PhysicalState())
-    ExaPF.init_buffer!(polar, cache)
+    mymap = [ExaPF.my_map(polar, State()); ExaPF.my_map(polar, Control())]
 
-    xk = ExaPF.initial(polar, State())
-    u = ExaPF.initial(polar, Control())
-    nx = length(xk) ; nu = length(u)
+    stack = ExaPF.NetworkStack(polar)
+    # Solve power flow
+    conv = ExaPF.run_pf(polar, stack)
 
-    jx = AutoDiff.Jacobian(polar, ExaPF.power_balance, State())
-    ju = AutoDiff.Jacobian(polar, ExaPF.power_balance, Control())
-    ∂obj = ExaPF.AdjointStackObjective(polar)
+    constraints = [
+        ExaPF.VoltageMagnitudePQ(polar),
+        ExaPF.PowerGenerationBounds(polar),
+        ExaPF.LineFlows(polar),
+        ExaPF.PowerFlowBalance(polar),
+    ]
+    mycons = ExaPF.MultiExpressions(constraints)
 
     # Initiate state and control for FiniteDiff
-    x = [cache.vang[pv] ; cache.vang[pq] ; cache.vmag[pq]]
-    u = [cache.vmag[ref]; cache.vmag[pv]; cache.pgen[pv2gen]]
-
     # CONSTRAINTS
-    @testset "Compare with FiniteDiff Hessian ($constraints)" for constraints in [
-        ExaPF.power_balance,
-        ExaPF.active_power_constraints,
-        ExaPF.reactive_power_constraints,
-        ExaPF.flow_constraints,
-        ExaPF.bus_power_injection,
-    ]
-        ncons = ExaPF.size_constraint(polar, constraints)
-        μ = rand(ncons)
-
-        function jac_x(z)
-            x_ = z[1:nx]
-            u_ = z[1+nx:end]
-            # Transfer control
-            ExaPF.transfer!(polar, cache, u_)
-            # Transfer state (manually)
-            cache.vang[pv] .= x_[1:npv]
-            cache.vang[pq] .= x_[npv+1:npv+npq]
-            cache.vmag[pq] .= x_[npv+npq+1:end]
-            Jx = ExaPF.matpower_jacobian(polar, constraints, State(), cache)
-            Ju = ExaPF.matpower_jacobian(polar, constraints, Control(), cache)
-            return [Jx Ju]' * μ
-        end
-        H_fd = FiniteDiff.finite_difference_jacobian(jac_x, [x; u])
-
-        HessianAD = AutoDiff.Hessian(polar, constraints)
-        tgt = rand(nx + nu)
-        projp = zeros(nx + nu)
-        dev_tgt = MT(tgt)
-        dev_projp = MT(projp)
-        dev_μ = MT(μ)
-        AutoDiff.adj_hessian_prod!(polar, HessianAD, dev_projp, cache, dev_μ, dev_tgt)
-        projp = Array(dev_projp)
-        @test isapprox(projp, H_fd * tgt, rtol=rtol)
-    end
-
-    # OBJECTIVE
-    ncons = ExaPF.size_constraint(polar, ExaPF.cost_production)
-    μ = ones(ncons)
-
-    # Initiate on CPU for FiniteDiff
-    polar_cpu = PolarForm(polar.network, CPU())
-    cache_cpu = ExaPF.get(polar_cpu, ExaPF.PhysicalState())
-    x0 = [x; u] |> Array
-    function obj_fd(z)
-        x_ = z[1:nx]
-        u_ = z[1+nx:end]
-        # Transfer control
-        ExaPF.transfer!(polar_cpu, cache_cpu, u_)
-        # Transfer state (manually)
-        cache_cpu.vang[pv] .= x_[1:npv]
-        cache_cpu.vang[pq] .= x_[npv+1:npv+npq]
-        cache_cpu.vmag[pq] .= x_[npv+npq+1:end]
-        return ExaPF.cost_production(polar_cpu, cache_cpu)
-    end
-    H_fd = FiniteDiff.finite_difference_hessian(obj_fd, x0)
+    m = length(mycons)
+    μ = rand(m)
+    c = zeros(m)
 
-    HessianAD = AutoDiff.Hessian(polar, ExaPF.cost_production)
+    HessianAD = ExaPF.MyHessian(polar, mycons, mymap)
     tgt = rand(nx + nu)
     projp = zeros(nx + nu)
     dev_tgt = MT(tgt)
     dev_projp = MT(projp)
     dev_μ = MT(μ)
-    AutoDiff.adj_hessian_prod!(polar, HessianAD, dev_projp, cache, dev_μ, dev_tgt)
+    ExaPF.hprod!(HessianAD, dev_projp, stack, dev_μ, dev_tgt)
     projp = Array(dev_projp)
-    @test isapprox( projp, H_fd * tgt, rtol=rtol)
+
+    function lagr_x(z)
+        stack.input[mymap] .= z
+        ExaPF.forward_eval_intermediate(polar, stack)
+        mycons(c, stack)
+        return dot(μ, c)
+    end
+    x0 = stack.input[mymap]
+    H_fd = FiniteDiff.finite_difference_hessian(lagr_x, x0)
+
+    @test isapprox(projp, H_fd * tgt, rtol=rtol)
 end
 

From 5d5572b13fb0321bad94c7d547493e03f511fc76 Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Sat, 8 Jan 2022 10:52:56 -0600
Subject: [PATCH 17/34] port new code on GPU

---
 src/Polar/first_order.jl |  5 +++--
 src/Polar/functions.jl   |  5 +++--
 src/Polar/kernels.jl     |  6 +++---
 src/Polar/newton.jl      | 17 +++++++++++++++++
 src/Polar/powerflow.jl   | 13 -------------
 src/architectures.jl     |  7 +++++++
 test/Polar/autodiff.jl   | 21 +++++++++++----------
 test/Polar/hessian.jl    |  6 +++---
 8 files changed, 47 insertions(+), 33 deletions(-)

diff --git a/src/Polar/first_order.jl b/src/Polar/first_order.jl
index 3919cc58..774ac986 100644
--- a/src/Polar/first_order.jl
+++ b/src/Polar/first_order.jl
@@ -34,12 +34,12 @@ Base.size(jac::MyJacobian, n::Int) = size(jac.J, n)
 
 function my_map(polar::PolarForm, ::State)
     nbus = get(polar, PS.NumberOfBuses())
-    ref, pv, pq = index_buses_device(polar)
+    ref, pv, pq = index_buses_host(polar)
     return Int[nbus .+ pv; nbus .+ pq; pq]
 end
 function my_map(polar::PolarForm, ::Control)
     nbus = get(polar, PS.NumberOfBuses())
-    ref, pv, pq = index_buses_device(polar)
+    ref, pv, pq = index_buses_host(polar)
     pv2gen = polar.network.pv2gen
     return Int[ref; pv; 2*nbus .+ pv2gen]
 end
@@ -94,6 +94,7 @@ function MyJacobian(polar::PolarForm{T, VI, VT, MT}, func::AbstractExpression, m
     # Move the seeds over to the device, if necessary
     gput1sseeds = A{ForwardDiff.Partials{ncolor,Float64}}(t1sseeds)
     compressedJ = MT(zeros(Float64, ncolor, n_cons))
+    coloring = coloring |> VI
 
     return MyJacobian(
         polar, func, map_device, stack, compressedJ, coloring, gput1sseeds, t1sF, J,
diff --git a/src/Polar/functions.jl b/src/Polar/functions.jl
index 53857c90..8bca8188 100644
--- a/src/Polar/functions.jl
+++ b/src/Polar/functions.jl
@@ -153,7 +153,7 @@ function (func::CostFunction)(state)
 end
 
 function (func::CostFunction)(output, state)
-    output[1] = func(state)
+    CUDA.@allowscalar output[1] = func(state)
     return
 end
 
@@ -270,7 +270,8 @@ function bounds(polar::PolarForm{T,VI,VT,MT}, func::PowerGenerationBounds) where
 end
 
 function (func::PowerGenerationBounds)(cons, state)
-    cons .= func.τ .+ func.M * state.ψ
+    cons .= func.τ
+    mul!(cons, func.M, state.ψ, 1.0, 1.0)
     return
 end
 
diff --git a/src/Polar/kernels.jl b/src/Polar/kernels.jl
index 229e5772..6a52e528 100644
--- a/src/Polar/kernels.jl
+++ b/src/Polar/kernels.jl
@@ -394,12 +394,12 @@ KA.@kernel function _reverse_transfer_kernel2!(
 end
 
 function reverse_transfer!(
-    polar::PolarForm,
+    polar::PolarForm{T, VI, VT, MT},
     output, ∂state,
-)
+) where {T, VI, VT, MT}
     nx = get(polar, ExaPF.NumberOfState())
     nu = get(polar, ExaPF.NumberOfControl())
-    map = [my_map(polar, State()); my_map(polar, Control())]
+    map = [my_map(polar, State()); my_map(polar, Control())] |> VI
     ev = _reverse_transfer_kernel2!(polar.device)(
         output, ∂state.input, map,
         ndrange=(nx+nu, size(output, 2)),
diff --git a/src/Polar/newton.jl b/src/Polar/newton.jl
index 04a11c31..c337e54b 100644
--- a/src/Polar/newton.jl
+++ b/src/Polar/newton.jl
@@ -5,6 +5,7 @@ struct NLBuffer{VT}
 end
 NLBuffer{VT}(n::Int) where VT = NLBuffer(VT(undef, n), VT(undef, n))
 
+
 function extract_values!(dest, src)
     @assert length(dest) == length(src)
     for i in eachindex(dest)
@@ -50,3 +51,19 @@ function nlsolve!(
     end
     return ConvergenceStatus(converged, iter, normF, sum(linsol_iters))
 end
+
+function run_pf(
+    polar::PolarForm, state::NetworkStack;
+    rtol=1e-8, max_iter=20,
+)
+    solver = NewtonRaphson(tol=rtol, maxiter=max_iter)
+    mapx = my_map(polar, State())
+
+    func = PowerFlowBalance(polar)
+    jac = MyJacobian(polar, func, mapx)
+
+    conv = nlsolve!(solver, jac, state)
+    return conv
+end
+
+
diff --git a/src/Polar/powerflow.jl b/src/Polar/powerflow.jl
index ca8ed577..cfa85117 100644
--- a/src/Polar/powerflow.jl
+++ b/src/Polar/powerflow.jl
@@ -213,16 +213,3 @@ function batch_powerflow(
     return ConvergenceStatus(converged, iter, sum(normF), 0)
 end
 
-function run_pf(
-    polar::PolarForm, state::NetworkStack;
-    rtol=1e-8, max_iter=20,
-)
-    solver = NewtonRaphson(tol=rtol, maxiter=max_iter)
-
-    func = PowerFlowBalance(polar)
-    jac = MyJacobian(polar, func, polar.mapx)
-
-    return nlsolve!(solver, jac, state)
-end
-
-
diff --git a/src/architectures.jl b/src/architectures.jl
index b3591ee7..b4316f0e 100644
--- a/src/architectures.jl
+++ b/src/architectures.jl
@@ -32,3 +32,10 @@ function get_batch_jacobian_types(::GPU)
     A = CUDA.CuArray
     return SMT, A
 end
+
+function Base.unsafe_wrap(Atype::Type{CUDA.CuArray{T, 1, CUDA.Mem.DeviceBuffer}},
+                          p::CUDA.CuPtr{T}, dim::Integer;
+                          own::Bool=false, ctx::CUDA.CuContext=CUDA.context()) where {T}
+    unsafe_wrap(CUDA.CuArray{T, 1}, p, (dim,); own, ctx)
+end
+
diff --git a/test/Polar/autodiff.jl b/test/Polar/autodiff.jl
index 8b9261f1..eba75c3d 100644
--- a/test/Polar/autodiff.jl
+++ b/test/Polar/autodiff.jl
@@ -38,7 +38,7 @@ function test_constraints_jacobian(polar, device, MT)
         function jac_fd_x(x)
             stack.input[mymap] .= x
             ExaPF.forward_eval_intermediate(polar, stack)
-            c = zeros(m)
+            c = zeros(m) |> MT
             constraint(c, stack)
             return c
         end
@@ -47,15 +47,17 @@ function test_constraints_jacobian(polar, device, MT)
         Jx = jac.J |> SparseMatrixCSC |> Array
 
         ## JACOBIAN VECTOR PRODUCT
-        tgt = rand(m) |> MT
+        tgt_h = rand(m)
+        tgt = tgt_h |> MT
         output = zeros(nx+nu) |> MT
         ExaPF.jacobian_transpose_product!(polar, pbm, output, stack, tgt)
 
+
         @test size(J) == (m, length(mymap))
-        @test isapprox(Jd, Jx, rtol=1e-5)
-        @test isapprox(Jmat, Jx, rtol=1e-5)
-        @test isapprox(Jmat, Jd, rtol=1e-5)
-        @test isapprox(∂stack.input[mymap], Jx' * tgt, rtol=1e-6)
+        @test myisapprox(Jd, Jx, rtol=1e-5)
+        @test myisapprox(Jmat, Jx, rtol=1e-5)
+        @test myisapprox(Jmat, Jd, rtol=1e-5)
+        @test myisapprox(∂stack.input[mymap], Jx' * tgt_h, rtol=1e-6)
     end
 end
 
@@ -98,7 +100,7 @@ function test_constraints_adjoint(polar, device, MT)
         adj_fd = FiniteDiff.finite_difference_jacobian(test_fd, x) |> Array
         # Loosen the tolerance to 1e-5 there (finite_difference_jacobian
         # is less accurate than finite_difference_gradient)
-        @test isapprox(∂stack.input[mymap], adj_fd[:], rtol=1e-5)
+        @test myisapprox(∂stack.input[mymap], adj_fd[:], rtol=1e-5)
     end
 end
 
@@ -122,16 +124,15 @@ function test_full_space_jacobian(polar, device, MT)
     jac = ExaPF.MyJacobian(polar, mycons, mymap)
     J = ExaPF.jacobian!(jac, stack)
 
-    y = zeros(m)
     function jac_fd_x(x)
         stack.input .= x
         ExaPF.forward_eval_intermediate(polar, stack)
-        c = zeros(m)
+        c = zeros(m) |> MT
         mycons(c, stack)
         return c
     end
     x = copy(stack.input)
     Jd = FiniteDiff.finite_difference_jacobian(jac_fd_x, x) |> Array
-    @test isapprox(Jd, J, rtol=1e-5)
+    @test myisapprox(Jd, J, rtol=1e-5)
 end
 
diff --git a/test/Polar/hessian.jl b/test/Polar/hessian.jl
index 3cd61415..5f07f8d1 100644
--- a/test/Polar/hessian.jl
+++ b/test/Polar/hessian.jl
@@ -88,8 +88,8 @@ function test_hessian_with_finitediff(polar, device, MT; rtol=1e-6, atol=1e-6)
     # Initiate state and control for FiniteDiff
     # CONSTRAINTS
     m = length(mycons)
-    μ = rand(m)
-    c = zeros(m)
+    μ = rand(m) |> MT
+    c = zeros(m) |> MT
 
     HessianAD = ExaPF.MyHessian(polar, mycons, mymap)
     tgt = rand(nx + nu)
@@ -107,7 +107,7 @@ function test_hessian_with_finitediff(polar, device, MT; rtol=1e-6, atol=1e-6)
         return dot(μ, c)
     end
     x0 = stack.input[mymap]
-    H_fd = FiniteDiff.finite_difference_hessian(lagr_x, x0)
+    H_fd = FiniteDiff.finite_difference_hessian(lagr_x, x0) |> Array
 
     @test isapprox(projp, H_fd * tgt, rtol=rtol)
 end

From 35cedbcb7c73f00ebe0eda5d990d6078cc24dfb7 Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Sun, 9 Jan 2022 22:51:36 -0600
Subject: [PATCH 18/34] add expressions for Matpower Hessians

---
 src/Polar/legacy.jl | 128 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 128 insertions(+)

diff --git a/src/Polar/legacy.jl b/src/Polar/legacy.jl
index 3c670e77..2975e9a9 100644
--- a/src/Polar/legacy.jl
+++ b/src/Polar/legacy.jl
@@ -93,4 +93,132 @@ function matpower_jacobian(polar::PolarForm, func::MultiExpressions, V)
     return vcat([matpower_jacobian(polar, expr, V) for expr in func.exprs]...)
 end
 
+# Return full-space Hessian
+function matpower_hessian(polar::PolarForm, func::CostFunction, V, λ)
+    pf = polar.network
+    nbus = get(polar, PS.NumberOfBuses())
+    ngen = get(polar, PS.NumberOfGenerators())
+    ref = pf.ref
+    nref = length(ref)
+
+    H11 = spzeros(2* nbus, 2 * nbus + ngen)
+
+    H21 = spzeros(ngen, 2 * nbus)
+    H22 = spdiagm(2.0 .* func.c2)
+    H = [
+        H11;
+        H21 H22;
+    ]::SparseMatrixCSC{Float64, Int}
+
+    # pg_ref is implicit: add term corresponding to ∂pg_ref' * ∂pg_ref
+    Ybus = pf.Ybus
+    dSbus_dVm, dSbus_dVa = PS.matpower_residual_jacobian(V, Ybus)
+    j11 = real(dSbus_dVm[ref, :])
+    j12 = real(dSbus_dVa[ref, :])
+    j13 = spzeros(nref, ngen)
+    J = [j11 j12 j13]::SparseMatrixCSC{Float64, Int}
+
+    Href = J' * Diagonal(2 .* func.c2[ref]) * J
+
+    return H + Href
+end
+
+function matpower_hessian(polar::PolarForm, func::PowerFlowBalance, V, λ)
+    pf = polar.network
+    Ybus = pf.Ybus
+    nbus = get(polar, PS.NumberOfBuses())
+    ngen = get(polar, PS.NumberOfGenerators())
+    pq, pv = pf.pq, pf.pv
+    npq, npv = length(pq), length(pv)
+
+    yp = zeros(nbus)
+    yp[pv] .= λ[1:npv]
+    yp[pq] .= λ[1+npv:npv+npq]
+    Hpθθ, Hpvθ, Hpvv = PS._matpower_hessian(V, Ybus, yp)
+
+    yq = zeros(nbus)
+    yq[pq] .= λ[1+npv+npq:npv+2*npq]
+    Hqθθ, Hqvθ, Hqvv = PS._matpower_hessian(V, Ybus, yq)
+
+    H11 = real.(Hpvv) .+ imag.(Hqvv)
+    H12 = real.(Hpvθ) .+ imag.(Hqvθ)
+    H13 = spzeros(nbus, ngen)
+
+    H21 = real.(Hpvθ') .+ imag.(Hqvθ')
+    H22 = real.(Hpθθ) .+ imag.(Hqθθ)
+    H23 = spzeros(nbus, ngen)
+
+    H31 = spzeros(ngen, nbus)
+    H32 = spzeros(ngen, nbus)
+    H33 = spzeros(ngen, ngen)
+    return [
+        H11 H12 H13;
+        H21 H22 H23;
+        H31 H32 H33
+    ]::SparseMatrixCSC{Float64, Int}
+end
+
+function matpower_hessian(polar::PolarForm, func::PowerGenerationBounds, V, λ)
+    pf = polar.network
+    Ybus = pf.Ybus
+    nbus = get(polar, PS.NumberOfBuses())
+    ngen = get(polar, PS.NumberOfGenerators())
+    ref, pv = pf.ref, pf.pv
+    nref, npv = length(ref), length(pv)
+
+    yp = zeros(nbus)
+    yp[ref] .= λ[1:nref]
+    Hpθθ, Hpvθ, Hpvv = PS._matpower_hessian(V, Ybus, yp)
 
+    yq = zeros(nbus)
+    yq[pv] .= λ[nref+1:nref+npv]
+    Hqθθ, Hqvθ, Hqvv = PS._matpower_hessian(V, Ybus, yq)
+
+    H11 = real.(Hpvv) .+ imag.(Hqvv)
+    H12 = real.(Hpvθ) .+ imag.(Hqvθ)
+    H13 = spzeros(nbus, ngen)
+
+    H21 = real.(Hpvθ') .+ imag.(Hqvθ')
+    H22 = real.(Hpθθ) .+ imag.(Hqθθ)
+    H23 = spzeros(nbus, ngen)
+
+    H31 = spzeros(ngen, nbus)
+    H32 = spzeros(ngen, nbus)
+    H33 = spzeros(ngen, ngen)
+    return [
+        H11 H12 H13;
+        H21 H22 H23;
+        H31 H32 H33
+    ]::SparseMatrixCSC{Float64, Int}
+end
+
+function matpower_hessian(polar::PolarForm, func::VoltageMagnitudePQ, V, λ)
+    nbus = get(polar, PS.NumberOfBuses())
+    ngen = get(polar, PS.NumberOfGenerators())
+    n = 2*nbus + ngen
+    return spzeros(n, n)
+end
+
+# TODO: not implemented yet
+function matpower_hessian(polar::PolarForm, func::LineFlows, V, λ)
+    nbus = get(polar, PS.NumberOfBuses())
+    ngen = get(polar, PS.NumberOfGenerators())
+    n = 2*nbus + ngen
+    return spzeros(n, n)
+end
+
+function matpower_hessian(polar::PolarForm, func::MultiExpressions, V, λ)
+    nbus = get(polar, PS.NumberOfBuses())
+    ngen = get(polar, PS.NumberOfGenerators())
+    n = 2*nbus + ngen
+    H = spzeros(n, n)
+
+    k = 0
+    for expr in func.exprs
+        m = length(expr)
+        y = view(λ, k+1:k+m)
+        H += matpower_hessian(polar, expr, V, y)::SparseMatrixCSC{Float64, Int}
+        k += m
+    end
+    return H
+end

From a6929335fb545f50078e6941bd52e22223b29bd3 Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Mon, 10 Jan 2022 09:30:08 -0600
Subject: [PATCH 19/34] implement FullHessian with autodiff

---
 src/Polar/first_order.jl  |  4 +-
 src/Polar/legacy.jl       | 11 ++++++
 src/Polar/second_order.jl | 82 +++++++++++++++++++++++++++++++++++++++
 src/autodiff.jl           | 15 +++++++
 test/Polar/autodiff.jl    |  2 -
 test/Polar/hessian.jl     | 37 +++++++++++++++++-
 6 files changed, 147 insertions(+), 4 deletions(-)

diff --git a/src/Polar/first_order.jl b/src/Polar/first_order.jl
index 774ac986..b41a82ae 100644
--- a/src/Polar/first_order.jl
+++ b/src/Polar/first_order.jl
@@ -86,9 +86,11 @@ function MyJacobian(polar::PolarForm{T, VI, VT, MT}, func::AbstractExpression, m
 
     J = J_host |> SMT
 
-    # Seedings
+    # Structures
     stack = NetworkStack(nbus, ngen, nlines, VD)
     t1sF = zeros(Float64, n_cons) |> VD
+
+    # Seedings
     t1sseeds = AutoDiff.init_seed(coloring, ncolor, nmap)
 
     # Move the seeds over to the device, if necessary
diff --git a/src/Polar/legacy.jl b/src/Polar/legacy.jl
index 2975e9a9..533d384f 100644
--- a/src/Polar/legacy.jl
+++ b/src/Polar/legacy.jl
@@ -222,3 +222,14 @@ function matpower_hessian(polar::PolarForm, func::MultiExpressions, V, λ)
     end
     return H
 end
+
+function hessian_sparsity(polar::PolarForm, func)
+    m = length(func)
+    nbus = get(polar, PS.NumberOfBuses())
+    Vre = Float64[i for i in 1:nbus]
+    Vim = Float64[i for i in nbus+1:2*nbus]
+    V = Vre .+ im .* Vim
+    y = rand(m)
+    return matpower_hessian(polar, func, V, y)
+end
+
diff --git a/src/Polar/second_order.jl b/src/Polar/second_order.jl
index c2a0a2a8..cf24295b 100644
--- a/src/Polar/second_order.jl
+++ b/src/Polar/second_order.jl
@@ -71,3 +71,85 @@ function hprod!(
     return
 end
 
+
+struct FullHessian{Model, Func, VD, SMT, MT, VI, VP, Buff} <: AutoDiff.AbstractHessian
+    model::Model
+    func::Func
+    map::VI
+    state::NetworkStack{VD}
+    ∂state::NetworkStack{VD}
+    coloring::VI
+    t1sseeds::VP
+    t1sF::VD
+    ∂t1sF::VD
+    buffer::Buff
+    compressedH::MT
+    H::SMT
+end
+
+function get_hessian_colors(polar::PolarForm, func::AbstractExpression, map::Vector{Int})
+    H = hessian_sparsity(polar, func)::SparseMatrixCSC
+    Hsub = H[map, map] # reorder
+    colors = AutoDiff.SparseDiffTools.matrix_colors(Hsub)
+    return (Hsub, colors)
+end
+
+function FullHessian(polar::PolarForm{T, VI, VT, MT}, func::AbstractExpression, map::Vector{Int}) where {T, VI, VT, MT}
+    (SMT, A) = get_jacobian_types(polar.device)
+
+    pf = polar.network
+    nbus = PS.get(pf, PS.NumberOfBuses())
+    nlines = PS.get(pf, PS.NumberOfLines())
+    ngen = PS.get(pf, PS.NumberOfGenerators())
+
+    n_cons = length(func)
+
+    nmap = length(map)
+    map_device = map |> VI
+
+    H_host, coloring = get_hessian_colors(polar, func, map)
+    ncolor = length(unique(coloring))
+    VD = A{ForwardDiff.Dual{Nothing, Float64, ncolor}}
+
+    H = H_host |> SMT
+
+    # Structures
+    stack = NetworkStack(nbus, ngen, nlines, VD)
+    ∂stack = NetworkStack(nbus, ngen, nlines, VD)
+    t1sF = zeros(Float64, n_cons) |> VD
+    adj_t1sF = similar(t1sF)
+
+    # Seedings
+    t1sseeds = AutoDiff.init_seed(coloring, ncolor, nmap) |> A
+
+    compressedH = MT(undef, ncolor, nmap)
+    coloring = coloring |> VI
+
+    intermediate = _get_intermediate_stack(polar, network_basis, VD, 1)
+    return FullHessian(
+        polar, func, map_device, stack, ∂stack, coloring, t1sseeds, t1sF, adj_t1sF,
+        intermediate, compressedH, H,
+    )
+end
+
+function hessian!(
+    H::FullHessian, state, λ,
+)
+    # init
+    H.state.input .= state.input
+    empty!(H.∂state)
+    H.∂t1sF .= λ
+    # seed
+    myseed!(H.state, state, H.t1sseeds, H.map, H.model.device)
+    # forward pass
+    forward_eval_intermediate(H.model, H.state)
+    H.func(H.t1sF, H.state)
+    # forward-over-reverse pass
+    adjoint!(H.func, H.∂state, H.state, H.∂t1sF)
+    reverse_eval_intermediate(H.model, H.∂state, H.state, H.buffer)
+    # uncompress
+    AutoDiff.partials_hess!(H.compressedH, H.∂state.input, H.map, H.model.device)
+    AutoDiff.uncompress_kernel!(H.H, H.compressedH, H.coloring, H.model.device)
+    return H.H
+end
+
diff --git a/src/autodiff.jl b/src/autodiff.jl
index 876f88f6..8ce67a2e 100644
--- a/src/autodiff.jl
+++ b/src/autodiff.jl
@@ -211,6 +211,13 @@ end
     end
 end
 
+@kernel function getpartials_hess_kernel!(compressedH, @Const(duals), @Const(map))
+    i = @index(Global, Linear)
+    for j in eachindex(ForwardDiff.partials.(duals[map[i]]).values)
+        compressedH[j, i] = ForwardDiff.partials.(duals[map[i]]).values[j]
+    end
+end
+
 """
     getpartials_kernel!(compressedJ, t1sF)
 
@@ -231,6 +238,14 @@ function getpartials_kernel!(compressedJ::AbstractMatrix, t1sF, device)
     wait(ev)
 end
 
+function partials_hess!(compressedH::AbstractMatrix, duals, map, device)
+    ev = getpartials_hess_kernel!(device)(
+        compressedH, duals, map,
+        ndrange=length(map), dependencies=Event(device),
+    )
+    wait(ev)
+end
+
 
 # Uncompress kernels
 @kernel function uncompress_kernel_gpu!(@Const(J_rowPtr), @Const(J_colVal), J_nzVal, @Const(compressedJ), @Const(coloring))
diff --git a/test/Polar/autodiff.jl b/test/Polar/autodiff.jl
index eba75c3d..c49151b5 100644
--- a/test/Polar/autodiff.jl
+++ b/test/Polar/autodiff.jl
@@ -6,8 +6,6 @@ function test_constraints_jacobian(polar, device, MT)
     ∂stack = ExaPF.NetworkStack(polar)
 
     mymap = [ExaPF.my_map(polar, State()); ExaPF.my_map(polar, Control())]
-    # n = length(stack.input)
-    # mymap = collect(1:n)
 
     # Solve power flow
     conv = ExaPF.run_pf(polar, stack)
diff --git a/test/Polar/hessian.jl b/test/Polar/hessian.jl
index 5f07f8d1..4011d563 100644
--- a/test/Polar/hessian.jl
+++ b/test/Polar/hessian.jl
@@ -67,7 +67,7 @@ function test_hessian_with_matpower(polar, device, AT; atol=1e-6, rtol=1e-6)
     return nothing
 end
 
-function test_hessian_with_finitediff(polar, device, MT; rtol=1e-6, atol=1e-6)
+function test_hessprod_with_finitediff(polar, device, MT; rtol=1e-6, atol=1e-6)
     nx = length(polar.mapx)
     nu = length(polar.mapu)
 
@@ -77,6 +77,7 @@ function test_hessian_with_finitediff(polar, device, MT; rtol=1e-6, atol=1e-6)
     # Solve power flow
     conv = ExaPF.run_pf(polar, stack)
 
+    # Tests all expressions in once with MultiExpressions
     constraints = [
         ExaPF.VoltageMagnitudePQ(polar),
         ExaPF.PowerGenerationBounds(polar),
@@ -112,3 +113,37 @@ function test_hessian_with_finitediff(polar, device, MT; rtol=1e-6, atol=1e-6)
     @test isapprox(projp, H_fd * tgt, rtol=rtol)
 end
 
+function test_full_space_hessian(polar, device, MT)
+    stack = ExaPF.NetworkStack(polar)
+    ExaPF.forward_eval_intermediate(polar, stack)
+
+    n = length(stack.input)
+    # Hessian / (x, u)
+    mymap = [ExaPF.my_map(polar, State()); ExaPF.my_map(polar, Control())]
+
+    constraints = [
+        ExaPF.CostFunction(polar),
+        ExaPF.PowerFlowBalance(polar),
+        ExaPF.PowerGenerationBounds(polar),
+        ExaPF.LineFlows(polar),
+    ]
+    mycons = ExaPF.MultiExpressions(constraints)
+
+    m = length(mycons)
+    y = rand(m) |> MT
+
+    hess = ExaPF.FullHessian(polar, mycons, mymap)
+    H = ExaPF.hessian!(hess, stack, y)
+
+    function hess_fd_x(x)
+        stack.input[mymap] .= x
+        ExaPF.forward_eval_intermediate(polar, stack)
+        c = zeros(m) |> MT
+        mycons(c, stack)
+        return dot(c, y)
+    end
+    x = stack.input[mymap]
+    Hd = FiniteDiff.finite_difference_hessian(hess_fd_x, x) |> Array
+    @test myisapprox(Hd, H, rtol=1e-5)
+end
+

From 1c14305f4f770ff57e95a3723c528f7ea3aff919 Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Mon, 10 Jan 2022 10:12:11 -0600
Subject: [PATCH 20/34] fix FullHessian on GPU

---
 src/Polar/functions.jl |  4 ++--
 src/Polar/legacy.jl    |  5 +++--
 test/Polar/hessian.jl  | 16 ++++++++++------
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/src/Polar/functions.jl b/src/Polar/functions.jl
index 8bca8188..ec03cbc5 100644
--- a/src/Polar/functions.jl
+++ b/src/Polar/functions.jl
@@ -147,7 +147,7 @@ Base.length(::CostFunction) = 1
 
 function (func::CostFunction)(state)
     costs = state.intermediate.c
-    state.pgen[func.gen_ref] .= func.M * state.ψ
+    mul!(state.pgen[func.gen_ref], func.M, state.ψ)
     costs .= func.c0 .+ func.c1 .* state.pgen .+ func.c2 .* state.pgen.^2
     return sum(costs)
 end
@@ -159,7 +159,7 @@ end
 
 function adjoint!(func::CostFunction, ∂state, state, ∂v)
     ∂state.pgen .+= ∂v .* (func.c1 .+ 2.0 .* func.c2 .* state.pgen)
-    ∂state.ψ .+= func.M' * ∂state.pgen[func.gen_ref]
+    mul!(∂state.ψ, func.M', ∂state.pgen[func.gen_ref], 1.0, 1.0)
     return
 end
 
diff --git a/src/Polar/legacy.jl b/src/Polar/legacy.jl
index 533d384f..5d440824 100644
--- a/src/Polar/legacy.jl
+++ b/src/Polar/legacy.jl
@@ -100,11 +100,12 @@ function matpower_hessian(polar::PolarForm, func::CostFunction, V, λ)
     ngen = get(polar, PS.NumberOfGenerators())
     ref = pf.ref
     nref = length(ref)
+    c2 = func.c2 |> Array
 
     H11 = spzeros(2* nbus, 2 * nbus + ngen)
 
     H21 = spzeros(ngen, 2 * nbus)
-    H22 = spdiagm(2.0 .* func.c2)
+    H22 = spdiagm(2.0 .* c2)
     H = [
         H11;
         H21 H22;
@@ -118,7 +119,7 @@ function matpower_hessian(polar::PolarForm, func::CostFunction, V, λ)
     j13 = spzeros(nref, ngen)
     J = [j11 j12 j13]::SparseMatrixCSC{Float64, Int}
 
-    Href = J' * Diagonal(2 .* func.c2[ref]) * J
+    Href = J' * Diagonal(2 .* c2[ref]) * J
 
     return H + Href
 end
diff --git a/test/Polar/hessian.jl b/test/Polar/hessian.jl
index 4011d563..c94a56c4 100644
--- a/test/Polar/hessian.jl
+++ b/test/Polar/hessian.jl
@@ -74,6 +74,7 @@ function test_hessprod_with_finitediff(polar, device, MT; rtol=1e-6, atol=1e-6)
     mymap = [ExaPF.my_map(polar, State()); ExaPF.my_map(polar, Control())]
 
     stack = ExaPF.NetworkStack(polar)
+    ExaPF.forward_eval_intermediate(polar, stack)
     # Solve power flow
     conv = ExaPF.run_pf(polar, stack)
 
@@ -108,9 +109,11 @@ function test_hessprod_with_finitediff(polar, device, MT; rtol=1e-6, atol=1e-6)
         return dot(μ, c)
     end
     x0 = stack.input[mymap]
-    H_fd = FiniteDiff.finite_difference_hessian(lagr_x, x0) |> Array
+    H_fd = FiniteDiff.finite_difference_hessian(lagr_x, x0)
+    proj_fd = similar(x0, nx+nu)
+    mul!(proj_fd, H_fd.data, dev_tgt, 1, 0)
 
-    @test isapprox(projp, H_fd * tgt, rtol=rtol)
+    @test isapprox(projp, Array(proj_fd), rtol=rtol)
 end
 
 function test_full_space_hessian(polar, device, MT)
@@ -122,7 +125,7 @@ function test_full_space_hessian(polar, device, MT)
     mymap = [ExaPF.my_map(polar, State()); ExaPF.my_map(polar, Control())]
 
     constraints = [
-        ExaPF.CostFunction(polar),
+        # ExaPF.CostFunction(polar),
         ExaPF.PowerFlowBalance(polar),
         ExaPF.PowerGenerationBounds(polar),
         ExaPF.LineFlows(polar),
@@ -134,16 +137,17 @@ function test_full_space_hessian(polar, device, MT)
 
     hess = ExaPF.FullHessian(polar, mycons, mymap)
     H = ExaPF.hessian!(hess, stack, y)
+    c = zeros(m) |> MT
 
     function hess_fd_x(x)
         stack.input[mymap] .= x
         ExaPF.forward_eval_intermediate(polar, stack)
-        c = zeros(m) |> MT
         mycons(c, stack)
         return dot(c, y)
     end
     x = stack.input[mymap]
-    Hd = FiniteDiff.finite_difference_hessian(hess_fd_x, x) |> Array
-    @test myisapprox(Hd, H, rtol=1e-5)
+    Hd = FiniteDiff.finite_difference_hessian(hess_fd_x, x)
+    @test myisapprox(Hd.data, H, rtol=1e-5)
+    return
 end
 

From 03231eba127621a5fc6448d6fbfeb94f164ffb3c Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Mon, 10 Jan 2022 16:27:34 -0600
Subject: [PATCH 21/34] clean definition of basis with ComposedExpressions

---
 src/Polar/first_order.jl  |   1 -
 src/Polar/functions.jl    | 151 ++++++++++++++++++++++++++++----------
 src/Polar/newton.jl       |   1 -
 src/Polar/polar.jl        |   4 +
 src/Polar/second_order.jl |   8 +-
 5 files changed, 118 insertions(+), 47 deletions(-)

diff --git a/src/Polar/first_order.jl b/src/Polar/first_order.jl
index b41a82ae..864d5434 100644
--- a/src/Polar/first_order.jl
+++ b/src/Polar/first_order.jl
@@ -127,7 +127,6 @@ function jacobian!(
     # seed
     myseed!(jac.stack, state, jac.t1sseeds, jac.map, jac.model.device)
     # forward pass
-    forward_eval_intermediate(jac.model, jac.stack)
     jac.func(jac.t1sF, jac.stack)
     # uncompress
     AutoDiff.getpartials_kernel!(jac.compressedJ, jac.t1sF, jac.model.device)
diff --git a/src/Polar/functions.jl b/src/Polar/functions.jl
index ec03cbc5..1cfb3f82 100644
--- a/src/Polar/functions.jl
+++ b/src/Polar/functions.jl
@@ -32,6 +32,10 @@ function NetworkStack(nbus, ngen, nlines, VT)
         sfq = VT(undef, nlines), # buffer for line-flow
         stp = VT(undef, nlines), # buffer for line-flow
         stq = VT(undef, nlines), # buffer for line-flow
+        ∂edge_vm_fr = VT(undef, nlines), # buffer for basis
+        ∂edge_vm_to = VT(undef, nlines), # buffer for basis
+        ∂edge_va_fr = VT(undef, nlines), # buffer for basis
+        ∂edge_va_to = VT(undef, nlines), # buffer for basis
     )
 
     return NetworkStack(input, vmag, vang, pgen, ψ, intermediate)
@@ -61,57 +65,94 @@ end
 voltage(buf::NetworkStack) = buf.vmag .* exp.(im .* buf.vang)
 
 
+#=
+    Generic expression
+=#
+
+abstract type AbstractExpression end
+
+
+#=
+    PolarBasis
+=#
+
+struct PolarBasis{VI, MT} <: AbstractExpression
+    nbus::Int
+    nlines::Int
+    f::VI
+    t::VI
+    Cf::MT
+    Ct::MT
+    device::KA.Device
+end
+
+function PolarBasis(polar::PolarForm{T, VI, VT, MT}) where {T, VI, VT, MT}
+    SMT = default_sparse_matrix(polar.device)
+    # Assemble matrix
+    pf = polar.network
+    lines = pf.lines
+    f = lines.from_buses
+    t = lines.to_buses
+
+    nbus = pf.nbus
+    nlines = length(t)
+
+    Cf = sparse(f, 1:nlines, ones(nlines), nbus, nlines)
+    Ct = sparse(t, 1:nlines, ones(nlines), nbus, nlines)
+    Cf = Cf |> SMT
+    Ct = Cf |> SMT
+
+    return PolarBasis{VI, SMT}(nbus, nlines, f, t, Cf, Ct, polar.device)
+end
+
+Base.length(func::PolarBasis) = func.nbus + 2 * func.nlines
+
 # update basis
-function forward_eval_intermediate(polar::PolarForm, state::NetworkStack)
-    _network_basis(polar, state.ψ, state.vmag, state.vang)
+function (func::PolarBasis)(output, stack::NetworkStack)
+    ev = basis_kernel!(func.device)(
+        output, stack.vmag, stack.vang,
+        func.f, func.t, func.nlines, func.nbus,
+        ndrange=(length(func), 1), dependencies=Event(func.device)
+    )
+    wait(ev)
+    return
 end
 
-function reverse_eval_intermediate(polar::PolarForm, ∂state::NetworkStack, state::NetworkStack, intermediate)
-    nl = PS.get(polar.network, PS.NumberOfLines())
-    nb = PS.get(polar.network, PS.NumberOfBuses())
-    top = polar.topology
-    f = top.f_buses
-    t = top.t_buses
+function adjoint!(func::PolarBasis, ∂state::NetworkStack, state::NetworkStack, ∂v)
+    nl = func.nlines
+    nb = func.nbus
+    f = func.f
+    t = func.t
 
-    fill!(intermediate.∂edge_vm_fr , 0.0)
-    fill!(intermediate.∂edge_vm_to , 0.0)
-    fill!(intermediate.∂edge_va_fr , 0.0)
-    fill!(intermediate.∂edge_va_to , 0.0)
+    fill!(∂state.intermediate.∂edge_vm_fr , 0.0)
+    fill!(∂state.intermediate.∂edge_vm_to , 0.0)
+    fill!(∂state.intermediate.∂edge_va_fr , 0.0)
+    fill!(∂state.intermediate.∂edge_va_to , 0.0)
 
     # Accumulate on edges
-    ndrange = (nl+nb, size(∂state.vmag, 2))
-    ev = adj_basis_kernel!(polar.device)(
-        ∂state.ψ,
+    ndrange = (nl+nb, 1)
+    ev = adj_basis_kernel!(func.device)(
+        ∂v,
         ∂state.vmag,
-        intermediate.∂edge_vm_fr,
-        intermediate.∂edge_vm_to,
-        intermediate.∂edge_va_fr,
-        intermediate.∂edge_va_to,
+        ∂state.intermediate.∂edge_vm_fr,
+        ∂state.intermediate.∂edge_vm_to,
+        ∂state.intermediate.∂edge_va_fr,
+        ∂state.intermediate.∂edge_va_to,
         state.vmag, state.vang, f, t, nl, nb,
-        ndrange=ndrange, dependencies=Event(polar.device),
+        ndrange=ndrange, dependencies=Event(func.device),
     )
     wait(ev)
 
     # Accumulate on nodes
-    Cf = intermediate.Cf
-    Ct = intermediate.Ct
-    mul!(∂state.vmag, Cf, intermediate.∂edge_vm_fr, 1.0, 1.0)
-    mul!(∂state.vmag, Ct, intermediate.∂edge_vm_to, 1.0, 1.0)
-    mul!(∂state.vang, Cf, intermediate.∂edge_va_fr, 1.0, 1.0)
-    mul!(∂state.vang, Ct, intermediate.∂edge_va_to, 1.0, 1.0)
+    Cf = func.Cf
+    Ct = func.Ct
+    mul!(∂state.vmag, Cf, ∂state.intermediate.∂edge_vm_fr, 1.0, 1.0)
+    mul!(∂state.vmag, Ct, ∂state.intermediate.∂edge_vm_to, 1.0, 1.0)
+    mul!(∂state.vang, Cf, ∂state.intermediate.∂edge_va_fr, 1.0, 1.0)
+    mul!(∂state.vang, Ct, ∂state.intermediate.∂edge_va_to, 1.0, 1.0)
     return
 end
 
-#=
-    Generic expression
-=#
-
-abstract type AbstractExpression end
-
-
-include("first_order.jl")
-include("second_order.jl")
-
 
 #=
     CostFunction
@@ -343,7 +384,7 @@ function adjoint!(func::LineFlows, ∂state, state, ∂v)
     return
 end
 
-# Aggregate expressions together
+# Concatenate expressions together
 struct MultiExpressions <: AbstractExpression
     exprs::Vector{AbstractExpression}
 end
@@ -370,6 +411,38 @@ function adjoint!(func::MultiExpressions, ∂state, state, ∂v)
     end
 end
 
-include("newton.jl")
-include("legacy.jl")
+function bounds(polar::PolarForm{T, VI, VT, MT}, func::MultiExpressions) where {T, VI, VT, MT}
+    m = length(func)
+    g_min = zeros(m)
+    g_max = zeros(m)
+    k = 0
+    for expr in func.exprs
+        m = length(expr)
+        l, u = bounds(polar, expr)
+        g_min[k+1:k+m] .= l
+        g_max[k+1:k+m] .= u
+        k += m
+    end
+    return (
+        convert(VT, g_min),
+        convert(VT, g_max),
+    )
+end
+
+struct ComposedExpressions{Expr1<:PolarBasis, Expr2} <: AbstractExpression
+    inner::Expr1
+    outer::Expr2
+end
+
+function (func::ComposedExpressions)(output, state)
+    func.inner(state.ψ, state) # Evaluate basis
+    func.outer(output, state)   # Evaluate expression
+end
+
+function adjoint!(func::ComposedExpressions, ∂state, state, ∂v)
+    adjoint!(func.outer, ∂state, state, ∂v)
+    adjoint!(func.inner, ∂state, state, ∂state.ψ)
+end
 
+# Overload ∘ operator
+Base.ComposedFunction(g::AbstractExpression, f::PolarBasis) = ComposedExpressions(f, g)
diff --git a/src/Polar/newton.jl b/src/Polar/newton.jl
index c337e54b..dbc673eb 100644
--- a/src/Polar/newton.jl
+++ b/src/Polar/newton.jl
@@ -66,4 +66,3 @@ function run_pf(
     return conv
 end
 
-
diff --git a/src/Polar/polar.jl b/src/Polar/polar.jl
index ef4d6de6..e0ddb01e 100644
--- a/src/Polar/polar.jl
+++ b/src/Polar/polar.jl
@@ -43,6 +43,10 @@ include("powerflow.jl")
 include("objective.jl")
 include("batch.jl")
 include("functions.jl")
+include("first_order.jl")
+include("second_order.jl")
+include("newton.jl")
+include("legacy.jl")
 
 function PolarForm(pf::PS.PowerNetwork, device::KA.Device)
     if isa(device, KA.CPU)
diff --git a/src/Polar/second_order.jl b/src/Polar/second_order.jl
index cf24295b..71f37fe5 100644
--- a/src/Polar/second_order.jl
+++ b/src/Polar/second_order.jl
@@ -60,12 +60,10 @@ function hprod!(
     # Init seed
     _init_seed_hessian!(H.t1sseeds, H.host_t1sseeds, v, nmap)
     myseed!(H.state, state, H.t1sseeds, H.map, H.model.device)
-    forward_eval_intermediate(H.model, H.state)
+    # Forward
     H.func(H.t1sF, H.state)
-
-    # Reverse
+    # Forward-over-Reverse
     adjoint!(H.func, H.∂state, H.state, H.∂t1sF)
-    reverse_eval_intermediate(H.model, H.∂state, H.state, H.buffer)
 
     AutoDiff.getpartials_kernel!(hv, H.∂state.input, H.map, H.model.device)
     return
@@ -142,11 +140,9 @@ function hessian!(
     # seed
     myseed!(H.state, state, H.t1sseeds, H.map, H.model.device)
     # forward pass
-    forward_eval_intermediate(H.model, H.state)
     H.func(H.t1sF, H.state)
     # forward-over-reverse pass
     adjoint!(H.func, H.∂state, H.state, H.∂t1sF)
-    reverse_eval_intermediate(H.model, H.∂state, H.state, H.buffer)
     # uncompress
     AutoDiff.partials_hess!(H.compressedH, H.∂state.input, H.map, H.model.device)
     AutoDiff.uncompress_kernel!(H.H, H.compressedH, H.coloring, H.model.device)

From 99726ea01df06306f5111cf6794e08c5bf5ddeb0 Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Mon, 10 Jan 2022 20:45:02 -0600
Subject: [PATCH 22/34] update tests

---
 src/Polar/first_order.jl  | 17 -----------------
 src/Polar/functions.jl    | 29 +++++++++++++++++------------
 src/Polar/legacy.jl       | 27 +++++++++++++++++++++++++++
 src/Polar/newton.jl       |  3 ++-
 src/Polar/second_order.jl |  4 ++--
 test/Polar/api.jl         | 10 +++++-----
 test/Polar/autodiff.jl    | 32 +++++++++++++++-----------------
 test/Polar/gradient.jl    | 22 ++++++++++++----------
 test/Polar/hessian.jl     | 11 +++++------
 9 files changed, 85 insertions(+), 70 deletions(-)

diff --git a/src/Polar/first_order.jl b/src/Polar/first_order.jl
index 864d5434..48cb48f2 100644
--- a/src/Polar/first_order.jl
+++ b/src/Polar/first_order.jl
@@ -1,20 +1,3 @@
-function get_tape(polar::PolarForm, expr::AbstractExpression, ∂stack::NetworkStack{VT, Buf}) where {VT, Buf}
-    # TODO
-    intermediate = _get_intermediate_stack(polar, ExaPF.network_basis, VT, 1)
-    return AutoDiff.TapeMemory(expr, ∂stack, intermediate)
-end
-
-function jacobian_transpose_product!(polar::PolarForm, pbm::AutoDiff.TapeMemory, jv, state, ∂v)
-    ∂state = pbm.stack
-    empty!(∂state)
-    adjoint!(pbm.func, ∂state, state, ∂v)
-    # Accumulate on vmag and vang
-    reverse_eval_intermediate(polar, ∂state, state, pbm.intermediate)
-    # Accumulate on x and u
-    reverse_transfer!(
-        polar, jv, ∂state,
-    )
-end
 
 struct MyJacobian{Model, Func, VD, SMT, MT, VI, VP}
     model::Model
diff --git a/src/Polar/functions.jl b/src/Polar/functions.jl
index 1cfb3f82..998bd9cd 100644
--- a/src/Polar/functions.jl
+++ b/src/Polar/functions.jl
@@ -71,6 +71,12 @@ voltage(buf::NetworkStack) = buf.vmag .* exp.(im .* buf.vang)
 
 abstract type AbstractExpression end
 
+function (expr::AbstractExpression)(stack::AbstractStack)
+    m = length(expr)
+    output = similar(stack.input, m)
+    expr(output, stack)
+    return output
+end
 
 #=
     PolarBasis
@@ -88,19 +94,18 @@ end
 
 function PolarBasis(polar::PolarForm{T, VI, VT, MT}) where {T, VI, VT, MT}
     SMT = default_sparse_matrix(polar.device)
+    nlines = PS.get(polar.network, PS.NumberOfLines())
     # Assemble matrix
     pf = polar.network
+    nbus = pf.nbus
     lines = pf.lines
     f = lines.from_buses
     t = lines.to_buses
 
-    nbus = pf.nbus
-    nlines = length(t)
-
     Cf = sparse(f, 1:nlines, ones(nlines), nbus, nlines)
     Ct = sparse(t, 1:nlines, ones(nlines), nbus, nlines)
     Cf = Cf |> SMT
-    Ct = Cf |> SMT
+    Ct = Ct |> SMT
 
     return PolarBasis{VI, SMT}(nbus, nlines, f, t, Cf, Ct, polar.device)
 end
@@ -186,15 +191,12 @@ end
 
 Base.length(::CostFunction) = 1
 
-function (func::CostFunction)(state)
+function (func::CostFunction)(output, state)
     costs = state.intermediate.c
-    mul!(state.pgen[func.gen_ref], func.M, state.ψ)
+    pg_ref = view(state.pgen, func.gen_ref)
+    mul!(pg_ref, func.M, state.ψ)
     costs .= func.c0 .+ func.c1 .* state.pgen .+ func.c2 .* state.pgen.^2
-    return sum(costs)
-end
-
-function (func::CostFunction)(output, state)
-    CUDA.@allowscalar output[1] = func(state)
+    CUDA.@allowscalar output[1] = sum(costs)
     return
 end
 
@@ -435,7 +437,7 @@ struct ComposedExpressions{Expr1<:PolarBasis, Expr2} <: AbstractExpression
 end
 
 function (func::ComposedExpressions)(output, state)
-    func.inner(state.ψ, state) # Evaluate basis
+    func.inner(state.ψ, state)  # Evaluate basis
     func.outer(output, state)   # Evaluate expression
 end
 
@@ -446,3 +448,6 @@ end
 
 # Overload ∘ operator
 Base.ComposedFunction(g::AbstractExpression, f::PolarBasis) = ComposedExpressions(f, g)
+Base.length(func::ComposedExpressions) = length(func.outer)
+bounds(polar, func::ComposedExpressions) = bounds(polar, func.outer)
+
diff --git a/src/Polar/legacy.jl b/src/Polar/legacy.jl
index 5d440824..ea929ae3 100644
--- a/src/Polar/legacy.jl
+++ b/src/Polar/legacy.jl
@@ -89,9 +89,35 @@ function matpower_jacobian(polar::PolarForm, func::LineFlows, V)
     ]::SparseMatrixCSC{Float64, Int}
 end
 
+function matpower_jacobian(polar::PolarForm, func::PolarBasis, V)
+    pf = polar.network
+    nbus = pf.nbus
+    ngen = pf.ngen
+    nlines = get(polar, PS.NumberOfLines())
+
+    dS_dVm, dS_dVa = PS._matpower_basis_jacobian(V, pf.lines)
+    dV2 = 2 * sparse(1:nbus, 1:nbus, abs.(V), nbus, nbus)
+
+    j11 = real(dS_dVm)
+    j12 = real(dS_dVa)
+    j13 = spzeros(nlines, ngen)
+    j21 = imag(dS_dVm)
+    j22 = imag(dS_dVa)
+    j23 = spzeros(nlines, ngen)
+    j31 = dV2
+    j32 = spzeros(nbus, nbus)
+    j33 = spzeros(nbus, ngen)
+    return [
+        j11 j12 j13;
+        j21 j22 j23;
+        j31 j32 j33
+    ]::SparseMatrixCSC{Float64, Int}
+end
+
 function matpower_jacobian(polar::PolarForm, func::MultiExpressions, V)
     return vcat([matpower_jacobian(polar, expr, V) for expr in func.exprs]...)
 end
+matpower_jacobian(polar::PolarForm, func::ComposedExpressions, V) = matpower_jacobian(polar, func.outer, V)
 
 # Return full-space Hessian
 function matpower_hessian(polar::PolarForm, func::CostFunction, V, λ)
@@ -223,6 +249,7 @@ function matpower_hessian(polar::PolarForm, func::MultiExpressions, V, λ)
     end
     return H
 end
+matpower_hessian(polar::PolarForm, func::ComposedExpressions, V, y) = matpower_hessian(polar, func.outer, V, y)
 
 function hessian_sparsity(polar::PolarForm, func)
     m = length(func)
diff --git a/src/Polar/newton.jl b/src/Polar/newton.jl
index dbc673eb..4d9a1354 100644
--- a/src/Polar/newton.jl
+++ b/src/Polar/newton.jl
@@ -59,7 +59,8 @@ function run_pf(
     solver = NewtonRaphson(tol=rtol, maxiter=max_iter)
     mapx = my_map(polar, State())
 
-    func = PowerFlowBalance(polar)
+    basis = PolarBasis(polar)
+    func = PowerFlowBalance(polar) ∘ basis
     jac = MyJacobian(polar, func, mapx)
 
     conv = nlsolve!(solver, jac, state)
diff --git a/src/Polar/second_order.jl b/src/Polar/second_order.jl
index 71f37fe5..f1d37b06 100644
--- a/src/Polar/second_order.jl
+++ b/src/Polar/second_order.jl
@@ -38,7 +38,7 @@ function MyHessian(polar::PolarForm{T, VI, VT, MT}, func::AbstractExpression, ma
     host_t1sseeds = Vector{ForwardDiff.Partials{1, Float64}}(undef, nmap)
     t1sseeds = A{ForwardDiff.Partials{1, Float64}}(undef, nmap)
 
-    intermediate = _get_intermediate_stack(polar, network_basis, VD, 1)
+    intermediate = nothing
     return MyHessian(
         polar, func, map_device, stack, ∂stack, host_t1sseeds, t1sseeds, t1sF, adj_t1sF,
         intermediate,
@@ -123,7 +123,7 @@ function FullHessian(polar::PolarForm{T, VI, VT, MT}, func::AbstractExpression,
     compressedH = MT(undef, ncolor, nmap)
     coloring = coloring |> VI
 
-    intermediate = _get_intermediate_stack(polar, network_basis, VD, 1)
+    intermediate = nothing
     return FullHessian(
         polar, func, map_device, stack, ∂stack, coloring, t1sseeds, t1sF, adj_t1sF,
         intermediate, compressedH, H,
diff --git a/test/Polar/api.jl b/test/Polar/api.jl
index 6fce6d13..acbfaacd 100644
--- a/test/Polar/api.jl
+++ b/test/Polar/api.jl
@@ -48,8 +48,8 @@ function test_polar_api(polar, device, M)
     pf = polar.network
     tolerance = 1e-8
     stack = ExaPF.NetworkStack(polar)
-    ExaPF.forward_eval_intermediate(polar, stack)
-    power_balance = ExaPF.PowerFlowBalance(polar)
+    basis  = ExaPF.PolarBasis(polar)
+    power_balance = ExaPF.PowerFlowBalance(polar) ∘ basis
     # Test that values are matching
     @test myisapprox(pf.vbus, stack.vmag .* exp.(im .* stack.vang))
     xₖ = ExaPF.initial(polar, State())
@@ -68,20 +68,20 @@ function test_polar_api(polar, device, M)
 
     # Test callbacks
     ## Power Balance
-    ExaPF.forward_eval_intermediate(polar, stack)
     power_balance(cons, stack)
     # As we run powerflow before, the balance should be below tolerance
     @test ExaPF.xnorm_inf(cons) < tolerance
 
     ## Cost Production
     cost_production = ExaPF.CostFunction(polar)
-    c2 = cost_production(stack)
+    c2 = cost_production(stack)[1]
     @test isa(c2, Real)
     return nothing
 end
 
 function test_polar_constraints(polar, device, M)
     stack = ExaPF.NetworkStack(polar)
+    basis  = ExaPF.PolarBasis(polar)
 
     @testset "Expressions $expr" for expr in [
         ExaPF.VoltageMagnitudePQ,
@@ -90,7 +90,7 @@ function test_polar_constraints(polar, device, M)
         ExaPF.PowerFlowBalance,
     ]
         # Instantiate
-        constraints = expr(polar)
+        constraints = expr(polar) ∘ basis
         m = length(constraints)
         @test isa(m, Int)
         g = M{Float64, 1}(undef, m) # TODO: this signature is not great
diff --git a/test/Polar/autodiff.jl b/test/Polar/autodiff.jl
index c49151b5..ce2dda51 100644
--- a/test/Polar/autodiff.jl
+++ b/test/Polar/autodiff.jl
@@ -4,6 +4,7 @@ function test_constraints_jacobian(polar, device, MT)
 
     stack = ExaPF.NetworkStack(polar)
     ∂stack = ExaPF.NetworkStack(polar)
+    basis  = ExaPF.PolarBasis(polar)
 
     mymap = [ExaPF.my_map(polar, State()); ExaPF.my_map(polar, Control())]
 
@@ -14,14 +15,14 @@ function test_constraints_jacobian(polar, device, MT)
 
     # Test Jacobian w.r.t. State
     @testset "Jacobian $(expr)" for expr in [
+        ExaPF.PolarBasis,
         ExaPF.VoltageMagnitudePQ,
         ExaPF.PowerFlowBalance,
-        # ExaPF.PowerGenerationBounds, TODO
+        # ExaPF.PowerGenerationBounds,
         ExaPF.LineFlows,
     ]
-        constraint = expr(polar)
+        constraint = expr(polar) ∘ basis
         m = length(constraint)
-        pbm = ExaPF.get_tape(polar, constraint, ∂stack)
 
         # Allocation
 
@@ -35,7 +36,6 @@ function test_constraints_jacobian(polar, device, MT)
         # Compare with FiniteDiff
         function jac_fd_x(x)
             stack.input[mymap] .= x
-            ExaPF.forward_eval_intermediate(polar, stack)
             c = zeros(m) |> MT
             constraint(c, stack)
             return c
@@ -48,14 +48,14 @@ function test_constraints_jacobian(polar, device, MT)
         tgt_h = rand(m)
         tgt = tgt_h |> MT
         output = zeros(nx+nu) |> MT
-        ExaPF.jacobian_transpose_product!(polar, pbm, output, stack, tgt)
-
+        empty!(∂stack)
+        ExaPF.adjoint!(constraint, ∂stack, stack, tgt)
 
         @test size(J) == (m, length(mymap))
         @test myisapprox(Jd, Jx, rtol=1e-5)
         @test myisapprox(Jmat, Jx, rtol=1e-5)
         @test myisapprox(Jmat, Jd, rtol=1e-5)
-        @test myisapprox(∂stack.input[mymap], Jx' * tgt_h, rtol=1e-6)
+        @test isapprox(∂stack.input[mymap], Jx' * tgt_h, rtol=1e-6)
     end
 end
 
@@ -66,31 +66,30 @@ function test_constraints_adjoint(polar, device, MT)
 
     stack = ExaPF.NetworkStack(polar)
     ∂stack = ExaPF.NetworkStack(polar)
+    basis  = ExaPF.PolarBasis(polar)
 
     conv = ExaPF.run_pf(polar, stack)
 
-    ExaPF.forward_eval_intermediate(polar, stack)
-
     @testset "Adjoint $(expr)" for expr in [
+        ExaPF.PolarBasis,
         ExaPF.CostFunction,
         ExaPF.VoltageMagnitudePQ,
         ExaPF.PowerFlowBalance,
         ExaPF.PowerGenerationBounds,
         ExaPF.LineFlows,
     ]
-        constraint = expr(polar)
+        constraint = expr(polar) ∘ basis
         m = length(constraint)
-        pbm = ExaPF.get_tape(polar, constraint, ∂stack)
         tgt = rand(m) |> MT
         output = zeros(nx+nu) |> MT
 
         c = zeros(m) |> MT
         constraint(c, stack)
 
-        ExaPF.jacobian_transpose_product!(polar, pbm, output, stack, tgt)
+        empty!(∂stack)
+        ExaPF.adjoint!(constraint, ∂stack, stack, tgt)
         function test_fd(x)
             stack.input[mymap] .= x
-            ExaPF.forward_eval_intermediate(polar, stack)
             constraint(c, stack)
             return dot(c, tgt)
         end
@@ -98,13 +97,13 @@ function test_constraints_adjoint(polar, device, MT)
         adj_fd = FiniteDiff.finite_difference_jacobian(test_fd, x) |> Array
         # Loosen the tolerance to 1e-5 there (finite_difference_jacobian
         # is less accurate than finite_difference_gradient)
-        @test myisapprox(∂stack.input[mymap], adj_fd[:], rtol=1e-5)
+        @test isapprox(∂stack.input[mymap], adj_fd[:], rtol=1e-5)
     end
 end
 
 function test_full_space_jacobian(polar, device, MT)
     stack = ExaPF.NetworkStack(polar)
-    ExaPF.forward_eval_intermediate(polar, stack)
+    basis  = ExaPF.PolarBasis(polar)
 
     n = length(stack.input)
     mymap = collect(1:n)
@@ -115,7 +114,7 @@ function test_full_space_jacobian(polar, device, MT)
         ExaPF.LineFlows(polar),
         ExaPF.PowerFlowBalance(polar),
     ]
-    mycons = ExaPF.MultiExpressions(constraints)
+    mycons = ExaPF.MultiExpressions(constraints) ∘ basis
 
     m = length(mycons)
 
@@ -124,7 +123,6 @@ function test_full_space_jacobian(polar, device, MT)
 
     function jac_fd_x(x)
         stack.input .= x
-        ExaPF.forward_eval_intermediate(polar, stack)
         c = zeros(m) |> MT
         mycons(c, stack)
         return c
diff --git a/test/Polar/gradient.jl b/test/Polar/gradient.jl
index cd93a8ca..b8200e17 100644
--- a/test/Polar/gradient.jl
+++ b/test/Polar/gradient.jl
@@ -1,8 +1,9 @@
 function test_reduced_gradient(polar, device, MT)
     stack = ExaPF.NetworkStack(polar)
+    basis  = ExaPF.PolarBasis(polar)
     ∂stack = ExaPF.NetworkStack(polar)
 
-    power_balance = ExaPF.PowerFlowBalance(polar)
+    power_balance = ExaPF.PowerFlowBalance(polar) ∘ basis
 
     mapx = ExaPF.my_map(polar, State())
     mapu = ExaPF.my_map(polar, Control())
@@ -27,16 +28,17 @@ function test_reduced_gradient(polar, device, MT)
     @test isapprox(h∇gₓ, J[:, mapx])
     @test isapprox(h∇gᵤ, J[:, mapu])
 
-    cost_production = ExaPF.CostFunction(polar)
-    ExaPF.forward_eval_intermediate(polar, stack)
-    obj = cost_production(stack)
-    pbm = ExaPF.get_tape(polar, cost_production, ∂stack)
+    cost_production = ExaPF.CostFunction(polar) ∘ basis
+
+    c = zeros(1)
+    cost_production(c, stack)
 
     grad = similar(stack.input, nx+nu)
 
-    ExaPF.jacobian_transpose_product!(polar, pbm, grad, stack, 1.0)
-    ∇fₓ = grad[1:nx]
-    ∇fᵤ = grad[1+nx:nx+nu]
+    empty!(∂stack)
+    ExaPF.adjoint!(cost_production, ∂stack, stack, 1.0)
+    ∇fₓ = ∂stack.input[mapx]
+    ∇fᵤ = ∂stack.input[mapu]
 
     h∇fₓ = ∇fₓ |> Array
     h∇fᵤ = ∇fᵤ |> Array
@@ -53,8 +55,8 @@ function test_reduced_gradient(polar, device, MT)
     function reduced_cost(u_)
         stack.input[mapu] .= u_
         ExaPF.nlsolve!(solver, jx, stack)
-        ExaPF.forward_eval_intermediate(polar, stack)
-        return cost_production(stack)
+        cost_production(c, stack)
+        return c[1]
     end
 
     u = stack.input[mapu]
diff --git a/test/Polar/hessian.jl b/test/Polar/hessian.jl
index c94a56c4..2ad61025 100644
--- a/test/Polar/hessian.jl
+++ b/test/Polar/hessian.jl
@@ -74,7 +74,8 @@ function test_hessprod_with_finitediff(polar, device, MT; rtol=1e-6, atol=1e-6)
     mymap = [ExaPF.my_map(polar, State()); ExaPF.my_map(polar, Control())]
 
     stack = ExaPF.NetworkStack(polar)
-    ExaPF.forward_eval_intermediate(polar, stack)
+    basis  = ExaPF.PolarBasis(polar)
+
     # Solve power flow
     conv = ExaPF.run_pf(polar, stack)
 
@@ -85,7 +86,7 @@ function test_hessprod_with_finitediff(polar, device, MT; rtol=1e-6, atol=1e-6)
         ExaPF.LineFlows(polar),
         ExaPF.PowerFlowBalance(polar),
     ]
-    mycons = ExaPF.MultiExpressions(constraints)
+    mycons = ExaPF.MultiExpressions(constraints) ∘ basis
 
     # Initiate state and control for FiniteDiff
     # CONSTRAINTS
@@ -104,7 +105,6 @@ function test_hessprod_with_finitediff(polar, device, MT; rtol=1e-6, atol=1e-6)
 
     function lagr_x(z)
         stack.input[mymap] .= z
-        ExaPF.forward_eval_intermediate(polar, stack)
         mycons(c, stack)
         return dot(μ, c)
     end
@@ -118,7 +118,7 @@ end
 
 function test_full_space_hessian(polar, device, MT)
     stack = ExaPF.NetworkStack(polar)
-    ExaPF.forward_eval_intermediate(polar, stack)
+    basis  = ExaPF.PolarBasis(polar)
 
     n = length(stack.input)
     # Hessian / (x, u)
@@ -130,7 +130,7 @@ function test_full_space_hessian(polar, device, MT)
         ExaPF.PowerGenerationBounds(polar),
         ExaPF.LineFlows(polar),
     ]
-    mycons = ExaPF.MultiExpressions(constraints)
+    mycons = ExaPF.MultiExpressions(constraints) ∘ basis
 
     m = length(mycons)
     y = rand(m) |> MT
@@ -141,7 +141,6 @@ function test_full_space_hessian(polar, device, MT)
 
     function hess_fd_x(x)
         stack.input[mymap] .= x
-        ExaPF.forward_eval_intermediate(polar, stack)
         mycons(c, stack)
         return dot(c, y)
     end

From 926665bf5df389863a3f315062421868def50480 Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Mon, 10 Jan 2022 21:35:12 -0600
Subject: [PATCH 23/34] fix Jacobian for PowerFlowBounds expression

---
 src/Polar/functions.jl | 21 +++++++++++++++++----
 src/Polar/legacy.jl    | 11 ++++++-----
 test/Polar/autodiff.jl |  5 +++--
 3 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/src/Polar/functions.jl b/src/Polar/functions.jl
index 998bd9cd..d3a1023a 100644
--- a/src/Polar/functions.jl
+++ b/src/Polar/functions.jl
@@ -176,7 +176,11 @@ function CostFunction(polar::PolarForm{T, VI, VT, MT}) where {T, VI, VT, MT}
     SMT = default_sparse_matrix(polar.device)
     # Load indexing
     ref = polar.network.ref
-    ref_gen = polar.indexing.index_ref_to_gen
+    gen2bus = polar.network.gen2bus
+    if length(ref) > 1
+        error("Too many generators are affected to the slack nodes")
+    end
+    ref_gen = Int[findfirst(isequal(ref[1]), gen2bus)]
     # Assemble matrix
     M_tot = PS.get_basis_matrix(polar.network)
     M = -M_tot[ref, :] |> SMT
@@ -303,12 +307,21 @@ end
 Base.length(func::PowerGenerationBounds) = length(func.τ)
 
 function bounds(polar::PolarForm{T,VI,VT,MT}, func::PowerGenerationBounds) where {T,VI,VT,MT}
+    pf = polar.network
+    ngen = pf.ngen
+    nbus = pf.nbus
+    ref, pv = pf.ref, pf.pv
+    # Build incidence matrix
+    Cg = sparse(pf.gen2bus, 1:ngen, ones(ngen), nbus, ngen)
+    Cgp = Cg[ref, :]
+    Cgq = Cg[[ref ; pv], :]
+    # Get original bounds
     p_min, p_max = PS.bounds(polar.network, PS.Generators(), PS.ActivePower())
     q_min, q_max = PS.bounds(polar.network, PS.Generators(), PS.ReactivePower())
-    _, ref2gen, _ = index_generators_host(polar)
+    # Aggregate bounds on ref and pv nodes
     return (
-        convert(VT, [p_min[ref2gen]; q_min]),
-        convert(VT, [p_max[ref2gen]; q_max]),
+        convert(VT, [Cgp * p_min; Cgq * q_min]),
+        convert(VT, [Cgp * p_max; Cgq * q_max]),
     )
 end
 
diff --git a/src/Polar/legacy.jl b/src/Polar/legacy.jl
index ea929ae3..6bab3da1 100644
--- a/src/Polar/legacy.jl
+++ b/src/Polar/legacy.jl
@@ -13,12 +13,12 @@ function matpower_jacobian(polar::PolarForm, func::PowerFlowBalance, V)
 
     dSbus_dVm, dSbus_dVa = PS.matpower_residual_jacobian(V, Ybus)
 
-    Cg_tot = sparse(gen2bus, 1:ngen, -ones(ngen), nbus, ngen)
+    Cg_tot = sparse(gen2bus, 1:ngen, ones(ngen), nbus, ngen)
     Cg = Cg_tot[[pv; pq], :]
 
     j11 = real(dSbus_dVm[[pv; pq], :])
     j12 = real(dSbus_dVa[[pv; pq], :])
-    j13 = Cg #sparse(gen2bus, 1:ngen, -ones(ngen), npv + npq, ngen)
+    j13 = -Cg
     j21 = imag(dSbus_dVm[pq, :])
     j22 = imag(dSbus_dVa[pq, :])
     j23 = spzeros(npq, ngen)
@@ -57,8 +57,8 @@ function matpower_jacobian(polar::PolarForm, func::PowerGenerationBounds, V)
     j12 = real(dSbus_dVa[ref, :])
     j13 = spzeros(nref, ngen)
 
-    j21 = imag(dSbus_dVm[gen2bus, :])
-    j22 = imag(dSbus_dVa[gen2bus, :])
+    j21 = imag(dSbus_dVm[[ref; pv], :])
+    j22 = imag(dSbus_dVa[[ref; pv], :])
     j23 = spzeros(ngen, ngen)
     # w.r.t. control
     return [
@@ -198,7 +198,8 @@ function matpower_hessian(polar::PolarForm, func::PowerGenerationBounds, V, λ)
     Hpθθ, Hpvθ, Hpvv = PS._matpower_hessian(V, Ybus, yp)
 
     yq = zeros(nbus)
-    yq[pv] .= λ[nref+1:nref+npv]
+    yq[ref] .= λ[nref+1:2*nref]
+    yq[pv] .= λ[2*nref+1:2*nref+npv]
     Hqθθ, Hqvθ, Hqvv = PS._matpower_hessian(V, Ybus, yq)
 
     H11 = real.(Hpvv) .+ imag.(Hqvv)
diff --git a/test/Polar/autodiff.jl b/test/Polar/autodiff.jl
index ce2dda51..9cbc056d 100644
--- a/test/Polar/autodiff.jl
+++ b/test/Polar/autodiff.jl
@@ -18,7 +18,7 @@ function test_constraints_jacobian(polar, device, MT)
         ExaPF.PolarBasis,
         ExaPF.VoltageMagnitudePQ,
         ExaPF.PowerFlowBalance,
-        # ExaPF.PowerGenerationBounds,
+        ExaPF.PowerGenerationBounds,
         ExaPF.LineFlows,
     ]
         constraint = expr(polar) ∘ basis
@@ -56,6 +56,7 @@ function test_constraints_jacobian(polar, device, MT)
         @test myisapprox(Jmat, Jx, rtol=1e-5)
         @test myisapprox(Jmat, Jd, rtol=1e-5)
         @test isapprox(∂stack.input[mymap], Jx' * tgt_h, rtol=1e-6)
+        @test isapprox(∂stack.input[mymap], Jmat' * tgt_h, rtol=1e-6)
     end
 end
 
@@ -110,7 +111,7 @@ function test_full_space_jacobian(polar, device, MT)
 
     constraints = [
         ExaPF.VoltageMagnitudePQ(polar),
-        # ExaPF.PowerGenerationBounds(polar),
+        ExaPF.PowerGenerationBounds(polar),
         ExaPF.LineFlows(polar),
         ExaPF.PowerFlowBalance(polar),
     ]

From 3fdf7ff1baa3ffcaf25aba46fad405b56aee55d1 Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Tue, 11 Jan 2022 21:13:44 -0600
Subject: [PATCH 24/34] minor fixes

---
 src/LinearSolvers/LinearSolvers.jl | 15 +++++++++++++++
 src/Polar/functions.jl             | 10 ++++++++++
 src/Polar/legacy.jl                |  2 +-
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/LinearSolvers/LinearSolvers.jl b/src/LinearSolvers/LinearSolvers.jl
index 4714a101..ca77ad2a 100644
--- a/src/LinearSolvers/LinearSolvers.jl
+++ b/src/LinearSolvers/LinearSolvers.jl
@@ -102,6 +102,21 @@ exa_factorize(J::Adjoint{T, SparseMatrixCSC{T, Int}}) where T = lu(J.parent)'
 DirectSolver(J; options...) = DirectSolver(exa_factorize(J))
 DirectSolver() = DirectSolver(nothing)
 
+function update!(s::DirectSolver, J::AbstractMatrix)
+    lu!(s.factorization, J) # Update factorization inplace
+end
+
+function lsolve!(s::DirectSolver, y::AbstractArray)
+    LinearAlgebra.ldiv!(s.factorization, y)
+end
+function lsolve!(s::DirectSolver, y::AbstractArray, x::AbstractArray)
+    LinearAlgebra.ldiv!(y, s.factorization, x)
+end
+
+function rsolve!(s::DirectSolver, y::AbstractArray, x::AbstractArray)
+    LinearAlgebra.ldiv!(y, s.factorization', x)
+end
+
 # Reuse factorization in update
 function ldiv!(s::DirectSolver{<:LinearAlgebra.Factorization}, y::AbstractVector, J::AbstractMatrix, x::AbstractVector)
     lu!(s.factorization, J) # Update factorization inplace
diff --git a/src/Polar/functions.jl b/src/Polar/functions.jl
index d3a1023a..4e956c59 100644
--- a/src/Polar/functions.jl
+++ b/src/Polar/functions.jl
@@ -62,6 +62,16 @@ function Base.empty!(state::NetworkStack)
     return
 end
 
+function init!(polar::PolarForm, stack::NetworkStack)
+    vmag = abs.(polar.network.vbus)
+    vang = angle.(polar.network.vbus)
+    pg = get(polar.network, PS.ActivePower())
+
+    copyto!(stack.vmag, vmag)
+    copyto!(stack.vang, vang)
+    copyto!(stack.pgen, pg)
+end
+
 voltage(buf::NetworkStack) = buf.vmag .* exp.(im .* buf.vang)
 
 
diff --git a/src/Polar/legacy.jl b/src/Polar/legacy.jl
index 6bab3da1..a46d3e88 100644
--- a/src/Polar/legacy.jl
+++ b/src/Polar/legacy.jl
@@ -145,7 +145,7 @@ function matpower_hessian(polar::PolarForm, func::CostFunction, V, λ)
     j13 = spzeros(nref, ngen)
     J = [j11 j12 j13]::SparseMatrixCSC{Float64, Int}
 
-    Href = J' * Diagonal(2 .* c2[ref]) * J
+    Href = J' * Diagonal(2 .* c2[func.gen_ref]) * J
 
     return H + Href
 end

From bd62ac636f61554efdcaa3505d499cf3c53be8c7 Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Tue, 11 Jan 2022 22:08:56 -0600
Subject: [PATCH 25/34] remove code

---
 src/ExaPF.jl                               |   8 -
 src/Polar/Constraints/active_power.jl      |  96 ---
 src/Polar/Constraints/basis.jl             | 100 ---
 src/Polar/Constraints/constraints.jl       | 147 ----
 src/Polar/Constraints/line_flow.jl         | 122 ---
 src/Polar/Constraints/network_operation.jl | 271 -------
 src/Polar/Constraints/power_balance.jl     | 128 ----
 src/Polar/Constraints/power_injection.jl   | 175 -----
 src/Polar/Constraints/ramping_rate.jl      | 204 -----
 src/Polar/Constraints/reactive_power.jl    | 146 ----
 src/Polar/Constraints/voltage_magnitude.jl |  73 --
 src/Polar/batch.jl                         | 234 ------
 src/Polar/caches.jl                        | 138 ----
 src/Polar/derivatives.jl                   | 418 -----------
 src/Polar/first_order.jl                   |  21 +-
 src/Polar/functions.jl                     | 115 ++-
 src/Polar/kernels.jl                       | 817 ---------------------
 src/Polar/newton.jl                        |  60 ++
 src/Polar/objective.jl                     | 176 -----
 src/Polar/polar.jl                         | 235 +-----
 src/Polar/powerflow.jl                     | 215 ------
 src/Polar/second_order.jl                  |  15 +
 src/architectures.jl                       |   4 +-
 src/autodiff.jl                            | 103 ---
 src/models.jl                              | 312 +-------
 src/utils.jl                               | 142 ----
 test/Polar/api.jl                          |   4 +-
 test/Polar/autodiff.jl                     |   8 +-
 test/Polar/hessian.jl                      |  73 +-
 test/Polar/matpower.jl                     | 113 ++-
 30 files changed, 264 insertions(+), 4409 deletions(-)
 delete mode 100644 src/Polar/Constraints/active_power.jl
 delete mode 100644 src/Polar/Constraints/basis.jl
 delete mode 100644 src/Polar/Constraints/constraints.jl
 delete mode 100644 src/Polar/Constraints/line_flow.jl
 delete mode 100644 src/Polar/Constraints/network_operation.jl
 delete mode 100644 src/Polar/Constraints/power_balance.jl
 delete mode 100644 src/Polar/Constraints/power_injection.jl
 delete mode 100644 src/Polar/Constraints/ramping_rate.jl
 delete mode 100644 src/Polar/Constraints/reactive_power.jl
 delete mode 100644 src/Polar/Constraints/voltage_magnitude.jl
 delete mode 100644 src/Polar/batch.jl
 delete mode 100644 src/Polar/caches.jl
 delete mode 100644 src/Polar/derivatives.jl
 delete mode 100644 src/Polar/kernels.jl
 delete mode 100644 src/Polar/objective.jl
 delete mode 100644 src/Polar/powerflow.jl
 delete mode 100644 src/utils.jl

diff --git a/src/ExaPF.jl b/src/ExaPF.jl
index a2aa875d..2db39d94 100644
--- a/src/ExaPF.jl
+++ b/src/ExaPF.jl
@@ -13,17 +13,9 @@ import CUDA.CUSOLVER
 import ForwardDiff
 using KernelAbstractions
 const KA = KernelAbstractions
-using TimerOutputs: @timeit, TimerOutput
 
 import Base: show, get
 
-const VERBOSE_LEVEL_HIGH = 3
-const VERBOSE_LEVEL_MEDIUM = 2
-const VERBOSE_LEVEL_LOW = 1
-const VERBOSE_LEVEL_NONE = 0
-const TIMER = TimerOutput()
-
-include("utils.jl")
 include("architectures.jl")
 
 # Templates
diff --git a/src/Polar/Constraints/active_power.jl b/src/Polar/Constraints/active_power.jl
deleted file mode 100644
index 19c593f0..00000000
--- a/src/Polar/Constraints/active_power.jl
+++ /dev/null
@@ -1,96 +0,0 @@
-is_constraint(::typeof(active_power_constraints)) = true
-
-# Function for AutoDiff
-function active_power_constraints(polar::PolarForm, cons, vmag, vang, pnet, qnet, pd, qd)
-    ref, _, _ = index_buses_device(polar)
-    ybus_re, ybus_im = get(polar.topology, PS.BusAdmittanceMatrix())
-    transperm = polar.topology.sortperm
-    ndrange = length(ref)
-    ev = active_power_slack!(polar.device)(cons, vmag, vang, ref, pd,
-        ybus_re.nzval, ybus_re.colptr, ybus_re.rowval, ybus_im.nzval,
-        transperm, ndrange=ndrange,
-    )
-    wait(ev)
-end
-
-function active_power_constraints(polar::PolarForm, cons, buffer)
-    active_power_constraints(polar, cons, buffer.vmag, buffer.vang, buffer.pnet, buffer.qnet, buffer.pload, buffer.qload)
-    return
-end
-
-function size_constraint(polar::PolarForm{T, IT, VT, MT}, ::typeof(active_power_constraints)) where {T, IT, VT, MT}
-    return PS.get(polar.network, PS.NumberOfSlackBuses())
-end
-
-function bounds(polar::PolarForm{T, IT, VT, MT}, ::typeof(active_power_constraints)) where {T, IT, VT, MT}
-    # Get all bounds (lengths of p_min, p_max, q_min, q_max equal to ngen)
-    p_min, p_max = PS.bounds(polar.network, PS.Generators(), PS.ActivePower())
-    _, ref2gen, _ = index_generators_host(polar)
-    pq_min = p_min[ref2gen]
-    pq_max = p_max[ref2gen]
-    return convert(VT, pq_min), convert(VT, pq_max)
-end
-
-# Adjoint
-function adjoint!(
-    polar::PolarForm,
-    pbm::AutoDiff.TapeMemory{F, S, I},
-    pg, ∂pg,
-    vm, ∂vm,
-    va, ∂va,
-    pnet, ∂pnet,
-    pload, qload,
-) where {F<:typeof(active_power_constraints), S, I}
-    nbus = PS.get(polar.network, PS.NumberOfBuses())
-    nref = PS.get(polar.network, PS.NumberOfSlackBuses())
-    ref, _, _ = index_buses_device(polar)
-
-    ybus_re, ybus_im = get(polar.topology, PS.BusAdmittanceMatrix())
-    transperm = polar.topology.sortperm
-    ndrange = nref
-    ev = adj_active_power_slack!(polar.device)(vm, va, ∂vm, ∂va, ∂pg, ref,
-        ybus_re.nzval, ybus_re.colptr, ybus_re.rowval, ybus_im.nzval, transperm,
-        ndrange=ndrange,
-    )
-    wait(ev)
-    return
-end
-
-# MATPOWER Jacobian
-function matpower_jacobian(polar::PolarForm, X::Union{State,Control}, ::typeof(active_power_constraints), V)
-    nbus = get(polar, PS.NumberOfBuses())
-    pf = polar.network
-    ref, pv, pq = index_buses_host(polar)
-    nref = length(ref)
-    npv = length(pv)
-    npq = length(pq)
-    Ybus = pf.Ybus
-
-    dSbus_dVm, dSbus_dVa = PS.matpower_residual_jacobian(V, Ybus)
-    # w.r.t. state
-    if isa(X, State)
-        j11 = real(dSbus_dVa[ref, [pv; pq]])
-        j12 = real(dSbus_dVm[ref, pq])
-    # w.r.t. control
-    elseif isa(X, Control)
-        j11 = real(dSbus_dVm[ref, [ref; pv]])
-        j12 = spzeros(length(ref), npv)
-    end
-    return [j11 j12]::SparseMatrixCSC{Float64, Int}
-end
-
-function matpower_hessian(polar::PolarForm, ::typeof(active_power_constraints), buffer, λ)
-    ref, pv, pq = index_buses_host(polar)
-    # Check consistency: currently only support a single slack node
-    @assert length(λ) == 1
-    V = voltage_host(buffer)
-    hxx, hxu, huu = PS.active_power_hessian(V, polar.network.Ybus, pv, pq, ref)
-
-    λₚ = sum(λ)  # TODO
-    return FullSpaceHessian(
-        λₚ .* hxx,
-        λₚ .* hxu,
-        λₚ .* huu,
-    )
-end
-
diff --git a/src/Polar/Constraints/basis.jl b/src/Polar/Constraints/basis.jl
deleted file mode 100644
index 54f2ca4c..00000000
--- a/src/Polar/Constraints/basis.jl
+++ /dev/null
@@ -1,100 +0,0 @@
-function network_basis end
-is_constraint(::typeof(network_basis)) = true
-
-function size_constraint(polar::PolarForm, ::typeof(network_basis))
-    return PS.get(polar.network, PS.NumberOfBuses()) + 2 * PS.get(polar.network, PS.NumberOfLines())
-end
-
-# We add constraint only on vmag_pq
-function _network_basis(polar::PolarForm, cons, vmag, vang)
-    nbus = PS.get(polar.network, PS.NumberOfBuses())
-    nlines = PS.get(polar.network, PS.NumberOfLines())
-
-    ev = basis_kernel!(polar.device)(
-        cons, vmag, vang,
-        polar.topology.f_buses, polar.topology.t_buses, nlines, nbus,
-        ndrange=(2 * nlines+nbus, size(cons, 2)),
-        dependencies=Event(polar.device)
-    )
-    wait(ev)
-    return
-end
-
-function network_basis(polar::PolarForm, cons, vmag, vang, pnet, qnet, pload, qload)
-    _network_basis(polar, cons, vmag, vang)
-end
-function network_basis(polar::PolarForm, cons, buffer)
-    _network_basis(polar, cons, buffer.vmag, buffer.vang)
-end
-
-function adjoint!(
-    polar::PolarForm,
-    pbm::AutoDiff.TapeMemory{F, S, I},
-    cons, ∂cons,
-    vmag, ∂vmag,
-    vang, ∂vang,
-    pnet, ∂pnet,
-    pload, qload,
-) where {F<:typeof(network_basis), S, I}
-    nl = PS.get(polar.network, PS.NumberOfLines())
-    nb = PS.get(polar.network, PS.NumberOfBuses())
-    top = polar.topology
-    f = top.f_buses
-    t = top.t_buses
-
-    fill!(pbm.intermediate.∂edge_vm_fr , 0.0)
-    fill!(pbm.intermediate.∂edge_vm_to , 0.0)
-    fill!(pbm.intermediate.∂edge_va_fr , 0.0)
-    fill!(pbm.intermediate.∂edge_va_to , 0.0)
-    ndrange = (nl+nb, size(∂cons, 2))
-    ev = adj_basis_kernel!(polar.device)(
-        ∂cons,
-        ∂vmag,
-        pbm.intermediate.∂edge_vm_fr,
-        pbm.intermediate.∂edge_vm_to,
-        pbm.intermediate.∂edge_va_fr,
-        pbm.intermediate.∂edge_va_to,
-        vmag, vang, f, t, nl, nb,
-        ndrange=ndrange, dependencies=Event(polar.device),
-    )
-    wait(ev)
-
-    Cf = pbm.intermediate.Cf
-    Ct = pbm.intermediate.Ct
-    mul!(∂vmag, Cf, pbm.intermediate.∂edge_vm_fr, 1.0, 1.0)
-    mul!(∂vmag, Ct, pbm.intermediate.∂edge_vm_to, 1.0, 1.0)
-    mul!(∂vang, Cf, pbm.intermediate.∂edge_va_fr, 1.0, 1.0)
-    mul!(∂vang, Ct, pbm.intermediate.∂edge_va_to, 1.0, 1.0)
-    return
-end
-
-function matpower_jacobian(polar::PolarForm, X::Union{State, Control}, ::typeof(network_basis), V)
-    nbus = get(polar, PS.NumberOfBuses())
-    nlines = get(polar, PS.NumberOfLines())
-    pf = polar.network
-    ref, pv, pq = index_buses_host(polar)
-    nref = length(ref)
-    npv = length(pv)
-    npq = length(pq)
-
-    dS_dVm, dS_dVa = PS._matpower_basis_jacobian(V, pf.lines)
-    dV2 = 2 * sparse(1:nbus, 1:nbus, abs.(V), nbus, nbus)
-
-    if isa(X, State)
-        j11 = real(dS_dVa[:, [pv; pq]])
-        j12 = real(dS_dVm[:, pq])
-        j21 = imag(dS_dVa[:, [pv; pq]])
-        j22 = imag(dS_dVm[:, pq])
-        j31 = spzeros(nbus, npv + npq)
-        j32 = dV2[:, pq]
-        return [j11 j12; j21 j22; j31 j32]::SparseMatrixCSC{Float64, Int}
-    elseif isa(X, Control)
-        j11 = real(dS_dVm[:, [ref; pv]])
-        j12 = spzeros(nlines, npv)
-        j21 = imag(dS_dVm[:, [ref; pv]])
-        j22 = spzeros(nlines, npv)
-        j31 = dV2[:, [ref; pv]]
-        j32 = spzeros(nbus, npv)
-        return [j11 j12; j21 j22; j31 j32]::SparseMatrixCSC{Float64, Int}
-    end
-end
diff --git a/src/Polar/Constraints/constraints.jl b/src/Polar/Constraints/constraints.jl
deleted file mode 100644
index e261f6db..00000000
--- a/src/Polar/Constraints/constraints.jl
+++ /dev/null
@@ -1,147 +0,0 @@
-
-# By default, generic Julia functions are not considered as constraint:
-is_constraint(::Function) = false
-
-# Is the function linear in the polar formulation?
-is_linear(polar::PolarForm, ::Function) = false
-
-
-include("power_balance.jl")
-include("power_injection.jl")
-include("voltage_magnitude.jl")
-include("active_power.jl")
-include("reactive_power.jl")
-include("line_flow.jl")
-include("ramping_rate.jl")
-include("network_operation.jl")
-include("basis.jl")
-
-# By default, function does not have any intermediate state
-_get_intermediate_stack(polar::PolarForm, func::Function, VT, nbatch) = nothing
-
-function _get_intermediate_stack(
-    polar::PolarForm, func::F, VT, nbatch
-) where {F <: Union{typeof(reactive_power_constraints), typeof(flow_constraints), typeof(power_balance), typeof(bus_power_injection), typeof(network_basis)}}
-    nlines = PS.get(polar.network, PS.NumberOfLines())
-    nbus = PS.get(polar.network, PS.NumberOfBuses())
-    # Take care that flow_constraints needs a buffer with a different size
-    nnz = if isa(func, typeof(flow_constraints))  || isa(func, typeof(network_basis))
-        nlines
-    else
-        length(polar.topology.ybus_im.nzval)
-    end
-
-    Cf = nothing
-    Ct = nothing
-    if isa(func, typeof(network_basis)) || isa(func, typeof(flow_constraints))
-        SMT, _ = get_jacobian_types(polar.device)
-        Cf = sparse(polar.network.lines.from_buses, 1:nlines, ones(nlines), nbus, nlines) |> SMT
-        Ct = sparse(polar.network.lines.to_buses, 1:nlines, ones(nlines), nbus, nlines) |> SMT
-    end
-
-    # Return a NamedTuple storing all the intermediate states
-    if nbatch == 1
-        return (
-            Cf=Cf, Ct=Ct,
-            ∂edge_vm_fr = VT(undef, nnz),
-            ∂edge_va_fr = VT(undef, nnz),
-            ∂edge_vm_to = VT(undef, nnz),
-            ∂edge_va_to = VT(undef, nnz),
-        )
-    else
-        return (
-            Cf=Cf, Ct=Ct,
-            ∂edge_vm_fr = VT(undef, nnz, nbatch),
-            ∂edge_va_fr = VT(undef, nnz, nbatch),
-            ∂edge_vm_to = VT(undef, nnz, nbatch),
-            ∂edge_va_to = VT(undef, nnz, nbatch),
-        )
-    end
-end
-
-# Generic functions
-function AutoDiff.TapeMemory(
-    polar::PolarForm, func::Function, VT; with_stack=true, nbatch=1,
-)
-    @assert is_constraint(func)
-    stack = (with_stack) ? AdjointPolar(polar) : nothing
-    intermediate = _get_intermediate_stack(polar, func, VT, nbatch)
-    return AutoDiff.TapeMemory(func, stack, intermediate)
-end
-
-## Adjoint
-function adjoint!(
-    polar::PolarForm,
-    pbm::AutoDiff.TapeMemory,
-    ∂cons, cons, buffer,
-)
-    stack = pbm.stack
-    reset!(stack)
-    adjoint!(
-        polar, pbm,
-        cons, ∂cons,
-        buffer.vmag, stack.∂vm,
-        buffer.vang, stack.∂va,
-        buffer.pnet, stack.∂pinj,
-        buffer.pload, buffer.qload,
-    )
-end
-
-## Jacobian-transpose vector product
-function jacobian_transpose_product!(
-    polar::PolarForm,
-    pbm::AutoDiff.TapeMemory,
-    buffer::PolarNetworkState,
-    v::AbstractVector,
-)
-    stack = pbm.stack
-    reset!(stack)
-    cons = buffer.balance ; fill!(cons, 0.0) # TODO
-    adjoint!(
-        polar, pbm,
-        cons, v,
-        buffer.vmag, stack.∂vm,
-        buffer.vang, stack.∂va,
-        buffer.pnet, stack.∂pinj,
-        buffer.pload, buffer.qload,
-    )
-    adjoint_transfer!(
-        polar,
-        stack.∂u, stack.∂x,
-        stack.∂vm, stack.∂va, stack.∂pinj,
-    )
-end
-
-## Sparsity detection
-function jacobian_sparsity(polar::PolarForm, func, xx::AbstractVariable)
-    nbus = get(polar, PS.NumberOfBuses())
-    Vre = Float64[i for i in 1:nbus]
-    Vim = Float64[i for i in nbus+1:2*nbus]
-    V = Vre .+ im .* Vim
-    return matpower_jacobian(polar, xx, func, V)
-end
-
-function matpower_jacobian(polar::PolarForm, func::Function, X::AbstractVariable, buffer::PolarNetworkState)
-    V = voltage_host(buffer)
-    return matpower_jacobian(polar, X, func, V)
-end
-
-# Utilities for AutoDiff
-function _build_jacobian(polar::PolarForm, cons::Function, X::Union{State, Control})
-    if is_linear(polar, cons)
-        return AutoDiff.ConstantJacobian(polar, cons, X)
-    else
-        return AutoDiff.Jacobian(polar, cons, X)
-    end
-end
-
-function FullSpaceJacobian(
-    polar::PolarForm{T, VI, VT, MT},
-    cons::Function,
-) where {T, VI, VT, MT}
-    @assert is_constraint(cons)
-    Jx = _build_jacobian(polar, cons, State())
-    Ju = _build_jacobian(polar, cons, Control())
-    return FullSpaceJacobian(Jx, Ju)
-end
-
diff --git a/src/Polar/Constraints/line_flow.jl b/src/Polar/Constraints/line_flow.jl
deleted file mode 100644
index b27ff684..00000000
--- a/src/Polar/Constraints/line_flow.jl
+++ /dev/null
@@ -1,122 +0,0 @@
-is_constraint(::typeof(flow_constraints)) = true
-
-# Branch flow constraints
-function _flow_constraints(polar::PolarForm, cons, vmag, vang)
-    nlines = PS.get(polar.network, PS.NumberOfLines())
-    ev = branch_flow_kernel!(polar.device)(
-        cons, vmag, vang,
-        polar.topology.yff_re, polar.topology.yft_re, polar.topology.ytf_re, polar.topology.ytt_re,
-        polar.topology.yff_im, polar.topology.yft_im, polar.topology.ytf_im, polar.topology.ytt_im,
-        polar.topology.f_buses, polar.topology.t_buses, nlines,
-        ndrange=(nlines, size(cons, 2)),
-        dependencies=Event(polar.device)
-    )
-    wait(ev)
-    return
-end
-
-function flow_constraints(polar::PolarForm, cons, buffer::PolarNetworkState)
-    _flow_constraints(polar, cons, buffer.vmag, buffer.vang)
-end
-
-# Specialized function for AD with ForwardDiff
-function flow_constraints(polar::PolarForm, cons, vmag, vang, pnet, qnet, pload, qload)
-    _flow_constraints(polar, cons, vmag, vang)
-end
-
-function flow_constraints_grad!(polar::PolarForm, cons_grad, buffer, weights)
-    nlines = PS.get(polar.network, PS.NumberOfLines())
-    nbus = PS.get(polar.network, PS.NumberOfBuses())
-    PT = polar.topology
-    fill!(cons_grad, 0)
-    adj_vmag = @view cons_grad[1:nbus]
-    adj_vang = @view cons_grad[nbus+1:2*nbus]
-    ∂edge_vm_fr = similar(cons_grad, nlines)
-    ∂edge_vm_to = similar(cons_grad, nlines)
-    ∂edge_va_fr = similar(cons_grad, nlines)
-    ∂edge_va_to = similar(cons_grad, nlines)
-    fill!(∂edge_vm_fr, 0)
-    fill!(∂edge_vm_to, 0)
-    fill!(∂edge_va_fr, 0)
-    fill!(∂edge_va_to, 0)
-    SMT, _ = get_jacobian_types(polar.device)
-    Cf = sparse(polar.network.lines.from_buses, 1:nlines, ones(nlines), nbus, nlines) |> SMT
-    Ct = sparse(polar.network.lines.to_buses, 1:nlines, ones(nlines), nbus, nlines) |> SMT
-    adj_branch_flow!(weights, buffer.vmag, adj_vmag,
-            buffer.vang, adj_vang,
-            ∂edge_vm_fr, ∂edge_vm_to,
-            ∂edge_va_fr, ∂edge_va_to,
-            PT.yff_re, PT.yft_re, PT.ytf_re, PT.ytt_re,
-            PT.yff_im, PT.yft_im, PT.ytf_im, PT.ytt_im,
-            PT.f_buses, PT.t_buses, Cf, Ct, nlines, polar.device
-    )
-    return cons_grad
-end
-
-function size_constraint(polar::PolarForm{T, IT, VT, MT}, ::typeof(flow_constraints)) where {T, IT, VT, MT}
-    return 2 * PS.get(polar.network, PS.NumberOfLines())
-end
-
-function bounds(polar::PolarForm{T, IT, VT, MT}, ::typeof(flow_constraints)) where {T, IT, VT, MT}
-    f_min, f_max = PS.bounds(polar.network, PS.Lines(), PS.ActivePower())
-    return convert(VT, [f_min; f_min]), convert(VT, [f_max; f_max])
-end
-
-function adjoint!(
-    polar::PolarForm,
-    pbm::AutoDiff.TapeMemory{F, S, I},
-    cons, ∂cons,
-    vmag, ∂vmag,
-    vang, ∂vang,
-    pnet, ∂pnet,
-    pload, qload,
-) where {F<:typeof(flow_constraints), S, I}
-    nlines = PS.get(polar.network, PS.NumberOfLines())
-    nbus = PS.get(polar.network, PS.NumberOfBuses())
-    top = polar.topology
-
-    fill!(pbm.intermediate.∂edge_vm_fr , 0.0)
-    fill!(pbm.intermediate.∂edge_vm_to , 0.0)
-    fill!(pbm.intermediate.∂edge_va_fr , 0.0)
-    fill!(pbm.intermediate.∂edge_va_to , 0.0)
-
-    adj_branch_flow!(
-        ∂cons,
-        vmag, ∂vmag,
-        vang, ∂vang,
-        pbm.intermediate.∂edge_vm_fr,
-        pbm.intermediate.∂edge_vm_to,
-        pbm.intermediate.∂edge_va_fr,
-        pbm.intermediate.∂edge_va_to,
-        top.yff_re, top.yft_re, top.ytf_re, top.ytt_re,
-        top.yff_im, top.yft_im, top.ytf_im, top.ytt_im,
-        top.f_buses, top.t_buses,
-        pbm.intermediate.Cf,
-        pbm.intermediate.Ct,
-        nlines, polar.device
-    )
-end
-
-function matpower_jacobian(polar::PolarForm, X::Union{State,Control}, ::typeof(flow_constraints), V)
-    nbus = get(polar, PS.NumberOfBuses())
-    nlines = get(polar, PS.NumberOfLines())
-    pf = polar.network
-    ref, pv, pq = index_buses_host(polar)
-    nref = length(ref)
-    npv  = length(pv)
-    npq  = length(pq)
-    lines = pf.lines
-
-    dSl_dVm, dSl_dVa = PS.matpower_lineflow_power_jacobian(V, lines)
-
-    if isa(X, State)
-        j11 = dSl_dVa[:, [pv; pq]]
-        j12 = dSl_dVm[:, pq]
-        return [j11 j12]::SparseMatrixCSC{Float64, Int}
-    elseif isa(X, Control)
-        j11 = dSl_dVm[:, [ref; pv]]
-        j12 = spzeros(2 * nlines, npv)
-        return [j11 j12]::SparseMatrixCSC{Float64, Int}
-    end
-end
-
diff --git a/src/Polar/Constraints/network_operation.jl b/src/Polar/Constraints/network_operation.jl
deleted file mode 100644
index 64817cb2..00000000
--- a/src/Polar/Constraints/network_operation.jl
+++ /dev/null
@@ -1,271 +0,0 @@
-
-# Lagrangian
-is_constraint(::typeof(network_operations)) = true
-size_constraint(polar::PolarForm, ::typeof(network_operations)) = 2 * get(polar, PS.NumberOfBuses()) + 1
-
-KA.@kernel function _bus_operation_kernel!(
-    cons, pnet,
-    @Const(pinj), @Const(qinj), @Const(pload), @Const(qload),
-    @Const(pv), @Const(pq), @Const(ref), @Const(pv_to_gen), @Const(ref_to_gen),
-)
-    i, j = @index(Global, NTuple)
-
-    npv = length(pv)
-    npq = length(pq)
-    nref = length(ref)
-    nbus = npv + npq + nref
-
-    #= PQ NODE =#
-    if i <= npq
-        bus = pq[i]
-        # Balance
-        cons[i+npv, j]     = pinj[bus, j] + pload[bus]
-        cons[i+npv+npq, j] = qinj[bus, j] + qload[bus]
-
-    #= PV NODE =#
-    elseif i <= npq + npv
-        i_ = i - npq
-        bus = pv[i_]
-        i_gen = pv_to_gen[i_]
-        # Balance
-        cons[i_, j] = pinj[bus, j] - pnet[bus, j] + pload[bus]
-        # Reactive power generation
-        shift = npv + 2 * npq + nref
-        cons[i_gen + shift, j] = qinj[bus, j] + qload[bus]
-
-    #= REF NODE =#
-    elseif i <= npq + npv + nref
-        i_ = i - npv - npq
-        bus = ref[i_]
-        i_gen = ref_to_gen[i_]
-
-        # Active power generation
-        shift = npv + 2 * npq
-        pg = pinj[bus, j] + pload[bus]
-        cons[i_ + shift, j] = pg
-        pnet[bus, j] = pg
-        # Reactive power generation
-        shift = npv + 2 * npq + nref
-        cons[i_gen + shift, j] = qinj[bus, j] + qload[bus]
-    end
-end
-
-KA.@kernel function _cost_kernel!(
-    costs, @Const(pnet), @Const(coefs),
-    @Const(pv), @Const(ref), @Const(pv_to_gen), @Const(ref_to_gen),
-)
-    i, j = @index(Global, NTuple)
-    npv = length(pv)
-    nref = length(ref)
-    # Evaluate active power at PV nodes
-    if i <= npv
-        bus = pv[i]
-        i_gen = pv_to_gen[i]
-    # Evaluate active power at slack nodes
-    elseif i <= npv + nref
-        i_ = i - npv
-        bus = ref[i_]
-        i_gen = ref_to_gen[i_]
-    end
-
-    pg = pnet[bus, j]
-    c0 = coefs[i_gen, 2]
-    c1 = coefs[i_gen, 3]
-    c2 = coefs[i_gen, 4]
-    costs[i_gen, j] = quadratic_cost(pg, c0, c1, c2)
-end
-
-function network_operations(
-    polar::PolarForm{T, VI, VT, MT}, cons, vmag, vang, pnet, qnet, pd, qd
-) where {T, VI, VT, MT}
-    nbus = get(polar, PS.NumberOfBuses())
-    ngen = get(polar, PS.NumberOfGenerators())
-    pv = polar.indexing.index_pv
-    pq = polar.indexing.index_pq
-    ref = polar.indexing.index_ref
-    pv_to_gen = polar.indexing.index_pv_to_gen
-    ref_to_gen = polar.indexing.index_ref_to_gen
-
-    nbatch = size(cons, 2)
-
-    fill!(cons, 0.0)
-    injection = MT(undef, 2 * nbus, nbatch)
-    fill!(injection, 0.0)
-
-    # Compute injection
-    bus_power_injection(polar, injection, vmag, vang, pnet, qnet, pd, qd)
-
-    pinj = view(injection, 1:nbus, :)
-    qinj = view(injection, 1+nbus:2*nbus, :)
-
-    # Compute operations
-    ndrange = (nbus, nbatch)
-    # Constraints
-    ev = _bus_operation_kernel!(polar.device)(
-        cons, pnet, pinj, qinj, pd, qd,
-        pv, pq, ref, pv_to_gen, ref_to_gen,
-        ndrange=ndrange, dependencies=Event(polar.device),
-    )
-    wait(ev)
-
-    # Objective
-    coefs = polar.costs_coefficients
-    costs = similar(cons, ngen, nbatch)
-    ev = _cost_kernel!(polar.device)(
-        costs, pnet, coefs, pv, ref, pv_to_gen, ref_to_gen,
-        ndrange=(ngen, nbatch), dependencies=Event(polar.device),
-    )
-    wait(ev)
-
-    cons[end, :] .= sum(costs)
-    return
-end
-
-function network_operations(polar::PolarForm, cons::AbstractVector, buffer::PolarNetworkState)
-    network_operations(polar, cons, buffer.vmag, buffer.vang, buffer.pnet, buffer.qnet, buffer.pload, buffer.qload)
-end
-
-function AutoDiff.TapeMemory(
-    polar::PolarForm, func::typeof(network_operations), VT; with_stack=true, nbatch=1,
-)
-    nnz = length(polar.topology.ybus_im.nzval)
-    nx = get(polar, NumberOfState())
-    nbus = get(polar, PS.NumberOfBuses())
-    # Intermediate state
-    intermediate = if nbatch == 1
-        (
-            ∂inj = VT(undef, 2*nbus),
-            ∂edge_vm_fr = VT(undef, nnz),
-            ∂edge_va_fr = VT(undef, nnz),
-            ∂edge_vm_to = VT(undef, nnz),
-            ∂edge_va_to = VT(undef, nnz),
-        )
-    else
-        (
-            ∂inj = VT(undef, 2*nbus, nbatch),
-            ∂edge_vm_fr = VT(undef, nnz, nbatch),
-            ∂edge_va_fr = VT(undef, nnz, nbatch),
-            ∂edge_vm_to = VT(undef, nnz, nbatch),
-            ∂edge_va_to = VT(undef, nnz, nbatch),
-        )
-    end
-    return AutoDiff.TapeMemory(
-        network_operations,
-        (with_stack) ? AdjointPolar(polar) : nothing,
-        intermediate,
-    )
-end
-
-KA.@kernel function _adjoint_bus_operation_kernel!(
-    adj_inj, adj_pnet,
-    @Const(adj_op), @Const(vmag), @Const(vang), @Const(pnet), @Const(pload),
-    @Const(coefs),
-    @Const(pv), @Const(pq), @Const(ref), @Const(pv_to_gen), @Const(ref_to_gen),
-    @Const(ybus_re_nzval), @Const(ybus_re_colptr), @Const(ybus_re_rowval), @Const(ybus_im_nzval),
-    @Const(transperm),
-)
-    i, j = @index(Global, NTuple)
-    npv = length(pv)
-    npq = length(pq)
-    nref = length(ref)
-    nbus = npv + npq + nref
-
-    @inbounds begin
-        #= PQ NODE =#
-        if i <= npq
-            bus = pq[i]
-            # Injection
-            adj_inj[bus     , j] = adj_op[i+npv]      # wrt P
-            adj_inj[bus+nbus, j] = adj_op[i+npv+npq]  # wrt Q
-
-        #= PV NODE =#
-        elseif i <= npq + npv
-            i_ = i - npq
-            bus = pv[i_]
-            i_gen = pv_to_gen[i_]
-            # Generation
-            pg = pnet[bus, j]
-
-            c0 = coefs[i_gen, 2]
-            c1 = coefs[i_gen, 3]
-            c2 = coefs[i_gen, 4]
-            adj_pnet[bus, j] = adj_op[end] * adj_quadratic_cost(pg, c0, c1, c2)
-            # Active injection
-            adj_inj[bus, j] = adj_op[i_]  # wrt P
-            # Reactive injection
-            shift = npv + 2 * npq + nref
-            adj_inj[bus + nbus, j] = adj_op[i_gen + shift]  # wrt Q
-
-        #= REF NODE =#
-        elseif i <= npq + npv + nref
-            i_ = i - npv - npq
-            bus = ref[i_]
-            i_gen = ref_to_gen[i_]
-
-            inj = bus_injection(bus, j, vmag, vang, ybus_re_colptr, ybus_re_rowval, ybus_re_nzval, ybus_im_nzval, transperm)
-            pg = inj + pload[bus]
-
-            c0 = coefs[i_gen, 2]
-            c1 = coefs[i_gen, 3]
-            c2 = coefs[i_gen, 4]
-            adj_pg = adj_op[end] * adj_quadratic_cost(pg, c0, c1, c2)
-            adj_pnet[bus, j] = adj_pg
-
-            shift = npv + 2 * npq
-            adj_inj[bus, j] = adj_op[i_+shift] + adj_pg
-
-            shift = npv + 2 * npq + nref
-            adj_inj[bus + nbus, j] = adj_op[i_gen+shift]
-        end
-    end
-end
-
-function _adjoint_network_operations(polar::PolarForm, ∂inj, ∂pnet, ∂cons, vmag, vang, pnet, pload)
-    nbus = PS.get(polar.network, PS.NumberOfBuses())
-    pq = polar.indexing.index_pq
-    pv = polar.indexing.index_pv
-    ref = polar.indexing.index_ref
-    pv2gen = polar.indexing.index_pv_to_gen
-    ref2gen = polar.indexing.index_ref_to_gen
-    ybus_re, ybus_im = get(polar.topology, PS.BusAdmittanceMatrix())
-    transperm = polar.topology.sortperm
-    coefs = polar.costs_coefficients
-
-    ndrange = (nbus, size(pnet, 2))
-    ev = _adjoint_bus_operation_kernel!(polar.device)(
-        ∂inj, ∂pnet, ∂cons, vmag, vang, pnet, pload, coefs, pv, pq, ref, pv2gen, ref2gen,
-        ybus_re.nzval, ybus_re.colptr, ybus_re.rowval, ybus_im.nzval, transperm,
-        ndrange=ndrange, dependencies=Event(polar.device),
-    )
-    wait(ev)
-end
-
-function adjoint!(
-    polar::PolarForm,
-    pbm::AutoDiff.TapeMemory{F, S, I},
-    cons, ∂cons,
-    vm, ∂vm,
-    va, ∂va,
-    pnet, ∂pnet,
-    pload, qload,
-) where {F<:typeof(network_operations), S, I}
-    nbus = PS.get(polar.network, PS.NumberOfBuses())
-    pv = polar.indexing.index_pv
-    ref = polar.indexing.index_ref
-    pv2gen = polar.indexing.index_pv_to_gen
-    ref2gen = polar.indexing.index_ref_to_gen
-
-    # Intermediate state
-    ∂inj = pbm.intermediate.∂inj
-
-    fill!(∂vm, 0.0)
-    fill!(∂va, 0.0)
-    fill!(∂pnet, 0.0)
-
-    # Seed adjoint of injection
-    _adjoint_network_operations(polar, ∂inj, ∂pnet, ∂cons, vm, va, pnet, pload)
-    # Backpropagate through the power injection to get ∂vm and ∂va
-    _adjoint_bus_power_injection!(polar, pbm, ∂inj, vm, ∂vm, va, ∂va)
-    return
-end
-
diff --git a/src/Polar/Constraints/power_balance.jl b/src/Polar/Constraints/power_balance.jl
deleted file mode 100644
index 08cd3790..00000000
--- a/src/Polar/Constraints/power_balance.jl
+++ /dev/null
@@ -1,128 +0,0 @@
-is_constraint(::typeof(power_balance)) = true
-
-function _power_balance!(
-    F, vmag, vang, pnet, pload, qload, ybus_re, ybus_im, transposeperm, pv, pq, ref, nbus, device
-)
-    npv = length(pv)
-    npq = length(pq)
-    kernel! = residual_kernel!(device)
-    ndrange = (npv+npq, size(F, 2))
-    ev = kernel!(
-        F, vmag, vang,
-        ybus_re.colptr, ybus_re.rowval,
-        ybus_re.nzval, ybus_im.nzval, transposeperm,
-        pnet, pload, qload, pv, pq, nbus,
-        ndrange=ndrange,
-        dependencies=Event(device)
-    )
-    wait(ev)
-end
-
-function power_balance(polar::PolarForm, cons, vmag, vang, pnet, qnet, pload, qload)
-    nbus = get(polar, PS.NumberOfBuses())
-    ref, pv, pq = index_buses_device(polar)
-    ybus_re, ybus_im = get(polar.topology, PS.BusAdmittanceMatrix())
-    transposeperm = polar.topology.sortperm
-
-    fill!(cons, 0.0)
-    _power_balance!(
-        cons, vmag, vang, pnet, pload, qload,
-        ybus_re, ybus_im, transposeperm,
-        pv, pq, ref, nbus, polar.device
-    )
-end
-
-function power_balance(polar::PolarForm, cons, buffer::PolarNetworkState)
-    power_balance(
-        polar, cons,
-        buffer.vmag, buffer.vang,
-        buffer.pnet, buffer.qnet,
-        buffer.pload, buffer.qload,
-    )
-end
-
-function size_constraint(polar::PolarForm, ::typeof(power_balance))
-    npv = PS.get(polar.network, PS.NumberOfPVBuses())
-    npq = PS.get(polar.network, PS.NumberOfPQBuses())
-    return 2 * npq + npv
-end
-
-function bounds(polar::PolarForm{T, IT, VT, MT}, ::typeof(power_balance)) where {T, IT, VT, MT}
-    m = size_constraint(polar, power_balance)
-    return (fill!(VT(undef, m), zero(T)) , fill!(VT(undef, m), zero(T)))
-end
-
-# Adjoint
-function adjoint!(
-    polar::PolarForm,
-    pbm::AutoDiff.TapeMemory{F, S, I},
-    cons, ∂cons,
-    vmag, ∂vmag,
-    vang, ∂vang,
-    pnet, ∂pnet,
-    pload, qload,
-) where {F<:typeof(power_balance), S, I}
-    nbus = get(polar, PS.NumberOfBuses())
-    ref, pv, pq = index_buses_device(polar)
-    ybus_re, ybus_im = get(polar.topology, PS.BusAdmittanceMatrix())
-
-    fill!(pbm.intermediate.∂edge_vm_fr , 0.0)
-    fill!(pbm.intermediate.∂edge_vm_to , 0.0)
-    fill!(pbm.intermediate.∂edge_va_fr , 0.0)
-    fill!(pbm.intermediate.∂edge_va_to , 0.0)
-
-    adj_residual_polar!(
-        cons, ∂cons,
-        vmag, ∂vmag,
-        vang, ∂vang,
-        ybus_re, ybus_im, polar.topology.sortperm,
-        pnet, ∂pnet, pload, qload,
-        pbm.intermediate.∂edge_vm_fr,
-        pbm.intermediate.∂edge_vm_to,
-        pbm.intermediate.∂edge_va_fr,
-        pbm.intermediate.∂edge_va_to,
-        pv, pq, nbus,
-        polar.device
-    )
-end
-
-function matpower_jacobian(polar::PolarForm, X::Union{State, Control}, ::typeof(power_balance), V)
-    nbus = get(polar, PS.NumberOfBuses())
-    pf = polar.network
-    ref, pv, pq = index_buses_host(polar)
-    nref = length(ref)
-    npv = length(pv)
-    npq = length(pq)
-    Ybus = pf.Ybus
-
-    dSbus_dVm, dSbus_dVa = PS.matpower_residual_jacobian(V, Ybus)
-
-    if isa(X, State)
-        j11 = real(dSbus_dVa[[pv; pq], [pv; pq]])
-        j12 = real(dSbus_dVm[[pv; pq], pq])
-        j21 = imag(dSbus_dVa[pq, [pv; pq]])
-        j22 = imag(dSbus_dVm[pq, pq])
-        return [j11 j12; j21 j22]::SparseMatrixCSC{Float64, Int}
-    elseif isa(X, Control)
-        j11 = real(dSbus_dVm[[pv; pq], [ref; pv]])
-        j12 = sparse(I, npv + npq, npv)
-        j21 = imag(dSbus_dVm[pq, [ref; pv]])
-        j22 = spzeros(npq, npv)
-        return [j11 -j12; j21 j22]::SparseMatrixCSC{Float64, Int}
-    end
-end
-
-# Hessian
-function matpower_hessian(
-    polar::PolarForm,
-    ::typeof(power_balance),
-    buffer::PolarNetworkState,
-    λ::AbstractVector,
-)
-    ref, pv, pq = index_buses_host(polar)
-    λ_host = λ |> Array
-    V = voltage_host(buffer)
-    hxx, hxu, huu = PS.residual_hessian(V, polar.network.Ybus, λ_host, pv, pq, ref)
-    return FullSpaceHessian(hxx, hxu, huu)
-end
-
diff --git a/src/Polar/Constraints/power_injection.jl b/src/Polar/Constraints/power_injection.jl
deleted file mode 100644
index 0d37147d..00000000
--- a/src/Polar/Constraints/power_injection.jl
+++ /dev/null
@@ -1,175 +0,0 @@
-is_constraint(::typeof(bus_power_injection)) = true
-size_constraint(polar::PolarForm, ::typeof(bus_power_injection)) = 2 * get(polar, PS.NumberOfBuses())
-
-KA.@kernel function bus_power_injection_kernel!(
-    inj, @Const(vmag), @Const(vang),
-    @Const(colptr), @Const(rowval),
-    @Const(ybus_re_nzval), @Const(ybus_im_nzval), nbus,
-)
-    bus, j = @index(Global, NTuple)
-
-    @inbounds for c in colptr[bus]:colptr[bus+1]-1
-        to = rowval[c]
-        aij = vang[bus, j] - vang[to, j]
-        # f_re = a * cos + b * sin
-        # f_im = a * sin - b * cos
-        coef_cos = vmag[bus, j]*vmag[to, j]*ybus_re_nzval[c]
-        coef_sin = vmag[bus, j]*vmag[to, j]*ybus_im_nzval[c]
-        cos_val = cos(aij)
-        sin_val = sin(aij)
-
-        inj[bus, j] += coef_cos * cos_val + coef_sin * sin_val
-        inj[bus+nbus, j] += coef_cos * sin_val - coef_sin * cos_val
-    end
-end
-
-KA.@kernel function adj_bus_power_injection_kernel!(
-    edge_vm_from, edge_vm_to,
-    edge_va_from, edge_va_to,
-    @Const(adj_inj), @Const(vmag), @Const(vang),
-    @Const(colptr), @Const(rowval),
-    @Const(ybus_re_nzval), @Const(ybus_im_nzval), nbus,
-)
-    bus, j = @index(Global, NTuple)
-
-    @inbounds for c in colptr[bus]:colptr[bus+1]-1
-        # Forward loop
-        to = rowval[c]
-        aij = vang[bus, j] - vang[to, j]
-        v_fr = vmag[bus, j]
-        v_to = vmag[to,  j]
-        y_re = ybus_re_nzval[c]
-        y_im = ybus_im_nzval[c]
-        # f_re = a * cos + b * sin
-        # f_im = a * sin - b * cos
-        coef_cos = v_fr*v_to*y_re
-        coef_sin = v_fr*v_to*y_im
-
-        cos_val = cos(aij)
-        sin_val = sin(aij)
-
-        adj_coef_cos = cos_val  * adj_inj[bus, j]
-        adj_coef_sin = sin_val  * adj_inj[bus, j]
-        adj_cos_val  = coef_cos * adj_inj[bus, j]
-        adj_sin_val  = coef_sin * adj_inj[bus, j]
-
-        adj_coef_cos +=  sin_val  * adj_inj[bus+nbus, j]
-        adj_coef_sin += -cos_val  * adj_inj[bus+nbus, j]
-        adj_cos_val  += -coef_sin * adj_inj[bus+nbus, j]
-        adj_sin_val  +=  coef_cos * adj_inj[bus+nbus, j]
-
-        adj_aij =   cos_val * adj_sin_val
-        adj_aij += -sin_val * adj_cos_val
-
-        edge_vm_from[c, j] += v_to * y_im * adj_coef_sin
-        edge_vm_to[c, j]   += v_fr * y_im * adj_coef_sin
-        edge_vm_from[c, j] += v_to * y_re * adj_coef_cos
-        edge_vm_to[c, j]   += v_fr * y_re * adj_coef_cos
-
-        edge_va_from[c, j] += adj_aij
-        edge_va_to[c, j]   -= adj_aij
-    end
-end
-
-function bus_power_injection(polar::PolarForm, cons, vmag, vang, pnet, qnet, pload, qload)
-    nbus = get(polar, PS.NumberOfBuses())
-    ybus_re, ybus_im = get(polar.topology, PS.BusAdmittanceMatrix())
-    fill!(cons, 0)
-    ndrange = (nbus, size(cons, 2))
-    ev = bus_power_injection_kernel!(polar.device)(
-        cons, vmag, vang,
-        ybus_re.colptr, ybus_re.rowval, ybus_re.nzval, ybus_im.nzval, nbus,
-        ndrange=ndrange, dependencies=Event(polar.device),
-    )
-    wait(ev)
-end
-
-function bus_power_injection(polar::PolarForm, cons, buffer::PolarNetworkState)
-    bus_power_injection(
-        polar, cons,
-        buffer.vmag, buffer.vang,
-        buffer.pnet, buffer.qnet,
-        buffer.pload, buffer.qload,
-    )
-end
-
-# Adjoint with standardized interface
-function _adjoint_bus_power_injection!(
-    polar::PolarForm,
-    pbm::AutoDiff.TapeMemory,
-    ∂cons, vmag, ∂vmag, vang, ∂vang,
-)
-    nbus = get(polar, PS.NumberOfBuses())
-    ybus_re, ybus_im = get(polar.topology, PS.BusAdmittanceMatrix())
-
-    fill!(pbm.intermediate.∂edge_vm_fr , 0.0)
-    fill!(pbm.intermediate.∂edge_vm_to , 0.0)
-    fill!(pbm.intermediate.∂edge_va_fr , 0.0)
-    fill!(pbm.intermediate.∂edge_va_to , 0.0)
-
-    ndrange = (nbus, size(∂vmag, 2))
-    # ADJOINT WRT EDGES
-    ev = adj_bus_power_injection_kernel!(polar.device)(
-        pbm.intermediate.∂edge_vm_fr,
-        pbm.intermediate.∂edge_vm_to,
-        pbm.intermediate.∂edge_va_fr,
-        pbm.intermediate.∂edge_va_to,
-        ∂cons,
-        vmag, vang,
-        ybus_re.colptr, ybus_re.rowval, ybus_re.nzval, ybus_im.nzval, nbus,
-        ndrange=ndrange, dependencies=Event(polar.device),
-    )
-    wait(ev)
-
-    # ADJOINT WRT NODES
-    ev = gpu_adj_node_kernel!(polar.device)(
-        ∂vmag, ∂vang,
-        ybus_re.colptr, ybus_re.rowval,
-        pbm.intermediate.∂edge_vm_fr, pbm.intermediate.∂edge_vm_to,
-        pbm.intermediate.∂edge_va_fr, pbm.intermediate.∂edge_va_to,
-        polar.topology.sortperm,
-        ndrange=ndrange, dependencies=Event(polar.device)
-    )
-    wait(ev)
-end
-
-function adjoint!(
-    polar::PolarForm,
-    pbm::AutoDiff.TapeMemory{F, S, I},
-    cons, ∂cons,
-    vm, ∂vm,
-    va, ∂va,
-    pnet, ∂pnet,
-    pload, qload,
-) where {F<:typeof(bus_power_injection), S, I}
-    fill!(∂vm, 0)
-    fill!(∂va, 0)
-    _adjoint_bus_power_injection!(polar, pbm, ∂cons, vm, ∂vm, va, ∂va)
-    return
-end
-
-function matpower_jacobian(polar::PolarForm, X::Union{State, Control}, ::typeof(bus_power_injection), V)
-    nbus = get(polar, PS.NumberOfBuses())
-    pf = polar.network
-    ref = pf.ref ; nref = length(ref)
-    pv = pf.pv ; npv = length(pv)
-    pq = pf.pq ; npq = length(pq)
-    Ybus = pf.Ybus
-
-    dSbus_dVm, dSbus_dVa = PS.matpower_residual_jacobian(V, Ybus)
-
-    if isa(X, State)
-        j11 = real(dSbus_dVa[:, [pv; pq]])
-        j12 = real(dSbus_dVm[:, pq])
-        j21 = imag(dSbus_dVa[:, [pv; pq]])
-        j22 = imag(dSbus_dVm[:, pq])
-        return [j11 j12; j21 j22]
-    elseif isa(X, Control)
-        j11 = real(dSbus_dVm[:, [ref; pv]])
-        j12 = spzeros(nbus, npv)
-        j21 = imag(dSbus_dVm[:, [ref; pv]])
-        j22 = spzeros(nbus, npv)
-        return [j11 j12; j21 j22]
-    end
-end
-
diff --git a/src/Polar/Constraints/ramping_rate.jl b/src/Polar/Constraints/ramping_rate.jl
deleted file mode 100644
index 10536d1c..00000000
--- a/src/Polar/Constraints/ramping_rate.jl
+++ /dev/null
@@ -1,204 +0,0 @@
-is_constraint(::typeof(cost_penalty_ramping_constraints)) = true
-size_constraint(polar::PolarForm, ::typeof(cost_penalty_ramping_constraints)) = 1
-
-function pullback_ramping(polar::PolarForm, intermediate)
-    return AutoDiff.TapeMemory(
-        cost_penalty_ramping_constraints,
-        AdjointStackObjective(polar),
-        intermediate,
-    )
-end
-
-@inline function _cost_ramping(pg, s, c0, c1, c2, σ, t, τ, λf, λt, ρf, ρt, p1, p2, p3)
-    obj = σ * quadratic_cost(pg, c0, c1, c2)
-    penalty = 0.5 * τ * (pg - p2)^2
-    if t != 0
-        penalty += λf * (p1 - pg + s) + 0.5 * ρf * (p1 - pg + s)^2
-    end
-    if t != 1
-        penalty += λt * (pg - p3) + 0.5 * ρt * (pg - p3)^2
-    end
-    return obj + penalty
-end
-
-@inline function _adjoint_cost_ramping(pg, s, c0, c1, c2, σ, t, τ, λf, λt, ρf, ρt, p1, p2, p3)
-    ∂c = σ * adj_quadratic_cost(pg, c0, c1, c2)
-    ∂c += τ * (pg - p2)
-    if t != 0
-       ∂c -= λf + ρf * (p1 - pg + s)
-    end
-    if t != 1
-       ∂c += λt + ρt * (pg - p3)
-    end
-    return ∂c
-end
-
-KA.@kernel function cost_ramping_kernel!(
-    costs, pg, @Const(vmag), @Const(vang), @Const(pinj), @Const(pload), @Const(s),
-    @Const(c0), @Const(c1), @Const(c2),
-    @Const(σ), @Const(t), @Const(τ), @Const(λf), @Const(λt), @Const(ρf), @Const(ρt), @Const(p1), @Const(p2), @Const(p3),
-    @Const(pv), @Const(ref), @Const(pv_to_gen), @Const(ref_to_gen),
-    @Const(ybus_re_nzval), @Const(ybus_re_colptr), @Const(ybus_re_rowval),
-    @Const(ybus_im_nzval), @Const(transperm),
-)
-    i, j = @index(Global, NTuple)
-    npv = length(pv)
-    nref = length(ref)
-    # Evaluate active power at PV nodes
-    if i <= npv
-        bus = pv[i]
-        i_gen = pv_to_gen[i]
-        pg[i_gen, j] = pinj[bus, j]
-    # Evaluate active power at slack nodes
-    elseif i <= npv + nref
-        i_ = i - npv
-        bus = ref[i_]
-        i_gen = ref_to_gen[i_]
-        inj = bus_injection(bus, j, vmag, vang, ybus_re_colptr, ybus_re_rowval, ybus_re_nzval, ybus_im_nzval, transperm)
-        pg[i_gen, j] = inj + pload[bus]
-    end
-
-    costs[i_gen, j] = _cost_ramping(
-        pg[i_gen, j], s[i_gen, j], c0[i_gen], c1[i_gen], c2[i_gen],
-        σ, t, τ, λf[i_gen], λt[i_gen], ρf, ρt, p1[i_gen], p2[i_gen], p3[i_gen],
-    )
-end
-
-KA.@kernel function adj_cost_ramping_kernel!(
-    adj_costs,
-    @Const(vmag), adj_vmag, @Const(vang), adj_vang, @Const(pinj), adj_pinj, @Const(pload),
-    @Const(s),
-    @Const(c0), @Const(c1), @Const(c2),
-    @Const(σ), @Const(t), @Const(τ), @Const(λf), @Const(λt), @Const(ρf), @Const(ρt), @Const(p1), @Const(p2), @Const(p3),
-    @Const(pv), @Const(ref), @Const(pv_to_gen), @Const(ref_to_gen),
-    @Const(ybus_re_nzval), @Const(ybus_re_colptr), @Const(ybus_re_rowval), @Const(ybus_im_nzval),
-    @Const(transperm),
-)
-    i, j = @index(Global, NTuple)
-    npv = length(pv)
-    nref = length(ref)
-    if i <= npv
-        bus = pv[i]
-        i_gen = pv_to_gen[i]
-        pg = pinj[bus, j]
-        adj_pinj[bus, j] = adj_costs[1] * _adjoint_cost_ramping(
-            pg, s[i_gen], c0[i_gen], c1[i_gen], c2[i_gen],
-            σ, t, τ, λf[i_gen], λt[i_gen], ρf, ρt, p1[i_gen], p2[i_gen], p3[i_gen],
-        )
-    # Evaluate active power at slack nodes
-    elseif i <= npv + nref
-        i_ = i - npv
-        fr = ref[i_]
-        i_gen = ref_to_gen[i_]
-
-        inj = bus_injection(fr, j, vmag, vang, ybus_re_colptr, ybus_re_rowval, ybus_re_nzval, ybus_im_nzval, transperm)
-        pg = inj + pload[fr]
-
-        adj_inj = adj_costs[1] * _adjoint_cost_ramping(
-            pg, s[i_gen], c0[i_gen], c1[i_gen], c2[i_gen],
-            σ, t, τ, λf[i_gen], λt[i_gen], ρf, ρt, p1[i_gen], p2[i_gen], p3[i_gen],
-        )
-        adj_pinj[fr, j] = adj_inj
-        # Update adj_vmag, adj_vang
-        adjoint_bus_injection!(
-            fr, j, adj_inj, adj_vmag, adj_vang, vmag, vang,
-            ybus_re_colptr, ybus_re_rowval, ybus_re_nzval, ybus_im_nzval, transperm,
-        )
-    end
-end
-
-function cost_penalty_ramping_constraints(
-    polar::PolarForm, buffer::PolarNetworkState,
-    s, t, σ, τ, λf, λt, ρf, ρt, p1, p2, p3,
-)
-    pv = polar.indexing.index_pv
-    pq = polar.indexing.index_pq
-    ref = polar.indexing.index_ref
-    pv2gen = polar.indexing.index_pv_to_gen
-    ref2gen = polar.indexing.index_ref_to_gen
-    ybus_re, ybus_im = get(polar.topology, PS.BusAdmittanceMatrix())
-    transperm = polar.topology.sortperm
-
-    ngen = PS.get(polar, PS.NumberOfGenerators())
-    coefs = polar.costs_coefficients
-    c0 = @view coefs[:, 2]
-    c1 = @view coefs[:, 3]
-    c2 = @view coefs[:, 4]
-    costs = similar(buffer.pgen)
-
-    ev = cost_ramping_kernel!(polar.device)(
-        costs, buffer.pgen,
-        buffer.vmag, buffer.vang, buffer.pnet, buffer.pload, s,
-        c0, c1, c2,
-        σ, t, τ, λf, λt, ρf, ρt, p1, p2, p3,
-        pv, ref, pv2gen, ref2gen,
-        ybus_re.nzval, ybus_re.colptr, ybus_re.rowval, ybus_im.nzval,
-        transperm,
-        ndrange=(ngen, size(buffer.pgen, 2)),
-        dependencies=Event(polar.device),
-    )
-    wait(ev)
-    return sum(costs)
-end
-
-function adjoint_penalty_ramping_constraints!(
-    polar::PolarForm,
-    pbm::AutoDiff.TapeMemory{F, S, I},
-    buffer,
-    s, t, σ, τ, λf, λt, ρf, ρt, p1, p2, p3,
-) where {F<:typeof(cost_penalty_ramping_constraints), S, I}
-    nbus = PS.get(polar.network, PS.NumberOfBuses())
-    nref = PS.get(polar.network, PS.NumberOfSlackBuses())
-    index_pv = polar.indexing.index_pv
-    index_ref = polar.indexing.index_ref
-    pv2gen = polar.indexing.index_pv_to_gen
-    ref2gen = polar.indexing.index_ref_to_gen
-
-    coefs = polar.costs_coefficients
-    c0 = @view coefs[:, 2]
-    c1 = @view coefs[:, 3]
-    c2 = @view coefs[:, 4]
-
-    ∂vm = pbm.stack.∂vm
-    ∂va = pbm.stack.∂va
-    ∂pinj = pbm.stack.∂pinj
-    fill!(∂vm, 0.0)
-    fill!(∂va, 0.0)
-    fill!(∂pinj, 0.0)
-
-    ngen = get(polar, PS.NumberOfGenerators())
-    ybus_re, ybus_im = get(polar.topology, PS.BusAdmittanceMatrix())
-    transperm = polar.topology.sortperm
-
-    ev = adj_cost_ramping_kernel!(polar.device)(
-        1.0,
-        buffer.vmag, ∂vm,
-        buffer.vang, ∂va,
-        buffer.pnet, ∂pinj, buffer.pload, s,
-        c0, c1, c2,
-        σ,
-        t,
-        τ,
-        λf,
-        λt,
-        ρf,
-        ρt,
-        p1,
-        p2,
-        p3,
-        index_pv, index_ref, pv2gen, ref2gen,
-        ybus_re.nzval, ybus_re.colptr, ybus_re.rowval, ybus_im.nzval,
-        transperm,
-        ndrange=(ngen, size(∂vm, 2)),
-        dependencies=Event(polar.device)
-    )
-    wait(ev)
-
-    adj_x = pbm.stack.∇fₓ
-    adj_u = pbm.stack.∇fᵤ
-    fill!(adj_x, 0.0)
-    fill!(adj_u, 0.0)
-    adjoint_transfer!(polar, adj_u, adj_x, ∂vm, ∂va, ∂pinj)
-    return
-end
-
diff --git a/src/Polar/Constraints/reactive_power.jl b/src/Polar/Constraints/reactive_power.jl
deleted file mode 100644
index 86097723..00000000
--- a/src/Polar/Constraints/reactive_power.jl
+++ /dev/null
@@ -1,146 +0,0 @@
-
-is_constraint(::typeof(reactive_power_constraints)) = true
-
-# Here, the power constraints are ordered as:
-# g = [qg_gen]
-function _reactive_power_constraints(
-    qg, vmag, vang, pnet, qnet, qload,
-    ybus_re, ybus_im, transperm, pv, pq, ref, pv_to_gen, ref_to_gen, nbus, device
-)
-    kernel! = reactive_power_kernel!(device)
-    range_ = length(pv) + length(ref)
-    ndrange = (length(pv) + length(ref), size(qg, 2))
-    ev = kernel!(
-        qg,
-        vmag, vang, pnet,
-        pv, ref, pv_to_gen, ref_to_gen,
-        ybus_re.nzval, ybus_re.colptr, ybus_re.rowval,
-        ybus_im.nzval, transperm, qload,
-        ndrange=ndrange,
-        dependencies=Event(device)
-    )
-    wait(ev)
-end
-
-function reactive_power_constraints(polar::PolarForm, cons, buffer)
-    kernel! = reactive_power_kernel!(polar.device)
-    pv = polar.indexing.index_pv
-    pq = polar.indexing.index_pq
-    ref = polar.indexing.index_ref
-    pv_to_gen = polar.indexing.index_pv_to_gen
-    ref_to_gen = polar.indexing.index_ref_to_gen
-    ybus_re, ybus_im = get(polar.topology, PS.BusAdmittanceMatrix())
-    transperm = polar.topology.sortperm
-
-    ndrange = (length(pv) + length(ref), size(buffer.qgen, 2))
-    ev = kernel!(
-        buffer.qgen,
-        buffer.vmag, buffer.vang, buffer.pnet,
-        pv, ref, pv_to_gen, ref_to_gen,
-        ybus_re.nzval, ybus_re.colptr, ybus_re.rowval,
-        ybus_im.nzval, transperm, buffer.qload,
-        ndrange=ndrange,
-        dependencies=Event(polar.device)
-    )
-    wait(ev)
-    # Constraint on Q_ref (generator) (Q_inj = Q_g - Q_load)
-    copyto!(cons, buffer.qgen)
-    return
-end
-
-# Function for AD with ForwardDiff
-function reactive_power_constraints(polar::PolarForm, cons, vmag, vang, pnet, qnet, pd, qd)
-    nbus = length(vmag)
-    ref, pv, pq = index_buses_device(polar)
-    _, ref_to_gen, pv_to_gen = index_generators_device(polar)
-    ybus_re, ybus_im = get(polar.topology, PS.BusAdmittanceMatrix())
-    transperm = polar.topology.sortperm
-    _reactive_power_constraints(
-        cons, vmag, vang, pnet, qnet, qd,
-        ybus_re, ybus_im, transperm, pv, pq, ref, pv_to_gen, ref_to_gen, nbus, polar.device
-    )
-end
-
-function size_constraint(polar::PolarForm, ::typeof(reactive_power_constraints))
-    return PS.get(polar.network, PS.NumberOfGenerators())
-end
-
-function bounds(polar::PolarForm{T, IT, VT, MT}, ::typeof(reactive_power_constraints)) where {T, IT, VT, MT}
-    q_min, q_max = PS.bounds(polar.network, PS.Generators(), PS.ReactivePower())
-    return convert(VT, q_min), convert(VT, q_max)
-end
-
-# Adjoint
-function adjoint!(
-    polar::PolarForm,
-    pbm::AutoDiff.TapeMemory{F, S, I},
-    cons, ∂cons,
-    vmag, ∂vmag,
-    vang, ∂vang,
-    pnet, ∂pnet,
-    pload, qload,
-) where {F<:typeof(reactive_power_constraints), S, I}
-    nbus = get(polar, PS.NumberOfBuses())
-    ref, pv, pq = index_buses_device(polar)
-    _, ref_to_gen, pv_to_gen = index_generators_device(polar)
-    ybus_re, ybus_im = get(polar.topology, PS.BusAdmittanceMatrix())
-
-    fill!(pbm.intermediate.∂edge_vm_fr , 0.0)
-    fill!(pbm.intermediate.∂edge_vm_to , 0.0)
-    fill!(pbm.intermediate.∂edge_va_fr , 0.0)
-    fill!(pbm.intermediate.∂edge_va_to , 0.0)
-
-    adj_reactive_power!(
-        cons, ∂cons,
-        vmag, ∂vmag,
-        vang, ∂vang,
-        ybus_re, ybus_im, polar.topology.sortperm,
-        pnet, ∂pnet,
-        pbm.intermediate.∂edge_vm_fr,
-        pbm.intermediate.∂edge_vm_to,
-        pbm.intermediate.∂edge_va_fr,
-        pbm.intermediate.∂edge_va_to,
-        qload,
-        pv, pq, ref, pv_to_gen, ref_to_gen, nbus,
-        polar.device
-    )
-end
-
-function matpower_jacobian(polar::PolarForm, X::Union{State,Control}, ::typeof(reactive_power_constraints), V)
-    nbus = get(polar, PS.NumberOfBuses())
-    pf = polar.network
-    ref, pv, pq = index_buses_host(polar)
-    gen2bus, _, _ = index_generators_host(polar)
-    Ybus = pf.Ybus
-
-    dSbus_dVm, dSbus_dVa = PS.matpower_residual_jacobian(V, Ybus)
-
-    if isa(X, State)
-        j11 = imag(dSbus_dVa[gen2bus, [pv; pq]])
-        j12 = imag(dSbus_dVm[gen2bus, pq])
-        return [j11 j12]::SparseMatrixCSC{Float64, Int}
-    elseif isa(X, Control)
-        j11 = imag(dSbus_dVm[gen2bus, [ref; pv]])
-        j12 = spzeros(length(gen2bus), length(pv))
-        return [j11 j12]::SparseMatrixCSC{Float64, Int}
-    end
-end
-
-function matpower_hessian(polar::PolarForm, ::typeof(reactive_power_constraints), buffer, λ)
-    nbus = get(polar, PS.NumberOfBuses())
-    ref, pv, pq = index_buses_host(polar)
-    gen2bus, _, _ = index_generators_host(polar)
-    # Check consistency
-    @assert length(λ) == length(gen2bus)
-
-    λq = zeros(nbus)
-    # Select only buses with generators
-    λq[gen2bus] .= λ
-
-    V = voltage_host(buffer)
-    hxx, hxu, huu = PS.reactive_power_hessian(V, polar.network.Ybus, λq, pv, pq, ref)
-    return FullSpaceHessian(
-        hxx, hxu, huu,
-    )
-end
-
diff --git a/src/Polar/Constraints/voltage_magnitude.jl b/src/Polar/Constraints/voltage_magnitude.jl
deleted file mode 100644
index 90a77979..00000000
--- a/src/Polar/Constraints/voltage_magnitude.jl
+++ /dev/null
@@ -1,73 +0,0 @@
-is_constraint(::typeof(voltage_magnitude_constraints)) = true
-
-is_linear(polar::PolarForm, ::typeof(voltage_magnitude_constraints)) = true
-
-# We add constraint only on vmag_pq
-function voltage_magnitude_constraints(polar::PolarForm, cons, vmag, vang, pnet, qnet, pload, qload)
-    _, _, pq = index_buses_device(polar)
-    cons .= @view vmag[pq]
-    return
-end
-function voltage_magnitude_constraints(polar::PolarForm, cons, buffer)
-    _, _, pq = index_buses_device(polar)
-    cons .= @view buffer.vmag[pq]
-    return
-end
-
-function size_constraint(polar::PolarForm, ::typeof(voltage_magnitude_constraints))
-    return PS.get(polar.network, PS.NumberOfPQBuses())
-end
-
-function bounds(polar::PolarForm, ::typeof(voltage_magnitude_constraints))
-    npv = PS.get(polar.network, PS.NumberOfPVBuses())
-    npq = PS.get(polar.network, PS.NumberOfPQBuses())
-    fr_ = npq + npv + 1
-    to_ = 2*npq + npv
-    return polar.x_min[fr_:to_], polar.x_max[fr_:to_]
-end
-
-function adjoint!(
-    polar::PolarForm,
-    pbm::AutoDiff.TapeMemory{F, S, I},
-    cons, ∂cons,
-    vmag, ∂vmag,
-    vang, ∂vang,
-    pnet, ∂pnet,
-    pload, qload,
-) where {F<:typeof(voltage_magnitude_constraints), S, I}
-    _, _, pq = index_buses_device(polar)
-    ∂vmag[pq] .= ∂cons
-end
-
-function matpower_jacobian(
-    polar::PolarForm,
-    X::Union{State,Control},
-    cons::typeof(voltage_magnitude_constraints),
-    V,
-)
-    m = size_constraint(polar, cons)
-    nᵤ = get(polar, NumberOfControl())
-    nₓ = get(polar, NumberOfState())
-    npv = PS.get(polar.network, PS.NumberOfPVBuses())
-    npq = PS.get(polar.network, PS.NumberOfPQBuses())
-    shift = npq + npv
-
-    I = 1:m
-    J = (shift+1):(shift+npq)
-    V = ones(m)
-    if isa(X, State)
-        return sparse(I, J, V, m, nₓ)
-    elseif isa(X, Control)
-        return spzeros(m, nᵤ)
-    end
-end
-
-function matpower_hessian(polar::PolarForm, ::typeof(voltage_magnitude_constraints), buffer, λ)
-    nu = get(polar, NumberOfControl())
-    nx = get(polar, NumberOfState())
-    return FullSpaceHessian(
-        spzeros(nx, nx),
-        spzeros(nu, nx),
-        spzeros(nu, nu),
-    )
-end
diff --git a/src/Polar/batch.jl b/src/Polar/batch.jl
deleted file mode 100644
index 3ed9efa9..00000000
--- a/src/Polar/batch.jl
+++ /dev/null
@@ -1,234 +0,0 @@
-
-function BatchHessian(polar::PolarForm{T, VI, VT, MT}, func, nbatch) where {T, VI, VT, MT}
-    @assert is_constraint(func)
-
-    if isa(polar.device, CPU)
-        A = Vector
-        MMT = Matrix
-    elseif isa(polar.device, GPU)
-        A = CUDA.CuVector
-        MMT = CUDA.CuMatrix
-    end
-
-    pf = polar.network
-    nbus = PS.get(pf, PS.NumberOfBuses())
-    n_cons = size_constraint(polar, func)
-
-    map = VI(polar.hessianstructure.map)
-    nmap = length(map)
-
-    x = VT(zeros(Float64, 3*nbus))
-
-    t1s{N} = ForwardDiff.Dual{Nothing,Float64, N} where N
-    t1sx = MMT{t1s{1}}(zeros(Float64, 3*nbus, nbatch))
-    t1sF = MMT{t1s{1}}(zeros(Float64, n_cons, nbatch))
-    host_t1sseeds = MMT{ForwardDiff.Partials{1,Float64}}(undef, nmap, nbatch)
-    t1sseeds = MMT{ForwardDiff.Partials{1,Float64}}(undef, nmap, nbatch)
-    varx = view(x, map)
-    t1svarx = view(t1sx, map, :)
-    VHP = typeof(host_t1sseeds)
-    VP = typeof(t1sseeds)
-    VD = typeof(t1sx)
-    adj_t1sx = MMT{t1s{1}}(zeros(Float64, 3 * nbus, nbatch))
-    adj_t1sF = A{t1s{1}}(zeros(Float64, n_cons))
-    buffer = AutoDiff.TapeMemory(polar, func, typeof(adj_t1sx); with_stack=false, nbatch=nbatch)
-    return AutoDiff.Hessian(
-        func, host_t1sseeds, t1sseeds, x, t1sF, adj_t1sF, t1sx, adj_t1sx, map, varx, t1svarx, buffer,
-    )
-end
-
-# Batch buffers
-function batch_buffer(polar::PolarForm{T, VI, VT, MT}, nbatch::Int) where {T, VI, VT, MT}
-    nbus = PS.get(polar.network, PS.NumberOfBuses())
-    ngen = PS.get(polar.network, PS.NumberOfGenerators())
-    nstates = get(polar, NumberOfState())
-    gen2bus, _, _ = index_generators_device(polar)
-    buffer =  PolarNetworkState{VI,MT}(
-        MT(undef, nbus, nbatch),
-        MT(undef, nbus, nbatch),
-        MT(undef, nbus, nbatch),
-        MT(undef, nbus, nbatch),
-        MT(undef, ngen, nbatch),
-        MT(undef, ngen, nbatch),
-        MT(undef, nbus, nbatch),
-        MT(undef, nbus, nbatch),
-        MT(undef, nstates, nbatch),
-        MT(undef, nstates, nbatch),
-        gen2bus,
-    )
-
-    # Init
-    pbus = zeros(nbus)
-    qbus = zeros(nbus)
-    vmag = abs.(polar.network.vbus)
-    vang = angle.(polar.network.vbus)
-    pd = get(polar.network, PS.ActiveLoad())
-    qd = get(polar.network, PS.ReactiveLoad())
-    pg = get(polar.network, PS.ActivePower())
-    qg = get(polar.network, PS.ReactivePower())
-    h_gen2bus, _, _ = index_generators_host(polar)
-    pbus[h_gen2bus] .= pg
-    qbus[h_gen2bus] .= qg
-
-    for i in 1:nbatch
-        copyto!(buffer.vmag, nbus * (i-1) + 1, vmag, 1, nbus)
-        copyto!(buffer.vang, nbus * (i-1) + 1, vang, 1, nbus)
-        copyto!(buffer.pnet, nbus * (i-1) + 1, pbus, 1, nbus)
-        copyto!(buffer.qnet, nbus * (i-1) + 1, qbus, 1, nbus)
-        copyto!(buffer.pgen,   ngen * (i-1) + 1,   pg, 1, ngen)
-        copyto!(buffer.qgen,   ngen * (i-1) + 1,   qg, 1, ngen)
-        copyto!(buffer.pload,  nbus * (i-1) + 1,   pd, 1, nbus)
-        copyto!(buffer.qload,  nbus * (i-1) + 1,   qd, 1, nbus)
-    end
-
-    return buffer
-end
-
-function batch_adj_hessian_prod!(
-    polar, H::AutoDiff.Hessian, hv, buffer, λ, v,
-)
-    @assert length(hv) == length(v)
-    device = polar.device
-    nbus = get(polar, PS.NumberOfBuses())
-    x = H.x
-    ntgt = length(v)
-    t1sx = H.t1sx
-    adj_t1sx = H.∂t1sx
-    t1sF = H.t1sF
-    adj_t1sF = H.∂t1sF
-    nbatch = size(adj_t1sx, 2)
-    # Init dual variables
-    adj_t1sx .= 0.0
-    t1sF .= 0.0
-    copyto!(adj_t1sF, 1, λ, 1, length(λ))
-    # Seeding
-    nmap = length(H.map)
-
-    # Init seed
-    AutoDiff.batch_init_seed_hessian!(H.t1sseeds, H.host_t1sseeds, v, nmap, device)
-    AutoDiff.batch_seed_hessian!(H.t1sseeds, H.varx, H.t1svarx, device)
-
-    adjoint!(
-        polar, H.buffer,
-        t1sF, adj_t1sF,
-        view(t1sx, 1:nbus, :), view(adj_t1sx, 1:nbus, :),                   # vmag
-        view(t1sx, nbus+1:2*nbus, :), view(adj_t1sx, nbus+1:2*nbus, :),     # vang
-        view(t1sx, 2*nbus+1:3*nbus, :), view(adj_t1sx, 2*nbus+1:3*nbus, :), # pinj
-        buffer.pload, buffer.qload,
-    )
-
-    AutoDiff.batch_partials_hessian!(hv, adj_t1sx, H.map, device)
-    return nothing
-end
-
-function BatchJacobian(
-    polar::PolarForm{T, VI, VT, MT}, func, variable, nbatch,
-) where {T, VI, VT, MT}
-    @assert is_constraint(func)
-    device = polar.device
-    (SMT, A) = get_batch_jacobian_types(device)
-
-    # Tensor type
-    TT = A{T, 3}
-
-    pf = polar.network
-    nbus = PS.get(pf, PS.NumberOfBuses())
-    map = get_map(polar, variable) |> VI
-
-    nmap = length(map)
-
-    # Sparsity pattern
-    J = jacobian_sparsity(polar, func, variable)
-
-    # Coloring
-    coloring = AutoDiff.SparseDiffTools.matrix_colors(J)
-    ncolor = size(unique(coloring),1)
-
-    nx = 2 * nbus
-    x = MT(zeros(Float64, nx, nbatch))
-    m = size(J, 1)
-
-    # Move Jacobian to the GPU
-    if isa(polar.device, CPU)
-        Js = SMT[J for i in 1:nbatch]
-    else
-        Js = BatchCuSparseMatrixCSR(J, nbatch)
-    end
-
-    # Seedings
-    t1s{N} = ForwardDiff.Dual{Nothing,Float64, N} where N
-    t1sx = A{t1s{ncolor}}(x)
-    t1sF = A{t1s{ncolor}}(zeros(Float64, m, nbatch))
-    t1sseeds = AutoDiff.init_seed(coloring, ncolor, nmap)
-
-    # Move the seeds over to the device, if necessary
-    gput1sseeds = A{ForwardDiff.Partials{ncolor,Float64}}(t1sseeds)
-    compressedJ = TT(zeros(Float64, ncolor, m, nbatch))
-
-    # Views
-    varx = view(x, map, :)
-    t1svarx = view(t1sx, map, :)
-
-    return AutoDiff.Jacobian{typeof(func), VI, MT, TT, typeof(Js), typeof(gput1sseeds), typeof(t1sx), typeof(varx), typeof(t1svarx), typeof(variable)}(
-        func, Js, compressedJ, coloring,
-        gput1sseeds, t1sF, x, t1sx, map, varx, t1svarx
-    )
-end
-
-function batch_jacobian!(polar::PolarForm,
-    jac::AutoDiff.Jacobian{Func, VI, VT, MT, SMT, VP, VD, SubT, SubD, State},
-    buffer
-) where {Func, VI, VT, MT, SMT, VP, VD, SubT, SubD}
-    device = polar.device
-    nbus = get(polar, PS.NumberOfBuses())
-    nbatch = size(jac.x, 2)
-    for i in 1:nbatch
-        f = (i-1) * nbus
-        copyto!(jac.x, 1 + 2 * f, buffer.vmag, 1 + f, nbus)
-        copyto!(jac.x, nbus + 2 * f + 1, buffer.vang, 1 + f, nbus)
-    end
-    jac.t1sx .= jac.x
-    jac.t1sF .= 0.0
-
-    AutoDiff.batch_seed_jacobian!(jac.t1sseeds, jac.varx, jac.t1svarx, device)
-
-    jac.func(
-        polar,
-        jac.t1sF,
-        view(jac.t1sx, 1:nbus, :),
-        view(jac.t1sx, nbus+1:2*nbus, :),
-        buffer.pnet, buffer.qnet,
-        buffer.pload, buffer.qload,
-    )
-
-    AutoDiff.batch_partials_jacobian!(jac.compressedJ, jac.t1sF, device)
-    AutoDiff.batch_uncompress!(jac.J, jac.compressedJ, jac.coloring, device)
-    return jac.J
-end
-
-function batch_jacobian!(polar::PolarForm,
-    jac::AutoDiff.Jacobian{Func, VI, VT, MT, SMT, VP, VD, SubT, SubD, Control},
-    buffer
-) where {Func, VI, VT, MT, SMT, VP, VD, SubT, SubD}
-    device = polar.device
-    nbus = get(polar, PS.NumberOfBuses())
-    copyto!(jac.x, 1, buffer.vmag, 1, nbus)
-    copyto!(jac.x, nbus+1, buffer.pnet, 1, nbus)
-    jac.t1sx .= jac.x
-    jac.t1sF .= 0.0
-
-    AutoDiff.batch_seed_jacobian!(jac.t1sseeds, jac.varx, jac.t1svarx, device)
-
-    jac.func(
-        polar,
-        jac.t1sF,
-        view(jac.t1sx, 1:nbus, :),
-        buffer.vang,
-        view(jac.t1sx, nbus+1:2*nbus, :), buffer.qnet,
-        buffer.pload, buffer.qload,
-    )
-
-    AutoDiff.batch_partials_jacobian!(jac.compressedJ, jac.t1sF, device)
-    AutoDiff.batch_uncompress!(jac.J, jac.compressedJ, jac.coloring, device)
-    return jac.J
-end
diff --git a/src/Polar/caches.jl b/src/Polar/caches.jl
deleted file mode 100644
index 6bd2bbe7..00000000
--- a/src/Polar/caches.jl
+++ /dev/null
@@ -1,138 +0,0 @@
-abstract type AbstractBuffer end
-abstract type AbstractNetworkBuffer <: AbstractBuffer end
-
-"Store indexing on target device"
-struct IndexingCache{IVT} <: AbstractBuffer
-    index_pv::IVT
-    index_pq::IVT
-    index_ref::IVT
-    index_generators::IVT
-    index_pv_to_gen::IVT
-    index_ref_to_gen::IVT
-end
-index_buses(idx::IndexingCache) = (idx.index_ref, idx.index_pv, idx.index_pq)
-index_generators(idx::IndexingCache) = (idx.index_generators, idx.index_ref_to_gen, idx.index_pv_to_gen)
-
-"""
-    PolarNetworkState{VI, VT} <: AbstractNetworkBuffer
-
-Buffer to store current values of all the variables describing
-the network, in polar formulation. Attributes are:
-
-- `vmag` (length: nbus): voltage magnitude at each bus
-- `vang` (length: nbus): voltage angle at each bus
-- `pnet` (length: nbus): power generation RHS. Equal to `Cg * Pg`
-- `qnet` (length: nbus): power generation RHS. Equal to `Cg * Qg`
-- `pgen`   (length: ngen): active power of generators
-- `qgen`   (length: ngen): reactive power of generators
-- `pload`  (length: nbus): active loads
-- `qload`  (length: nbus): reactive loads
-- `dx`   (length: nstates): cache the difference between two consecutive states (used in power flow resolution)
-- `balance` (length: nstates): cache for current power imbalance (used in power flow resolution)
-- `bus_gen` (length: ngen): generator-bus incidence matrix `Cg`
-
-"""
-struct PolarNetworkState{VI,VT} <: AbstractNetworkBuffer
-    vmag::VT
-    vang::VT
-    pnet::VT
-    qnet::VT
-    pgen::VT
-    qgen::VT
-    pload::VT
-    qload::VT
-    balance::VT
-    dx::VT
-    bus_gen::VI   # Generator-Bus incidence matrix
-end
-
-setvalues!(buf::PolarNetworkState, ::PS.VoltageMagnitude, values) = copyto!(buf.vmag, values)
-setvalues!(buf::PolarNetworkState, ::PS.VoltageAngle, values) = copyto!(buf.vang, values)
-function setvalues!(buf::PolarNetworkState, ::PS.ActivePower, values)
-    pgenbus = view(buf.pnet, buf.bus_gen)
-    pgenbus .= values
-    copyto!(buf.pgen, values)
-end
-function setvalues!(buf::PolarNetworkState, ::PS.ReactivePower, values)
-    qgenbus = view(buf.qnet, buf.bus_gen)
-    qgenbus .= values
-    copyto!(buf.qgen, values)
-end
-function setvalues!(buf::PolarNetworkState, ::PS.ActiveLoad, values)
-    copyto!(buf.pload, values)
-end
-function setvalues!(buf::PolarNetworkState, ::PS.ReactiveLoad, values)
-    copyto!(buf.qload, values)
-end
-
-function Base.iszero(buf::PolarNetworkState)
-    return iszero(buf.pnet) &&
-        iszero(buf.qnet) &&
-        iszero(buf.vmag) &&
-        iszero(buf.vang) &&
-        iszero(buf.pgen) &&
-        iszero(buf.qgen) &&
-        iszero(buf.pload) &&
-        iszero(buf.qload) &&
-        iszero(buf.balance) &&
-        iszero(buf.dx)
-end
-
-voltage(buf::PolarNetworkState) = buf.vmag .* exp.(im .* buf.vang)
-voltage_host(buf) = voltage(buf) |> Array
-
-"Store topology of the network on target device."
-struct NetworkTopology{VTI, VTD}
-    # Bus admittance matrix
-    ybus_re::Spmat{VTI, VTD} # nb x nb
-    ybus_im::Spmat{VTI, VTD} # nb x nb
-    # Branches admittance matrix
-    ## Real part
-    yff_re::VTD # nl
-    yft_re::VTD # nl
-    ytf_re::VTD # nl
-    ytt_re::VTD # nl
-    ## Imag part
-    yff_im::VTD # nl
-    yft_im::VTD # nl
-    ytf_im::VTD # nl
-    ytt_im::VTD # nl
-    # Correspondence
-    f_buses::VTI # nl
-    t_buses::VTI # nl
-    sortperm::VTI # nnz
-end
-
-function NetworkTopology(pf::PS.PowerNetwork, ::Type{VTI}, ::Type{VTD}) where {VTI, VTD}
-    Y = pf.Ybus
-    ybus_re, ybus_im = Spmat{VTI, VTD}(Y)
-    lines = pf.lines
-    yff_re = real.(lines.Yff) |> VTD
-    yft_re = real.(lines.Yft) |> VTD
-    ytf_re = real.(lines.Ytf) |> VTD
-    ytt_re = real.(lines.Ytt) |> VTD
-
-    yff_im = imag.(lines.Yff) |> VTD
-    yft_im = imag.(lines.Yft) |> VTD
-    ytf_im = imag.(lines.Ytf) |> VTD
-    ytt_im = imag.(lines.Ytt) |> VTD
-
-    f = lines.from_buses |> VTI
-    t = lines.to_buses   |> VTI
-    i, j, _ = findnz(Y)
-    sp = sortperm(i) |> VTI
-
-    return NetworkTopology(
-        ybus_re, ybus_im,
-        yff_re, yft_re, ytf_re, ytt_re,
-        yff_im, yft_im, ytf_im, ytt_im,
-        f, t, sp,
-    )
-end
-
-get(net::NetworkTopology, ::PS.BusAdmittanceMatrix) = (net.ybus_re, net.ybus_im)
-
-struct HessianStructure{IT} <: AbstractStructure where {IT}
-    map::IT
-end
-
diff --git a/src/Polar/derivatives.jl b/src/Polar/derivatives.jl
deleted file mode 100644
index c3ee030c..00000000
--- a/src/Polar/derivatives.jl
+++ /dev/null
@@ -1,418 +0,0 @@
-get_map(polar,::State) = polar.mapx
-get_map(polar,::Control) = polar.mapu
-
-"""
-    AutoDiff.Jacobian(polar, func::Function, variable::AbstractVariable)
-
-Instantiate a Jacobian AD factory for constraint function
-`func`, w.r.t. state ``x`` (if `variable=State()`) or control
-``u`` (if `variable=Control()`).
-
-The coloring is done using Jacobian's expressions from MATPOWER.
-
-### Examples
-
-```julia
-julia> Jacx = AutoDiff.Jacobian(polar, ExaPF.power_balance, State())
-```
-"""
-function AutoDiff.Jacobian(
-    polar::PolarForm{T, VI, VT, MT}, func, variable,
-) where {T, VI, VT, MT}
-    @assert is_constraint(func)
-    (SMT, A) = get_jacobian_types(polar.device)
-
-    pf = polar.network
-    nbus = PS.get(pf, PS.NumberOfBuses())
-    map = VI(get_map(polar, variable))
-
-    nmap = length(map)
-
-    # Sparsity pattern
-    J = jacobian_sparsity(polar, func, variable)
-
-    # Coloring
-    coloring = AutoDiff.SparseDiffTools.matrix_colors(J)
-    ncolor = size(unique(coloring),1)
-
-    # TODO: clean
-    nx = 2 * nbus
-    x = VT(zeros(Float64, nx))
-    m = size(J, 1)
-
-    # Move Jacobian to the GPU
-    J = convert(SMT, J)
-
-    # Seedings
-    t1s{N} = ForwardDiff.Dual{Nothing,Float64, N} where N
-    t1sx = A{t1s{ncolor}}(x)
-    t1sF = A{t1s{ncolor}}(zeros(Float64, m))
-    t1sseeds = AutoDiff.init_seed(coloring, ncolor, nmap)
-
-    # Move the seeds over to the device, if necessary
-    gput1sseeds = A{ForwardDiff.Partials{ncolor,Float64}}(t1sseeds)
-    compressedJ = MT(zeros(Float64, ncolor, m))
-
-    # Views
-    varx = view(x, map)
-    t1svarx = view(t1sx, map)
-
-    return AutoDiff.Jacobian{typeof(func), VI, VT, MT, SMT, typeof(gput1sseeds), typeof(t1sx), typeof(varx), typeof(t1svarx), typeof(variable)}(
-        func, J, compressedJ, coloring,
-        gput1sseeds, t1sF, x, t1sx, map, varx, t1svarx
-    )
-end
-
-"""
-    AutoDiff.jacobian!(polar::PolarForm, jac::AutoDiff.Jacobian, buffer)
-
-Update the sparse Jacobian entries `jacobian.J` using AutoDiff.
-No allocations are taking place in this function.
-
-* `polar::PolarForm`: polar formulation, stores all parameters.
-* `jac::AutoDiff.Jacobian`: AutoDiff Factory with Jacobian to update.
-* `buffer::PolarNetworkState`: store current values for network's variables.
-
-"""
-function AutoDiff.jacobian!(
-    polar::PolarForm,
-    jac::AutoDiff.Jacobian{Func, VI, VT, MT, SMT, VP, VD, SubT, SubD, State},
-    buffer
-) where {Func, VI, VT, MT, SMT, VP, VD, SubT, SubD}
-    nbus = get(polar, PS.NumberOfBuses())
-    copyto!(jac.x, 1, buffer.vmag, 1, nbus)
-    copyto!(jac.x, nbus+1, buffer.vang, 1, nbus)
-    jac.t1sx .= jac.x
-    jac.t1sF .= 0.0
-
-    AutoDiff.seed!(jac.t1sseeds, jac.varx, jac.t1svarx, polar.device)
-
-    jac.func(
-        polar,
-        jac.t1sF,
-        view(jac.t1sx, 1:nbus),
-        view(jac.t1sx, nbus+1:2*nbus),
-        buffer.pnet,
-        buffer.qnet,
-        buffer.pload,
-        buffer.qload,
-    )
-
-    AutoDiff.getpartials_kernel!(jac.compressedJ, jac.t1sF, polar.device)
-    AutoDiff.uncompress_kernel!(jac.J, jac.compressedJ, jac.coloring, polar.device)
-    return jac.J
-end
-
-function AutoDiff.jacobian!(
-    polar::PolarForm,
-    jac::AutoDiff.Jacobian{Func, VI, VT, MT, SMT, VP, VD, SubT, SubD, Control},
-    buffer
-) where {Func, VI, VT, MT, SMT, VP, VD, SubT, SubD}
-    nbus = get(polar, PS.NumberOfBuses())
-    copyto!(jac.x, 1, buffer.vmag, 1, nbus)
-    copyto!(jac.x, nbus+1, buffer.pnet, 1, nbus)
-    jac.t1sx .= jac.x
-    jac.t1sF .= 0.0
-
-    AutoDiff.seed!(jac.t1sseeds, jac.varx, jac.t1svarx, polar.device)
-
-    jac.func(
-        polar,
-        jac.t1sF,
-        view(jac.t1sx, 1:nbus),
-        buffer.vang,
-        view(jac.t1sx, nbus+1:2*nbus),
-        buffer.qnet,
-        buffer.pload,
-        buffer.qload,
-    )
-
-    AutoDiff.getpartials_kernel!(jac.compressedJ, jac.t1sF, polar.device)
-    AutoDiff.uncompress_kernel!(jac.J, jac.compressedJ, jac.coloring, polar.device)
-    return jac.J
-end
-
-# Handle properly constant Jacobian case
-function AutoDiff.ConstantJacobian(polar::PolarForm, func::Function, variable::Union{State,Control})
-    @assert is_constraint(func)
-
-    if isa(polar.device, CPU)
-        SMT = SparseMatrixCSC{Float64,Int}
-    elseif isa(polar.device, GPU)
-        SMT = CUSPARSE.CuSparseMatrixCSR{Float64}
-    end
-
-    nbus = get(polar, PS.NumberOfBuses())
-    vmag = ones(nbus)
-    vang = ones(nbus)
-    V = vmag .* exp.(im .* vang)
-    # Evaluate Jacobian with MATPOWER
-    J = matpower_jacobian(polar, variable, func, V)
-    # Move Jacobian to the GPU
-    if isa(polar.device, GPU) && iszero(J)
-        # CUSPARSE does not support zero matrix. Return nothing instead.
-        J = nothing
-    else
-        J = convert(SMT, J)
-    end
-    return AutoDiff.ConstantJacobian(J)
-end
-
-function AutoDiff.jacobian!(polar::PolarForm, jac::AutoDiff.ConstantJacobian, buffer)
-    return jac.J
-end
-
-
-function AutoDiff.Hessian(polar::PolarForm{T, VI, VT, MT}, func; tape=nothing) where {T, VI, VT, MT}
-    @assert is_constraint(func)
-
-    if isa(polar.device, CPU)
-        A = Vector
-    elseif isa(polar.device, GPU)
-        A = CUDA.CuVector
-    end
-
-    pf = polar.network
-    nbus = PS.get(pf, PS.NumberOfBuses())
-    n_cons = size_constraint(polar, func)
-
-    map = VI(polar.hessianstructure.map)
-    nmap = length(map)
-
-    x = VT(zeros(Float64, 3*nbus))
-
-    t1s{N} = ForwardDiff.Dual{Nothing,Float64, N} where N
-    t1sx = A{t1s{1}}(x)
-    t1sF = A{t1s{1}}(zeros(Float64, n_cons))
-    host_t1sseeds = Vector{ForwardDiff.Partials{1,Float64}}(undef, nmap)
-    t1sseeds = A{ForwardDiff.Partials{1,Float64}}(undef, nmap)
-    varx = view(x, map)
-    t1svarx = view(t1sx, map)
-    VHP = typeof(host_t1sseeds)
-    VP = typeof(t1sseeds)
-    VD = typeof(t1sx)
-    adj_t1sx = similar(t1sx)
-    adj_t1sF = similar(t1sF)
-    if isnothing(tape)
-        buffer = AutoDiff.TapeMemory(polar, func, VD; with_stack=false)
-    else
-        buffer = tape
-    end
-    return AutoDiff.Hessian(
-        func, host_t1sseeds, t1sseeds, x, t1sF, adj_t1sF, t1sx, adj_t1sx, map, varx, t1svarx, buffer,
-    )
-end
-
-function _init_seed_hessian!(dest, tmp, v::AbstractArray, nmap)
-    @inbounds for i in 1:nmap
-        dest[i] = ForwardDiff.Partials{1, Float64}(NTuple{1, Float64}(v[i]))
-    end
-    return
-end
-function _init_seed_hessian!(dest, tmp, v::CUDA.CuArray, nmap)
-    hostv = Array(v)
-    @inbounds Threads.@threads for i in 1:nmap
-        tmp[i] = ForwardDiff.Partials{1, Float64}(NTuple{1, Float64}(hostv[i]))
-    end
-    copyto!(dest, tmp)
-    return
-end
-
-function update_hessian!(polar::PolarForm, H::AutoDiff.Hessian, buffer)
-    nbatch = size(H.t1sx, 2)
-    nbus = get(polar, PS.NumberOfBuses())
-
-    # Move data
-    copyto!(H.x,        1, buffer.vmag, 1, nbus)
-    copyto!(H.x,   nbus+1, buffer.vang, 1, nbus)
-    copyto!(H.x, 2*nbus+1, buffer.pnet, 1, nbus)
-    @inbounds for i in 1:nbatch
-        H.t1sx[:, i] .= H.x
-    end
-    return
-end
-
-# λ' * H * v
-function AutoDiff.adj_hessian_prod!(
-    polar, H::AutoDiff.Hessian, hv, buffer, λ, v,
-)
-    @assert length(hv) == length(v)
-    nbus = get(polar, PS.NumberOfBuses())
-    x = H.x
-    ntgt = length(v)
-    t1sx = H.t1sx
-    adj_t1sx = H.∂t1sx
-    t1sF = H.t1sF
-    adj_t1sF = H.∂t1sF
-    # Move data
-    copyto!(x, 1, buffer.vmag, 1, nbus)
-    copyto!(x, nbus+1, buffer.vang, 1, nbus)
-    copyto!(x, 2*nbus+1, buffer.pnet, 1, nbus)
-    # Init dual variables
-    t1sx .= H.x
-    adj_t1sx .= 0.0
-    adj_t1sF .= λ
-    # Seeding
-    nmap = length(H.map)
-
-    # Init seed
-    _init_seed_hessian!(H.t1sseeds, H.host_t1sseeds, v, nmap)
-    AutoDiff.seed!(H.t1sseeds, H.varx, H.t1svarx, polar.device)
-
-    adjoint!(
-        polar, H.buffer,
-        t1sF, adj_t1sF,
-        view(t1sx, 1:nbus), view(adj_t1sx, 1:nbus),                   # vmag
-        view(t1sx, nbus+1:2*nbus), view(adj_t1sx, nbus+1:2*nbus),     # vang
-        view(t1sx, 2*nbus+1:3*nbus), view(adj_t1sx, 2*nbus+1:3*nbus), # pnet
-        buffer.pload, buffer.qload,
-    )
-
-    AutoDiff.getpartials_kernel!(hv, adj_t1sx, H.map, polar.device)
-    return nothing
-end
-
-# Adjoint's structure
-"""
-    AdjointStackObjective{VT}
-
-An object for storing the adjoint stack in the adjoint objective computation.
-
-"""
-struct AdjointStackObjective{VT<:AbstractVector}
-    ∇fₓ::VT
-    ∇fᵤ::VT
-    ∂pg::VT
-    ∂vm::VT
-    ∂va::VT
-    ∂pinj::VT
-    jvₓ::VT
-    jvᵤ::VT
-end
-
-function AdjointStackObjective(polar::PolarForm{T, VI, VT, MT}) where {T, VI, VT, MT}
-    nbus = get(polar, PS.NumberOfBuses())
-    return AdjointStackObjective{VT}(
-        fill!(VT(undef, get(polar, NumberOfState())), zero(T)),
-        fill!(VT(undef, get(polar, NumberOfControl())), zero(T)),
-        fill!(VT(undef, get(polar, PS.NumberOfGenerators())), zero(T)),
-        fill!(VT(undef, nbus), zero(T)),
-        fill!(VT(undef, nbus), zero(T)),
-        fill!(VT(undef, nbus), zero(T)),
-        fill!(VT(undef, get(polar, NumberOfState())), zero(T)),
-        fill!(VT(undef, get(polar, NumberOfControl())), zero(T)),
-    )
-end
-
-# Adjoint's stack for Polar
-struct AdjointPolar{VT} <: AutoDiff.AbstractAdjointStack{VT}
-    ∂vm::VT
-    ∂va::VT
-    ∂pinj::VT
-    ∂qinj::VT
-    ∂x::VT
-    ∂u::VT
-end
-
-function AdjointPolar{VT}(nx::Int, nu::Int, nbus::Int) where {VT}
-    return AdjointPolar{VT}(
-        VT(undef, nbus),
-        VT(undef, nbus),
-        VT(undef, nbus),
-        VT(undef, nbus),
-        VT(undef, nx),
-        VT(undef, nu),
-    )
-end
-
-function reset!(adj::AdjointPolar)
-    adj.∂vm .= 0.0
-    adj.∂va .= 0.0
-    adj.∂pinj .= 0.0
-    adj.∂x .= 0.0
-    adj.∂u .= 0.0
-end
-
-# Stack constructor for each constraint
-function AdjointPolar(polar::PolarForm{T, VI, VT, MT}) where {T, VI, VT, MT}
-    nbus = get(polar, PS.NumberOfBuses())
-    nx = get(polar, NumberOfState())
-    nu = get(polar, NumberOfControl())
-    return AdjointPolar{VT}(nx, nu, nbus)
-end
-
-
-struct FullSpaceJacobian{Jacx,Jacu}
-    x::Jacx
-    u::Jacu
-end
-
-struct FullSpaceHessian{SpMT}
-    xx::SpMT
-    xu::SpMT
-    uu::SpMT
-end
-
-#=
-    JACOBIAN
-=#
-struct ConstraintsJacobianStorage{SpMT}
-    Jx::SpMT
-    Ju::SpMT
-    constraints_ad::Vector{FullSpaceJacobian}
-end
-
-function ConstraintsJacobianStorage(polar::PolarForm{T, VI, VT, MT}, constraints::Vector{Function}) where {T, VI, VT, MT}
-    if isa(polar.device, CPU)
-        SpMT = SparseMatrixCSC{Float64, Int}
-    elseif isa(polar.device, GPU)
-        SpMT = CUSPARSE.CuSparseMatrixCSR{Float64}
-    end
-
-    SparseCPU = SparseMatrixCSC{Float64, Int}
-
-    Jx = SparseCPU[]
-    Ju = SparseCPU[]
-    # Build global Jacobian on the CPU
-    for cons in constraints
-        push!(Jx, jacobian_sparsity(polar, cons, State()))
-        push!(Ju, jacobian_sparsity(polar, cons, Control()))
-    end
-    gJx = convert(SpMT, vcat(Jx...))
-    gJu = convert(SpMT, vcat(Ju...))
-
-    # Build AD
-    cons_ad = FullSpaceJacobian[]
-    for cons in constraints
-        jac_ad_x = _build_jacobian(polar, cons, State())
-        jac_ad_u = _build_jacobian(polar, cons, Control())
-        push!(cons_ad, FullSpaceJacobian(jac_ad_x, jac_ad_u))
-    end
-
-    return ConstraintsJacobianStorage{SpMT}(
-        gJx,
-        gJu,
-        cons_ad,
-    )
-end
-
-function update_full_jacobian!(
-    polar::PolarForm,
-    cons_jac::ConstraintsJacobianStorage{SpMT},
-    buffer::PolarNetworkState
-) where {SpMT}
-    shift = 0
-    for ad in cons_jac.constraints_ad
-        # Update Jacobian
-        Jx = AutoDiff.jacobian!(polar, ad.x, buffer)::SpMT
-        Ju = AutoDiff.jacobian!(polar, ad.u, buffer)::Union{Nothing, SpMT}
-        # Copy back results
-        _transfer_sparse!(cons_jac.Jx, Jx, shift, polar.device)
-        if !isnothing(Ju)
-            _transfer_sparse!(cons_jac.Ju, Ju, shift, polar.device)
-        end
-
-        shift += size(Jx, 1)
-    end
-    return
-end
diff --git a/src/Polar/first_order.jl b/src/Polar/first_order.jl
index 48cb48f2..6bdf2cf7 100644
--- a/src/Polar/first_order.jl
+++ b/src/Polar/first_order.jl
@@ -11,24 +11,13 @@ struct MyJacobian{Model, Func, VD, SMT, MT, VI, VP}
     J::SMT
 end
 
-Base.size(jac::MyJacobian, n::Int) = size(jac.J, n)
-
-# Ordering: [vmag, vang, pgen]
-
-function my_map(polar::PolarForm, ::State)
-    nbus = get(polar, PS.NumberOfBuses())
-    ref, pv, pq = index_buses_host(polar)
-    return Int[nbus .+ pv; nbus .+ pq; pq]
-end
-function my_map(polar::PolarForm, ::Control)
-    nbus = get(polar, PS.NumberOfBuses())
-    ref, pv, pq = index_buses_host(polar)
-    pv2gen = polar.network.pv2gen
-    return Int[ref; pv; 2*nbus .+ pv2gen]
+function Base.show(io::IO, jacobian::MyJacobian)
+    println(io, "A AutoDiff Jacobian for $(typeof(jacobian.func))")
+    ncolor = size(jacobian.compressedJ, 1)
+    print(io, "Number of Jacobian colors: ", ncolor)
 end
 
-number(polar::PolarForm, ::State) = get(polar, NumberOfState())
-number(polar::PolarForm, ::Control) = get(polar, NumberOfControl())
+Base.size(jac::MyJacobian, n::Int) = size(jac.J, n)
 
 # Coloring
 function jacobian_sparsity(polar::PolarForm, func::AbstractExpression)
diff --git a/src/Polar/functions.jl b/src/Polar/functions.jl
index 4e956c59..6473a327 100644
--- a/src/Polar/functions.jl
+++ b/src/Polar/functions.jl
@@ -73,6 +73,7 @@ function init!(polar::PolarForm, stack::NetworkStack)
 end
 
 voltage(buf::NetworkStack) = buf.vmag .* exp.(im .* buf.vang)
+voltage_host(buf::NetworkStack) = voltage(buf) |> Array
 
 
 #=
@@ -123,6 +124,33 @@ end
 Base.length(func::PolarBasis) = func.nbus + 2 * func.nlines
 
 # update basis
+@kernel function basis_kernel!(
+    cons, @Const(vmag), @Const(vang), @Const(f), @Const(t), nlines, nbus,
+)
+    i, j = @index(Global, NTuple)
+
+    @inbounds begin
+        if i <= nlines
+            ℓ = i
+            fr_bus = f[ℓ]
+            to_bus = t[ℓ]
+            Δθ = vang[fr_bus, j] - vang[to_bus, j]
+            cosθ = cos(Δθ)
+            cons[i, j] = vmag[fr_bus, j] * vmag[to_bus, j] * cosθ
+        elseif i <= 2 * nlines
+            ℓ = i - nlines
+            fr_bus = f[ℓ]
+            to_bus = t[ℓ]
+            Δθ = vang[fr_bus, j] - vang[to_bus, j]
+            sinθ = sin(Δθ)
+            cons[i, j] = vmag[fr_bus, j] * vmag[to_bus, j] * sinθ
+        elseif i <= 2 * nlines + nbus
+            b = i - 2 * nlines
+            cons[i, j] = vmag[b, j] * vmag[b, j]
+        end
+    end
+end
+
 function (func::PolarBasis)(output, stack::NetworkStack)
     ev = basis_kernel!(func.device)(
         output, stack.vmag, stack.vang,
@@ -133,6 +161,39 @@ function (func::PolarBasis)(output, stack::NetworkStack)
     return
 end
 
+@kernel function adj_basis_kernel!(
+    ∂cons, adj_vmag, adj_vmag_fr, adj_vmag_to,
+    adj_vang_fr, adj_vang_to,
+    @Const(vmag), @Const(vang), @Const(f), @Const(t), nlines, nbus,
+)
+    i, j = @index(Global, NTuple)
+
+    @inbounds begin
+        if i <= nlines
+            ℓ = i
+            fr_bus = f[ℓ]
+            to_bus = t[ℓ]
+            Δθ = vang[fr_bus, j] - vang[to_bus, j]
+            cosθ = cos(Δθ)
+            sinθ = sin(Δθ)
+
+            adj_vang_fr[i] += -vmag[fr_bus, j] * vmag[to_bus, j] * sinθ * ∂cons[ℓ, j]
+            adj_vang_fr[i] +=  vmag[fr_bus, j] * vmag[to_bus, j] * cosθ * ∂cons[ℓ+nlines, j]
+            adj_vang_to[i] +=  vmag[fr_bus, j] * vmag[to_bus, j] * sinθ * ∂cons[ℓ, j]
+            adj_vang_to[i] -=  vmag[fr_bus, j] * vmag[to_bus, j] * cosθ * ∂cons[ℓ+nlines, j]
+
+            adj_vmag_fr[i] +=  vmag[to_bus, j] * cosθ * ∂cons[ℓ, j]
+            adj_vmag_fr[i] += vmag[to_bus, j] * sinθ * ∂cons[ℓ+nlines, j]
+
+            adj_vmag_to[i] +=  vmag[fr_bus, j] * cosθ * ∂cons[ℓ, j]
+            adj_vmag_to[i] += vmag[fr_bus, j] * sinθ * ∂cons[ℓ+nlines, j]
+        else i <= nlines + nbus
+            b = i - nlines
+            adj_vmag[b, j] += 2.0 * vmag[b, j] * ∂cons[b+2*nlines, j]
+        end
+    end
+end
+
 function adjoint!(func::PolarBasis, ∂state::NetworkStack, state::NetworkStack, ∂v)
     nl = func.nlines
     nb = func.nbus
@@ -196,7 +257,7 @@ function CostFunction(polar::PolarForm{T, VI, VT, MT}) where {T, VI, VT, MT}
     M = -M_tot[ref, :] |> SMT
 
     # coefficients
-    coefs = polar.costs_coefficients
+    coefs = PS.get_costs_coefficients(polar.network)
     c0 = @view coefs[:, 2]
     c1 = @view coefs[:, 3]
     c2 = @view coefs[:, 4]
@@ -221,6 +282,29 @@ function adjoint!(func::CostFunction, ∂state, state, ∂v)
 end
 
 
+@doc raw"""
+    PowerFlowBalance
+
+Subset of the power injection in the network
+corresponding to ``(p_{inj}^{pv}, p_{inj}^{pq}, q_{inj}^{pq})``.
+They are associated to the function
+
+```math
+g(x, u) = 0 .
+```
+introduced in the documentation.
+
+In detail, the function encodes the active balance equations at
+PV and PQ nodes, and the reactive balance equations at PQ nodes:
+```math
+\begin{aligned}
+    p_i &= v_i \sum_{j}^{n} v_j (g_{ij}\cos{(\theta_i - \theta_j)} + b_{ij}\sin{(\theta_i - \theta_j})) \,, &
+    ∀ i ∈ \{PV, PQ\} \\
+    q_i &= v_i \sum_{j}^{n} v_j (g_{ij}\sin{(\theta_i - \theta_j)} - b_{ij}\cos{(\theta_i - \theta_j})) \,. &
+    ∀ i ∈ \{PQ\}
+\end{aligned}
+```
+"""
 struct PowerFlowBalance{VT, MT} <: AbstractExpression
     M::MT
     Cg::MT
@@ -272,6 +356,19 @@ function adjoint!(func::PowerFlowBalance, ∂state, state, ∂v)
 end
 
 
+"""
+    VoltageMagnitudePQ
+
+Bounds the voltage magnitudes at PQ nodes:
+```math
+v_{pq}^♭ ≤ v_{pq} ≤ v_{pq}^♯ .
+```
+
+## Note
+The constraints on the voltage magnitudes at PV nodes ``v_{pv}``
+are taken into account when bounding the control ``u``.
+
+"""
 struct VoltageMagnitudePQ <: AbstractExpression
     pq::Vector{Int}
 
@@ -293,7 +390,17 @@ function adjoint!(func::VoltageMagnitudePQ, ∂state, state, ∂v)
     ∂state.vmag[func.pq] .+= ∂v
 end
 
+"""
+    PowerGenerationBounds
 
+Constraints on the **active power production**
+and on the **reactive power production** at the generators
+that are not already taken into account in the bound constraints.
+```math
+p_g^♭ ≤ p_g ≤ p_g^♯  ;
+q_g^♭ ≤ q_g ≤ q_g^♯  .
+```
+"""
 struct PowerGenerationBounds{VT, MT} <: AbstractExpression
     M::MT
     τ::VT
@@ -347,6 +454,12 @@ function adjoint!(func::PowerGenerationBounds, ∂state, state, ∂v)
 end
 
 
+"""
+    LineFlows
+
+Thermal limit constraints porting on the lines of the network.
+
+"""
 struct LineFlows{VT, MT} <: AbstractExpression
     nlines::Int
     Lfp::MT
diff --git a/src/Polar/kernels.jl b/src/Polar/kernels.jl
deleted file mode 100644
index 6a52e528..00000000
--- a/src/Polar/kernels.jl
+++ /dev/null
@@ -1,817 +0,0 @@
-import KernelAbstractions: @index
-
-# Implement kernels for polar formulation
-
-"""
-    function residual_kernel!(F, vmag, vang,
-                              colptr, rowval,
-                              ybus_re_nzval, ybus_im_nzval,
-                              pnet, qload, pv, pq, nbus)
-
-The residual CPU/GPU kernel of the powerflow residual.
-"""
-KA.@kernel function residual_kernel!(
-    F, @Const(vmag), @Const(vang),
-    @Const(colptr), @Const(rowval),
-    @Const(ybus_re_nzval), @Const(ybus_im_nzval), @Const(transperm),
-    @Const(pnet), @Const(pload), @Const(qload), @Const(pv), @Const(pq), nbus
-)
-
-    npv = size(pv, 1)
-    npq = size(pq, 1)
-
-    i, j = @index(Global, NTuple)
-    # REAL PV: 1:npv
-    # REAL PQ: (npv+1:npv+npq)
-    # IMAG PQ: (npv+npq+1:npv+2npq)
-    fr = (i <= npv) ? pv[i] : pq[i - npv]
-    F[i, j] = -(pnet[fr, j] - pload[fr, j])
-    if i > npv
-        F[i + npq, j] = qload[fr, j]
-    end
-    @inbounds for c in colptr[fr]:colptr[fr+1]-1
-        to = rowval[c]
-        aij = vang[fr, j] - vang[to, j]
-        # f_re = a * cos + b * sin
-        # f_im = a * sin - b * cos
-        coef_cos = vmag[fr, j]*vmag[to, j]*ybus_re_nzval[transperm[c]]
-        coef_sin = vmag[fr, j]*vmag[to, j]*ybus_im_nzval[transperm[c]]
-        cos_val = cos(aij)
-        sin_val = sin(aij)
-        F[i, j] += coef_cos * cos_val + coef_sin * sin_val
-        if i > npv
-            F[npq + i, j] += coef_cos * sin_val - coef_sin * cos_val
-        end
-    end
-end
-
-"""
-    function adj_residual_edge_kernel!(
-        F, adj_F, vmag, adj_vm, vang, adj_va,
-        colptr, rowval,
-        ybus_re_nzval, ybus_im_nzval,
-        edge_vm_from, edge_vm_to,
-        edge_va_from, edge_va_to,
-        pnet, adj_pnet, qnet, pv, pq
-    )
-
-This kernel computes the adjoint of the voltage magnitude `adj_vm`
-and `adj_va` with respect to the residual `F` and the adjoint `adj_F`.
-
-To avoid a race condition, each thread sums its contribution on the edge of the network graph.
-"""
-KA.@kernel function adj_residual_edge_kernel!(
-    F, @Const(adj_F), @Const(vmag), adj_vm, vang, adj_va,
-    @Const(colptr), @Const(rowval),
-    @Const(ybus_re_nzval), @Const(ybus_im_nzval), @Const(transperm),
-    edge_vm_from, edge_vm_to,
-    edge_va_from, edge_va_to,
-    @Const(pnet), adj_pnet, @Const(pload), @Const(qload), @Const(pv), @Const(pq)
-)
-
-    npv = size(pv, 1)
-    npq = size(pq, 1)
-
-    i, j = @index(Global, NTuple)
-    # REAL PV: 1:npv
-    # REAL PQ: (npv+1:npv+npq)
-    # IMAG PQ: (npv+npq+1:npv+2npq)
-    fr = (i <= npv) ? pv[i] : pq[i - npv]
-    F[i, j] = -(pnet[fr] - pload[fr])
-    if i > npv
-        F[i + npq, j] = qload[fr]
-    end
-    @inbounds for c in colptr[fr]:colptr[fr+1]-1
-        # Forward loop
-        to = rowval[c]
-        aij = vang[fr, j] - vang[to, j]
-
-        yre = ybus_re_nzval[transperm[c]]
-        yim = ybus_im_nzval[transperm[c]]
-        # f_re = a * cos + b * sin
-        # f_im = a * sin - b * cos
-        coef_cos = vmag[fr, j]*vmag[to, j]*yre
-        coef_sin = vmag[fr, j]*vmag[to, j]*yim
-
-        cos_val = cos(aij)
-        sin_val = sin(aij)
-        F[i, j] += coef_cos * cos_val + coef_sin * sin_val
-        if i > npv
-            F[npq + i, j] += coef_cos * sin_val - coef_sin * cos_val
-        end
-
-        adj_coef_cos =  cos_val  * adj_F[i]
-        adj_coef_sin =  sin_val  * adj_F[i]
-        adj_cos_val  =  coef_cos * adj_F[i]
-        adj_sin_val  =  coef_sin * adj_F[i]
-
-        if i > npv
-            adj_coef_cos +=  sin_val  * adj_F[npq + i]
-            adj_coef_sin += -cos_val  * adj_F[npq + i]
-            adj_cos_val  += -coef_sin * adj_F[npq + i]
-            adj_sin_val  +=  coef_cos * adj_F[npq + i]
-        end
-
-        adj_aij =   cos_val*adj_sin_val
-        adj_aij += -sin_val*adj_cos_val
-
-        edge_vm_from[c, j] += vmag[to, j]*yim*adj_coef_sin
-        edge_vm_to[c, j]   += vmag[fr, j]*yim*adj_coef_sin
-        edge_vm_from[c, j] += vmag[to, j]*yre*adj_coef_cos
-        edge_vm_to[c, j]   += vmag[fr, j]*yre*adj_coef_cos
-
-        edge_va_from[c, j] += adj_aij
-        edge_va_to[c, j]   -= adj_aij
-    end
-    # qnet is not active
-    # if i > npv
-    #     adj_qnet[fr] -= adj_F[i + npq]
-    # end
-    adj_pnet[fr, j] -= adj_F[i]
-end
-
-"""
-    function cpu_adj_node_kernel!(F, adj_F, vmag, adj_vm, vang, adj_va,
-                                  colptr, rowval,
-                                  edge_vm_from, edge_vm_to,
-                                  edge_va_from, edge_va_to
-    )
-
-This kernel accumulates the adjoint of the voltage magnitude `adj_vm`
-and `adj_va` from the edges of the graph stored as CSC matrices.
-"""
-function cpu_adj_node_kernel!(
-    adj_vm, adj_va,
-    colptr, rowval,
-    edge_vm_from, edge_vm_to,
-    edge_va_from, edge_va_to, dest,
-)
-    for i in 1:size(adj_vm, 1), j in 1:size(adj_vm, 2)
-        @inbounds for c in colptr[i]:colptr[i+1]-1
-            adj_vm[i, j] += edge_vm_from[c, j]
-            adj_vm[i, j] += edge_vm_to[dest[c], j]
-            adj_va[i, j] += edge_va_from[c, j]
-            adj_va[i, j] += edge_va_to[dest[c], j]
-        end
-    end
-end
-
-"""
-    function gpu_adj_node_kernel!(adj_vm, adj_va,
-                                  colptr, rowval,
-                                  edge_vm_from, edge_va_from,
-                                  edge_vm_to, edge_va_to, perm,
-    )
-
-This kernel accumulates the adjoint of the voltage magnitude `adj_vm`
-and `adj_va` from the edges of the graph. For the `to` edges a COO matrix
-was used to compute the transposed of the graph to add them to the `from` edges.
-The permutation corresponding to the transpose operation is stored inplace,
-in vector `perm`.
-
-"""
-KA.@kernel function gpu_adj_node_kernel!(
-    adj_vm, adj_va,
-    @Const(colptr), @Const(rowval),
-    @Const(edge_vm_from), @Const(edge_vm_to),
-    @Const(edge_va_from), @Const(edge_va_to), @Const(dest)
-)
-    i, j = @index(Global, NTuple)
-    @inbounds for c in colptr[i]:colptr[i+1]-1
-        to = dest[c]
-        adj_vm[i, j] += edge_vm_from[c, j]
-        adj_vm[i, j] += edge_vm_to[to, j]
-        adj_va[i, j] += edge_va_from[c, j]
-        adj_va[i, j] += edge_va_to[to, j]
-    end
-end
-
-"""
-    function adj_residual_polar!(
-        F, adj_F, vmag, adj_vm, vang, adj_va,
-        ybus_re, ybus_im,
-        pnet, adj_pnet, qnet,
-        edge_vm_from, edge_vm_to, edge_va_from, edge_va_to,
-        pv, pq, nbus
-    ) where {T}
-
-This is the wrapper of the adjoint kernel that computes the adjoint of
-the voltage magnitude `adj_vm` and `adj_va` with respect to the residual `F`
-and the adjoint `adj_F`.
-"""
-function adj_residual_polar!(
-    F, adj_F, vmag, adj_vm, vang, adj_va,
-    ybus_re, ybus_im, transpose_perm,
-    pnet, adj_pnet, pload, qload,
-    edge_vm_from, edge_vm_to, edge_va_from, edge_va_to,
-    pv, pq, nbus, device
-)
-    npv = length(pv)
-    npq = length(pq)
-    nvbus = size(vmag, 1)
-    nnz = length(ybus_re.nzval)
-    colptr = ybus_re.colptr
-    rowval = ybus_re.rowval
-
-    kernel_edge! = adj_residual_edge_kernel!(device)
-    ev = kernel_edge!(F, adj_F, vmag, adj_vm, vang, adj_va,
-                 ybus_re.colptr, ybus_re.rowval,
-                 ybus_re.nzval, ybus_im.nzval, transpose_perm,
-                 edge_vm_from, edge_vm_to,
-                 edge_va_from, edge_va_to,
-                 pnet, adj_pnet, pload, qload, pv, pq,
-                 ndrange=(npv+npq, size(F, 2)),
-                 dependencies = Event(device)
-    )
-    wait(ev)
-
-    # The permutation corresponding to the transpose of Ybus.
-    # is given in transpose_perm
-    if isa(device, CPU)
-        cpu_adj_node_kernel!(
-            adj_vm, adj_va,
-            ybus_re.colptr, ybus_re.rowval,
-            edge_vm_from, edge_vm_to,
-            edge_va_from, edge_va_to, transpose_perm,
-        )
-    else
-        ev = gpu_adj_node_kernel!(device)(
-            adj_vm, adj_va,
-            ybus_re.colptr, ybus_re.rowval,
-            edge_vm_from, edge_vm_to,
-            edge_va_from, edge_va_to, transpose_perm,
-            ndrange=(nvbus, size(adj_vm, 2)),
-            dependencies = Event(device)
-        )
-        wait(ev)
-    end
-end
-
-@inline function bus_injection(
-    bus, j, vmag, vang, ybus_re_colptr, ybus_re_rowval, ybus_re_nzval, ybus_im_nzval, transperm,
-)
-    inj = 0.0
-    @inbounds for c in ybus_re_colptr[bus]:ybus_re_colptr[bus+1]-1
-        to = ybus_re_rowval[c]
-        aij = vang[bus, j] - vang[to, j]
-        coef_cos = vmag[bus, j]*vmag[to, j]*ybus_re_nzval[transperm[c]]
-        coef_sin = vmag[bus, j]*vmag[to, j]*ybus_im_nzval[transperm[c]]
-        cos_val = cos(aij)
-        sin_val = sin(aij)
-        inj += coef_cos * cos_val + coef_sin * sin_val
-    end
-    return inj
-end
-
-@inline function adjoint_bus_injection!(
-    fr, j, adj_inj, adj_vmag, adj_vang, vmag, vang,
-    ybus_re_colptr, ybus_re_rowval, ybus_re_nzval, ybus_im_nzval, transperm,
-)
-    @inbounds for c in ybus_re_colptr[fr]:ybus_re_colptr[fr+1]-1
-        to = ybus_re_rowval[c]
-        aij = vang[fr, j] - vang[to, j]
-        # f_re = a * cos + b * sin
-        # f_im = a * sin - b * cos
-        yre = ybus_re_nzval[transperm[c]]
-        yim = ybus_im_nzval[transperm[c]]
-        coef_cos = vmag[fr, j]*vmag[to, j]*yre
-        coef_sin = vmag[fr, j]*vmag[to, j]*yim
-        cosθ = cos(aij)
-        sinθ = sin(aij)
-
-        adj_coef_cos = cosθ  * adj_inj
-        adj_cos_val  = coef_cos * adj_inj
-        adj_coef_sin = sinθ  * adj_inj
-        adj_sin_val  = coef_sin * adj_inj
-
-        adj_aij =   cosθ * adj_sin_val
-        adj_aij -=  sinθ * adj_cos_val
-
-        adj_vmag[fr, j] += vmag[to, j] * yre * adj_coef_cos
-        adj_vmag[to, j] += vmag[fr, j] * yre * adj_coef_cos
-        adj_vmag[fr, j] += vmag[to, j] * yim * adj_coef_sin
-        adj_vmag[to, j] += vmag[fr, j] * yim * adj_coef_sin
-
-        adj_vang[fr, j] += adj_aij
-        adj_vang[to, j] -= adj_aij
-    end
-end
-
-KA.@kernel function transfer_kernel!(
-    vmag, vang, pnet, qnet, @Const(u), @Const(pv), @Const(pq), @Const(ref), @Const(pload), @Const(qload)
-)
-    i, j = @index(Global, NTuple)
-    npv = length(pv)
-    npq = length(pq)
-    nref = length(ref)
-
-    # PV bus
-    if i <= npv
-        bus = pv[i]
-        vmag[bus, j] = u[nref + i, j]
-        pnet[bus, j] = u[nref + npv + i, j]
-    # REF bus
-    else
-        i_ref = i - npv
-        bus = ref[i_ref]
-        vmag[bus, j] = u[i_ref, j]
-    end
-end
-
-# Transfer values in (x, u) to buffer
-function transfer!(polar::PolarForm, buffer::PolarNetworkState, u)
-    kernel! = transfer_kernel!(polar.device)
-    nbus = length(buffer.vmag)
-    pv = polar.indexing.index_pv
-    pq = polar.indexing.index_pq
-    ref = polar.indexing.index_ref
-    ybus_re, ybus_im = get(polar.topology, PS.BusAdmittanceMatrix())
-    ndrange = (length(pv)+length(ref), size(u, 2))
-    ev = kernel!(
-        buffer.vmag, buffer.vang, buffer.pnet, buffer.qnet,
-        u,
-        pv, pq, ref,
-        buffer.pload, buffer.qload,
-        ndrange=ndrange,
-        dependencies = Event(polar.device)
-    )
-    wait(ev)
-end
-
-KA.@kernel function adj_transfer_kernel!(
-    adj_u, adj_x, @Const(adj_vmag), @Const(adj_vang), @Const(adj_pnet), @Const(pv), @Const(pq), @Const(ref),
-)
-    i, j = @index(Global, NTuple)
-    npv = length(pv)
-    npq = length(pq)
-    nref = length(ref)
-
-    # PQ buses
-    if i <= npq
-        bus = pq[i]
-        adj_x[npv+i, j] =  adj_vang[bus, j]
-        adj_x[npv+npq+i, j] = adj_vmag[bus, j]
-    # PV buses
-    elseif i <= npq + npv
-        i_ = i - npq
-        bus = pv[i_]
-        adj_u[nref + i_, j] = adj_vmag[bus, j]
-        adj_u[nref + npv + i_, j] = adj_pnet[bus, j]
-        adj_x[i_, j] = adj_vang[bus, j]
-    # SLACK buses
-    elseif i <= npq + npv + nref
-        i_ = i - npq - npv
-        bus = ref[i_]
-        adj_u[i_, j] = adj_vmag[bus, j]
-    end
-end
-
-# Transfer values in (x, u) to buffer
-function adjoint_transfer!(
-    polar::PolarForm,
-    ∂u, ∂x,
-    ∂vmag, ∂vang, ∂pnet,
-)
-    nbus = get(polar, PS.NumberOfBuses())
-    pv = polar.indexing.index_pv
-    pq = polar.indexing.index_pq
-    ref = polar.indexing.index_ref
-    ev = adj_transfer_kernel!(polar.device)(
-        ∂u, ∂x,
-        ∂vmag, ∂vang, ∂pnet,
-        pv, pq, ref;
-        ndrange=(nbus, size(∂u, 2)),
-        dependencies=Event(polar.device)
-    )
-    wait(ev)
-end
-
-KA.@kernel function _reverse_transfer_kernel2!(
-    dest, @Const(src), @Const(map),
-)
-    i, j = @index(Global, NTuple)
-    dest[i, j] = src[map[i], j]
-end
-
-function reverse_transfer!(
-    polar::PolarForm{T, VI, VT, MT},
-    output, ∂state,
-) where {T, VI, VT, MT}
-    nx = get(polar, ExaPF.NumberOfState())
-    nu = get(polar, ExaPF.NumberOfControl())
-    map = [my_map(polar, State()); my_map(polar, Control())] |> VI
-    ev = _reverse_transfer_kernel2!(polar.device)(
-        output, ∂state.input, map,
-        ndrange=(nx+nu, size(output, 2)),
-        dependencies=Event(polar.device)
-    )
-    wait(ev)
-end
-
-KA.@kernel function active_power_slack!(
-    cons, vmag, vang, ref, pd,
-    @Const(ybus_re_nzval), @Const(ybus_re_colptr), @Const(ybus_re_rowval), @Const(ybus_im_nzval),
-    @Const(transperm),
-)
-    i = @index(Global, Linear)
-    bus = ref[i]
-    inj = 0.0
-    @inbounds for c in ybus_re_colptr[bus]:ybus_re_colptr[bus+1]-1
-        to = ybus_re_rowval[c]
-        aij = vang[bus] - vang[to]
-        # f_re = a * cos + b * sin
-        # f_im = a * sin - b * cos
-        coef_cos = vmag[bus]*vmag[to]*ybus_re_nzval[transperm[c]]
-        coef_sin = vmag[bus]*vmag[to]*ybus_im_nzval[transperm[c]]
-        cos_val = cos(aij)
-        sin_val = sin(aij)
-        inj += coef_cos * cos_val + coef_sin * sin_val
-    end
-    cons[i] = inj + pd[bus]
-end
-
-KA.@kernel function adj_active_power_slack!(
-    v_m, v_a, adj_v_m, adj_v_a, adj_P, ref,
-    @Const(ybus_re_nzval), @Const(ybus_re_colptr), @Const(ybus_re_rowval), @Const(ybus_im_nzval),
-    @Const(transperm),
-)
-    i = @index(Global, Linear)
-    fr = ref[i]
-    @inbounds for c in ybus_re_colptr[fr]:ybus_re_colptr[fr+1]-1
-        to = ybus_re_rowval[c]
-        aij = v_a[fr] - v_a[to]
-        cosθ = cos(aij)
-        sinθ = sin(aij)
-
-        yre = ybus_re_nzval[transperm[c]]
-        yim = ybus_im_nzval[transperm[c]]
-
-        cθ = yre*cosθ
-        sθ = yim*sinθ
-        adj_v_m[fr] += v_m[to] * (cθ + sθ) * adj_P[i]
-        adj_v_m[to] += v_m[fr] * (cθ + sθ) * adj_P[i]
-
-        adj_aij = -(v_m[fr]*v_m[to]*(yre*sinθ))
-        adj_aij += v_m[fr]*v_m[to]*(yim*cosθ)
-        adj_aij *= adj_P[i]
-        adj_v_a[to] += -adj_aij
-        adj_v_a[fr] += adj_aij
-    end
-end
-
-KA.@kernel function reactive_power_kernel!(
-    qg, @Const(vmag), @Const(vang), @Const(pnet),
-    @Const(pv), @Const(ref), @Const(pv_to_gen), @Const(ref_to_gen),
-    @Const(ybus_re_nzval), @Const(ybus_re_colptr), @Const(ybus_re_rowval),
-    @Const(ybus_im_nzval), @Const(transperm), @Const(qload)
-)
-    i, j = @index(Global, NTuple)
-    npv = length(pv)
-    nref = length(ref)
-    # Evaluate reactive power at PV nodes
-    if i <= npv
-        bus = pv[i]
-        i_gen = pv_to_gen[i]
-    # Evaluate reactive power at slack nodes
-    elseif i <= npv + nref
-        i_ = i - npv
-        bus = ref[i_]
-        i_gen = ref_to_gen[i_]
-    end
-    inj = 0.0
-    @inbounds for c in ybus_re_colptr[bus]:ybus_re_colptr[bus+1]-1
-        to = ybus_re_rowval[c]
-        aij = vang[bus, j] - vang[to, j]
-        # f_re = a * cos + b * sin
-        # f_im = a * sin - b * cos
-        coef_cos = vmag[bus, j]*vmag[to, j]*ybus_re_nzval[transperm[c]]
-        coef_sin = vmag[bus, j]*vmag[to, j]*ybus_im_nzval[transperm[c]]
-        cos_val = cos(aij)
-        sin_val = sin(aij)
-        inj += coef_cos * sin_val - coef_sin * cos_val
-    end
-    qg[i_gen, j] = inj + qload[bus]
-end
-
-KA.@kernel function adj_reactive_power_edge_kernel!(
-    qg, adj_qg,
-    @Const(vmag), adj_vmag, @Const(vang), adj_vang,
-    @Const(pnet), adj_pnet,
-    @Const(pv), @Const(ref), @Const(pv_to_gen), @Const(ref_to_gen),
-    edge_vmag_bus, edge_vmag_to,
-    edge_vang_bus, edge_vang_to,
-    @Const(ybus_re_nzval), @Const(ybus_re_colptr), @Const(ybus_re_rowval),
-    @Const(ybus_im_nzval), @Const(transperm), @Const(qload)
-)
-    i, j = @index(Global, NTuple)
-    npv = length(pv)
-    nref = length(ref)
-    # Evaluate reactive power at PV nodes
-    if i <= npv
-        bus = pv[i]
-        i_gen = pv_to_gen[i]
-    # Evaluate reactive power at slack nodes
-    elseif i <= npv + nref
-        i_ = i - npv
-        bus = ref[i_]
-        i_gen = ref_to_gen[i_]
-    end
-    inj = 0.0
-    @inbounds for c in ybus_re_colptr[bus]:ybus_re_colptr[bus+1]-1
-        to = ybus_re_rowval[c]
-        aij = vang[bus, j] - vang[to, j]
-        # f_re = a * cos + b * sin
-        # f_im = a * sin - b * cos
-        coef_cos = vmag[bus, j]*vmag[to, j]*ybus_re_nzval[transperm[c]]
-        coef_sin = vmag[bus, j]*vmag[to, j]*ybus_im_nzval[transperm[c]]
-        cos_val = cos(aij)
-        sin_val = sin(aij)
-        inj += coef_cos * sin_val - coef_sin * cos_val
-    end
-    qg[i_gen, j] = inj + qload[bus]
-
-    # Reverse run
-    adj_inj = adj_qg[i_gen, j]
-    @inbounds for c in ybus_re_colptr[bus]:ybus_re_colptr[bus+1]-1
-        to = ybus_re_rowval[c]
-        aij = vang[bus, j] - vang[to, j]
-        # f_re = a * cos + b * sin
-        # f_im = a * sin - b * cos
-        yre = ybus_re_nzval[transperm[c]]
-        yim = ybus_im_nzval[transperm[c]]
-        coef_cos = vmag[bus, j]*vmag[to, j]*yre
-        coef_sin = vmag[bus, j]*vmag[to, j]*yim
-        cos_val = cos(aij)
-        sin_val = sin(aij)
-
-        adj_coef_cos = sin_val  * adj_inj
-        adj_sin_val  = coef_cos * adj_inj
-        adj_coef_sin = -cos_val  * adj_inj
-        adj_cos_val  = -coef_sin * adj_inj
-
-        adj_aij =   coef_cos * cos_val * adj_inj
-        adj_aij +=  coef_sin * sin_val * adj_inj
-
-        edge_vmag_bus[c, j] += vmag[to, j] *yre*adj_coef_cos
-        edge_vmag_to[c, j]  += vmag[bus, j]*yre*adj_coef_cos
-        edge_vmag_bus[c, j] += vmag[to, j] *yim*adj_coef_sin
-        edge_vmag_to[c, j]  += vmag[bus, j]*yim*adj_coef_sin
-
-        edge_vang_bus[c, j] += adj_aij
-        edge_vang_to[c, j]  -= adj_aij
-    end
-end
-
-function adj_reactive_power!(
-    F, adj_F, vmag, adj_vm, vang, adj_va,
-    ybus_re, ybus_im, transpose_perm,
-    pnet, adj_pnet,
-    edge_vm_from, edge_vm_to, edge_va_from, edge_va_to,
-    reactive_load,
-    pv, pq, ref, pv_to_gen, ref_to_gen, nbus, device
-)
-    npv = length(pv)
-    npq = length(pq)
-    nvbus = length(vmag)
-    nnz = length(ybus_re.nzval)
-
-    colptr = ybus_re.colptr
-    rowval = ybus_re.rowval
-
-    kernel_edge! = adj_reactive_power_edge_kernel!(device)
-
-    ndrange = (length(pv) + length(ref), size(adj_F, 2))
-
-    ev = kernel_edge!(
-        F, adj_F,
-        vmag, adj_vm,
-        vang, adj_va,
-        pnet, adj_pnet,
-        pv, ref, pv_to_gen, ref_to_gen,
-        edge_vm_from, edge_vm_to,
-        edge_va_from, edge_va_to,
-        ybus_re.nzval, ybus_re.colptr, ybus_re.rowval,
-        ybus_im.nzval, transpose_perm, reactive_load,
-        ndrange=ndrange,
-        dependencies=Event(device)
-    )
-    wait(ev)
-
-    if isa(device, CPU)
-        cpu_adj_node_kernel!(
-            adj_vm, adj_va,
-            ybus_re.colptr, ybus_re.rowval,
-            edge_vm_from, edge_vm_to,
-            edge_va_from, edge_va_to, transpose_perm,
-        )
-    else
-        ev = gpu_adj_node_kernel!(device)(
-            adj_vm, adj_va,
-            ybus_re.colptr, ybus_re.rowval,
-            edge_vm_from, edge_vm_to,
-            edge_va_from, edge_va_to, transpose_perm,
-            ndrange=(nvbus, size(adj_vm, 2)),
-            dependencies=Event(device)
-        )
-        wait(ev)
-    end
-
-end
-
-KA.@kernel function branch_flow_kernel!(
-    slines, @Const(vmag), @Const(vang),
-    @Const(yff_re), @Const(yft_re), @Const(ytf_re), @Const(ytt_re),
-    @Const(yff_im), @Const(yft_im), @Const(ytf_im), @Const(ytt_im),
-    @Const(f), @Const(t), nlines,
-)
-    ℓ, j = @index(Global, NTuple)
-    fr_bus = f[ℓ]
-    to_bus = t[ℓ]
-
-    Δθ = vang[fr_bus, j] - vang[to_bus, j]
-    cosθ = cos(Δθ)
-    sinθ = sin(Δθ)
-
-    # branch apparent power limits - from bus
-    yff_abs = yff_re[ℓ]^2 + yff_im[ℓ]^2
-    yft_abs = yft_re[ℓ]^2 + yft_im[ℓ]^2
-    yre_fr =   yff_re[ℓ] * yft_re[ℓ] + yff_im[ℓ] * yft_im[ℓ]
-    yim_fr = - yff_re[ℓ] * yft_im[ℓ] + yff_im[ℓ] * yft_re[ℓ]
-
-    fr_flow = vmag[fr_bus, j]^2 * (
-        yff_abs * vmag[fr_bus, j]^2 + yft_abs * vmag[to_bus, j]^2 +
-        2.0 * vmag[fr_bus, j] * vmag[to_bus, j] * (yre_fr * cosθ - yim_fr * sinθ)
-    )
-    slines[ℓ, j] = fr_flow
-
-    # branch apparent power limits - to bus
-    ytf_abs = ytf_re[ℓ]^2 + ytf_im[ℓ]^2
-    ytt_abs = ytt_re[ℓ]^2 + ytt_im[ℓ]^2
-    yre_to =   ytf_re[ℓ] * ytt_re[ℓ] + ytf_im[ℓ] * ytt_im[ℓ]
-    yim_to = - ytf_re[ℓ] * ytt_im[ℓ] + ytf_im[ℓ] * ytt_re[ℓ]
-
-    to_flow = vmag[to_bus, j]^2 * (
-        ytf_abs * vmag[fr_bus, j]^2 + ytt_abs * vmag[to_bus, j]^2 +
-        2.0 * vmag[fr_bus, j] * vmag[to_bus, j] * (yre_to * cosθ - yim_to * sinθ)
-    )
-    slines[ℓ + nlines, j] = to_flow
-end
-
-KA.@kernel function adj_branch_flow_edge_kernel!(
-    @Const(adj_slines), @Const(vmag), @Const(adj_vmag), @Const(vang), @Const(adj_vang),
-    adj_va_to_lines, adj_va_from_lines, adj_vm_to_lines, adj_vm_from_lines,
-    @Const(yff_re), @Const(yft_re), @Const(ytf_re), @Const(ytt_re),
-    @Const(yff_im), @Const(yft_im), @Const(ytf_im), @Const(ytt_im),
-    @Const(f), @Const(t), nlines,
-)
-    ℓ, j = @index(Global, NTuple)
-    fr_bus = f[ℓ]
-    to_bus = t[ℓ]
-
-    Δθ = vang[fr_bus, j] - vang[to_bus, j]
-    cosθ = cos(Δθ)
-    sinθ = sin(Δθ)
-    dcosθ = -sinθ
-    dsinθ = cosθ
-
-    # branch apparent power limits - from bus
-    yff_abs = yff_re[ℓ]^2 + yff_im[ℓ]^2
-    yft_abs = yft_re[ℓ]^2 + yft_im[ℓ]^2
-    yre_fr =   yff_re[ℓ] * yft_re[ℓ] + yff_im[ℓ] * yft_im[ℓ]
-    yim_fr = - yff_re[ℓ] * yft_im[ℓ] + yff_im[ℓ] * yft_re[ℓ]
-
-    # not needed in the reverse run
-    # fr_flow = vmag[fr_bus]^2 * (
-    #     yff_abs * vmag[fr_bus]^2 + yft_abs * vmag[to_bus]^2 +
-    #     2 * vmag[fr_bus] * vmag[to_bus] * (yre_fr * cosθ - yim_fr * sinθ)
-    # )
-    # slines[ℓ] = fr_flow
-
-    # branch apparent power limits - to bus
-    ytf_abs = ytf_re[ℓ]^2 + ytf_im[ℓ]^2
-    ytt_abs = ytt_re[ℓ]^2 + ytt_im[ℓ]^2
-    yre_to =   ytf_re[ℓ] * ytt_re[ℓ] + ytf_im[ℓ] * ytt_im[ℓ]
-    yim_to = - ytf_re[ℓ] * ytt_im[ℓ] + ytf_im[ℓ] * ytt_re[ℓ]
-
-    # not needed in the reverse run
-    # to_flow = vmag[to_bus]^2 * (
-    #     ytf_abs * vmag[fr_bus]^2 + ytt_abs * vmag[to_bus]^2 +
-    #     2 * vmag[fr_bus] * vmag[to_bus] * (yre_to * cosθ - yim_to * sinθ)
-    # )
-    # slines[ℓ + nlines] = to_flow
-
-    adj_to_flow = adj_slines[ℓ + nlines, j]
-    adj_vm_to_lines[ℓ, j] += (2.0 * vmag[to_bus, j] * ytf_abs * vmag[fr_bus, j]^2
-                      + 4.0 * vmag[to_bus, j]^3 * ytt_abs
-                      + 6.0 * vmag[fr_bus, j] * vmag[to_bus, j]^2 * (yre_to * cosθ - yim_to * sinθ)
-                       ) * adj_to_flow
-    adj_vm_from_lines[ℓ, j] += (2.0 * vmag[to_bus, j]^2 * vmag[fr_bus, j] * ytf_abs
-                      + 2.0 * vmag[to_bus, j]^3 * (yre_to * cosθ - yim_to * sinθ)
-                        ) * adj_to_flow
-    adj_cosθ = 2.0 * vmag[to_bus, j]^3 * vmag[fr_bus, j] *   yre_to  * adj_to_flow
-    adj_sinθ = 2.0 * vmag[to_bus, j]^3 * vmag[fr_bus, j] * (-yim_to) * adj_to_flow
-
-    adj_from_flow = adj_slines[ℓ, j]
-    adj_vm_from_lines[ℓ, j] += (4.0 * yff_abs * vmag[fr_bus, j]^3
-                      + 2.0 * vmag[to_bus, j]^2 * vmag[fr_bus, j] * yft_abs
-                      + 6.0 * vmag[fr_bus, j]^2 * vmag[to_bus, j] * (yre_fr * cosθ - yim_fr * sinθ)
-                       ) * adj_from_flow
-    adj_vm_to_lines[ℓ, j] += (2.0 * yft_abs * vmag[fr_bus, j]^2 * vmag[to_bus, j]
-                       + 2.0 * vmag[fr_bus, j]^3 * (yre_fr * cosθ - yim_fr * sinθ)
-                        ) * adj_from_flow
-    adj_cosθ += 2.0 * vmag[to_bus, j] * vmag[fr_bus, j]^3 *   yre_fr  * adj_from_flow
-    adj_sinθ += 2.0 * vmag[to_bus, j] * vmag[fr_bus, j]^3 * (-yim_fr) * adj_from_flow
-
-    adj_Δθ =   dsinθ * adj_sinθ
-    adj_Δθ +=  dcosθ * adj_cosθ
-    adj_va_from_lines[ℓ, j] += adj_Δθ
-    adj_va_to_lines[ℓ, j] -= adj_Δθ
-end
-
-function adj_branch_flow!(
-        adj_slines, vmag, adj_vm, vang, adj_va,
-        adj_vm_from_lines, adj_va_from_lines, adj_vm_to_lines, adj_va_to_lines,
-        yff_re, yft_re, ytf_re, ytt_re,
-        yff_im, yft_im, ytf_im, ytt_im,
-        f, t, Cf, Ct, nlines, device
-    )
-    nvbus = length(vang)
-
-    ev = adj_branch_flow_edge_kernel!(device)(
-            adj_slines, vmag, adj_vm, vang, adj_va,
-            adj_va_to_lines, adj_va_from_lines, adj_vm_to_lines, adj_vm_from_lines,
-            yff_re, yft_re, ytf_re, ytt_re,
-            yff_im, yft_im, ytf_im, ytt_im,
-            f, t, nlines, ndrange = (nlines, size(adj_slines, 2)),
-            dependencies=Event(device)
-    )
-    wait(ev)
-
-    # Aggregate the adjoints on the nodes using the bus-node adjacency matrices.
-    # mul! should be overloaded on the GPU to work with dual numbers
-    # (needed to evaluate the Hessian using forward over reverse)
-    mul!(adj_vm, Cf, adj_vm_from_lines, 1.0, 1.0)
-    mul!(adj_vm, Ct, adj_vm_to_lines, 1.0, 1.0)
-    mul!(adj_va, Cf, adj_va_from_lines, 1.0, 1.0)
-    mul!(adj_va, Ct, adj_va_to_lines, 1.0, 1.0)
-end
-
-KA.@kernel function basis_kernel!(
-    cons, @Const(vmag), @Const(vang), @Const(f), @Const(t), nlines, nbus,
-)
-    i, j = @index(Global, NTuple)
-
-    @inbounds begin
-        if i <= nlines
-            ℓ = i
-            fr_bus = f[ℓ]
-            to_bus = t[ℓ]
-            Δθ = vang[fr_bus, j] - vang[to_bus, j]
-            cosθ = cos(Δθ)
-            cons[i, j] = vmag[fr_bus, j] * vmag[to_bus, j] * cosθ
-        elseif i <= 2 * nlines
-            ℓ = i - nlines
-            fr_bus = f[ℓ]
-            to_bus = t[ℓ]
-            Δθ = vang[fr_bus, j] - vang[to_bus, j]
-            sinθ = sin(Δθ)
-            cons[i, j] = vmag[fr_bus, j] * vmag[to_bus, j] * sinθ
-        elseif i <= 2 * nlines + nbus
-            b = i - 2 * nlines
-            cons[i, j] = vmag[b, j] * vmag[b, j]
-        end
-    end
-end
-
-KA.@kernel function adj_basis_kernel!(
-    ∂cons, adj_vmag, adj_vmag_fr, adj_vmag_to,
-    adj_vang_fr, adj_vang_to,
-    @Const(vmag), @Const(vang), @Const(f), @Const(t), nlines, nbus,
-)
-    i, j = @index(Global, NTuple)
-
-    @inbounds begin
-        if i <= nlines
-            ℓ = i
-            fr_bus = f[ℓ]
-            to_bus = t[ℓ]
-            Δθ = vang[fr_bus, j] - vang[to_bus, j]
-            cosθ = cos(Δθ)
-            sinθ = sin(Δθ)
-
-            adj_vang_fr[i] += -vmag[fr_bus, j] * vmag[to_bus, j] * sinθ * ∂cons[ℓ, j]
-            adj_vang_fr[i] +=  vmag[fr_bus, j] * vmag[to_bus, j] * cosθ * ∂cons[ℓ+nlines, j]
-            adj_vang_to[i] +=  vmag[fr_bus, j] * vmag[to_bus, j] * sinθ * ∂cons[ℓ, j]
-            adj_vang_to[i] -=  vmag[fr_bus, j] * vmag[to_bus, j] * cosθ * ∂cons[ℓ+nlines, j]
-
-            adj_vmag_fr[i] +=  vmag[to_bus, j] * cosθ * ∂cons[ℓ, j]
-            adj_vmag_fr[i] += vmag[to_bus, j] * sinθ * ∂cons[ℓ+nlines, j]
-
-            adj_vmag_to[i] +=  vmag[fr_bus, j] * cosθ * ∂cons[ℓ, j]
-            adj_vmag_to[i] += vmag[fr_bus, j] * sinθ * ∂cons[ℓ+nlines, j]
-        else i <= nlines + nbus
-            b = i - nlines
-            adj_vmag[b, j] += 2.0 * vmag[b, j] * ∂cons[b+2*nlines, j]
-        end
-    end
-end
diff --git a/src/Polar/newton.jl b/src/Polar/newton.jl
index 4d9a1354..c97cb309 100644
--- a/src/Polar/newton.jl
+++ b/src/Polar/newton.jl
@@ -1,3 +1,44 @@
+export NewtonRaphson
+
+abstract type AbstractNonLinearSolver end
+
+"""
+    NewtonRaphson <: AbstractNonLinearSolver
+
+Newton-Raphson algorithm. Used to solve the non-linear equation
+``g(x, u) = 0``, at a fixed control ``u``.
+
+### Attributes
+- `maxiter::Int` (default 20): maximum number of iterations
+- `tol::Float64` (default `1e-8`): tolerance of the algorithm
+- `verbose::Int` (default `NONE`): verbosity level
+
+"""
+struct NewtonRaphson <: AbstractNonLinearSolver
+    maxiter::Int
+    tol::Float64
+    verbose::Int
+end
+NewtonRaphson(; maxiter=20, tol=1e-8, verbose=0) = NewtonRaphson(maxiter, tol, verbose)
+
+"""
+    ConvergenceStatus
+
+Convergence status returned by a non-linear algorithm.
+
+### Attributes
+- `has_converged::Bool`: states whether the algorithm has converged.
+- `n_iterations::Int`: total number of iterations of the non-linear algorithm.
+- `norm_residuals::Float64`: final residual.
+- `n_linear_solves::Int`: number of linear systems ``Ax = b`` resolved during the run.
+
+"""
+struct ConvergenceStatus
+    has_converged::Bool
+    n_iterations::Int
+    norm_residuals::Float64
+    n_linear_solves::Int
+end
 
 struct NLBuffer{VT}
     x::VT
@@ -52,6 +93,25 @@ function nlsolve!(
     return ConvergenceStatus(converged, iter, normF, sum(linsol_iters))
 end
 
+"""
+    run_pf(polar::PolarForm, stack::NetworkStack;
+           rtol=1e-8, max_iter=20,
+    )
+
+Solve the power flow equations ``g(x, u) = 0`` w.r.t. the state ``x``,
+using the ([`NewtonRaphson`](@ref) algorithm.
+The initial state ``x`` is specified inside
+`stack`. The object `stack` is modified inplace in the function.
+
+The algorithm stops when a tolerance `rtol` or a maximum number of
+iterations `maxiter` is reached.
+
+## Arguments
+
+* `polar::AbstractFormulation`: formulation of the power flow equation
+* `stack::NetworkStack`: initial values in the network
+
+"""
 function run_pf(
     polar::PolarForm, state::NetworkStack;
     rtol=1e-8, max_iter=20,
diff --git a/src/Polar/objective.jl b/src/Polar/objective.jl
deleted file mode 100644
index 77a7c3bd..00000000
--- a/src/Polar/objective.jl
+++ /dev/null
@@ -1,176 +0,0 @@
-is_constraint(::typeof(cost_production)) = true
-size_constraint(polar::PolarForm, ::typeof(cost_production)) = 1
-
-function pullback_objective(polar::PolarForm)
-    return AutoDiff.TapeMemory(
-        cost_production,
-        AdjointStackObjective(polar),
-        nothing,
-    )
-end
-
-@inline quadratic_cost(pg, c0, c1, c2) = c0 + c1 * pg + c2 * pg^2
-@inline adj_quadratic_cost(pg, c0, c1, c2) = c1 + 2.0 * c2 * pg
-
-KA.@kernel function cost_production_kernel!(
-    costs, pg, @Const(vmag), @Const(vang), pnet, @Const(pload),
-    @Const(c0), @Const(c1), @Const(c2),
-    @Const(pv), @Const(ref), @Const(pv_to_gen), @Const(ref_to_gen),
-    @Const(ybus_re_nzval), @Const(ybus_re_colptr), @Const(ybus_re_rowval),
-    @Const(ybus_im_nzval), @Const(transperm),
-)
-    i, j = @index(Global, NTuple)
-    npv = length(pv)
-    nref = length(ref)
-    # Evaluate active power at PV nodes
-    if i <= npv
-        bus = pv[i]
-        i_gen = pv_to_gen[i]
-        pg[i_gen, j] = pnet[bus, j]
-    # Evaluate active power at slack nodes
-    elseif i <= npv + nref
-        i_ = i - npv
-        bus = ref[i_]
-        i_gen = ref_to_gen[i_]
-        inj = bus_injection(bus, j, vmag, vang, ybus_re_colptr, ybus_re_rowval, ybus_re_nzval, ybus_im_nzval, transperm)
-        pg[i_gen, j] = inj + pload[bus]
-        pnet[bus, j] = inj + pload[bus]
-    end
-
-    costs[i_gen, j] = quadratic_cost(pg[i_gen, j], c0[i_gen], c1[i_gen], c2[i_gen])
-end
-
-KA.@kernel function adj_cost_production_kernel!(
-    adj_costs,
-    @Const(vmag), adj_vmag, @Const(vang), adj_vang, @Const(pnet), adj_pnet, @Const(pload),
-    @Const(c0), @Const(c1), @Const(c2),
-    @Const(pv), @Const(ref), @Const(pv_to_gen), @Const(ref_to_gen),
-    @Const(ybus_re_nzval), @Const(ybus_re_colptr), @Const(ybus_re_rowval), @Const(ybus_im_nzval),
-    @Const(transperm),
-)
-    i, j = @index(Global, NTuple)
-    npv = length(pv)
-    nref = length(ref)
-    if i <= npv
-        bus = pv[i]
-        i_gen = pv_to_gen[i]
-        pg = pnet[bus, j]
-        adj_pnet[bus, j] = adj_costs[1] * adj_quadratic_cost(pg, c0[i_gen], c1[i_gen], c2[i_gen])
-    # Evaluate active power at slack nodes
-    elseif i <= npv + nref
-        i_ = i - npv
-        bus = ref[i_]
-        i_gen = ref_to_gen[i_]
-
-        inj = bus_injection(bus, j, vmag, vang, ybus_re_colptr, ybus_re_rowval, ybus_re_nzval, ybus_im_nzval, transperm)
-        pg = inj + pload[bus]
-
-        adj_net = adj_costs[1] * adj_quadratic_cost(pg, c0[i_gen], c1[i_gen], c2[i_gen])
-        adj_pnet[bus, j] = adj_net
-        adjoint_bus_injection!(
-            bus, j, adj_net, adj_vmag, adj_vang, vmag, vang,
-            ybus_re_colptr, ybus_re_rowval, ybus_re_nzval, ybus_im_nzval, transperm,
-        )
-    end
-end
-
-function cost_production(polar::PolarForm, buffer::PolarNetworkState)
-    pv = polar.indexing.index_pv
-    pq = polar.indexing.index_pq
-    ref = polar.indexing.index_ref
-    pv2gen = polar.indexing.index_pv_to_gen
-    ref2gen = polar.indexing.index_ref_to_gen
-    ybus_re, ybus_im = get(polar.topology, PS.BusAdmittanceMatrix())
-    transperm = polar.topology.sortperm
-
-    ngen = PS.get(polar, PS.NumberOfGenerators())
-    coefs = polar.costs_coefficients
-    c0 = @view coefs[:, 2]
-    c1 = @view coefs[:, 3]
-    c2 = @view coefs[:, 4]
-    costs = similar(buffer.pgen)
-
-    ev = cost_production_kernel!(polar.device)(
-        costs, buffer.pgen,
-        buffer.vmag, buffer.vang, buffer.pnet, buffer.pload,
-        c0, c1, c2,
-        pv, ref, pv2gen, ref2gen,
-        ybus_re.nzval, ybus_re.colptr, ybus_re.rowval, ybus_im.nzval, transperm,
-        ndrange=(ngen, size(buffer.pgen, 2)),
-        dependencies=Event(polar.device)
-    )
-    wait(ev)
-    # TODO: supports batch
-    return sum(costs)
-end
-
-function adjoint!(
-    polar::PolarForm,
-    pbm::AutoDiff.TapeMemory{F, S, I},
-    pg, ∂cost,
-    vm, ∂vm,
-    va, ∂va,
-    pnet, ∂pnet,
-    pload, qload,
-) where {F<:typeof(cost_production), S, I}
-    nbus = PS.get(polar.network, PS.NumberOfBuses())
-    nref = PS.get(polar.network, PS.NumberOfSlackBuses())
-    index_pv = polar.indexing.index_pv
-    index_ref = polar.indexing.index_ref
-    pv2gen = polar.indexing.index_pv_to_gen
-    ref2gen = polar.indexing.index_ref_to_gen
-
-    coefs = polar.costs_coefficients
-    c0 = @view coefs[:, 2]
-    c1 = @view coefs[:, 3]
-    c2 = @view coefs[:, 4]
-
-    ngen = get(polar, PS.NumberOfGenerators())
-    ybus_re, ybus_im = get(polar.topology, PS.BusAdmittanceMatrix())
-    transperm = polar.topology.sortperm
-
-    fill!(∂vm, 0.0)
-    fill!(∂va, 0.0)
-    fill!(∂pnet, 0.0)
-    ev = adj_cost_production_kernel!(polar.device)(
-        ∂cost,
-        vm, ∂vm,
-        va, ∂va,
-        pnet, ∂pnet, pload,
-        c0, c1, c2,
-        index_pv, index_ref, pv2gen, ref2gen,
-        ybus_re.nzval, ybus_re.colptr, ybus_re.rowval, ybus_im.nzval,
-        transperm,
-        ndrange=(ngen, size(∂vm, 2)),
-        dependencies=Event(polar.device)
-    )
-    wait(ev)
-    return
-end
-
-function gradient_objective!(polar::PolarForm, ∂obj::AutoDiff.TapeMemory, buffer::PolarNetworkState)
-    ∂pg = ∂obj.stack.∂pg
-    obj_autodiff = ∂obj.stack
-    adj_pg = obj_autodiff.∂pg
-    adj_x = obj_autodiff.∇fₓ
-    adj_u = obj_autodiff.∇fᵤ
-    adj_vmag = obj_autodiff.∂vm
-    adj_vang = obj_autodiff.∂va
-    adj_pinj = obj_autodiff.∂pinj
-
-    # Adjoint of active power generation
-    adjoint!(polar, ∂obj,
-        buffer.pgen, 1.0,
-        buffer.vmag, adj_vmag,
-        buffer.vang, adj_vang,
-        buffer.pnet, adj_pinj,
-        buffer.pload, buffer.qload,
-    )
-
-    # Adjoint w.r.t. x and u
-    fill!(adj_x, 0.0)
-    fill!(adj_u, 0.0)
-    adjoint_transfer!(polar, adj_u, adj_x, adj_vmag, adj_vang, adj_pinj)
-    return
-end
-
diff --git a/src/Polar/polar.jl b/src/Polar/polar.jl
index e0ddb01e..43f9f15a 100644
--- a/src/Polar/polar.jl
+++ b/src/Polar/polar.jl
@@ -1,8 +1,4 @@
 # Polar formulation
-#
-#
-#
-include("caches.jl")
 
 """
     PolarForm{T, IT, VT, MT}
@@ -18,30 +14,8 @@ or directly on the device `CUDADevice()`.
 struct PolarForm{T, IT, VT, MT} <: AbstractFormulation where {T, IT, VT, MT}
     network::PS.PowerNetwork
     device::KA.Device
-    # bounds
-    x_min::VT
-    x_max::VT
-    u_min::VT
-    u_max::VT
-    # costs
-    costs_coefficients::MT
-    # Indexing of the PV, PQ and slack buses
-    indexing::IndexingCache{IT}
-    # struct
-    topology::NetworkTopology{IT, VT}
-    # Jacobian indexing
-    mapx::IT
-    mapu::IT
-    # Hessian structures and indexing
-    hessianstructure::HessianStructure
 end
 
-include("kernels.jl")
-include("derivatives.jl")
-include("Constraints/constraints.jl")
-include("powerflow.jl")
-include("objective.jl")
-include("batch.jl")
 include("functions.jl")
 include("first_order.jl")
 include("second_order.jl")
@@ -61,213 +35,38 @@ function PolarForm(pf::PS.PowerNetwork, device::KA.Device)
         AT = CUDA.CuArray
     end
 
-    nbus = PS.get(pf, PS.NumberOfBuses())
-    npv = PS.get(pf, PS.NumberOfPVBuses())
-    npq = PS.get(pf, PS.NumberOfPQBuses())
-    nref = PS.get(pf, PS.NumberOfSlackBuses())
-    ngens = PS.get(pf, PS.NumberOfGenerators())
-
-    topology = NetworkTopology(pf, IT, VT)
-    # Get coefficients penalizing the generation of the generators
-    coefs = convert(AT{Float64, 2}, PS.get_costs_coefficients(pf))
-
-    # Move the indexing to the target device
-    idx_gen = PS.get(pf, PS.GeneratorIndexes())
-    idx_ref = PS.get(pf, PS.SlackIndexes())
-    idx_pv = PS.get(pf, PS.PVIndexes())
-    idx_pq = PS.get(pf, PS.PQIndexes())
-    # Build-up reverse index for performance
-    pv_to_gen = PS.get(pf, PS.PVToGeneratorsIndex())
-    ref_to_gen = PS.get(pf, PS.SlackToGeneratorsIndex())
-
-    gidx_gen = convert(IT, idx_gen)
-    gidx_ref = convert(IT, idx_ref)
-    gidx_pv = convert(IT, idx_pv)
-    gidx_pq = convert(IT, idx_pq)
-    gref_to_gen = convert(IT, ref_to_gen)
-    gpv_to_gen = convert(IT, pv_to_gen)
-
-    # Bounds
-    ## Get bounds on active power
-    p_min, p_max = PS.bounds(pf, PS.Generators(), PS.ActivePower())
-    p_min = convert(VT, p_min)
-    p_max = convert(VT, p_max)
-    ## Get bounds on voltage magnitude
-    v_min, v_max = PS.bounds(pf, PS.Buses(), PS.VoltageMagnitude())
-    v_min = convert(VT, v_min)
-    v_max = convert(VT, v_max)
-    ## Instantiate arrays
-    nᵤ = nref + 2*npv
-    nₓ = npv + 2*npq
-    u_min = convert(VT, fill(-Inf, nᵤ))
-    u_max = convert(VT, fill( Inf, nᵤ))
-    x_min = convert(VT, fill(-Inf, nₓ))
-    x_max = convert(VT, fill( Inf, nₓ))
-    ## Bounds on v_pq
-    x_min[npv+npq+1:end] .= v_min[gidx_pq]
-    x_max[npv+npq+1:end] .= v_max[gidx_pq]
-    ## Bounds on v_pv
-    u_min[nref+1:nref+npv] .= v_min[gidx_pv]
-    u_max[nref+1:nref+npv] .= v_max[gidx_pv]
-    ## Bounds on v_ref
-    u_min[1:nref] .= v_min[gidx_ref]
-    u_max[1:nref] .= v_max[gidx_ref]
-    ## Bounds on p_pv
-    u_min[nref+npv+1:nref+2*npv] .= p_min[gpv_to_gen]
-    u_max[nref+npv+1:nref+2*npv] .= p_max[gpv_to_gen]
-
-    indexing = IndexingCache(gidx_pv, gidx_pq, gidx_ref, gidx_gen, gpv_to_gen, gref_to_gen)
-    mappv = [i + nbus for i in idx_pv]
-    mappq = [i + nbus for i in idx_pq]
-    # Ordering for x is (θ_pv, θ_pq, v_pq)
-    statemap = vcat(mappv, mappq, idx_pq)
-    controlmap = vcat(idx_ref, idx_pv, idx_pv .+ nbus)
-    hessianmap = vcat(statemap, idx_ref, idx_pv, idx_pv .+ 2*nbus)
-    hessianstructure = HessianStructure(IT(hessianmap))
     return PolarForm{Float64, IT, VT, AT{Float64,  2}}(
         pf, device,
-        x_min, x_max, u_min, u_max,
-        coefs,
-        indexing,
-        topology,
-        statemap, controlmap,
-        hessianstructure
     )
 end
 # Convenient constructor
 PolarForm(datafile::String, device) = PolarForm(PS.PowerNetwork(datafile), device)
 
-# Getters
-function get(polar::PolarForm, ::NumberOfState)
-    npv = PS.get(polar.network, PS.NumberOfPVBuses())
-    npq = PS.get(polar.network, PS.NumberOfPQBuses())
-    return 2*npq + npv
-end
 
-function get(polar::PolarForm, ::NumberOfControl)
-    nref = PS.get(polar.network, PS.NumberOfSlackBuses())
-    npv = PS.get(polar.network, PS.NumberOfPVBuses())
-    return nref + 2*npv
-end
-
-get(polar::PolarForm, attr::PS.AbstractNetworkAttribute) = get(polar.network, attr)
-
-index_buses_host(polar) = PS.get(polar.network, PS.AllBusesIndex())
-index_buses_device(polar) = index_buses(polar.indexing)
+# Ordering: [vmag, vang, pgen]
 
-index_generators_host(polar) = PS.get(polar.network, PS.AllGeneratorsIndex())
-index_generators_device(polar) = index_generators(polar.indexing)
-
-## Bounds
-function bounds(polar::PolarForm{T, IT, VT, MT}, ::State) where {T, IT, VT, MT}
-    return polar.x_min, polar.x_max
-end
-
-function bounds(polar::PolarForm{T, IT, VT, MT}, ::Control) where {T, IT, VT, MT}
-    return polar.u_min, polar.u_max
-end
-
-# Initial position
-function initial(polar::PolarForm{T, IT, VT, MT}, X::Union{State,Control}) where {T, IT, VT, MT}
-    ref, pv, pq = index_buses_host(polar)
-    _, _, pv2gen = index_generators_host(polar)
-    # Load data from PowerNetwork
-    vmag = abs.(polar.network.vbus)
-    vang = angle.(polar.network.vbus)
-    pg = get(polar.network, PS.ActivePower())
-
-    if isa(X, State)
-        # build vector x
-        return [vang[pv] ; vang[pq] ; vmag[pq]] |> VT
-    elseif isa(X, Control)
-        return [vmag[ref] ; vmag[pv] ; pg[pv2gen]] |> VT
-    end
-end
-
-function get(form::PolarForm{T, VI, VT, MT}, ::PhysicalState) where {T, VI, VT, MT}
-    nbus = PS.get(form.network, PS.NumberOfBuses())
-    ngen = PS.get(form.network, PS.NumberOfGenerators())
-    n_state = get(form, NumberOfState())
-    gen2bus = form.indexing.index_generators
-    # Bus variables
-    pnet = zeros(nbus) |> VT
-    qnet = zeros(nbus) |> VT
-    vmag = zeros(nbus) |> VT
-    vang = zeros(nbus) |> VT
-    # Generators variables
-    pgen = zeros(ngen) |> VT
-    qgen = zeros(ngen) |> VT
-    # Loads
-    pload = zeros(nbus) |> VT
-    qload = zeros(nbus) |> VT
-    # Buffers
-    balance = zeros(n_state) |> VT
-    dx = zeros(n_state) |> VT
-    return PolarNetworkState(pnet, qnet, vmag, vang, pgen, qgen, pload, qload, balance, dx, gen2bus)
-end
-
-function get!(
-    polar::PolarForm{T, IT, VT, MT},
-    ::State,
-    x::AbstractVector,
-    buffer::PolarNetworkState
-) where {T, IT, VT, MT}
-    npv = get(polar, PS.NumberOfPVBuses())
-    npq = get(polar, PS.NumberOfPQBuses())
-    nref = get(polar, PS.NumberOfSlackBuses())
+function my_map(polar::PolarForm, ::State)
+    nbus = get(polar, PS.NumberOfBuses())
     ref, pv, pq = index_buses_host(polar)
-    # Copy values of vang and vmag into x
-    # NB: this leads to 3 memory allocation on the GPU
-    #     we use indexing on the CPU, as for some reason
-    #     we get better performance than with the indexing on the GPU
-    #     stored in the buffer polar.indexing.
-    x[1:npv] .= @view buffer.vang[pv]
-    x[npv+1:npv+npq] .= @view buffer.vang[pq]
-    x[npv+npq+1:npv+2*npq] .= @view buffer.vmag[pq]
+    return Int[nbus .+ pv; nbus .+ pq; pq]
 end
-
-function get!(
-    polar::PolarForm{T, IT, VT, MT},
-    ::Control,
-    u::AbstractVector,
-    buffer::PolarNetworkState,
-) where {T, IT, VT, MT}
-    npv = get(polar, PS.NumberOfPVBuses())
-    npq = get(polar, PS.NumberOfPQBuses())
-    nref = get(polar, PS.NumberOfSlackBuses())
+function my_map(polar::PolarForm, ::Control)
+    nbus = get(polar, PS.NumberOfBuses())
     ref, pv, pq = index_buses_host(polar)
-    _, _, pv2gen = index_generators_host(polar)
-    # build vector u
-    nᵤ = get(polar, NumberOfControl())
-    u[1:nref] .= @view buffer.vmag[ref]
-    u[nref + 1:nref + npv] .= @view buffer.vmag[pv]
-    u[nref + npv + 1:nref + 2*npv] .= @view buffer.pgen[pv2gen]
-    return u
+    pv2gen = polar.network.pv2gen
+    return Int[ref; pv; 2*nbus .+ pv2gen]
 end
 
-function init_buffer!(form::PolarForm{T, IT, VT, MT}, buffer::PolarNetworkState) where {T, IT, VT, MT}
-    # FIXME: add proper getters in PowerSystem
-    vmag = abs.(form.network.vbus)
-    vang = angle.(form.network.vbus)
-    pd = PS.get(form.network, PS.ActiveLoad())
-    qd = PS.get(form.network, PS.ReactiveLoad())
+number(polar::PolarForm, v::AbstractVariable) = length(my_map(polar, v))
 
-    pg = get(form.network, PS.ActivePower())
-    qg = get(form.network, PS.ReactivePower())
+# Getters
+get(polar::PolarForm, attr::PS.AbstractNetworkAttribute) = get(polar.network, attr)
 
-    copyto!(buffer.vmag, vmag)
-    copyto!(buffer.vang, vang)
-    copyto!(buffer.pgen, pg)
-    copyto!(buffer.qgen, qg)
-    copyto!(buffer.pload, pd)
-    copyto!(buffer.qload, qd)
+index_buses_host(polar) = PS.get(polar.network, PS.AllBusesIndex())
+index_buses_device(polar) = index_buses(polar.indexing)
 
-    fill!(buffer.pnet, 0.0)
-    fill!(buffer.qnet, 0.0)
-    copyto!(view(buffer.pnet, form.indexing.index_generators), pg)
-    copyto!(view(buffer.qnet, form.indexing.index_generators), qg)
-    return
-end
+index_generators_host(polar) = PS.get(polar.network, PS.AllGeneratorsIndex())
+index_generators_device(polar) = index_generators(polar.indexing)
 
 # Power flow linear solvers
 function powerflow_jacobian(polar)
@@ -291,8 +90,8 @@ function Base.show(io::IO, polar::PolarForm)
     ngen = PS.get(polar.network, PS.NumberOfGenerators())
     nlines = PS.get(polar.network, PS.NumberOfLines())
     # Polar formulation characteristics
-    n_states = get(polar, NumberOfState())
-    n_controls = get(polar, NumberOfControl())
+    n_states = 2*npq + npv
+    n_controls = nref + npv + ngen - 1
     print(io,   "Polar formulation model")
     println(io, " (instantiated on device $(polar.device))")
     println(io, "Network characteristics:")
diff --git a/src/Polar/powerflow.jl b/src/Polar/powerflow.jl
deleted file mode 100644
index cfa85117..00000000
--- a/src/Polar/powerflow.jl
+++ /dev/null
@@ -1,215 +0,0 @@
-
-function powerflow(
-    polar::PolarForm,
-    algo::AbstractNonLinearSolver;
-    linear_solver=DirectSolver(),
-)
-    buffer = get(polar, PhysicalState())
-    init_buffer!(polar, buffer)
-    Jₓ = AutoDiff.Jacobian(polar, power_balance, State())
-    return powerflow(polar, Jₓ, buffer, algo; linear_solver=linear_solver)
-end
-
-function powerflow(
-    polar::PolarForm{T, IT, VT, MT},
-    jacobian::AutoDiff.Jacobian,
-    buffer::PolarNetworkState{IT,VT},
-    algo::NewtonRaphson;
-    linear_solver=DirectSolver(),
-) where {T, IT, VT, MT}
-    # Retrieve parameter and initial voltage guess
-    Vm, Va = buffer.vmag, buffer.vang
-
-    nbus = PS.get(polar.network, PS.NumberOfBuses())
-    ngen = PS.get(polar.network, PS.NumberOfGenerators())
-    npv = PS.get(polar.network, PS.NumberOfPVBuses())
-    npq = PS.get(polar.network, PS.NumberOfPQBuses())
-    n_states = get(polar, NumberOfState())
-    nvbus = length(polar.network.vbus)
-    ybus_re, ybus_im = get(polar.topology, PS.BusAdmittanceMatrix())
-
-    ref, pv, pq = index_buses_device(polar)
-
-    # iteration variables
-    iter = 0
-    converged = false
-
-    # indices
-    j1 = 1
-    j2 = npv
-    j3 = j2 + 1
-    j4 = j2 + npq
-    j5 = j4 + 1
-    j6 = j4 + npq
-
-    # form residual function directly on target device
-    F = buffer.balance
-    dx = buffer.dx
-    fill!(F, zero(T))
-    fill!(dx, zero(T))
-
-    # Evaluate residual function
-    power_balance(polar, F, buffer)
-
-    # check for convergence
-    normF = xnorm(F)
-    if algo.verbose >= VERBOSE_LEVEL_LOW
-        @printf("Iteration %d. Residual norm: %g.\n", iter, normF)
-    end
-    if normF < algo.tol
-        converged = true
-    end
-
-    linsol_iters = Int[]
-    Vapv = view(Va, pv)
-    Vapq = view(Va, pq)
-    Vmpq = view(Vm, pq)
-    dx12 = view(dx, j5:j6) # Vmqp
-    dx34 = view(dx, j3:j4) # Vapq
-    dx56 = view(dx, j1:j2) # Vapv
-
-    @timeit TIMER "Newton" while ((!converged) && (iter < algo.maxiter))
-
-        iter += 1
-
-        @timeit TIMER "Jacobian" begin
-            J = AutoDiff.jacobian!(polar, jacobian, buffer)
-        end
-
-        # Find descent direction
-        if isa(linear_solver, LinearSolvers.AbstractIterativeLinearSolver)
-            @timeit TIMER "Preconditioner" LS.update_preconditioner!(linear_solver, J, polar.device)
-        end
-        @timeit TIMER "Linear Solver" n_iters = LS.ldiv!(linear_solver, dx, J, F)
-        push!(linsol_iters, n_iters)
-
-        # update voltage
-        @timeit TIMER "Update voltage" begin
-            # Sometimes it is better to move backward
-            if (npv != 0)
-                # Va[pv] .= Va[pv] .+ dx[j5:j6]
-                Vapv .= Vapv .- dx56
-            end
-            if (npq != 0)
-                # Vm[pq] .= Vm[pq] .+ dx[j1:j2]
-                Vmpq .= Vmpq .- dx12
-                # Va[pq] .= Va[pq] .+ dx[j3:j4]
-                Vapq .= Vapq .- dx34
-            end
-        end
-
-        fill!(F, zero(T))
-        @timeit TIMER "Residual function" begin
-            power_balance(polar, F, buffer)
-        end
-
-        @timeit TIMER "Norm" normF = xnorm(F)
-        if algo.verbose >= VERBOSE_LEVEL_LOW
-            @printf("Iteration %d. Residual norm: %g.\n", iter, normF)
-        end
-
-        if normF < algo.tol
-            converged = true
-        end
-    end
-
-    if algo.verbose >= VERBOSE_LEVEL_HIGH
-        if converged
-            @printf("N-R converged in %d iterations.\n", iter)
-        else
-            @printf("N-R did not converge.\n")
-        end
-    end
-
-    # Timer outputs display
-    if algo.verbose >= VERBOSE_LEVEL_MEDIUM
-        show(TIMER)
-        println("")
-    end
-    return ConvergenceStatus(converged, iter, normF, sum(linsol_iters))
-end
-
-function batch_powerflow(
-    polar::PolarForm{T, IT, VT, MT},
-    jacobian::AutoDiff.Jacobian,
-    buffer::PolarNetworkState{IT,MT},
-    algo::NewtonRaphson,
-    linear_solver::LS.DirectSolver;
-) where {T, IT, VT, MT}
-    # Retrieve parameter and initial voltage guess
-    Vm, Va = buffer.vmag, buffer.vang
-    nbatch = size(Vm, 2)
-
-    nbus = PS.get(polar.network, PS.NumberOfBuses())
-    ngen = PS.get(polar.network, PS.NumberOfGenerators())
-    npv = PS.get(polar.network, PS.NumberOfPVBuses())
-    npq = PS.get(polar.network, PS.NumberOfPQBuses())
-    n_states = get(polar, NumberOfState())
-    nvbus = length(polar.network.vbus)
-    ybus_re, ybus_im = get(polar.topology, PS.BusAdmittanceMatrix())
-
-    ref, pv, pq = index_buses_device(polar)
-
-    # iteration variables
-    iter = 0
-    converged = false
-
-    # indices
-    j1 = 1
-    j2 = npv
-    j3 = j2 + 1
-    j4 = j2 + npq
-    j5 = j4 + 1
-    j6 = j4 + npq
-
-    # form residual function directly on target device
-    F = buffer.balance
-    dx = buffer.dx
-    fill!(F, zero(T))
-    fill!(dx, zero(T))
-
-    # Evaluate residual function
-    power_balance(polar, F, buffer)
-
-    # check for convergence
-    normF = Float64[xnorm(view(F, :, i)) for i in 1:nbatch]
-    if algo.verbose >= VERBOSE_LEVEL_LOW
-        @printf("Iteration %d. Residual norm: %g.\n", iter, sum(normF))
-    end
-    if all(normF .< algo.tol)
-        converged = true
-    end
-
-    Vapv = view(Va, pv, :)
-    Vapq = view(Va, pq, :)
-    Vmpq = view(Vm, pq, :)
-    dx12 = view(dx, j5:j6, :) # Vmqp
-    dx34 = view(dx, j3:j4, :) # Vapq
-    dx56 = view(dx, j1:j2, :) # Vapv
-
-    while ((!converged) && (iter < algo.maxiter))
-        iter += 1
-
-        J = batch_jacobian!(polar, jacobian, buffer)
-        LS.batch_ldiv!(linear_solver, dx, J, F)
-        # x+ = x - J \ F
-        Vapv .= Vapv .- dx56
-        Vmpq .= Vmpq .- dx12
-        Vapq .= Vapq .- dx34
-
-        fill!(F, zero(T))
-        power_balance(polar, F, buffer)
-
-        normF = Float64[xnorm(view(F, :, i)) for i in 1:nbatch]
-
-        if algo.verbose >= VERBOSE_LEVEL_LOW
-            @printf("Iteration %d. Residual norm: %g.\n", iter, sum(normF))
-        end
-
-        if all(normF .< algo.tol)
-            converged = true
-        end
-    end
-    return ConvergenceStatus(converged, iter, sum(normF), 0)
-end
-
diff --git a/src/Polar/second_order.jl b/src/Polar/second_order.jl
index f1d37b06..b9184281 100644
--- a/src/Polar/second_order.jl
+++ b/src/Polar/second_order.jl
@@ -45,6 +45,21 @@ function MyHessian(polar::PolarForm{T, VI, VT, MT}, func::AbstractExpression, ma
     )
 end
 
+function _init_seed_hessian!(dest, tmp, v::AbstractArray, nmap)
+    @inbounds for i in 1:nmap
+        dest[i] = ForwardDiff.Partials{1, Float64}(NTuple{1, Float64}(v[i]))
+    end
+    return
+end
+function _init_seed_hessian!(dest, tmp, v::CUDA.CuArray, nmap)
+    hostv = Array(v)
+    @inbounds Threads.@threads for i in 1:nmap
+        tmp[i] = ForwardDiff.Partials{1, Float64}(NTuple{1, Float64}(hostv[i]))
+    end
+    copyto!(dest, tmp)
+    return
+end
+
 function hprod!(
     H::MyHessian, hv, state, λ, v,
 )
diff --git a/src/architectures.jl b/src/architectures.jl
index b4316f0e..29c6de53 100644
--- a/src/architectures.jl
+++ b/src/architectures.jl
@@ -12,7 +12,7 @@ default_sparse_matrix(::CPU) = SparseMatrixCSC{Float64,Int}
 function get_jacobian_types(::CPU)
     SMT = SparseMatrixCSC{Float64,Int}
     A = Vector
-	return SMT, A
+    return SMT, A
 end
 
 function get_jacobian_types(::GPU)
@@ -24,7 +24,7 @@ end
 function get_batch_jacobian_types(::CPU)
     SMT = SparseMatrixCSC{Float64,Int}
     A = Array
-	return SMT, A
+    return SMT, A
 end
 
 function get_batch_jacobian_types(::GPU)
diff --git a/src/autodiff.jl b/src/autodiff.jl
index 8ce67a2e..d0aa6537 100644
--- a/src/autodiff.jl
+++ b/src/autodiff.jl
@@ -31,109 +31,6 @@ any nonlinear constraint ``h(x)``.
 """
 abstract type AbstractHessian end
 
-abstract type AbstractAdjointStack{VT} end
-
-"""
-    jacobian!(form::AbstractFormulation, jac::AutoDiff.AbstractJacobian, x)
-
-Update inplace the Jacobian ``J`` stored inside `jac` at a new point `x`.
-"""
-function jacobian! end
-
-"""
-    adj_hessian_prod!(form::AbstractFormulation, H::AutoDiff.AbstractHessian, hv, x, λ, v)
-
-Compute the adjoint-Hessian-vector product ``λ^⊤ H v`` at a given
-point `x`, and store the result inplace in vector `hv`.
-
-"""
-function adj_hessian_prod! end
-
-"""
-    AutoDiff.Jacobian <: AbstractJacobian
-
-Creates an object to compute the Jacobian with ForwardDiff.
-
-### Attributes
-
-* `func::Func`: base function to differentiate
-* `var::Union{State,Control}`: specify whether we are differentiating w.r.t. the state or the control.
-* `J::SMT`: Sparse uncompressed Jacobian to be used by linear solver. This is either of type `SparseMatrixCSC` or `CuSparseMatrixCSR`.
-* `compressedJ::MT`: Dense compressed Jacobian used for updating values through AD either of type `Matrix` or `CuMatrix`.
-* `coloring::VI`: Row coloring of the Jacobian.
-* `t1sseeds::VP`: The seeding vector for AD built based on the coloring.
-* `t1sF::VD`: Output array of active (AD) type.
-* `x::VT`: Input array of passive type. This includes both state and control.
-* `t1sx::VD`: Input array of active type.
-* `map::VI`: State and control mapping to array `x`
-* `varx::SubT`: View of `map` on `x`
-* `t1svarx::SubD`: Active (AD) view of `map` on `x`
-
-"""
-struct Jacobian{Func, VI, VT, MT, SMT, VP, VD, SubT, SubD, JacT} <: AbstractJacobian
-    func::Func
-    J::SMT
-    compressedJ::MT
-    coloring::VI
-    t1sseeds::VP
-    t1sF::VD
-    x::VT
-    t1sx::VD
-    map::VI
-    # Cache views on x and its dual vector to avoid reallocating on the GPU
-    varx::SubT
-    t1svarx::SubD
-end
-
-function Base.show(io::IO, jacobian::Jacobian)
-    println(io, "A AutoDiff Jacobian for $(jacobian.func)")
-    ncolor = size(jacobian.compressedJ, 1)
-    print(io, "Number of Jacobian colors: ", ncolor)
-end
-
-"""
-    AutoDiff.ConstantJacobian <: AbstractJacobian
-
-Creates a constant Jacobian object for a linear function ``h(x)``.
-Using a `ConstantJacobian` object allows to avoid computing
-the full Jacobian with AutoDiff when it is not necessary.
-"""
-struct ConstantJacobian{SMT} <: AbstractJacobian
-    J::SMT
-end
-
-"""
-    AutoDiff.Hessian
-
-Creates an object for computing Hessian adjoint tangent projections.
-
-* `func::Func`: base function to differentiate.
-* `host_t1sseeds::VHP`: Seeding vector for seeding on the host.
-* `t1sseeds::VP`: The seeding vector for AD built based on the coloring.
-* `x::VT`: Input array of passive type. This includes both state and control.
-* `t1sF::VD`: Output array of active (AD) type.
-* `∂t1sF::VD`: Adjoint of the output array.
-* `t1sx::VD`: Input array of active type.
-* `∂t1sx::VD`: Adjoint of the input array.
-* `map::VI`: State and control mapping to array `x`
-* `varx::SubT`: View of `map` on `x`
-* `t1svarx::SubD`: Active (AD) view of `map` on `x`
-* `buffer::Buff`: cache for computing the adjoint (could be `Nothing`)
-"""
-struct Hessian{Func, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, Buff} <: AbstractHessian
-    func::Func
-    host_t1sseeds::T1 # Needed because seeds have to be created on the host
-    t1sseeds::T2
-    x::T3
-    t1sF::T4
-    ∂t1sF::T5
-    t1sx::T6
-    ∂t1sx::T7
-    map::T8
-    varx::T9
-    t1svarx::T10
-    buffer::Buff
-end
 
 # Cache for adjoint
 """
diff --git a/src/models.jl b/src/models.jl
index c54717cf..d2c0c036 100644
--- a/src/models.jl
+++ b/src/models.jl
@@ -1,14 +1,6 @@
 export PolarForm, bounds, powerflow
 export State, Control, Parameters, NumberOfState, NumberOfControl
 
-"""
-    AbstractStructure
-
-The user may specify a mapping to the single input vector `x` for AD.
-
-"""
-abstract type AbstractStructure end
-
 """
     AbstractFormulation
 
@@ -19,19 +11,6 @@ third layer (implementing the callbacks for the optimization solver).
 """
 abstract type AbstractFormulation end
 
-"""
-    AbstractFormAttribute
-
-Attributes attached to an `AbstractFormulation`.
-"""
-abstract type AbstractFormAttribute end
-
-"Number of states attached to a particular formulation."
-struct NumberOfState <: AbstractFormAttribute end
-
-"Number of controls attached to a particular formulation."
-struct NumberOfControl <: AbstractFormAttribute end
-
 """
     AbstractVariable
 
@@ -57,57 +36,13 @@ formulation.
 """
 struct Control <: AbstractVariable end
 
-"""
-    PhysicalState <: AbstractVariable
-
-All physical variables describing the current physical state
-of the underlying network.
-
-`PhysicalState` variables are encoded in a `AbstractNetworkBuffer`,
-storing all the physical values needed to describe the current
-state of the network.
-
-"""
-struct PhysicalState <: AbstractVariable end
-
-# Templates
-"""
-    get(form::AbstractFormulation, attr::AbstractFormAttribute)
-
-Return value of attribute `attr` attached to the particular
-formulation `form`.
-
-## Examples
-
-```julia
-get(form, NumberOfState())
-get(form, NumberOfControl())
-
-```
-"""
-function get end
-
-"""
-    setvalues!(form::AbstractFormulation, attr::PS.AbstractNetworkAttribute, values)
-
-Update inplace the attribute's values specified by `attr`.
-
-## Examples
-
-```julia
-setvalues!(form, ActiveLoad(), new_ploads)
-setvalues!(form, ReactiveLoad(), new_qloads)
-
-```
-"""
-function setvalues! end
 
 """
     bounds(form::AbstractFormulation, var::AbstractVariable)
 
 Return the bounds attached to the variable `var`.
 
-    bounds(form::AbstractFormulation, func::Function)
+    bounds(form::AbstractFormulation, func::AbstractExpression)
 
 Return a tuple of vectors `(lb, ub)` specifying the admissible range
 of the constraints specified by the function `cons_func`.
@@ -122,248 +57,3 @@ h_min, h_max = bounds(form, reactive_power_constraints)
 """
 function bounds end
 
-"""
-    initial(form::AbstractFormulation, var::AbstractVariable)
-
-Return an initial position for the variable `var`.
-
-## Examples
-
-```julia
-u₀ = initial(form, Control())
-x₀ = initial(form, State())
-
-```
-"""
-function initial end
-
-"""
-    powerflow(form::AbstractFormulation,
-              algo::AbstractNonLinearSolver;
-              kwargs...)
-
-    powerflow(form::AbstractFormulation,
-              jacobian::AutoDiff.Jacobian,
-              buffer::AbstractNetworkBuffer,
-              algo::AbstractNonLinearSolver;
-              kwargs...) where VT <: AbstractVector
-
-Solve the power flow equations ``g(x, u) = 0`` w.r.t. the state ``x``,
-using the algorithm specified in `algo` ([`NewtonRaphson`](@ref) by default).
-The initial state ``x`` is specified inside
-`buffer`. The object `buffer` is modified inplace in the function.
-
-The algorithm stops when a tolerance `tol` or a maximum number of
-iterations `maxiter` is reached (these parameters being specified
-in the object `algo`).
-
-## Notes
-If only the arguments `form` and `algo` are specified to the function,
-then the Jacobian `jacobian` and the cache `buffer` are inferred
-from the object `form`.
-
-## Arguments
-
-* `form::AbstractFormulation`: formulation of the power flow equation
-* `jacobian::AutoDiff.Jacobian`: Jacobian
-* `buffer::AbstractNetworkBuffer`: buffer storing current state `x` and control `u`
-* `algo::AbstractNonLinearSolver`: non-linear solver. Currently only `NewtonRaphson` is being implemented.
-
-## Optional arguments
-
-* `linear_solver::AbstractLinearSolver` (default `DirectSolver()`): solver to solve the linear systems ``J x = y`` arising at each iteration of the Newton-Raphson algorithm.
-
-"""
-function powerflow end
-
-# Cost function
-"""
-    cost_production(form::AbstractFormulation, buffer::AbstractNetworkBuffer)::Float64
-
-Get operational cost.
-"""
-function cost_production end
-
-"""
-    cost_penalty_ramping_constraints(form::AbstractFormulation, buffer::AbstractNetworkBuffer, params...)::Float64
-
-Get operational cost, including a quadratic penalty penalizing the ramping
-constraints w.r.t. a given reference.
-"""
-function cost_penalty_ramping_constraints end
-
-
-# Generic constraints
-
-"""
-    voltage_magnitude_constraints(form::AbstractFormulation, cons::AbstractVector, buffer::AbstractNetworkBuffer)
-
-Bounds the voltage magnitudes at PQ nodes:
-```math
-v_{pq}^♭ ≤ v_{pq} ≤ v_{pq}^♯ .
-```
-The result is stored inplace, inside `cons`.
-
-## Note
-The constraints on the voltage magnitudes at PV nodes ``v_{pv}``
-are taken into account when bounding the control ``u``.
-"""
-function voltage_magnitude_constraints end
-
-"""
-    active_power_constraints(form::AbstractFormulation, cons::AbstractVector, buffer::AbstractNetworkBuffer)
-
-Evaluate the constraints on the **active power production** at the generators
-that are not already taken into account in the bound constraints.
-```math
-p_g^♭ ≤ p_g ≤ p_g^♯  .
-```
-
-The result is stored inplace, inside the vector `cons`.
-"""
-function active_power_constraints end
-
-"""
-    reactive_power_constraints(form::AbstractFormulation, cons::AbstractVector, buffer::AbstractNetworkBuffer)
-
-Evaluate the constraints on the **reactive power production** at the generators:
-```math
-q_g^♭ ≤ q_g ≤ q_g^♯  .
-```
-The result is stored inplace, inside the vector `cons`.
-"""
-function reactive_power_constraints end
-
-"""
-    flow_constraints(form::AbstractFormulation, cons::AbstractVector, buffer::AbstractNetworkBuffer)
-
-Evaluate the thermal limit constraints porting on the lines of the network.
-The result is stored inplace, inside the vector `cons`.
-"""
-function flow_constraints end
-
-@doc raw"""
-    power_balance(form::AbstractFormulation, cons::AbstractVector, buffer::AbstractNetworkBuffer)
-
-Evaluate a subset of the power injection in the network.
-The function `power_balance` corresponds to the function
-```math
-g(x, u) = 0 .
-```
-introduced in the documentation.
-
-In detail, the function encodes the active balance equations at
-PV and PQ nodes, and the reactive balance equations at PQ nodes:
-```math
-\begin{aligned}
-    p_i &= v_i \sum_{j}^{n} v_j (g_{ij}\cos{(\theta_i - \theta_j)} + b_{ij}\sin{(\theta_i - \theta_j})) \,, &
-    ∀ i ∈ \{PV, PQ\} \\
-    q_i &= v_i \sum_{j}^{n} v_j (g_{ij}\sin{(\theta_i - \theta_j)} - b_{ij}\cos{(\theta_i - \theta_j})) \,. &
-    ∀ i ∈ \{PQ\}
-\end{aligned}
-```
-
-The result is stored inplace, inside the vector `cons`.
-"""
-function power_balance end
-
-@doc raw"""
-    bus_power_injection(form::AbstractFormulation, cons::AbstractVector, buffer::AbstractNetworkBuffer)
-
-Evaluate *all* the power injection in the network.
-
-In detail, the function encodes the active balance equations and
-the reactive balance equations at all buses in the network:
-```math
-\begin{aligned}
-    p_i &= v_i \sum_{j}^{n} v_j (g_{ij}\cos{(\theta_i - \theta_j)} + b_{ij}\sin{(\theta_i - \theta_j})) \,, &
-    ∀ i = 1, ⋯, n_b \\
-    q_i &= v_i \sum_{j}^{n} v_j (g_{ij}\sin{(\theta_i - \theta_j)} - b_{ij}\cos{(\theta_i - \theta_j})) \,. &
-    ∀ i = 1, ⋯,  n_b
-\end{aligned}
-```
-
-The result is stored inplace, inside the vector `cons`.
-"""
-function bus_power_injection end
-
-@doc raw"""
-    network_operations(form::AbstractFormulation, cons::AbstractVector, buffer::AbstractNetworkBuffer)
-
-This function calls [`bus_power_injection`](@ref) and uses the
-result to evaluate all the operational cosntraints as well as
-the corresponding operational cost. The final result is stored
-inplace inside `cons`, with the following ordering:
-- `1:npv+2npq`: power balance at PV and PQ nodes (same result as in [`bus_power_injection`](@ref)).
-- `npv+2npq+1:npv+2npq+1`: active power generation at slack nodes
-- `npv+2npq+1:npv+2npq+1+ngen`: reactive power generation at generators
-- `npv+2npq+1+ngen+1`: operational costs
-
-"""
-function network_operations end
-
-# Interface for the constraints
-"""
-    size_constraint(cons_func::Function)::Bool
-Return whether the function `cons_func` is a supported constraint
-in the powerflow model.
-"""
-function is_constraint end
-
-"""
-    size_constraint(form::AbstractFormulation, cons_func::Function)::Int
-
-Get number of constraints specified by the function `cons_func`
-in the formulation `form`.
-"""
-function size_constraint end
-
-"""
-    adjoint!(form::AbstractFormulation, pbm::AutoDiff.TapeMemory, adj_h, h, buffer)
-
-Return the adjoint w.r.t. the variables of the network (voltage magnitudes
-and angles, power injection) for the constraint stored inside the [`AutoDiff.TapeMemory`](@ref)
-object `pbm`. The results are stored directly inside the stack stored
-inside `pbm`.
-"""
-function adjoint! end
-
-"""
-    jacobian_transpose_product!(form::AbstractFormulation, pbm::AutoDiff.TapeMemory, buffer, v)
-
-Return the two transpose-Jacobian vector product ``(Jᵤ^⊤ v, Jₓ^⊤ v)``  w.r.t. the
-control ``u`` and the state ``x``. Store the two resulting vectors directly inside
-the [`AutoDiff.TapeMemory`](@ref) `pbm`.
-
-"""
-function jacobian_transpose_product! end
-
-"""
-    matpower_jacobian(form::AbstractFormulation, X::Union{State,Control}, cons_func::Function, V::Vector{Complex})
-    matpower_jacobian(form::AbstractFormulation, X::Union{State,Control}, cons_func::Function, buffer::AbstractNetworkBuffer)
-
-For the constraint `cons_func`, return the expression of the Jacobian ``J``
-w.r.t. the state or the control (depending on the argument `X`),
-as given by MATPOWER.
-"""
-function matpower_jacobian end
-
-@doc raw"""
-    matpower_hessian(form::AbstractFormulation, cons_func::Function, buffer::AbstractNetworkBuffer, λ::AbstractVector)
-
-For constraint `cons_func`, return the three matrices ``(λ^⊤ H_{xx},
-λ^⊤ H_{xu},λ^⊤ H_{uu})`` storing the product of the Hessian tensor ``H`` with the vector ``\lambda``.
-The expressions of the Hessian matrices are given by MATPOWER.
-
-"""
-function matpower_hessian end
-
-"""
-    jacobian_sparsity(form::AbstractFormulation, cons_func::Function, X::Union{State,Control})
-
-For the constraint `cons_func`, return the sparsity pattern of the Jacobian ``J``
-w.r.t. the state or the control (depending on the argument `X`).
-
-"""
-function jacobian_sparsity end
-
diff --git a/src/utils.jl b/src/utils.jl
deleted file mode 100644
index c842c12e..00000000
--- a/src/utils.jl
+++ /dev/null
@@ -1,142 +0,0 @@
-export NewtonRaphson
-
-abstract type AbstractNonLinearSolver end
-
-"""
-    NewtonRaphson <: AbstractNonLinearSolver
-
-Newton-Raphson algorithm. Used to solve the non-linear equation
-``g(x, u) = 0``, at a fixed control ``u``.
-
-### Attributes
-- `maxiter::Int` (default 20): maximum number of iterations
-- `tol::Float64` (default `1e-8`): tolerance of the algorithm
-- `verbose::Int` (default `NONE`): verbosity level
-
-"""
-struct NewtonRaphson <: AbstractNonLinearSolver
-    maxiter::Int
-    tol::Float64
-    verbose::Int
-end
-NewtonRaphson(; maxiter=20, tol=1e-8, verbose=VERBOSE_LEVEL_NONE) = NewtonRaphson(maxiter, tol, verbose)
-
-"""
-    ConvergenceStatus
-
-Convergence status returned by a non-linear algorithm.
-
-### Attributes
-- `has_converged::Bool`: states whether the algorithm has converged.
-- `n_iterations::Int`: total number of iterations of the non-linear algorithm.
-- `norm_residuals::Float64`: final residual.
-- `n_linear_solves::Int`: number of linear systems ``Ax = b`` resolved during the run.
-
-"""
-struct ConvergenceStatus
-    has_converged::Bool
-    n_iterations::Int
-    norm_residuals::Float64
-    n_linear_solves::Int
-end
-
-# Sparse utilities
-mutable struct Spmat{VTI<:AbstractVector, VTF<:AbstractVector}
-    colptr::VTI
-    rowval::VTI
-    nzval::VTF
-
-    # create 2 Spmats from complex matrix
-    function Spmat{VTI, VTF}(mat::SparseMatrixCSC{Complex{Float64}, Int}) where {VTI, VTF}
-        matreal = new(VTI(mat.colptr), VTI(mat.rowval), VTF(real.(mat.nzval)))
-        matimag = new(VTI(mat.colptr), VTI(mat.rowval), VTF(imag.(mat.nzval)))
-        return matreal, matimag
-    end
-end
-
-mutable struct BatchCuSparseMatrixCSR{Tv}
-    rowPtr::CUDA.CuArray{Cint, 1, CUDA.Mem.DeviceBuffer}
-    colVal::CUDA.CuArray{Cint, 1, CUDA.Mem.DeviceBuffer}
-    nzVal::CUDA.CuArray{Tv, 2, CUDA.Mem.DeviceBuffer}
-    dims::NTuple{2,Int}
-    nnz::Cint
-    nbatch::Int
-
-    function BatchCuSparseMatrixCSR{Tv}(rowPtr::CUDA.CuArray{<:Integer, 1, CUDA.Mem.DeviceBuffer}, colVal::CUDA.CuArray{<:Integer, 1, CUDA.Mem.DeviceBuffer},
-                                   nzVal::CUDA.CuMatrix, dims::NTuple{2,<:Integer}, nnzJ::Int, nbatch::Int) where Tv
-        new(rowPtr, colVal, nzVal, dims, nnzJ, nbatch)
-    end
-end
-
-Base.size(J::BatchCuSparseMatrixCSR) = J.dims
-Base.size(J::BatchCuSparseMatrixCSR, dim::Int) = J.dims[dim]
-function BatchCuSparseMatrixCSR(J::SparseMatrixCSC{Tv, Int}, nbatch) where Tv
-    dims = size(J)
-    nnzJ = nnz(J)
-    d_J = CUSPARSE.CuSparseMatrixCSR(J)
-    nzVal = CUDA.zeros(Tv, nnzJ, nbatch)
-    for i in 1:nbatch
-        copyto!(nzVal, nnzJ * (i-1) + 1, J.nzval, 1, nnzJ)
-    end
-    return BatchCuSparseMatrixCSR{Tv}(d_J.rowPtr, d_J.colVal, nzVal, dims, nnzJ, nbatch)
-end
-
-function CUDA.unsafe_free!(xs::BatchCuSparseMatrixCSR)
-    unsafe_free!(xs.rowPtr)
-    unsafe_free!(xs.colVal)
-    unsafe_free!(xs.nzVal)
-    return
-end
-
-function _copy_csc!(J_dest, J_src, shift)
-    @inbounds for i in 1:size(J_src, 2)
-        for j in J_src.colptr[i]:J_src.colptr[i+1]-1
-            row = J_src.rowval[j]
-            @inbounds J_dest[row+shift, i] = J_src.nzval[j]
-        end
-    end
-end
-
-function _transfer_sparse!(J_dest::SparseMatrixCSC, J_src::SparseMatrixCSC, shift, device)
-    _copy_csc!(J_dest, J_src, shift)
-end
-
-@kernel function _copy_sparse_matric_csr!(Jdest, Jsrc, rowptr, shift, nnz_)
-    i = @index(Global, Linear)
-    nnz_start = rowptr[shift+1] - 1
-    # Jnnz = @view J_dest.nzVal[nnz_start:nnz_start+nnz_-1]
-    Jdest[i+nnz_start] = Jsrc[i]
-end
-
-function _transfer_sparse!(J_dest::CUSPARSE.CuSparseMatrixCSR, J_src::CUSPARSE.CuSparseMatrixCSR, shift, device)
-    nnz_ = nnz(J_src)
-    _copy_sparse_matric_csr!(device)(
-        J_dest.nzVal, J_src.nzVal, J_dest.rowPtr, shift, nnz_,
-        ndrange=nnz_,
-    )
-end
-
-# Utils function to solve transposed linear system  A' x = y
-# Source code taken from:
-# https://github.com/JuliaGPU/CUDA.jl/blob/master/lib/cusolver/wrappers.jl#L78L111
-function csclsvqr!(A::CUSPARSE.CuSparseMatrixCSC{Float64},
-                    b::CUDA.CuArray{Float64, 1, CUDA.Mem.DeviceBuffer},
-                    x::CUDA.CuArray{Float64, 1, CUDA.Mem.DeviceBuffer},
-                    tol::Float64,
-                    reorder::Cint,
-                    inda::Char)
-    n = size(A,1)
-    desca = CUSPARSE.CuMatrixDescriptor(
-        CUSPARSE.CUSPARSE_MATRIX_TYPE_GENERAL,
-        CUSPARSE.CUSPARSE_FILL_MODE_LOWER,
-        CUSPARSE.CUSPARSE_DIAG_TYPE_NON_UNIT, inda)
-    singularity = Ref{Cint}(1)
-    CUSOLVER.cusolverSpDcsrlsvqr(CUSOLVER.sparse_handle(), n, A.nnz, desca, A.nzVal, A.colPtr, A.rowVal, b, tol, reorder, x, singularity)
-
-    if singularity[] != -1
-        throw(SingularException(singularity[]))
-    end
-
-    x
-end
-
diff --git a/test/Polar/api.jl b/test/Polar/api.jl
index acbfaacd..a5da6d27 100644
--- a/test/Polar/api.jl
+++ b/test/Polar/api.jl
@@ -47,18 +47,18 @@ end
 function test_polar_api(polar, device, M)
     pf = polar.network
     tolerance = 1e-8
+    nx = ExaPF.number(polar, State())
     stack = ExaPF.NetworkStack(polar)
     basis  = ExaPF.PolarBasis(polar)
     power_balance = ExaPF.PowerFlowBalance(polar) ∘ basis
     # Test that values are matching
     @test myisapprox(pf.vbus, stack.vmag .* exp.(im .* stack.vang))
-    xₖ = ExaPF.initial(polar, State())
 
     # Check that initial residual is correct
     mis = pf.vbus .* conj.(pf.Ybus * pf.vbus) .- pf.sbus
     f_mat = [real(mis[[pf.pv; pf.pq]]); imag(mis[pf.pq])];
 
-    cons = similar(xₖ)
+    cons = similar(stack.input, nx)
     power_balance(cons, stack)
     @test myisapprox(cons, f_mat)
 
diff --git a/test/Polar/autodiff.jl b/test/Polar/autodiff.jl
index 9cbc056d..9d5ef8f8 100644
--- a/test/Polar/autodiff.jl
+++ b/test/Polar/autodiff.jl
@@ -1,6 +1,6 @@
 function test_constraints_jacobian(polar, device, MT)
-    nx = length(polar.mapx)
-    nu = length(polar.mapu)
+    nx = ExaPF.number(polar, State())
+    nu = ExaPF.number(polar, Control())
 
     stack = ExaPF.NetworkStack(polar)
     ∂stack = ExaPF.NetworkStack(polar)
@@ -61,8 +61,8 @@ function test_constraints_jacobian(polar, device, MT)
 end
 
 function test_constraints_adjoint(polar, device, MT)
-    nx = length(polar.mapx)
-    nu = length(polar.mapu)
+    nx = ExaPF.number(polar, State())
+    nu = ExaPF.number(polar, Control())
     mymap = [ExaPF.my_map(polar, State()); ExaPF.my_map(polar, Control())]
 
     stack = ExaPF.NetworkStack(polar)
diff --git a/test/Polar/hessian.jl b/test/Polar/hessian.jl
index 2ad61025..09ebb551 100644
--- a/test/Polar/hessian.jl
+++ b/test/Polar/hessian.jl
@@ -1,75 +1,6 @@
-function test_hessian_with_matpower(polar, device, AT; atol=1e-6, rtol=1e-6)
-    pf = polar.network
-    pv = pf.pv ; npv = length(pv)
-    pq = pf.pq ; npq = length(pq)
-    ref = pf.ref ; nref = length(ref)
-    nbus = pf.nbus
-    # Cache
-    cache = ExaPF.get(polar, ExaPF.PhysicalState())
-    ExaPF.init_buffer!(polar, cache)
-    # Jacobian AD
-    jx = AutoDiff.Jacobian(polar, ExaPF.power_balance, State())
-    ju = AutoDiff.Jacobian(polar, ExaPF.power_balance, Control())
-    ∂obj = ExaPF.AdjointStackObjective(polar)
-
-    conv = powerflow(polar, jx, cache, NewtonRaphson())
-
-    ##################################################
-    # Computation of Hessians
-    ##################################################
-    @testset "Compare with Matpower's Hessian ($constraints)" for constraints in [
-        ExaPF.power_balance,
-        ExaPF.active_power_constraints,
-        ExaPF.reactive_power_constraints,
-    ]
-        ncons = ExaPF.size_constraint(polar, constraints)
-        hλ = rand(ncons)
-        λ = hλ |> AT
-        # Evaluate Hessian-vector product (full ∇²g is a 3rd dimension tensor)
-        ∇²gλ = ExaPF.matpower_hessian(polar, constraints, cache, hλ)
-        nx = size(∇²gλ.xx, 1)
-        nu = size(∇²gλ.uu, 1)
-
-        # Hessian-vector product using forward over adjoint AD
-        HessianAD = AutoDiff.Hessian(polar, constraints)
-
-        projp = zeros(nx + nu) |> AT
-
-        host_tgt = rand(nx + nu)
-        tgt = host_tgt |> AT
-        tgt[nx+1:end] .= 0.0
-        AutoDiff.adj_hessian_prod!(polar, HessianAD, projp, cache, λ, tgt)
-        host_projp = projp |> Array
-        @test isapprox(host_projp[1:nx], ∇²gλ.xx * host_tgt[1:nx])
-
-        host_tgt = rand(nx + nu)
-        tgt = host_tgt |> AT
-        # set tangents only for u direction
-        tgt[1:nx] .= 0.0
-        AutoDiff.adj_hessian_prod!(polar, HessianAD, projp, cache, λ, tgt)
-        host_projp = projp |> Array
-        # (we use absolute tolerance as Huu is equal to 0 for case9)
-        @test isapprox(host_projp[nx+1:end], ∇²gλ.uu * host_tgt[nx+1:end], atol=atol)
-
-        # check cross terms ux
-        host_tgt = rand(nx + nu)
-        tgt = host_tgt |> AT
-        # Build full Hessian
-        H = [
-            ∇²gλ.xx ∇²gλ.xu';
-            ∇²gλ.xu ∇²gλ.uu
-        ]
-        AutoDiff.adj_hessian_prod!(polar, HessianAD, projp, cache, λ, tgt)
-        host_projp = projp |> Array
-        @test isapprox(host_projp, H * host_tgt)
-    end
-
-    return nothing
-end
-
 function test_hessprod_with_finitediff(polar, device, MT; rtol=1e-6, atol=1e-6)
-    nx = length(polar.mapx)
-    nu = length(polar.mapu)
+    nx = ExaPF.number(polar, State())
+    nu = ExaPF.number(polar, Control())
 
     mymap = [ExaPF.my_map(polar, State()); ExaPF.my_map(polar, Control())]
 
diff --git a/test/Polar/matpower.jl b/test/Polar/matpower.jl
index e8a29c58..7901f006 100644
--- a/test/Polar/matpower.jl
+++ b/test/Polar/matpower.jl
@@ -3,7 +3,7 @@ using KernelAbstractions
 using Test
 using ExaPF
 
-import ExaPF: PowerSystem, AutoDiff
+import ExaPF: PowerSystem
 
 const PS = PowerSystem
 const INSTANCES_DIR = joinpath(artifact"ExaData", "ExaData")
@@ -23,14 +23,11 @@ const INSTANCES_DIR = joinpath(artifact"ExaData", "ExaData")
     @test isapprox(S, pf.sbus)
 
     polar = PolarForm(pf, CPU())
-    x = ExaPF.initial(polar, State())
-
-    cache = ExaPF.get(polar, ExaPF.PhysicalState())
-    ExaPF.init_buffer!(polar, cache)
-    jx = AutoDiff.Jacobian(polar, ExaPF.power_balance, State())
+    mapx = ExaPF.my_map(polar, State())
+    stack = ExaPF.NetworkStack(polar)
     # solve power flow
-    convergence = ExaPF.powerflow(polar, jx, cache, NewtonRaphson())
-    ExaPF.get!(polar, State(), x, cache)
+    convergence = ExaPF.run_pf(polar, stack)
+    x = stack.input[mapx]
 
     x_sol = [0.16875136481876485, 0.0832709533581424,
              -0.04200385129447893, -0.07011446830092488,
@@ -44,94 +41,68 @@ const INSTANCES_DIR = joinpath(artifact"ExaData", "ExaData")
     @test isapprox(x_sol, x, atol=1e-7)
 end
 
+@testset "Power flow 9 bus case (phase shift) (see Issue #184)" begin
+    datafile = joinpath(INSTANCES_DIR, "case9phaseshift.m")
+    pf = PS.PowerNetwork(datafile)
+
+    polar = PolarForm(pf, CPU())
+    stack = ExaPF.NetworkStack(polar)
+    # Load buffer
+    # Check buffer
+    @test pf.vbus ≈ stack.vmag .* exp.(im .* stack.vang)
+
+    # solve power flow
+
+    convergence = ExaPF.run_pf(polar, stack)
+    @test convergence.n_iterations == 4
+    # Compare with MATPOWER's solution
+    vang_matpower = [
+        0.0, 0.172591688402360, 0.082724964598100, -0.040264015269618, -0.068037592808220,
+        0.034807882121366, 0.012281528715204, 0.068398625299755, -0.074633227364866,
+    ]
+    vmag_matpower = [
+       1.000000000000000, 1.000000000000000, 1.000000000000000, 0.987010583420980,
+       0.975589804364444, 1.003384229977194, 0.985648588730233, 0.996151111467108, 0.957437967505498,
+    ]
+
+    @test stack.vmag ≈ vmag_matpower
+    @test stack.vang ≈ vang_matpower
+end
+
 @testset "Power flow 14 bus case" begin
     datafile = joinpath(INSTANCES_DIR, "case14.m")
     polar = PolarForm(datafile, CPU())
-
-    cache = ExaPF.get(polar, ExaPF.PhysicalState())
-    ExaPF.init_buffer!(polar, cache)
-    jx = AutoDiff.Jacobian(polar, ExaPF.power_balance, State())
+    stack = ExaPF.NetworkStack(polar)
     # solve power flow
-    conv = ExaPF.powerflow(polar, jx, cache, NewtonRaphson())
-
+    conv = ExaPF.run_pf(polar, stack)
     @test conv.n_iterations == 2
-    @test isapprox(norm(cache.balance, Inf), 1.3158e-10, rtol=1e-4)
 
-    vmag = cache.vmag
     vmag_matpower = [1.060000,1.045000,1.010000,1.017671,1.019514,1.070000,1.061520,1.090000,1.055932,1.050985,1.056907,1.055189,1.050382,1.035530]
 
-    @test isapprox(vmag, vmag_matpower, rtol=1e-6)
+    @test isapprox(stack.vmag, vmag_matpower, rtol=1e-6)
 end
 
 @testset "Power flow 30 bus case" begin
     datafile = joinpath(INSTANCES_DIR, "case30.m")
     polar = PolarForm(datafile, CPU())
+    stack = ExaPF.NetworkStack(polar)
 
-    cache = ExaPF.get(polar, ExaPF.PhysicalState())
-    ExaPF.init_buffer!(polar, cache)
-    jx = AutoDiff.Jacobian(polar, ExaPF.power_balance, State())
-    # solve power flow
-    conv = ExaPF.powerflow(polar, jx, cache, NewtonRaphson())
-
+    conv = ExaPF.run_pf(polar, stack)
     @test conv.n_iterations == 3
-    @test isapprox(norm(cache.balance, Inf), 9.56998e-10, rtol=1e-4)
 
     vmag_matpower = [1.000000,1.000000,0.983138,0.980093,0.982406,0.973184,0.967355,0.960624,0.980506,0.984404,0.980506,0.985468,1.000000,0.976677,0.980229,0.977396,0.976865,0.968440,0.965287,0.969166,0.993383,1.000000,1.000000,0.988566,0.990215,0.972194,1.000000,0.974715,0.979597,0.967883]
 
-    @test isapprox(cache.vmag, vmag_matpower, rtol=1e-6)
+    @test isapprox(stack.vmag, vmag_matpower, rtol=1e-6)
 end
 
 @testset "Power flow 300 bus case" begin
     datafile = joinpath(INSTANCES_DIR, "case300.m")
     polar = PolarForm(datafile, CPU())
-
-    cache = ExaPF.get(polar, ExaPF.PhysicalState())
-    ExaPF.init_buffer!(polar, cache)
-    jx = AutoDiff.Jacobian(polar, ExaPF.power_balance, State())
-    # solve power flow
-    conv = ExaPF.powerflow(polar, jx, cache, NewtonRaphson())
-
+    stack = ExaPF.NetworkStack(polar)
+    conv = ExaPF.run_pf(polar, stack)
     @test conv.n_iterations == 5
-    @test isapprox(norm(cache.balance, Inf), 0.0, atol=1e-10)
 
     vmag_matpower = [1.028420,1.035340,0.997099,1.030812,1.019109,1.031196,0.993408,1.015300,1.003386,1.020500,1.005657,0.997373,0.997674,0.999170,1.034391,1.031587,1.064906,0.981924,1.001000,0.975168,0.996270,1.050098,1.005656,1.023354,0.998577,0.975035,1.024565,1.041441,0.975688,1.001170,1.020158,1.020312,1.053611,1.021655,1.029283,1.044944,1.000732,1.008735,1.021646,1.034515,0.977877,1.001958,1.047499,1.025388,0.998003,0.996035,1.005135,1.015146,1.033490,0.991822,0.978860,1.024716,0.990654,1.016040,0.958300,0.947956,0.962698,0.951318,0.979391,0.969614,0.977610,0.996488,0.963200,0.983787,0.990023,0.982012,0.987242,1.034127,1.025000,0.987112,0.990818,0.991954,1.015248,1.031724,1.027231,1.052000,1.052000,0.992945,1.018224,1.000000,0.989358,1.005986,1.000708,1.028759,0.995737,1.022267,1.009461,0.990000,0.975245,0.973213,0.974473,0.970155,0.976812,0.960282,1.024861,0.934829,0.929853,1.043500,0.958437,0.987111,0.972796,1.000588,1.023300,1.010300,0.997795,1.000129,1.002406,1.002825,1.019136,0.986142,1.004551,1.001998,1.022076,1.019337,1.047586,1.047088,1.055000,1.011709,1.042991,1.051000,1.015510,1.043500,1.016107,1.008106,1.052800,1.052800,1.057719,1.073500,0.986926,1.004833,1.053500,1.043500,0.966417,1.017724,0.963000,0.984473,0.998709,0.986644,0.999801,1.036082,0.991820,1.041011,0.983914,1.000211,0.997254,0.971492,1.002431,0.987864,0.929000,0.982900,1.024466,0.983654,1.062214,0.973081,1.052200,1.007700,0.939796,0.969910,0.979330,1.051824,1.044628,0.971645,1.038589,1.052200,1.065000,1.065000,1.053282,1.002757,1.055100,1.043500,0.937458,0.998236,1.048984,1.035903,0.973993,0.992473,1.015000,0.954313,0.956174,0.974032,0.990839,1.003359,0.966709,0.985554,1.003768,1.018555,0.999440,1.004774,0.980462,1.001820,1.013262,1.010000,0.991863,0.986632,0.975110,1.021525,1.007547,1.055420,1.008000,1.000000,1.050000,0.996551,1.000254,0.945276,1.018005,1.000000,1.042356,1.049552,1.040000,1.053541,1.041466,1.000000,1.038706,1.009515,1.016500,1.055850,1.010000,1.000000,1.023776,1.050000,0.993000,1.010000,0.992178,0.971140,0.965191,0.969095,0.976999,0.976227,1.020532,1.025125,1.015209,1.014590,1.000433,0.980890,0.974945,0.942873,0.972387,0.960470,1.000921,0.977728,0.958325,1.031028,1.012876,1.024438,1.012197,0.969485,1.050700,1.050700,1.032300,1.014500,1.050700,1.050700,1.050700,1.029000,1.050000,1.014500,1.050700,0.996700,1.021200,1.014500,1.001700,0.989300,1.050700,1.050700,1.014500,1.011774,0.994500,0.983335,0.976825,1.011711,1.002924,0.991387,1.002280,0.988722,0.964884,0.974704,0.970504,0.964756,0.965606,0.931742,0.944074,0.928799,0.997240,0.950422,0.959699,0.957027,0.939160,0.963555,0.950267,0.964683,0.979007,1.000000,0.978627,1.000000,1.000000,1.000000,0.975431,0.980460,0.979888,1.040517]
 
-    @test isapprox(cache.vmag, vmag_matpower, rtol=1e-6)
-end
-
-@testset "Power flow 9 bus case (phase shift) (see Issue #184)" begin
-    datafile = joinpath(INSTANCES_DIR, "case9phaseshift.m")
-    pf = PS.PowerNetwork(datafile)
-
-    polar = PolarForm(pf, CPU())
-    # Load buffer
-    cache = ExaPF.get(polar, ExaPF.PhysicalState())
-    ExaPF.init_buffer!(polar, cache)
-    # Check buffer
-    @test pf.vbus ≈ cache.vmag .* exp.(im .* cache.vang)
-    @test pf.sbus ≈ (cache.pnet .- cache.pload) .+ im .* (cache.qnet .- cache.qload)
-    # Check initial residual
-    mis = pf.vbus .* conj.(pf.Ybus * pf.vbus) .- pf.sbus
-    f_mat = [real(mis[[pf.pv; pf.pq]]); imag(mis[pf.pq])];
-
-    ExaPF.power_balance(polar, cache.balance, cache)
-    @test cache.balance ≈ f_mat
-
-    jx = AutoDiff.Jacobian(polar, ExaPF.power_balance, State())
-    # solve power flow
-    convergence = ExaPF.powerflow(polar, jx, cache, NewtonRaphson())
-
-    @test convergence.n_iterations == 4
-    # Compare with MATPOWER's solution
-    vang_matpower = [
-        0.0, 0.172591688402360, 0.082724964598100, -0.040264015269618, -0.068037592808220,
-        0.034807882121366, 0.012281528715204, 0.068398625299755, -0.074633227364866,
-    ]
-    vmag_matpower = [
-       1.000000000000000, 1.000000000000000, 1.000000000000000, 0.987010583420980,
-       0.975589804364444, 1.003384229977194, 0.985648588730233, 0.996151111467108, 0.957437967505498,
-    ]
-
-    @test cache.vmag ≈ vmag_matpower
-    @test cache.vang ≈ vang_matpower
+    @test isapprox(stack.vmag, vmag_matpower, rtol=1e-6)
 end

From 05b8708a7588078916301b9b56e2c7774ab539bb Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Wed, 12 Jan 2022 09:14:22 -0600
Subject: [PATCH 26/34] fix tests on CPU

---
 benchmark/benchmarks.jl            | 60 +++++++++++--------------
 src/ExaPF.jl                       |  2 +
 src/LinearSolvers/LinearSolvers.jl | 10 +++--
 src/Polar/legacy.jl                |  8 ++--
 src/Polar/newton.jl                |  3 +-
 src/Polar/polar.jl                 | 71 ++++++++----------------------
 src/architectures.jl               | 12 -----
 test/Polar/TestPolarForm.jl        | 21 ++-------
 test/Polar/api.jl                  | 28 ++++++++----
 test/Polar/gradient.jl             | 42 ------------------
 test/powersystem.jl                |  1 -
 test/quickstart.jl                 | 70 ++++++++++++++++-------------
 12 files changed, 121 insertions(+), 207 deletions(-)

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index caab5da8..0f900ba0 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -40,40 +40,37 @@ function run_benchmark(datafile, device, linsolver)
     ntol = 1e-6
     pf = PowerSystem.PowerNetwork(datafile)
     polar = PolarForm(pf, device)
-    cache = ExaPF.get(polar, ExaPF.PhysicalState())
-    jx = AutoDiff.Jacobian(polar, ExaPF.power_balance, State())
+    mapx = ExaPF.my_map(polar, State())
+    nx = length(mapx)
+    stack = ExaPF.NetworkStack(polar)
+
+    basis = ExaPF.PolarBasis(polar)
+    pflow = ExaPF.PowerFlowBalance(polar)
+    jx = ExaPF.MyJacobian(polar, pflow ∘ basis, mapx)
     J = jx.J
     npartitions = ceil(Int64,(size(jx.J,1)/64))
     if npartitions < 2
         npartitions = 2
     end
     precond = ExaPF.LinearSolvers.BlockJacobiPreconditioner(J, npartitions, device)
-    # Retrieve initial state of network
-    u0 = ExaPF.initial(polar, Control())
 
     algo = linsolver(J; P=precond)
     powerflow_solver = NewtonRaphson(tol=ntol)
+    VT = typeof(stack.input)
+    pf_buffer = ExaPF.NLBuffer{VT}(nx)
 
-    # Init variables
-    buffer = get(polar, ExaPF.PhysicalState())
-    jx = AutoDiff.Jacobian(polar, ExaPF.power_balance, State())
-
-    # Warmstart
-    ExaPF.init_buffer!(polar, buffer)
-    ExaPF.powerflow(polar, jx, buffer, powerflow_solver; linear_solver=algo)
+    # Warm-up
+    ExaPF.nlsolve!(
+        powerflow_solver, jx, stack; linear_solver=algo, nl_buffer=pf_buffer,
+    )
 
-    TimerOutputs.reset_timer!(ExaPF.TIMER)
-    ExaPF.init_buffer!(polar, buffer)
-    convergence = ExaPF.powerflow(polar, jx, buffer, powerflow_solver; linear_solver=algo)
+    ExaPF.init!(polar, stack)
+    res = @timed ExaPF.nlsolve!(
+        powerflow_solver, jx, stack; linear_solver=algo, nl_buffer=pf_buffer,
+    )
+    convergence = res.value
 
-    # Make sure we are converged
-    @assert(convergence.has_converged)
-
-    # Output
-    prettytime = TimerOutputs.prettytime
-    timers = ExaPF.TIMER.inner_timers
-    inner_timer = timers["Newton"]
-    return convergence.has_converged, timers, inner_timer
+    return convergence.has_converged, res.time
 end
 
 function main()
@@ -81,20 +78,13 @@ function main()
     device = eval(Meta.parse("$(ARGS[2])()"))
     datafile = joinpath(dirname(@__FILE__), ARGS[3])
 
-    has_converged, timers, inner_timer = run_benchmark(datafile, device, linsolver)
-
-    if ARGS[1] == "DirectSolver"
-        println("$(ARGS[1]), $(ARGS[2]), $(ARGS[3]),",
-                printtimer(timers, "Newton"),
-                ", $(has_converged)")
-    else
-        println("$(ARGS[1]), $(ARGS[2]), $(ARGS[3]),",
-                printtimer(timers, "Newton"),",",
-                printtimer(inner_timer, "Jacobian"),",",
-                printtimer(inner_timer, "Linear Solver"),
-                ", $(has_converged)")
-    end
+    has_converged, timer = run_benchmark(datafile, device, linsolver)
     @test has_converged
+
+    println("$(ARGS[1]), $(ARGS[2]), $(ARGS[3]),",
+            timer,
+            ", $(has_converged)")
+
 end
 
 main()
diff --git a/src/ExaPF.jl b/src/ExaPF.jl
index 2db39d94..1db5f123 100644
--- a/src/ExaPF.jl
+++ b/src/ExaPF.jl
@@ -16,6 +16,8 @@ const KA = KernelAbstractions
 
 import Base: show, get
 
+export run_pf
+
 include("architectures.jl")
 
 # Templates
diff --git a/src/LinearSolvers/LinearSolvers.jl b/src/LinearSolvers/LinearSolvers.jl
index ca77ad2a..28ed9ae3 100644
--- a/src/LinearSolvers/LinearSolvers.jl
+++ b/src/LinearSolvers/LinearSolvers.jl
@@ -15,7 +15,7 @@ import Krylov
 import LightGraphs
 import Metis
 
-import ..ExaPF: xnorm, csclsvqr!
+import ..ExaPF: xnorm
 
 const KA = KernelAbstractions
 
@@ -119,7 +119,6 @@ end
 
 # Reuse factorization in update
 function ldiv!(s::DirectSolver{<:LinearAlgebra.Factorization}, y::AbstractVector, J::AbstractMatrix, x::AbstractVector)
-    lu!(s.factorization, J) # Update factorization inplace
     LinearAlgebra.ldiv!(y, s.factorization, x) # Forward-backward solve
     return 0
 end
@@ -178,8 +177,11 @@ function rdiv!(s::DirectSolver{<:LinearAlgebra.Factorization}, y::CUDA.CuVector,
     return 0
 end
 
-function update_preconditioner!(solver::AbstractIterativeLinearSolver, J, device)
-    update(solver.precond, J, device)
+function update!(solver::AbstractIterativeLinearSolver, J::SparseMatrixCSC)
+    update(solver.precond, J, CPU())
+end
+function update!(solver::AbstractIterativeLinearSolver, J::CUSPARSE.CuSparseMatrixCSR)
+    update(solver.precond, J, CUDADevice())
 end
 
 """
diff --git a/src/Polar/legacy.jl b/src/Polar/legacy.jl
index a46d3e88..960f3d76 100644
--- a/src/Polar/legacy.jl
+++ b/src/Polar/legacy.jl
@@ -4,7 +4,7 @@ function matpower_jacobian(polar::PolarForm, func::PowerFlowBalance, V)
     pf = polar.network
     nbus = pf.nbus
     ngen = pf.ngen
-    ref, pv, pq = index_buses_host(polar)
+    ref, pv, pq = pf.ref, pf.pv, pf.pq
     gen2bus = pf.gen2bus
     nref = length(ref)
     npv = length(pv)
@@ -33,7 +33,7 @@ function matpower_jacobian(polar::PolarForm, func::VoltageMagnitudePQ, V)
     pf = polar.network
     ngen = pf.ngen
     nbus = pf.nbus
-    ref, pv, pq = index_buses_host(polar)
+    ref, pv, pq = pf.ref, pf.pv, pf.pq
     npq = length(pq)
 
     j11 = sparse(1:npq, pq, ones(npq), npq, nbus)
@@ -46,7 +46,7 @@ function matpower_jacobian(polar::PolarForm, func::PowerGenerationBounds, V)
     nbus = pf.nbus
     ngen = pf.ngen
     gen2bus = pf.gen2bus
-    ref, pv, pq = index_buses_host(polar)
+    ref, pv, pq = pf.ref, pf.pv, pf.pq
     nref = length(ref)
     npv = length(pv)
     npq = length(pq)
@@ -71,7 +71,7 @@ function matpower_jacobian(polar::PolarForm, func::LineFlows, V)
     nbus = get(polar, PS.NumberOfBuses())
     nlines = get(polar, PS.NumberOfLines())
     pf = polar.network
-    ref, pv, pq = index_buses_host(polar)
+    ref, pv, pq = pf.ref, pf.pv, pf.pq
     ngen = pf.ngen
     nref = length(ref)
     npv  = length(pv)
diff --git a/src/Polar/newton.jl b/src/Polar/newton.jl
index c97cb309..40e752ce 100644
--- a/src/Polar/newton.jl
+++ b/src/Polar/newton.jl
@@ -58,7 +58,7 @@ function nlsolve!(
     algo::NewtonRaphson,
     jac::MyJacobian,
     state::NetworkStack{VT,Buf};
-    linear_solver=DirectSolver(),
+    linear_solver=DirectSolver(jac.J),
     nl_buffer=NLBuffer{VT}(size(jac, 2)),
 ) where {VT, Buf}
     iter = 0
@@ -83,6 +83,7 @@ function nlsolve!(
         end
 
         # Update
+        LS.update!(linear_solver, J)
         n_iters = LS.ldiv!(linear_solver, Δx, J, residual)
         x .= x .- Δx
 
diff --git a/src/Polar/polar.jl b/src/Polar/polar.jl
index 43f9f15a..94bf2a98 100644
--- a/src/Polar/polar.jl
+++ b/src/Polar/polar.jl
@@ -3,56 +3,36 @@
 """
     PolarForm{T, IT, VT, MT}
 
-Takes as input a [`PS.PowerNetwork`](@ref) network and
-implement the polar formulation model associated to this network.
-The structure `PolarForm` stores the topology of the network, as
-well as the complete indexing used in the polar formulation.
+Wrap a [`PS.PowerNetwork`](@ref) network to move the data on
+the target device (`CPU()` and `CUDADevice()` are currently supported).
 
-A `PolarForm` structure can be instantiated both on the host `CPU()`
-or directly on the device `CUDADevice()`.
 """
 struct PolarForm{T, IT, VT, MT} <: AbstractFormulation where {T, IT, VT, MT}
     network::PS.PowerNetwork
     device::KA.Device
 end
 
-include("functions.jl")
-include("first_order.jl")
-include("second_order.jl")
-include("newton.jl")
-include("legacy.jl")
-
-function PolarForm(pf::PS.PowerNetwork, device::KA.Device)
-    if isa(device, KA.CPU)
-        IT = Vector{Int}
-        VT = Vector{Float64}
-        M = SparseMatrixCSC
-        AT = Array
-    elseif isa(device, KA.GPU)
-        IT = CUDA.CuArray{Int64, 1, CUDA.Mem.DeviceBuffer}
-        VT = CUDA.CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}
-        M = CUSPARSE.CuSparseMatrixCSR
-        AT = CUDA.CuArray
-    end
-
-    return PolarForm{Float64, IT, VT, AT{Float64,  2}}(
-        pf, device,
-    )
+function PolarForm(pf::PS.PowerNetwork, device::KA.CPU)
+    return PolarForm{Float64, Vector{Int}, Vector{Float64}, Matrix{Float64}}(pf, device)
+end
+function PolarForm(pf::PS.PowerNetwork, device::KA.GPU)
+    return PolarForm{Float64, CuVector{Int}, CuVector{Float64}, CuMatrix{Float64}}(pf, device)
 end
+
 # Convenient constructor
 PolarForm(datafile::String, device) = PolarForm(PS.PowerNetwork(datafile), device)
 
-
-# Ordering: [vmag, vang, pgen]
-
+# Default ordering: [vmag, vang, pgen]
 function my_map(polar::PolarForm, ::State)
+    pf = polar.network
     nbus = get(polar, PS.NumberOfBuses())
-    ref, pv, pq = index_buses_host(polar)
+    ref, pv, pq = pf.ref, pf.pv, pf.pq
     return Int[nbus .+ pv; nbus .+ pq; pq]
 end
 function my_map(polar::PolarForm, ::Control)
+    pf = polar.network
     nbus = get(polar, PS.NumberOfBuses())
-    ref, pv, pq = index_buses_host(polar)
+    ref, pv, pq = pf.ref, pf.pv, pf.pq
     pv2gen = polar.network.pv2gen
     return Int[ref; pv; 2*nbus .+ pv2gen]
 end
@@ -62,25 +42,6 @@ number(polar::PolarForm, v::AbstractVariable) = length(my_map(polar, v))
 # Getters
 get(polar::PolarForm, attr::PS.AbstractNetworkAttribute) = get(polar.network, attr)
 
-index_buses_host(polar) = PS.get(polar.network, PS.AllBusesIndex())
-index_buses_device(polar) = index_buses(polar.indexing)
-
-index_generators_host(polar) = PS.get(polar.network, PS.AllGeneratorsIndex())
-index_generators_device(polar) = index_generators(polar.indexing)
-
-# Power flow linear solvers
-function powerflow_jacobian(polar)
-    nbus = get(polar, PS.NumberOfBuses())
-    v0 = polar.network.vbus .+ 0.01 .* rand(ComplexF64, nbus)
-    return matpower_jacobian(polar, State(), power_balance, v0)
-end
-
-function powerflow_jacobian_device(polar)
-    SpMT = default_sparse_matrix(polar.device)
-    J = powerflow_jacobian(polar)
-    return J |> SpMT
-end
-
 function Base.show(io::IO, polar::PolarForm)
     # Network characteristics
     nbus = PS.get(polar.network, PS.NumberOfBuses())
@@ -103,3 +64,9 @@ function Base.show(io::IO, polar::PolarForm)
     print(io,   "    #states  :   ", n_states)
 end
 
+include("functions.jl")
+include("first_order.jl")
+include("second_order.jl")
+include("newton.jl")
+include("legacy.jl")
+
diff --git a/src/architectures.jl b/src/architectures.jl
index 29c6de53..75ed2c01 100644
--- a/src/architectures.jl
+++ b/src/architectures.jl
@@ -21,18 +21,6 @@ function get_jacobian_types(::GPU)
     return SMT, A
 end
 
-function get_batch_jacobian_types(::CPU)
-    SMT = SparseMatrixCSC{Float64,Int}
-    A = Array
-    return SMT, A
-end
-
-function get_batch_jacobian_types(::GPU)
-    SMT = CUSPARSE.CuSparseMatrixCSR{Float64}
-    A = CUDA.CuArray
-    return SMT, A
-end
-
 function Base.unsafe_wrap(Atype::Type{CUDA.CuArray{T, 1, CUDA.Mem.DeviceBuffer}},
                           p::CUDA.CuPtr{T}, dim::Integer;
                           own::Bool=false, ctx::CUDA.CuContext=CUDA.context()) where {T}
diff --git a/test/Polar/TestPolarForm.jl b/test/Polar/TestPolarForm.jl
index 8ffb32b4..d1047e0e 100644
--- a/test/Polar/TestPolarForm.jl
+++ b/test/Polar/TestPolarForm.jl
@@ -22,7 +22,6 @@ include("api.jl")
 include("autodiff.jl")
 include("gradient.jl")
 include("hessian.jl")
-include("batch.jl")
 
 function myisless(a, b)
     h_a = a |> Array
@@ -43,33 +42,21 @@ function runtests(datafile, device, AT)
     println(devnull, polar)
 
     @testset "PolarForm API" begin
-        test_polar_network_cache(polar, device, AT)
         test_polar_api(polar, device, AT)
         test_polar_constraints(polar, device, AT)
         test_polar_powerflow(polar, device, AT)
     end
 
-    @testset "PolarForm AutoDiff" begin
+    @testset "PolarForm AutoDiff (first-order)" begin
         test_constraints_jacobian(polar, device, AT)
         test_constraints_adjoint(polar, device, AT)
         test_full_space_jacobian(polar, device, AT)
-    end
-
-    @testset "PolarForm Gradient" begin
-        test_objective_adjoint(polar, device, AT)
-        test_objective_with_ramping_adjoint(polar, device, AT)
         test_reduced_gradient(polar, device, AT)
-        test_line_flow_gradient(polar, device, AT)
-    end
-
-    @testset "PolarForm Hessians" begin
-        test_hessian_with_matpower(polar, device, AT)
-        test_hessian_with_finitediff(polar, device, AT)
     end
 
-    @testset "Batch algorithms" begin
-        test_batch_powerflow(polar, device, AT)
-        test_batch_hessian(polar, device, AT)
+    @testset "PolarForm AutoDiff (second-order)" begin
+        test_hessprod_with_finitediff(polar, device, AT)
+        test_full_space_hessian(polar, device, AT)
     end
 end
 
diff --git a/test/Polar/api.jl b/test/Polar/api.jl
index a5da6d27..9b20a376 100644
--- a/test/Polar/api.jl
+++ b/test/Polar/api.jl
@@ -108,26 +108,38 @@ function test_polar_constraints(polar, device, M)
 end
 
 function test_polar_powerflow(polar, device, M)
+    SMT = ExaPF.default_sparse_matrix(polar.device)
+    # Init structures
+    stack = ExaPF.NetworkStack(polar)
+    mapx = ExaPF.my_map(polar, State())
     pf_solver = NewtonRaphson(tol=1e-6)
     npartitions = 8
+
+    basis = ExaPF.PolarBasis(polar)
+    pflow = ExaPF.PowerFlowBalance(polar)
+    n = length(pflow)
+
     # Get reduced space Jacobian on the CPU
-    J = ExaPF.powerflow_jacobian(polar)
-    n = size(J, 1)
+    J = ExaPF.jacobian_sparsity(polar, pflow)
+    J = J[:, mapx]
+
+    @test n == size(J, 1) == length(mapx)
+
     # Build preconditioner
     precond = LS.BlockJacobiPreconditioner(J, npartitions, device)
 
-    J_gpu = ExaPF.powerflow_jacobian_device(polar)
+    J_gpu = J |> SMT
 
     # Init AD
-    jx = AutoDiff.Jacobian(polar, ExaPF.power_balance, State())
-    # Init buffer
-    buffer = get(polar, ExaPF.PhysicalState())
+    jx = ExaPF.MyJacobian(polar, pflow ∘ basis, mapx)
 
     @testset "Powerflow solver $(LinSolver)" for LinSolver in ExaPF.list_solvers(device)
         algo = LinSolver(J_gpu; P=precond)
-        ExaPF.init_buffer!(polar, buffer)
-        convergence = ExaPF.powerflow(polar, jx, buffer, pf_solver; linear_solver=algo)
+        ExaPF.init!(polar, stack)
+        convergence = ExaPF.nlsolve!(
+            pf_solver, jx, stack; linear_solver=algo)
         @test convergence.has_converged
         @test convergence.norm_residuals < pf_solver.tol
     end
 end
+
diff --git a/test/Polar/gradient.jl b/test/Polar/gradient.jl
index b8200e17..e1affed2 100644
--- a/test/Polar/gradient.jl
+++ b/test/Polar/gradient.jl
@@ -64,45 +64,3 @@ function test_reduced_gradient(polar, device, MT)
     @test isapprox(grad_fd[:], grad_adjoint, rtol=1e-4)
 end
 
-function test_objective_adjoint(polar, device, MT)
-    pf = polar.network
-    nbus = pf.nbus
-    pv = pf.pv ; npv = length(pv)
-    pq = pf.pq ; npq = length(pq)
-    ref = pf.ref ; nref = length(ref)
-    pv2gen = polar.indexing.index_pv_to_gen
-    nx = ExaPF.get(polar, ExaPF.NumberOfState())
-
-    cache = ExaPF.get(polar, ExaPF.PhysicalState())
-    ExaPF.init_buffer!(polar, cache)
-
-    u = ExaPF.initial(polar, Control())
-
-    jx = AutoDiff.Jacobian(polar, ExaPF.power_balance, State())
-    conv = powerflow(polar, jx, cache, NewtonRaphson(tol=1e-12))
-
-    # Evaluate gradient
-    pbm = ExaPF.pullback_objective(polar)
-    ExaPF.gradient_objective!(polar, pbm, cache)
-
-    # Compare with finite diff
-    x = [cache.vang[pv] ; cache.vang[pq] ; cache.vmag[pq]]
-    u = [cache.vmag[ref]; cache.vmag[pv]; cache.pgen[pv2gen]]
-
-    function test_objective_fd(z)
-        x_ = z[1:nx]
-        u_ = z[1+nx:end]
-        # Transfer control
-        ExaPF.transfer!(polar, cache, u_)
-        # Transfer state (manually)
-        cache.vang[pv] .= x_[1:npv]
-        cache.vang[pq] .= x_[npv+1:npv+npq]
-        cache.vmag[pq] .= x_[npv+npq+1:end]
-        return ExaPF.cost_production(polar, cache)
-    end
-    ∇f = FiniteDiff.finite_difference_jacobian(test_objective_fd, [x; u])
-
-    @test myisapprox(∇f[1:nx], pbm.stack.∇fₓ, rtol=1e-5)
-    @test myisapprox(∇f[1+nx:end], pbm.stack.∇fᵤ, rtol=1e-5)
-    return
-end
diff --git a/test/powersystem.jl b/test/powersystem.jl
index abd7f353..eb8c1250 100644
--- a/test/powersystem.jl
+++ b/test/powersystem.jl
@@ -61,7 +61,6 @@ const INSTANCES_DIR = joinpath(artifact"ExaData", "ExaData")
     nbus = size(bus, 1)
     ngen = size(gen, 1)
 
-    ybus_re, ybus_im = ExaPF.Spmat{T{Int}, T{Float64}}(Ybus)
     SBASE = data["baseMVA"][1]
     Sbus, Sload = PS.assembleSbus(gen, bus, SBASE, bus_to_indexes)
     pbus = real(Sbus)
diff --git a/test/quickstart.jl b/test/quickstart.jl
index a481f468..294550cb 100644
--- a/test/quickstart.jl
+++ b/test/quickstart.jl
@@ -17,11 +17,12 @@ const INSTANCES_DIR = joinpath(artifact"ExaData", "ExaData")
 
     # Short version
     polar = ExaPF.PolarForm(datafile, CPU())
-    pf_algo = NewtonRaphson(; verbose=0, tol=1e-10)
-    convergence = ExaPF.powerflow(polar, pf_algo)
+    # Initial values
+    stack = ExaPF.NetworkStack(polar)
+    convergence = run_pf(polar, stack; rtol=1e-10)
     @test convergence.has_converged
     @test convergence.n_iterations == 5
-    @test convergence.norm_residuals <= pf_algo.tol
+    @test convergence.norm_residuals <= 1e-10
 
     # Long version
     pf = PS.PowerNetwork(datafile)
@@ -34,22 +35,28 @@ const INSTANCES_DIR = joinpath(artifact"ExaData", "ExaData")
 
     # Build-up PolarForm object
     polar = ExaPF.PolarForm(pf, CPU())
-    physical_state = get(polar, ExaPF.PhysicalState())
-    ExaPF.init_buffer!(polar, physical_state)
-    jx = AutoDiff.Jacobian(polar, ExaPF.power_balance, State())
-
-    linear_solver = LS.DirectSolver()
-    convergence = ExaPF.powerflow(
-        polar, jx, physical_state, pf_algo;
-        linear_solver=linear_solver
+    stack = ExaPF.NetworkStack(polar)
+    basis = ExaPF.PolarBasis(polar)
+    # Powerflow function
+    pflow = ExaPF.PowerFlowBalance(polar) ∘ basis
+    mapx = ExaPF.my_map(polar, State())
+    # AD for Jacobian
+    jx = ExaPF.MyJacobian(polar, pflow, mapx)
+    # Linear solver
+    linear_solver = LS.DirectSolver(jx.J)
+    # Powerflow solver
+    pf_solver = NewtonRaphson(tol=1e-10)
+
+    convergence = ExaPF.nlsolve!(
+        pf_solver, jx, stack; linear_solver=linear_solver,
     )
 
     @test convergence.has_converged
     @test convergence.n_iterations == 5
-    @test convergence.norm_residuals <= pf_algo.tol
+    @test convergence.norm_residuals <= pf_solver.tol
 
     # Reinit buffer
-    ExaPF.init_buffer!(polar, physical_state)
+    ExaPF.init!(polar, stack)
     npartitions = 8
     jac = jx.J
     precond = LS.BlockJacobiPreconditioner(jac, npartitions, CPU())
@@ -60,9 +67,8 @@ const INSTANCES_DIR = joinpath(artifact"ExaData", "ExaData")
     # Build powerflow algorithm
     pf_algo = NewtonRaphson(; verbose=0, tol=1e-7)
 
-    convergence = ExaPF.powerflow(
-        polar, jx, physical_state, pf_algo;
-        linear_solver=iterative_linear_solver
+    convergence = ExaPF.nlsolve!(
+        pf_solver, jx, stack; linear_solver=iterative_linear_solver,
     )
 
     @test convergence.has_converged
@@ -71,35 +77,37 @@ const INSTANCES_DIR = joinpath(artifact"ExaData", "ExaData")
 
     if CUDA.has_cuda_gpu()
         polar_gpu = ExaPF.PolarForm(pf, CUDADevice())
-        jx_gpu = AutoDiff.Jacobian(polar_gpu, ExaPF.power_balance, State())
-        physical_state_gpu = get(polar_gpu, ExaPF.PhysicalState())
-        ExaPF.init_buffer!(polar_gpu, physical_state_gpu)
-        linear_solver = LS.DirectSolver()
-        convergence = ExaPF.powerflow(
-            polar_gpu, jx_gpu, physical_state_gpu, pf_algo;
-            linear_solver=linear_solver
+        stack_gpu = ExaPF.NetworkStack(polar_gpu)
+
+        basis_gpu = ExaPF.PolarBasis(polar)
+        pflow_gpu = ExaPF.PowerFlowBalance(polar) ∘ basis
+        jx_gpu = ExaPF.MyJacobian(polar_gpu, pflow_gpu, mapx)
+
+        linear_solver = LS.DirectSolver(jx_gpu.J)
+
+        convergence = ExaPF.nlsolve!(
+            pf_solver, jx_gpu, stack_gpu; linear_solver=linear_solver,
         )
 
         @test convergence.has_converged
         @test convergence.n_iterations == 5
-        @test convergence.norm_residuals <= pf_algo.tol
+        @test convergence.norm_residuals <= pf_solver.tol
 
         npartitions = 8
         jac = jx_gpu.J # we need to take the Jacobian on the CPU for partitioning!
         precond = LS.BlockJacobiPreconditioner(jac, npartitions, CUDADevice())
 
         # Reinit buffer
-        ExaPF.init_buffer!(polar_gpu, physical_state_gpu)
+        ExaPF.init!(polar_gpu, stack_gpu)
+
+        iterative_linear_solver = ExaPF.KrylovBICGSTAB(jac; P=precond)
 
-        linear_solver = ExaPF.KrylovBICGSTAB(jac; P=precond)
-        pf_algo = NewtonRaphson(; verbose=0, tol=1e-7)
-        convergence = ExaPF.powerflow(
-            polar_gpu, jx_gpu, physical_state_gpu, pf_algo;
-            linear_solver=linear_solver
+        convergence = ExaPF.nlsolve!(
+            pf_solver, jx_gpu, stack_gpu; linear_solver=iterative_linear_solver,
         )
 
         @test convergence.has_converged
         @test convergence.n_iterations == 5
-        @test convergence.norm_residuals <= pf_algo.tol
+        @test convergence.norm_residuals <= pf_solver.tol
     end
 end

From 6c78a5be8137b49768823c15a9452032b20834da Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Wed, 12 Jan 2022 13:41:18 -0600
Subject: [PATCH 27/34] [skip ci] clean CUDA deps

---
 Project.toml         |  1 +
 src/ExaPF.jl         |  8 ++++---
 src/Polar/polar.jl   |  4 ----
 src/architectures.jl | 13 ------------
 src/autodiff.jl      |  1 -
 src/cuda_wrapper.jl  | 50 ++++++++++++++++++++++++++++++++++++++++++++
 test/gpu.jl          | 34 ------------------------------
 test/runtests.jl     |  3 ---
 8 files changed, 56 insertions(+), 58 deletions(-)
 create mode 100644 src/cuda_wrapper.jl
 delete mode 100644 test/gpu.jl

diff --git a/Project.toml b/Project.toml
index d3773b88..b059b564 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,6 +5,7 @@ version = "0.6.0"
 
 [deps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
 FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
diff --git a/src/ExaPF.jl b/src/ExaPF.jl
index 1db5f123..3535a3ec 100644
--- a/src/ExaPF.jl
+++ b/src/ExaPF.jl
@@ -6,9 +6,6 @@ using LinearAlgebra
 using SparseArrays
 
 import CUDA
-import CUDA.CUBLAS
-import CUDA.CUSPARSE
-import CUDA.CUSOLVER
 
 import ForwardDiff
 using KernelAbstractions
@@ -37,4 +34,9 @@ const LS = LinearSolvers
 # Polar formulation
 include("Polar/polar.jl")
 
+# CUDA extension
+if CUDA.has_cuda()
+    include("cuda_wrapper.jl")
+end
+
 end
diff --git a/src/Polar/polar.jl b/src/Polar/polar.jl
index 94bf2a98..950d081b 100644
--- a/src/Polar/polar.jl
+++ b/src/Polar/polar.jl
@@ -15,10 +15,6 @@ end
 function PolarForm(pf::PS.PowerNetwork, device::KA.CPU)
     return PolarForm{Float64, Vector{Int}, Vector{Float64}, Matrix{Float64}}(pf, device)
 end
-function PolarForm(pf::PS.PowerNetwork, device::KA.GPU)
-    return PolarForm{Float64, CuVector{Int}, CuVector{Float64}, CuMatrix{Float64}}(pf, device)
-end
-
 # Convenient constructor
 PolarForm(datafile::String, device) = PolarForm(PS.PowerNetwork(datafile), device)
 
diff --git a/src/architectures.jl b/src/architectures.jl
index 75ed2c01..1fe008ca 100644
--- a/src/architectures.jl
+++ b/src/architectures.jl
@@ -3,7 +3,6 @@ abstract type AbstractArchitecture end
 
 # norm
 xnorm(x::AbstractVector) = norm(x, 2)
-xnorm(x::CUDA.CuVector) = CUBLAS.nrm2(x)
 
 xnorm_inf(a) = maximum(abs.(a))
 
@@ -15,15 +14,3 @@ function get_jacobian_types(::CPU)
     return SMT, A
 end
 
-function get_jacobian_types(::GPU)
-    SMT = CUSPARSE.CuSparseMatrixCSR{Float64}
-    A = CUDA.CuVector
-    return SMT, A
-end
-
-function Base.unsafe_wrap(Atype::Type{CUDA.CuArray{T, 1, CUDA.Mem.DeviceBuffer}},
-                          p::CUDA.CuPtr{T}, dim::Integer;
-                          own::Bool=false, ctx::CUDA.CuContext=CUDA.context()) where {T}
-    unsafe_wrap(CUDA.CuArray{T, 1}, p, (dim,); own, ctx)
-end
-
diff --git a/src/autodiff.jl b/src/autodiff.jl
index d0aa6537..de559c8b 100644
--- a/src/autodiff.jl
+++ b/src/autodiff.jl
@@ -4,7 +4,6 @@ module AutoDiff
 using SparseArrays
 
 using CUDA
-import CUDA.CUSPARSE
 import ForwardDiff
 import SparseDiffTools
 using KernelAbstractions
diff --git a/src/cuda_wrapper.jl b/src/cuda_wrapper.jl
new file mode 100644
index 00000000..0b2804c4
--- /dev/null
+++ b/src/cuda_wrapper.jl
@@ -0,0 +1,50 @@
+
+import CUDA.CUBLAS
+import CUDA.CUSPARSE
+import CUDA.CUSOLVER
+
+using CUDAKernels
+
+function PolarForm(pf::PS.PowerNetwork, device::CUDADevice)
+    return PolarForm{Float64, CuVector{Int}, CuVector{Float64}, CuMatrix{Float64}}(pf, device)
+end
+
+default_sparse_matrix(::CUDADevice) = CuSparseMatrixCSR{Float64, Int}
+xnorm(x::CUDA.CuVector) = CUBLAS.nrm2(x)
+
+function get_jacobian_types(::CUDADevice)
+    SMT = default_sparse_matrix(CUDADevice())
+    A = CUDA.CuVector
+    return SMT, A
+end
+
+function Base.unsafe_wrap(Atype::Type{CUDA.CuArray{T, 1, CUDA.Mem.DeviceBuffer}},
+                          p::CUDA.CuPtr{T}, dim::Integer;
+                          own::Bool=false, ctx::CUDA.CuContext=CUDA.context()) where {T}
+    unsafe_wrap(CUDA.CuArray{T, 1}, p, (dim,); own, ctx)
+end
+
+# Differentiable LinearAlgebra.mul! for ForwardDiff
+@kernel function _spmm_kernel!(Y, X, colVal, rowPtr, nzVal, alpha, beta, n, m)
+    i, k = @index(Global, NTuple)
+    Y[i, k] *= beta
+    @inbounds for c in rowPtr[i]:rowPtr[i+1]-1
+        j = colVal[c]
+        Y[i, k] += alpha * nzVal[c] * X[j, k]
+    end
+end
+
+function LinearAlgebra.mul!(Y::AbstractArray{T, 2}, A::CUSPARSE.CuSparseMatrixCSR, X::AbstractArray{T, 2}, alpha::Number, beta::Number) where {T <: ForwardDiff.Dual}
+    n, m = size(A)
+    p = size(X, 2)
+    @assert size(Y, 1) == n
+    @assert size(X, 1) == m
+    @assert size(X, 2) == size(Y, 2)
+
+    ndrange = (n, p)
+    ev = _spmm_kernel!(CUDADevice())(
+        Y, X, A.colVal, A.rowPtr, A.nzVal, alpha, beta, n, m,
+        ndrange=ndrange,
+    )
+    wait(ev)
+end
diff --git a/test/gpu.jl b/test/gpu.jl
deleted file mode 100644
index 4162f48b..00000000
--- a/test/gpu.jl
+++ /dev/null
@@ -1,34 +0,0 @@
-using LinearAlgebra
-using CUDAKernels
-using CUDA.CUSPARSE
-
-CUDA_ARCH = (CUDADevice(), CuArray, CuSparseMatrixCSR)
-push!(ARCHS, CUDA_ARCH)
-
-# Default sparse matrix on CUDA GPU
-ExaPF.default_sparse_matrix(::CUDADevice) = CuSparseMatrixCSR
-
-# Differentiable LinearAlgebra.mul! for ForwardDiff
-@kernel function _spmm_kernel!(Y, X, colVal, rowPtr, nzVal, alpha, beta, n, m)
-    i, k = @index(Global, NTuple)
-    Y[i, k] *= beta
-    @inbounds for c in rowPtr[i]:rowPtr[i+1]-1
-        j = colVal[c]
-        Y[i, k] += alpha * nzVal[c] * X[j, k]
-    end
-end
-
-function LinearAlgebra.mul!(Y::AbstractArray{T, 2}, A::CuSparseMatrixCSR, X::AbstractArray{T, 2}, alpha::Number, beta::Number) where {T <: ForwardDiff.Dual}
-    n, m = size(A)
-    p = size(X, 2)
-    @assert size(Y, 1) == n
-    @assert size(X, 1) == m
-    @assert size(X, 2) == size(Y, 2)
-
-    ndrange = (n, p)
-    ev = _spmm_kernel!(CUDADevice())(
-        Y, X, A.colVal, A.rowPtr, A.nzVal, alpha, beta, n, m,
-        ndrange=ndrange,
-    )
-    wait(ev)
-end
diff --git a/test/runtests.jl b/test/runtests.jl
index abd974d0..8608d4b4 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -19,9 +19,6 @@ const BENCHMARK_DIR = joinpath(dirname(@__FILE__), "..", "benchmark")
 const CASES = ["case9.m", "case30.m"]
 
 ARCHS = Any[(CPU(), Array, SparseMatrixCSC)]
-if has_cuda_gpu()
-    include("gpu.jl")
-end
 
 # Load test modules
 @isdefined(TestLinearSolvers)    || include("TestLinearSolvers.jl")

From 81ad1721f499ab0d621c3d63c823bf498c3577c8 Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Wed, 12 Jan 2022 16:30:59 -0600
Subject: [PATCH 28/34] fix tests on GPU

---
 src/ExaPF.jl                       |   1 +
 src/LinearSolvers/LinearSolvers.jl |  13 +---
 src/Polar/functions.jl             |   5 +-
 src/cuda_wrapper.jl                | 120 +++++++++++++++++++++++++++--
 test/Polar/api.jl                  |   2 +-
 test/Polar/autodiff.jl             |   6 +-
 test/Polar/hessian.jl              |   2 +-
 test/quickstart.jl                 |   5 +-
 test/runtests.jl                   |   7 ++
 9 files changed, 135 insertions(+), 26 deletions(-)

diff --git a/src/ExaPF.jl b/src/ExaPF.jl
index 3535a3ec..3b10eb40 100644
--- a/src/ExaPF.jl
+++ b/src/ExaPF.jl
@@ -6,6 +6,7 @@ using LinearAlgebra
 using SparseArrays
 
 import CUDA
+import CUDA.CUSPARSE
 
 import ForwardDiff
 using KernelAbstractions
diff --git a/src/LinearSolvers/LinearSolvers.jl b/src/LinearSolvers/LinearSolvers.jl
index 28ed9ae3..5b26e711 100644
--- a/src/LinearSolvers/LinearSolvers.jl
+++ b/src/LinearSolvers/LinearSolvers.jl
@@ -8,6 +8,7 @@ import Base: show
 
 using CUDA
 using KernelAbstractions
+using CUDAKernels
 import CUDA.CUBLAS
 import CUDA.CUSOLVER
 import CUDA.CUSPARSE
@@ -157,18 +158,6 @@ function batch_ldiv!(s::DirectSolver{<:LinearAlgebra.Factorization}, Y, Js::Vect
     end
 end
 
-function ldiv!(::DirectSolver{Nothing},
-    y::CUDA.CuVector, J::CUSPARSE.CuSparseMatrixCSR, x::CUDA.CuVector,
-)
-    CUSOLVER.csrlsvqr!(J, x, y, 1e-8, one(Cint), 'O')
-    return 0
-end
-function ldiv!(::DirectSolver{Nothing},
-    y::CUDA.CuVector, J::CUSPARSE.CuSparseMatrixCSC, x::CUDA.CuVector,
-)
-    csclsvqr!(J, x, y, 1e-8, one(Cint), 'O')
-    return 0
-end
 get_transpose(::DirectSolver, M::CUSPARSE.CuSparseMatrixCSR) = CUSPARSE.CuSparseMatrixCSC(M)
 
 function rdiv!(s::DirectSolver{<:LinearAlgebra.Factorization}, y::CUDA.CuVector, J::CUSPARSE.CuSparseMatrixCSR, x::CUDA.CuVector)
diff --git a/src/Polar/functions.jl b/src/Polar/functions.jl
index 6473a327..5ca47330 100644
--- a/src/Polar/functions.jl
+++ b/src/Polar/functions.jl
@@ -268,8 +268,11 @@ Base.length(::CostFunction) = 1
 
 function (func::CostFunction)(output, state)
     costs = state.intermediate.c
+    # pg_ref = view(state.pgen, func.gen_ref)
+    res = similar(costs, length(func.gen_ref))
     pg_ref = view(state.pgen, func.gen_ref)
-    mul!(pg_ref, func.M, state.ψ)
+    mul!(res, func.M, state.ψ)
+    pg_ref .= res
     costs .= func.c0 .+ func.c1 .* state.pgen .+ func.c2 .* state.pgen.^2
     CUDA.@allowscalar output[1] = sum(costs)
     return
diff --git a/src/cuda_wrapper.jl b/src/cuda_wrapper.jl
index 0b2804c4..f1f89841 100644
--- a/src/cuda_wrapper.jl
+++ b/src/cuda_wrapper.jl
@@ -1,6 +1,7 @@
 
+using CUDA
 import CUDA.CUBLAS
-import CUDA.CUSPARSE
+import CUDA.CUSPARSE: CuSparseMatrixCSR, CuSparseMatrixCSC
 import CUDA.CUSOLVER
 
 using CUDAKernels
@@ -9,11 +10,11 @@ function PolarForm(pf::PS.PowerNetwork, device::CUDADevice)
     return PolarForm{Float64, CuVector{Int}, CuVector{Float64}, CuMatrix{Float64}}(pf, device)
 end
 
-default_sparse_matrix(::CUDADevice) = CuSparseMatrixCSR{Float64, Int}
+default_sparse_matrix(::CUDADevice) = CuSparseMatrixCSR
 xnorm(x::CUDA.CuVector) = CUBLAS.nrm2(x)
 
 function get_jacobian_types(::CUDADevice)
-    SMT = default_sparse_matrix(CUDADevice())
+    SMT = CuSparseMatrixCSR
     A = CUDA.CuVector
     return SMT, A
 end
@@ -24,6 +25,64 @@ function Base.unsafe_wrap(Atype::Type{CUDA.CuArray{T, 1, CUDA.Mem.DeviceBuffer}}
     unsafe_wrap(CUDA.CuArray{T, 1}, p, (dim,); own, ctx)
 end
 
+#=
+    LinearSolvers
+=#
+function csclsvqr!(A::CUSPARSE.CuSparseMatrixCSC{Float64},
+                    b::CUDA.CuArray{Float64, 1, CUDA.Mem.DeviceBuffer},
+                    x::CUDA.CuArray{Float64, 1, CUDA.Mem.DeviceBuffer},
+                    tol::Float64,
+                    reorder::Cint,
+                    inda::Char)
+    n = size(A,1)
+    desca = CUSPARSE.CuMatrixDescriptor(
+        CUSPARSE.CUSPARSE_MATRIX_TYPE_GENERAL,
+        CUSPARSE.CUSPARSE_FILL_MODE_LOWER,
+        CUSPARSE.CUSPARSE_DIAG_TYPE_NON_UNIT, inda)
+    singularity = Ref{Cint}(1)
+    CUSOLVER.cusolverSpDcsrlsvqr(CUSOLVER.sparse_handle(), n, A.nnz, desca, A.nzVal, A.colPtr, A.rowVal, b, tol, reorder, x, singularity)
+
+    if singularity[] != -1
+        throw(SingularException(singularity[]))
+    end
+
+    x
+end
+
+# By default, no factorization routine is available
+LinearSolvers.update!(s::DirectSolver{Nothing}, J::CuSparseMatrixCSR) = nothing
+function LinearSolvers.ldiv!(::DirectSolver{Nothing},
+    y::CuVector, J::CuSparseMatrixCSR, x::CuVector,
+)
+    CUSOLVER.csrlsvqr!(J, x, y, 1e-8, one(Cint), 'O')
+    return 0
+end
+function LinearSolvers.ldiv!(::DirectSolver{Nothing},
+    y::CUDA.CuVector, J::CUSPARSE.CuSparseMatrixCSC, x::CUDA.CuVector,
+)
+    csclsvqr!(J, x, y, 1e-8, one(Cint), 'O')
+    return 0
+end
+
+#=
+    Autodiff
+=#
+
+@kernel function _extract_values_kernel(dest, src)
+    i = @index(Global, Linear)
+    dest[i] = src[i].value
+end
+
+function extract_values!(dest::CuArray, src::CuArray)
+    ndrange = (length(dest),)
+    ev = _extract_values_kernel(CUDADevice())(dest, src, ndrange=ndrange)
+    wait(ev)
+end
+
+#=
+    Generic SpMV for CuSparseMatrixCSR
+=#
+
 # Differentiable LinearAlgebra.mul! for ForwardDiff
 @kernel function _spmm_kernel!(Y, X, colVal, rowPtr, nzVal, alpha, beta, n, m)
     i, k = @index(Global, NTuple)
@@ -34,12 +93,11 @@ end
     end
 end
 
-function LinearAlgebra.mul!(Y::AbstractArray{T, 2}, A::CUSPARSE.CuSparseMatrixCSR, X::AbstractArray{T, 2}, alpha::Number, beta::Number) where {T <: ForwardDiff.Dual}
+function LinearAlgebra.mul!(Y::AbstractArray{T, 1}, A::CUSPARSE.CuSparseMatrixCSR, X::AbstractArray{T, 1}, alpha::Number, beta::Number) where {T <: ForwardDiff.Dual}
     n, m = size(A)
-    p = size(X, 2)
+    p = 1
     @assert size(Y, 1) == n
     @assert size(X, 1) == m
-    @assert size(X, 2) == size(Y, 2)
 
     ndrange = (n, p)
     ev = _spmm_kernel!(CUDADevice())(
@@ -48,3 +106,53 @@ function LinearAlgebra.mul!(Y::AbstractArray{T, 2}, A::CUSPARSE.CuSparseMatrixCS
     )
     wait(ev)
 end
+
+
+#=
+    Generic SpMV for CuSparseMatrixCSC
+=#
+
+# Write a CUDA kernel directly as KernelAbstractions does not
+# supports atomic_add.
+function _spmm_csc_kernel_T!(Y, X, colPtr, rowVal, nzVal, alpha, beta, m, p)
+    I = threadIdx().x + (blockDim().x * (blockIdx().x - 1))
+    J = threadIdx().y + (blockDim().y * (blockIdx().y - 1))
+    if I <= m && J <= p
+        @inbounds for c in colPtr[I]:colPtr[I+1]-1
+            j = rowVal[c]
+            CUDA.@atomic Y[J, j] += alpha * nzVal[c] * X[J, I]
+        end
+    end
+end
+
+function LinearAlgebra.mul!(
+    Y::AbstractArray{D, 1},
+    A::Adjoint{T, CuSparseMatrixCSR{T, I}},
+    X::AbstractArray{D, 1},
+    alpha::Number, beta::Number,
+) where {N, I, T, S, D <: ForwardDiff.Dual{S, T, N}}
+    n, m = size(A)
+    p = N + 1
+    @assert size(Y, 1) == n
+    @assert size(X, 1) == m
+
+    B = A.parent
+
+    nthreads = 32
+    threads_y = p
+    threads_x = div(nthreads, threads_y)
+    threads = (threads_x, threads_y)
+
+    blocks = ceil.(Int, (m, p) ./ threads)
+
+    # Reinterpret duals as double.
+    # (Needed to work with atomic_add)
+    Ys = reshape(reinterpret(Float64, Y), p, n)
+    Xs = reshape(reinterpret(Float64, X), p, m)
+
+    Ys .*= beta
+    @cuda threads=threads blocks=blocks _spmm_csc_kernel_T!(
+        Ys, Xs, B.rowPtr, B.colVal, B.nzVal, alpha, beta, m, p,
+    )
+end
+
diff --git a/test/Polar/api.jl b/test/Polar/api.jl
index 9b20a376..c03d1e9a 100644
--- a/test/Polar/api.jl
+++ b/test/Polar/api.jl
@@ -74,7 +74,7 @@ function test_polar_api(polar, device, M)
 
     ## Cost Production
     cost_production = ExaPF.CostFunction(polar)
-    c2 = cost_production(stack)[1]
+    c2 = CUDA.@allowscalar cost_production(stack)[1]
     @test isa(c2, Real)
     return nothing
 end
diff --git a/test/Polar/autodiff.jl b/test/Polar/autodiff.jl
index 9d5ef8f8..62f66f7b 100644
--- a/test/Polar/autodiff.jl
+++ b/test/Polar/autodiff.jl
@@ -55,8 +55,8 @@ function test_constraints_jacobian(polar, device, MT)
         @test myisapprox(Jd, Jx, rtol=1e-5)
         @test myisapprox(Jmat, Jx, rtol=1e-5)
         @test myisapprox(Jmat, Jd, rtol=1e-5)
-        @test isapprox(∂stack.input[mymap], Jx' * tgt_h, rtol=1e-6)
-        @test isapprox(∂stack.input[mymap], Jmat' * tgt_h, rtol=1e-6)
+        @test myisapprox(∂stack.input[mymap], Jx' * tgt_h, rtol=1e-6)
+        @test myisapprox(∂stack.input[mymap], Jmat' * tgt_h, rtol=1e-6)
     end
 end
 
@@ -98,7 +98,7 @@ function test_constraints_adjoint(polar, device, MT)
         adj_fd = FiniteDiff.finite_difference_jacobian(test_fd, x) |> Array
         # Loosen the tolerance to 1e-5 there (finite_difference_jacobian
         # is less accurate than finite_difference_gradient)
-        @test isapprox(∂stack.input[mymap], adj_fd[:], rtol=1e-5)
+        @test myisapprox(∂stack.input[mymap], adj_fd[:], rtol=1e-5)
     end
 end
 
diff --git a/test/Polar/hessian.jl b/test/Polar/hessian.jl
index 09ebb551..1530472e 100644
--- a/test/Polar/hessian.jl
+++ b/test/Polar/hessian.jl
@@ -44,7 +44,7 @@ function test_hessprod_with_finitediff(polar, device, MT; rtol=1e-6, atol=1e-6)
     proj_fd = similar(x0, nx+nu)
     mul!(proj_fd, H_fd.data, dev_tgt, 1, 0)
 
-    @test isapprox(projp, Array(proj_fd), rtol=rtol)
+    @test myisapprox(projp, Array(proj_fd), rtol=rtol)
 end
 
 function test_full_space_hessian(polar, device, MT)
diff --git a/test/quickstart.jl b/test/quickstart.jl
index 294550cb..8bba8a9e 100644
--- a/test/quickstart.jl
+++ b/test/quickstart.jl
@@ -1,6 +1,7 @@
 using Test
 using CUDA
 using KernelAbstractions
+using CUDAKernels
 
 using ExaPF
 import ExaPF: AutoDiff
@@ -79,8 +80,8 @@ const INSTANCES_DIR = joinpath(artifact"ExaData", "ExaData")
         polar_gpu = ExaPF.PolarForm(pf, CUDADevice())
         stack_gpu = ExaPF.NetworkStack(polar_gpu)
 
-        basis_gpu = ExaPF.PolarBasis(polar)
-        pflow_gpu = ExaPF.PowerFlowBalance(polar) ∘ basis
+        basis_gpu = ExaPF.PolarBasis(polar_gpu)
+        pflow_gpu = ExaPF.PowerFlowBalance(polar_gpu) ∘ basis_gpu
         jx_gpu = ExaPF.MyJacobian(polar_gpu, pflow_gpu, mapx)
 
         linear_solver = LS.DirectSolver(jx_gpu.J)
diff --git a/test/runtests.jl b/test/runtests.jl
index 8608d4b4..6082b57a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -19,6 +19,13 @@ const BENCHMARK_DIR = joinpath(dirname(@__FILE__), "..", "benchmark")
 const CASES = ["case9.m", "case30.m"]
 
 ARCHS = Any[(CPU(), Array, SparseMatrixCSC)]
+if CUDA.has_cuda()
+    using CUDAKernels
+    using CUDA.CUSPARSE
+    CUDA.allowscalar(false)
+    CUDA_ARCH = (CUDADevice(), CuArray, CuSparseMatrixCSR)
+    push!(ARCHS, CUDA_ARCH)
+end
 
 # Load test modules
 @isdefined(TestLinearSolvers)    || include("TestLinearSolvers.jl")

From 9878023d0774d5738d31ddf39da27dd59f3066b3 Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Thu, 13 Jan 2022 21:01:05 -0600
Subject: [PATCH 29/34] various performance fixes

---
 src/Polar/first_order.jl |   2 +-
 src/Polar/functions.jl   |  26 +++---
 src/autodiff.jl          | 186 ++++++---------------------------------
 src/cuda_wrapper.jl      |  36 +++++---
 test/Polar/hessian.jl    |  32 ++++---
 5 files changed, 93 insertions(+), 189 deletions(-)

diff --git a/src/Polar/first_order.jl b/src/Polar/first_order.jl
index 6bdf2cf7..f069bf5c 100644
--- a/src/Polar/first_order.jl
+++ b/src/Polar/first_order.jl
@@ -101,7 +101,7 @@ function jacobian!(
     # forward pass
     jac.func(jac.t1sF, jac.stack)
     # uncompress
-    AutoDiff.getpartials_kernel!(jac.compressedJ, jac.t1sF, jac.model.device)
+    AutoDiff.partials_jac!(jac.compressedJ, jac.t1sF, jac.model.device)
     AutoDiff.uncompress_kernel!(jac.J, jac.compressedJ, jac.coloring, jac.model.device)
     return jac.J
 end
diff --git a/src/Polar/functions.jl b/src/Polar/functions.jl
index 5ca47330..e080bc31 100644
--- a/src/Polar/functions.jl
+++ b/src/Polar/functions.jl
@@ -243,6 +243,7 @@ struct CostFunction{VT, MT} <: AbstractExpression
 end
 
 function CostFunction(polar::PolarForm{T, VI, VT, MT}) where {T, VI, VT, MT}
+    nbus = get(polar, PS.NumberOfBuses())
     ngen = get(polar, PS.NumberOfGenerators())
     SMT = default_sparse_matrix(polar.device)
     # Load indexing
@@ -252,9 +253,12 @@ function CostFunction(polar::PolarForm{T, VI, VT, MT}) where {T, VI, VT, MT}
         error("Too many generators are affected to the slack nodes")
     end
     ref_gen = Int[findfirst(isequal(ref[1]), gen2bus)]
+
+    # Gen-bus incidence matrix
+    Cg = sparse(ref_gen, ref, ones(1), ngen, 2 * nbus)
     # Assemble matrix
     M_tot = PS.get_basis_matrix(polar.network)
-    M = -M_tot[ref, :] |> SMT
+    M = - Cg * M_tot |> SMT
 
     # coefficients
     coefs = PS.get_costs_coefficients(polar.network)
@@ -268,11 +272,9 @@ Base.length(::CostFunction) = 1
 
 function (func::CostFunction)(output, state)
     costs = state.intermediate.c
-    # pg_ref = view(state.pgen, func.gen_ref)
-    res = similar(costs, length(func.gen_ref))
-    pg_ref = view(state.pgen, func.gen_ref)
-    mul!(res, func.M, state.ψ)
-    pg_ref .= res
+    # Update pgen_ref
+    state.pgen[func.gen_ref] .= 0.0
+    mul!(state.pgen, func.M, state.ψ, 1.0, 1.0)
     costs .= func.c0 .+ func.c1 .* state.pgen .+ func.c2 .* state.pgen.^2
     CUDA.@allowscalar output[1] = sum(costs)
     return
@@ -280,7 +282,7 @@ end
 
 function adjoint!(func::CostFunction, ∂state, state, ∂v)
     ∂state.pgen .+= ∂v .* (func.c1 .+ 2.0 .* func.c2 .* state.pgen)
-    mul!(∂state.ψ, func.M', ∂state.pgen[func.gen_ref], 1.0, 1.0)
+    mul!(∂state.ψ, func.M', ∂state.pgen, 1.0, 1.0)
     return
 end
 
@@ -511,10 +513,12 @@ function adjoint!(func::LineFlows, ∂state, state, ∂v)
     mul!(stp, func.Ltp, state.ψ)
     mul!(stq, func.Ltq, state.ψ)
 
-    sfp .*= ∂v[1:nlines]
-    sfq .*= ∂v[1:nlines]
-    stp .*= ∂v[1+nlines:2*nlines]
-    stq .*= ∂v[1+nlines:2*nlines]
+    @views begin
+        sfp .*= ∂v[1:nlines]
+        sfq .*= ∂v[1:nlines]
+        stp .*= ∂v[1+nlines:2*nlines]
+        stq .*= ∂v[1+nlines:2*nlines]
+    end
 
     # Accumulate adjoint
     mul!(∂state.ψ, func.Lfp', sfp, 2.0, 1.0)
diff --git a/src/autodiff.jl b/src/autodiff.jl
index de559c8b..3ad6a5bf 100644
--- a/src/autodiff.jl
+++ b/src/autodiff.jl
@@ -31,23 +31,6 @@ any nonlinear constraint ``h(x)``.
 abstract type AbstractHessian end
 
 
-# Cache for adjoint
-"""
-    TapeMemory{F, S, I}
-
-This object is used as a buffer to compute the adjoint of a given function
-``h(x)``. It stores internally all intermediate values necessary
-to compute the adjoint, and cache the stack used in the backward pass.
-
-## Note
-This structure is largely inspired from [ChainRulesCore.jl](https://juliadiff.org/ChainRulesCore.jl/stable/design/changing_the_primal.html#The-Journey-to-rrule).
-"""
-struct TapeMemory{F, S, I}
-    func::F
-    stack::S
-    intermediate::I
-end
-
 # Seeding
 function _init_seed!(t1sseeds, t1sseedvecs, coloring, ncolor, nmap)
     for i in 1:nmap
@@ -92,11 +75,9 @@ end
 
 
 # Get partials
-@kernel function getpartials_kernel!(compressedJ, @Const(t1sF))
-    i = @index(Global, Linear)
-    for j in eachindex(ForwardDiff.partials.(t1sF[i]).values)
-        @inbounds compressedJ[j, i] = ForwardDiff.partials.(t1sF[i]).values[j]
-    end
+@kernel function getpartials_jac_kernel!(compressedJ, @Const(duals))
+    i, j = @index(Global, NTuple)
+    compressedJ[j, i] = duals[j+1, i]
 end
 
 # Get partials for Hessian projection
@@ -108,10 +89,8 @@ end
 end
 
 @kernel function getpartials_hess_kernel!(compressedH, @Const(duals), @Const(map))
-    i = @index(Global, Linear)
-    for j in eachindex(ForwardDiff.partials.(duals[map[i]]).values)
-        compressedH[j, i] = ForwardDiff.partials.(duals[map[i]]).values[j]
-    end
+    i, j = @index(Global, NTuple)
+    compressedH[j, i] = duals[j+1, map[i]]
 end
 
 """
@@ -128,16 +107,34 @@ function getpartials_kernel!(hv::AbstractVector, adj_t1sx, map, device)
     wait(ev)
 end
 
-function getpartials_kernel!(compressedJ::AbstractMatrix, t1sF, device)
-    kernel! = getpartials_kernel!(device)
-    ev = kernel!(compressedJ, t1sF, ndrange=length(t1sF), dependencies=Event(device))
+function partials_jac!(
+    compressedJ::AbstractMatrix{T},
+    duals::AbstractVector{ForwardDiff.Dual{Nothing, T, N}},
+    device,
+) where {T, N}
+    n = length(duals)
+    @assert size(compressedJ) == (N, n)
+    duals_ = reshape(reinterpret(T, duals), N+1, n)
+    ndrange = (n, N)
+    ev = getpartials_jac_kernel!(device)(
+        compressedJ, duals_,
+        ndrange=ndrange, dependencies=Event(device),
+    )
     wait(ev)
 end
 
-function partials_hess!(compressedH::AbstractMatrix, duals, map, device)
+function partials_hess!(
+    compressedH::AbstractMatrix,
+    duals::AbstractVector{ForwardDiff.Dual{Nothing, T, N}},
+    map, device,
+) where {T, N}
+    n = length(map)
+    @assert size(compressedH) == (N, n)
+    duals_ = reshape(reinterpret(Float64, duals), N+1, length(duals))
+    ndrange = (n, N)
     ev = getpartials_hess_kernel!(device)(
-        compressedH, duals, map,
-        ndrange=length(map), dependencies=Event(device),
+        compressedH, duals_, map,
+        ndrange=ndrange, dependencies=Event(device),
     )
     wait(ev)
 end
@@ -178,129 +175,4 @@ function uncompress_kernel!(J, compressedJ, coloring, device)
     wait(ev)
 end
 
-# BATCH AUTODIFF
-# Init seeding
-function batch_init_seed_hessian!(dest, tmp, v::Matrix, nmap, device)
-    nbatch = size(dest, 2)
-    @inbounds for i in 1:nmap
-        for j in 1:nbatch
-            dest[i, j] = ForwardDiff.Partials{1, Float64}(NTuple{1, Float64}(v[i, j]))
-        end
-    end
-    return
-end
-
-@kernel function _gpu_init_seed_hessian!(dest, v)
-    i, j = @index(Global, NTuple)
-    @inbounds dest[i, j] = ForwardDiff.Partials{1, Float64}(NTuple{1, Float64}(v[i, j]))
-end
-
-function batch_init_seed_hessian!(dest, tmp, v::CUDA.CuMatrix, nmap, device)
-    ndrange = (nmap, size(dest, 2))
-    ev = _gpu_init_seed_hessian!(device)(dest, v, ndrange=ndrange, dependencies=Event(device), workgroupsize=256)
-    wait(ev)
-end
-
-# Seeds
-@kernel function batch_seed_kernel_hessian!(
-    duals::AbstractMatrix{ForwardDiff.Dual{T, V, N}},
-    x::AbstractVector{V},
-    seeds::AbstractMatrix{ForwardDiff.Partials{N, V}}
-) where {T,V,N}
-    i, j = @index(Global, NTuple)
-    duals[i, j] = ForwardDiff.Dual{T,V,N}(x[i], seeds[i, j])
-end
-
-function batch_seed_hessian!(t1sseeds, varx, t1svarx, device)
-    kernel! = batch_seed_kernel_hessian!(device)
-    nvars = size(t1svarx, 1)
-    nbatch = size(t1svarx, 2)
-    ndrange = (nvars, nbatch)
-    ev = kernel!(t1svarx, varx, t1sseeds, ndrange=ndrange, dependencies=Event(device), workgroupsize=256)
-    wait(ev)
-end
-
-@kernel function batch_seed_kernel_jacobian!(
-    duals::AbstractMatrix{ForwardDiff.Dual{T, V, N}},
-    x::AbstractMatrix{V},
-    seeds::AbstractVector{ForwardDiff.Partials{N, V}}
-) where {T,V,N}
-    i, j = @index(Global, NTuple)
-    duals[i, j] = ForwardDiff.Dual{T,V,N}(x[i, j], seeds[i])
-end
-
-function batch_seed_jacobian!(t1sseeds, varx, t1svarx, device)
-    kernel! = batch_seed_kernel_jacobian!(device)
-    nvars = size(t1svarx, 1)
-    nbatch = size(t1svarx, 2)
-    ndrange = (nvars, nbatch)
-    ev = kernel!(t1svarx, varx, t1sseeds, ndrange=ndrange, dependencies=Event(device), workgroupsize=256)
-    wait(ev)
-end
-
-# Partials
-@kernel function batch_getpartials_hv_kernel!(hv, adj_t1sx, map)
-    i, j = @index(Global, NTuple)
-    hv[i, j] = ForwardDiff.partials(adj_t1sx[map[i], j]).values[1]
-end
-
-function batch_partials_hessian!(hv::AbstractMatrix, adj_t1sx, map, device)
-    kernel! = batch_getpartials_hv_kernel!(device)
-    nvars = size(hv, 1)
-    nbatch = size(hv, 2)
-    ndrange = (nvars, nbatch)
-    ev = kernel!(hv, adj_t1sx, map, ndrange=ndrange, dependencies=Event(device), workgroupsize=256)
-    wait(ev)
-end
-
-@kernel function batch_getpartials_jac_kernel!(compressedJ, t1sF)
-    i, j = @index(Global, NTuple)
-    compressedJ[:, i, j] .= ForwardDiff.partials.(t1sF[i, j]).values
-end
-
-@kernel function batch_getpartials_jac_kernel_gpu!(compressedJ, t1sF)
-    i, j = @index(Global, NTuple)
-    p = ForwardDiff.partials.(t1sF[i, j]).values
-    for k in eachindex(p)
-        @inbounds compressedJ[k, i, j] = p[k]
-    end
-end
-
-function batch_partials_jacobian!(compressedJ::AbstractArray{T, 3}, t1sF, device) where T
-    kernel! = batch_getpartials_jac_kernel_gpu!(device)
-    ndrange = size(t1sF)
-    ev = kernel!(compressedJ, t1sF, ndrange=ndrange, dependencies=Event(device), workgroupsize=256)
-    wait(ev)
-end
-
-# Uncompress kernels
-@kernel function batch_uncompress_kernel_gpu!(J_rowPtr, J_colVal, J_nzVal, compressedJ, coloring)
-    i, j = @index(Global, NTuple)
-    for k in J_rowPtr[i]:J_rowPtr[i+1]-1
-        J_nzVal[k, j] = compressedJ[coloring[J_colVal[k]], i, j]
-    end
-end
-
-@kernel function batch_uncompress_kernel_cpu!(J_colptr, J_rowval, J_nzval, compressedJ, coloring)
-    i, j = @index(Global, NTuple)
-    @inbounds for k in J_colptr[i]:J_colptr[i+1]-1
-        @inbounds J_nzval[j][k] = compressedJ[coloring[i], J_rowval[k], j]
-    end
-end
-
-function batch_uncompress!(Js, compressedJ, coloring, device)
-    if isa(device, GPU)
-        kernel! = batch_uncompress_kernel_gpu!(device)
-        ndrange = (size(Js, 2), size(compressedJ, 3))
-        ev = kernel!(Js.rowPtr, Js.colVal, Js.nzVal, compressedJ, coloring, ndrange=ndrange, dependencies=Event(device))
-    else
-        kernel! = batch_uncompress_kernel_cpu!(device)
-        Jsnzval = Vector{Float64}[J.nzval for J in Js]
-        J = Js[1]
-        ndrange = (size(J, 2), size(compressedJ, 3))
-        ev = kernel!(J.colptr, J.rowval, Jsnzval, compressedJ, coloring, ndrange=ndrange, dependencies=Event(device))
-    end
-    wait(ev)
-end
-
 end
diff --git a/src/cuda_wrapper.jl b/src/cuda_wrapper.jl
index f1f89841..7b09a9e2 100644
--- a/src/cuda_wrapper.jl
+++ b/src/cuda_wrapper.jl
@@ -10,11 +10,11 @@ function PolarForm(pf::PS.PowerNetwork, device::CUDADevice)
     return PolarForm{Float64, CuVector{Int}, CuVector{Float64}, CuMatrix{Float64}}(pf, device)
 end
 
-default_sparse_matrix(::CUDADevice) = CuSparseMatrixCSR
+default_sparse_matrix(::CUDADevice) = CuSparseMatrixCSR{Float64, Int32}
 xnorm(x::CUDA.CuVector) = CUBLAS.nrm2(x)
 
 function get_jacobian_types(::CUDADevice)
-    SMT = CuSparseMatrixCSR
+    SMT = CuSparseMatrixCSR{Float64, Int32}
     A = CUDA.CuVector
     return SMT, A
 end
@@ -25,6 +25,8 @@ function Base.unsafe_wrap(Atype::Type{CUDA.CuArray{T, 1, CUDA.Mem.DeviceBuffer}}
     unsafe_wrap(CUDA.CuArray{T, 1}, p, (dim,); own, ctx)
 end
 
+CuSparseMatrixCSR{Tv, Int32}(A::SparseMatrixCSC{Tv, Ti}) where {Tv, Ti} = CuSparseMatrixCSR(A)
+
 #=
     LinearSolvers
 =#
@@ -82,26 +84,40 @@ end
 #=
     Generic SpMV for CuSparseMatrixCSR
 =#
+function ForwardDiff.npartials(vec::CuVector{ForwardDiff.Dual{T, V, N}}) where {T, V, N}
+    return N
+end
 
 # Differentiable LinearAlgebra.mul! for ForwardDiff
-@kernel function _spmm_kernel!(Y, X, colVal, rowPtr, nzVal, alpha, beta, n, m)
+@kernel function _spmv_csr_kernel!(Y, X, colVal, rowPtr, nzVal, alpha, beta, n, m)
     i, k = @index(Global, NTuple)
-    Y[i, k] *= beta
+    Y[k, i] *= beta
     @inbounds for c in rowPtr[i]:rowPtr[i+1]-1
         j = colVal[c]
-        Y[i, k] += alpha * nzVal[c] * X[j, k]
+        Y[k, i] += alpha * nzVal[c] * X[k, j]
     end
 end
 
-function LinearAlgebra.mul!(Y::AbstractArray{T, 1}, A::CUSPARSE.CuSparseMatrixCSR, X::AbstractArray{T, 1}, alpha::Number, beta::Number) where {T <: ForwardDiff.Dual}
+function LinearAlgebra.mul!(
+    Y::CuArray{T, 1},
+    A::CUSPARSE.CuSparseMatrixCSR,
+    X::AbstractArray{T, 1},
+    alpha::Number, beta::Number,
+) where {T <: ForwardDiff.Dual}
     n, m = size(A)
-    p = 1
     @assert size(Y, 1) == n
     @assert size(X, 1) == m
 
+    N = ForwardDiff.npartials(Y)
+    p = 1 + N
+
+    # Reinterpret duals as double.
+    Ys = reshape(reinterpret(Float64, Y), p, n)
+    Xs = reshape(reinterpret(Float64, X), p, m)
+
     ndrange = (n, p)
-    ev = _spmm_kernel!(CUDADevice())(
-        Y, X, A.colVal, A.rowPtr, A.nzVal, alpha, beta, n, m,
+    ev = _spmv_csr_kernel!(CUDADevice())(
+        Ys, Xs, A.colVal, A.rowPtr, A.nzVal, alpha, beta, n, m,
         ndrange=ndrange,
     )
     wait(ev)
@@ -138,7 +154,7 @@ function LinearAlgebra.mul!(
 
     B = A.parent
 
-    nthreads = 32
+    nthreads = 256
     threads_y = p
     threads_x = div(nthreads, threads_y)
     threads = (threads_x, threads_y)
diff --git a/test/Polar/hessian.jl b/test/Polar/hessian.jl
index 1530472e..e822d72c 100644
--- a/test/Polar/hessian.jl
+++ b/test/Polar/hessian.jl
@@ -34,15 +34,19 @@ function test_hessprod_with_finitediff(polar, device, MT; rtol=1e-6, atol=1e-6)
     ExaPF.hprod!(HessianAD, dev_projp, stack, dev_μ, dev_tgt)
     projp = Array(dev_projp)
 
-    function lagr_x(z)
+    ∂stack = ExaPF.NetworkStack(polar)
+    empty!(∂stack)
+    function grad_lagr_x(z)
         stack.input[mymap] .= z
         mycons(c, stack)
-        return dot(μ, c)
+        empty!(∂stack)
+        ExaPF.adjoint!(mycons, ∂stack, stack, dev_μ)
+        return ∂stack.input[mymap]
     end
     x0 = stack.input[mymap]
-    H_fd = FiniteDiff.finite_difference_hessian(lagr_x, x0)
+    H_fd = FiniteDiff.finite_difference_jacobian(grad_lagr_x, x0)
     proj_fd = similar(x0, nx+nu)
-    mul!(proj_fd, H_fd.data, dev_tgt, 1, 0)
+    mul!(proj_fd, H_fd, dev_tgt, 1, 0)
 
     @test myisapprox(projp, Array(proj_fd), rtol=rtol)
 end
@@ -56,28 +60,36 @@ function test_full_space_hessian(polar, device, MT)
     mymap = [ExaPF.my_map(polar, State()); ExaPF.my_map(polar, Control())]
 
     constraints = [
-        # ExaPF.CostFunction(polar),
+        ExaPF.CostFunction(polar),
         ExaPF.PowerFlowBalance(polar),
+        ExaPF.VoltageMagnitudePQ(polar),
         ExaPF.PowerGenerationBounds(polar),
         ExaPF.LineFlows(polar),
     ]
     mycons = ExaPF.MultiExpressions(constraints) ∘ basis
 
     m = length(mycons)
-    y = rand(m) |> MT
+    y_cpu = rand(m)
+    y = y_cpu |> MT
 
     hess = ExaPF.FullHessian(polar, mycons, mymap)
     H = ExaPF.hessian!(hess, stack, y)
+
     c = zeros(m) |> MT
+    ∂stack = ExaPF.NetworkStack(polar)
 
-    function hess_fd_x(x)
+    function grad_fd_x(x)
         stack.input[mymap] .= x
         mycons(c, stack)
-        return dot(c, y)
+        empty!(∂stack)
+        ExaPF.adjoint!(mycons, ∂stack, stack, y)
+        return ∂stack.input[mymap]
     end
     x = stack.input[mymap]
-    Hd = FiniteDiff.finite_difference_hessian(hess_fd_x, x)
-    @test myisapprox(Hd.data, H, rtol=1e-5)
+    Hd = FiniteDiff.finite_difference_jacobian(grad_fd_x, x)
+
+    # Test that both Hessian match
+    @test myisapprox(Hd, H, rtol=1e-5)
     return
 end
 

From 990eca8174e581c73c8b8a7ad4469728f7eecda6 Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Fri, 14 Jan 2022 15:30:04 -0600
Subject: [PATCH 30/34] add missing functions for Argos

---
 src/LinearSolvers/LinearSolvers.jl | 15 ---------
 src/Polar/functions.jl             | 51 ++++++++++++++++++++++--------
 src/cuda_wrapper.jl                | 27 ++++++++++++++++
 3 files changed, 64 insertions(+), 29 deletions(-)

diff --git a/src/LinearSolvers/LinearSolvers.jl b/src/LinearSolvers/LinearSolvers.jl
index 5b26e711..3aef9826 100644
--- a/src/LinearSolvers/LinearSolvers.jl
+++ b/src/LinearSolvers/LinearSolvers.jl
@@ -107,17 +107,6 @@ function update!(s::DirectSolver, J::AbstractMatrix)
     lu!(s.factorization, J) # Update factorization inplace
 end
 
-function lsolve!(s::DirectSolver, y::AbstractArray)
-    LinearAlgebra.ldiv!(s.factorization, y)
-end
-function lsolve!(s::DirectSolver, y::AbstractArray, x::AbstractArray)
-    LinearAlgebra.ldiv!(y, s.factorization, x)
-end
-
-function rsolve!(s::DirectSolver, y::AbstractArray, x::AbstractArray)
-    LinearAlgebra.ldiv!(y, s.factorization', x)
-end
-
 # Reuse factorization in update
 function ldiv!(s::DirectSolver{<:LinearAlgebra.Factorization}, y::AbstractVector, J::AbstractMatrix, x::AbstractVector)
     LinearAlgebra.ldiv!(y, s.factorization, x) # Forward-backward solve
@@ -137,10 +126,6 @@ function rdiv!(s::DirectSolver{<:LinearAlgebra.Factorization}, y::AbstractArray,
     LinearAlgebra.ldiv!(y, s.factorization', x) # Forward-backward solve
     return 0
 end
-function rdiv!(s::DirectSolver{<:LinearAlgebra.Factorization}, y::Array, J::SparseMatrixCSC, x::Array)
-    LinearAlgebra.ldiv!(y, s.factorization', x) # Forward-backward solve
-    return 0
-end
 
 function ldiv!(::DirectSolver{Nothing}, y::Vector, J::AbstractMatrix, x::Vector)
     F = lu(J)
diff --git a/src/Polar/functions.jl b/src/Polar/functions.jl
index e080bc31..3c4e035c 100644
--- a/src/Polar/functions.jl
+++ b/src/Polar/functions.jl
@@ -1,8 +1,23 @@
 
 
-abstract type AbstractStack end
+abstract type AbstractStack{VT} end
 
-struct NetworkStack{VT,NT} <: AbstractStack
+
+function Base.copyto!(stack::AbstractStack{VT}, map::AbstractVector{Int}, src::VT) where VT
+    @assert length(map) == length(src)
+    for i in eachindex(map)
+        stack.input[map[i]] = src[i]
+    end
+end
+
+function Base.copyto!(dest::VT, stack::AbstractStack{VT}, map::AbstractVector{Int}) where VT
+    @assert length(map) == length(dest)
+    for i in eachindex(map)
+        dest[i] = stack.input[map[i]]
+    end
+end
+
+struct NetworkStack{VT,NT} <: AbstractStack{VT}
     # INPUT
     input::VT
     vmag::VT # voltage magnitudes
@@ -41,16 +56,22 @@ function NetworkStack(nbus, ngen, nlines, VT)
     return NetworkStack(input, vmag, vang, pgen, ψ, intermediate)
 end
 
+function init!(polar::PolarForm, stack::NetworkStack)
+    vmag = abs.(polar.network.vbus)
+    vang = angle.(polar.network.vbus)
+    pg = get(polar.network, PS.ActivePower())
+
+    copyto!(stack.vmag, vmag)
+    copyto!(stack.vang, vang)
+    copyto!(stack.pgen, pg)
+end
+
 function NetworkStack(polar::PolarForm{T,VI,VT,MT}) where {T,VI,VT,MT}
     nbus = get(polar, PS.NumberOfBuses())
     ngen = get(polar, PS.NumberOfGenerators())
     nlines = get(polar, PS.NumberOfLines())
-
     stack = NetworkStack(nbus, ngen, nlines, VT)
-    # Initiate with initial solution
-    copyto!(stack.vmag, abs.(polar.network.vbus))
-    copyto!(stack.vang, angle.(polar.network.vbus))
-    copyto!(stack.pgen, get(polar.network, PS.ActivePower()))
+    init!(polar, stack)
     return stack
 end
 
@@ -62,16 +83,18 @@ function Base.empty!(state::NetworkStack)
     return
 end
 
-function init!(polar::PolarForm, stack::NetworkStack)
-    vmag = abs.(polar.network.vbus)
-    vang = angle.(polar.network.vbus)
-    pg = get(polar.network, PS.ActivePower())
+function bounds(polar::PolarForm{T, VI, VT, MT}, stack::NetworkStack) where {T, VI, VT, MT}
+    nbus = polar.network.nbus
+    vmag_min, vmag_max = PS.bounds(polar.network, PS.Buses(), PS.VoltageMagnitude())
+    vang_min, vang_max = fill(-Inf, nbus), fill(Inf, nbus)
+    pgen_min, pgen_max = PS.bounds(polar.network, PS.Generators(), PS.ActivePower())
 
-    copyto!(stack.vmag, vmag)
-    copyto!(stack.vang, vang)
-    copyto!(stack.pgen, pg)
+    lb = [vmag_min; vang_min; pgen_min]
+    ub = [vmag_max; vang_max; pgen_max]
+    return convert(VT, lb), convert(VT, ub)
 end
 
+
 voltage(buf::NetworkStack) = buf.vmag .* exp.(im .* buf.vang)
 voltage_host(buf::NetworkStack) = voltage(buf) |> Array
 
diff --git a/src/cuda_wrapper.jl b/src/cuda_wrapper.jl
index 7b09a9e2..c1311d79 100644
--- a/src/cuda_wrapper.jl
+++ b/src/cuda_wrapper.jl
@@ -27,6 +27,33 @@ end
 
 CuSparseMatrixCSR{Tv, Int32}(A::SparseMatrixCSC{Tv, Ti}) where {Tv, Ti} = CuSparseMatrixCSR(A)
 
+
+# AbstractStack
+
+@kernel function _transfer_to_input!(input, map, src)
+    i = @index(Global, Linear)
+    input[map[i]] = src[i]
+end
+
+@kernel function _transfer_fr_input!(dest, input, map)
+    i = @index(Global, Linear)
+    dest[i] = input[map[i]]
+end
+
+function Base.copyto!(stack::AbstractStack{VT}, map::AbstractVector{Int}, vals::VT) where {VT <: CuArray}
+    @assert length(map) == length(vals)
+    ndrange = (length(map),)
+    ev = _transfer_to_input!(CUDADevice())(stack.input, map, vals, ndrange=ndrange)
+    wait(ev)
+end
+
+function Base.copyto!(dest::VT, stack::AbstractStack{VT}, map::AbstractVector{Int}) where {VT <: CuArray}
+    @assert length(map) == length(vals)
+    ndrange = (length(map),)
+    ev = _transfer_fr_input!(CUDADevice())(dest, stack.input, map, ndrange=ndrange)
+    wait(ev)
+end
+
 #=
     LinearSolvers
 =#

From 88de7da4cc3b490e19c60cda79a6ffaec6c3fa92 Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Fri, 14 Jan 2022 15:31:20 -0600
Subject: [PATCH 31/34] clean functions in matpower.jl

---
 src/PowerSystem/matpower.jl | 178 ------------------------------------
 1 file changed, 178 deletions(-)

diff --git a/src/PowerSystem/matpower.jl b/src/PowerSystem/matpower.jl
index 62a9df35..a27ddb27 100644
--- a/src/PowerSystem/matpower.jl
+++ b/src/PowerSystem/matpower.jl
@@ -131,181 +131,3 @@ function _matpower_hessian(V, Ybus, λ)
     return (G11, transpose(G21), G22)
 end
 
-function residual_hessian(V, Ybus, λ, pv, pq, ref)
-    # decompose vector
-    n = length(V)
-    λp = zeros(n) ; λq = zeros(n)
-    npv = length(pv)
-    npq = length(pq)
-    nref = length(ref)
-    λp[pv] = λ[1:npv]
-    λp[pq] = λ[npv+1:npv+npq]
-    λq[pq] = λ[npv+npq+1:end]
-
-    Gp11, Gp12, Gp22 = _matpower_hessian(V, Ybus, λp)
-    Pθθ = real.(Gp11)
-    Pvθ = real.(Gp12)
-    Pvv = real.(Gp22)
-
-    Gq11, Gq12, Gq22 = _matpower_hessian(V, Ybus, λq)
-    Qθθ = imag.(Gq11)
-    Qvθ = imag.(Gq12)
-    Qvv = imag.(Gq22)
-
-    # w.r.t. xx
-    H11 = Pθθ[pv, pv] + Qθθ[pv, pv]
-    H12 = Pθθ[pv, pq] + Qθθ[pv, pq]
-    H13 = Pvθ[pv, pq] + Qvθ[pv, pq]
-    H22 = Pθθ[pq, pq] + Qθθ[pq, pq]
-    H23 = Pvθ[pq, pq] + Qvθ[pq, pq]
-    H33 = Pvv[pq, pq] + Qvv[pq, pq]
-
-    Hxx = [
-        H11  H12  H13
-        H12' H22  H23
-        H13' H23' H33
-    ]::SparseMatrixCSC{Float64, Int}
-
-    # w.r.t. uu
-    H11 = Pvv[ref, ref] + Qvv[ref, ref]
-    H12 = Pvv[ref,  pv] + Qvv[ref,  pv]
-    H22 = Pvv[pv,   pv] + Qvv[pv,   pv]
-
-    Huu = [
-         H11  H12 spzeros(nref, npv)
-         H12' H22 spzeros(npv, npv)
-         spzeros(npv, nref + 2 * npv)
-    ]::SparseMatrixCSC{Float64, Int}
-
-    # w.r.t. xu
-    Pvθ = real.(transpose(Gp12))
-    Qvθ = imag.(transpose(Gq12))
-    H11 = Pvθ[ref, pv] + Qvθ[ref, pv]
-    H12 = Pvθ[ref, pq] + Qvθ[ref, pq]
-    H13 = Pvv[ref, pq] + Qvv[ref, pq]
-    H21 = Pvθ[pv,  pv] + Qvθ[pv,  pv]
-    H22 = Pvθ[pv,  pq] + Qvθ[pv,  pq]
-    H23 = Pvv[pv,  pq] + Qvv[pv,  pq]
-
-    Hxu = [
-        H11  H12  H13
-        H21  H22  H23
-        spzeros(npv, npv + 2 * npq)
-    ]::SparseMatrixCSC{Float64, Int}
-
-    return (Hxx, Hxu, Huu)
-end
-
-# ∂²pg_ref / ∂²x
-function active_power_hessian(V, Ybus, pv, pq, ref)
-    # decompose vector
-    n = length(V)
-    npv = length(pv)
-    npq = length(pq)
-    nref = length(ref)
-
-    λp = zeros(n)
-    # Pick only components wrt ref nodes
-    λp[ref] .= 1.0
-
-    G11, G12, G22 = _matpower_hessian(V, Ybus, λp)
-    Pθθ = real.(G11)
-    Pvθ = real.(G12)
-    Pvv = real.(G22)
-
-    H11 = Pθθ[pv, pv]
-    H12 = Pθθ[pv, pq]
-    H13 = Pvθ[pv, pq]
-    H22 = Pθθ[pq, pq]
-    H23 = Pvθ[pq, pq]
-    H33 = Pvv[pq, pq]
-
-    # w.r.t. xx
-    Hxx = [
-        H11  H12  H13
-        H12' H22  H23
-        H13' H23' H33
-    ]::SparseMatrixCSC{Float64, Int}
-
-    # w.r.t. uu
-    H11 = Pvv[ref, ref]
-    H12 = Pvv[ref, pv]
-    H22 = Pvv[pv, pv]
-
-    Huu = [
-         H11  H12 spzeros(nref, npv)
-         H12' H22 spzeros(npv, npv)
-         spzeros(npv, nref + 2 * npv)
-    ]::SparseMatrixCSC{Float64, Int}
-
-    # w.r.t. xu
-    Pvθ = real.(transpose(G12))
-    Qvθ = imag.(transpose(G12))
-    H11 = Pvθ[ref, pv]
-    H12 = Pvθ[ref, pq]
-    H13 = Pvv[ref, pq]
-    H21 = Pvθ[pv, pv]
-    H22 = Pvθ[pv, pq]
-    H23 = Pvv[pv, pq]
-
-    Hxu = [
-        H11  H12  H13
-        H21  H22  H23
-        spzeros(npv, npv + 2 * npq)
-    ]::SparseMatrixCSC{Float64, Int}
-
-    return (Hxx, Hxu, Huu)
-end
-
-# ∂²qg / ∂²x * λ
-function reactive_power_hessian(V, Ybus, λ, pv, pq, ref)
-    n = length(V)
-    npv = length(pv)
-    npq = length(pq)
-    nref = length(ref)
-
-    G11, G12, G22 = _matpower_hessian(V, Ybus, λ)
-    Qθθ = imag.(G11)
-    Qvθ = imag.(G12)
-    Qvv = imag.(G22)
-
-    H11 = Qθθ[pv, pv]
-    H12 = Qθθ[pv, pq]
-    H13 = Qvθ[pv, pq]
-    H22 = Qθθ[pq, pq]
-    H23 = Qvθ[pq, pq]
-    H33 = Qvv[pq, pq]
-
-    Hxx = [
-        H11  H12  H13
-        H12' H22  H23
-        H13' H23' H33
-    ]::SparseMatrixCSC{Float64, Int}
-
-    H11 = Qvv[ref, ref]
-    H12 = Qvv[ref, pv]
-    H22 = Qvv[pv, pv]
-
-    Huu = [
-         H11  H12 spzeros(nref, npv)
-         H12' H22 spzeros(npv, npv)
-         spzeros(npv, nref + 2* npv)
-    ]::SparseMatrixCSC{Float64, Int}
-
-    Pvθ = real.(transpose(G12))
-    Qvθ = imag.(transpose(G12))
-    H11 = Qvθ[ref, pv]
-    H12 = Qvθ[ref, pq]
-    H13 = Qvv[ref, pq]
-    H21 = Qvθ[pv, pv]
-    H22 = Qvθ[pv, pq]
-    H23 = Qvv[pv, pq]
-
-    Hxu = [
-        H11  H12  H13
-        H21  H22  H23
-        spzeros(npv, 2*npq+npv)
-    ]::SparseMatrixCSC{Float64, Int}
-    return (Hxx, Hxu, Huu)
-end
-

From faa13c368fcde4cb049866689919e415e2b47ef6 Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Mon, 17 Jan 2022 15:05:51 -0600
Subject: [PATCH 32/34] [skip ci] another fixes on GPU

---
 src/cuda_wrapper.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cuda_wrapper.jl b/src/cuda_wrapper.jl
index c1311d79..ac027ef2 100644
--- a/src/cuda_wrapper.jl
+++ b/src/cuda_wrapper.jl
@@ -48,7 +48,7 @@ function Base.copyto!(stack::AbstractStack{VT}, map::AbstractVector{Int}, vals::
 end
 
 function Base.copyto!(dest::VT, stack::AbstractStack{VT}, map::AbstractVector{Int}) where {VT <: CuArray}
-    @assert length(map) == length(vals)
+    @assert length(map) == length(dest)
     ndrange = (length(map),)
     ev = _transfer_fr_input!(CUDADevice())(dest, stack.input, map, ndrange=ndrange)
     wait(ev)
@@ -145,7 +145,7 @@ function LinearAlgebra.mul!(
     ndrange = (n, p)
     ev = _spmv_csr_kernel!(CUDADevice())(
         Ys, Xs, A.colVal, A.rowPtr, A.nzVal, alpha, beta, n, m,
-        ndrange=ndrange,
+        ndrange=ndrange, dependencies=Event(CUDADevice()),
     )
     wait(ev)
 end

From 8bd349183565a1d862c2d5712100ae4f3aa2d967 Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Mon, 17 Jan 2022 18:30:01 -0600
Subject: [PATCH 33/34] fix: initiate properly sparsity pattern for Jacobians

---
 src/Polar/first_order.jl | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/Polar/first_order.jl b/src/Polar/first_order.jl
index f069bf5c..d9eb3615 100644
--- a/src/Polar/first_order.jl
+++ b/src/Polar/first_order.jl
@@ -22,10 +22,8 @@ Base.size(jac::MyJacobian, n::Int) = size(jac.J, n)
 # Coloring
 function jacobian_sparsity(polar::PolarForm, func::AbstractExpression)
     nbus = get(polar, PS.NumberOfBuses())
-    Vre = Float64[i for i in 1:nbus]
-    Vim = Float64[i for i in nbus+1:2*nbus]
-    V = Vre .+ im .* Vim
-    return matpower_jacobian(polar, func, V)
+    v = polar.network.vbus .+ 0.01 .* rand(ComplexF64, nbus)
+    return matpower_jacobian(polar, func, v)
 end
 
 function get_jacobian_colors(polar::PolarForm, func::AbstractExpression, map::Vector{Int})

From d1b2dea50f8be95a028929289784fca913ad28a2 Mon Sep 17 00:00:00 2001
From: fpacaud <francoispacaud8@gmail.com>
Date: Mon, 17 Jan 2022 21:15:18 -0600
Subject: [PATCH 34/34] add support for multi-generators

---
 src/Polar/legacy.jl              |  2 +-
 src/PowerSystem/power_network.jl | 10 +---------
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/Polar/legacy.jl b/src/Polar/legacy.jl
index 960f3d76..3ddc2222 100644
--- a/src/Polar/legacy.jl
+++ b/src/Polar/legacy.jl
@@ -59,7 +59,7 @@ function matpower_jacobian(polar::PolarForm, func::PowerGenerationBounds, V)
 
     j21 = imag(dSbus_dVm[[ref; pv], :])
     j22 = imag(dSbus_dVa[[ref; pv], :])
-    j23 = spzeros(ngen, ngen)
+    j23 = spzeros(nref + npv, ngen)
     # w.r.t. control
     return [
         j11 j12 j13 ;
diff --git a/src/PowerSystem/power_network.jl b/src/PowerSystem/power_network.jl
index 2f8b8f0a..2f8c3b17 100644
--- a/src/PowerSystem/power_network.jl
+++ b/src/PowerSystem/power_network.jl
@@ -44,7 +44,7 @@ struct PowerNetwork <: AbstractPowerSystem
     sbus::Vector{Complex{Float64}}
     sload::Vector{Complex{Float64}}
 
-    function PowerNetwork(data::Dict{String, Array}; remove_lines=Int[], multi_generators=:aggregate)
+    function PowerNetwork(data::Dict{String, Array}; remove_lines=Int[])
         # Parsed data indexes
         BUS_I, BUS_TYPE, PD, QD, GS, BS, BUS_AREA, VM, VA, BASE_KV, ZONE, VMAX, VMIN,
         LAM_P, LAM_Q, MU_VMAX, MU_VMIN = IndexSet.idx_bus()
@@ -59,14 +59,6 @@ struct PowerNetwork <: AbstractPowerSystem
         # BUSES
         bus_id_to_indexes = get_bus_id_to_indexes(bus)
 
-        # GENERATORS
-        if has_multiple_generators(gen) && multi_generators == :aggregate
-            gen, σg = merge_multi_generators(gen)
-            if !isnothing(cost_coefficients)
-                cost_coefficients = merge_cost_coefficients(cost_coefficients, gen, σg)
-            end
-        end
-
         # LINES
         # Remove specified lines
         lines = get_active_branches(lines, remove_lines)