feat: show diffs when #guard_msgs fails (#3912)

Adds the ability to show a diff when `guard_msgs` fails, using the histogram diff algorithm pioneered in jgit. This algorithm tends to produce more user-friendly diffs, but it can be quadratic in the worst case. Empirically, the quadratic case of this implementation doesn't seem to be slow enough to matter for messages smaller than hundreds of megabytes, but if it's ever a problem, we can mitigate it the same way jgit does by falling back to Myers diff. See lean/run/guard_msgs.lean in the tests directory for some examples of its output.
leanprover · Apr 18, 2024 · b6d77be · b6d77be
1 parent 0c9f9ab
commit b6d77be
Show file tree

Hide file tree

Showing 11 changed files with 626 additions and 1 deletion.
diff --git a/RELEASES.md b/RELEASES.md
@@ -99,6 +99,10 @@ v4.8.0 (development in progress)
   and `#guard_msgs (ordering := sorted) in cmd` sorts the messages in lexicographic order before checking.
   PR [#3883](https://github.com/leanprover/lean4/pull/3883).
 
+* The `#guard_msgs` command now supports showing a diff between the expected and actual outputs. This feature is currently
+  disabled by default, but can be enabled with `set_option guard_msgs.diff true`. Depending on user feedback, this option
+  may default to `true` in a future version of Lean.
+
 Breaking changes:
 
 * Automatically generated equational theorems are now named using suffix `.eq_<idx>` instead of `._eq_<idx>`, and `.def` instead of `._unfold`. Example:

diff --git a/src/Init/Data.lean b/src/Init/Data.lean
@@ -14,6 +14,7 @@ import Init.Data.String
 import Init.Data.List
 import Init.Data.Int
 import Init.Data.Array
+import Init.Data.Array.Subarray.Split
 import Init.Data.ByteArray
 import Init.Data.FloatArray
 import Init.Data.Fin

diff --git a/src/Init/Data/Array/Subarray.lean b/src/Init/Data/Array/Subarray.lean
@@ -29,6 +29,12 @@ namespace Subarray
 def size (s : Subarray α) : Nat :=
   s.stop - s.start
 
+theorem size_le_array_size {s : Subarray α} : s.size ≤ s.array.size := by
+  let {array, start, stop, start_le_stop, stop_le_array_size} := s
+  simp [size]
+  apply Nat.le_trans (Nat.sub_le stop start)
+  assumption
+
 def get (s : Subarray α) (i : Fin s.size) : α :=
   have : s.start + i.val < s.array.size := by
    apply Nat.lt_of_lt_of_le _ s.stop_le_array_size

diff --git a/src/Init/Data/Array/Subarray/Split.lean b/src/Init/Data/Array/Subarray/Split.lean
@@ -0,0 +1,71 @@
+/-
+Copyright (c) 2024 Lean FRO, LLC. All rights reserved.
+Released under Apache 2.0 license as described in the file LICENSE.
+Authors: David Thrane Christiansen
+-/
+
+prelude
+import Init.Data.Array.Basic
+import Init.Data.Array.Subarray
+import Init.Omega
+
+/-
+This module contains splitting operations on subarrays that crucially rely on `omega` for proof
+automation. Placing them in another module breaks an import cycle, because `omega` itself uses the
+array library.
+-/
+
+namespace Subarray
+/--
+Splits a subarray into two parts.
+-/
+def split (s : Subarray α) (i : Fin s.size.succ) : (Subarray α × Subarray α) :=
+  let ⟨i', isLt⟩ := i
+  have := s.start_le_stop
+  have := s.stop_le_array_size
+  have : i' ≤ s.stop - s.start := Nat.lt_succ.mp isLt
+  have : s.start + i' ≤ s.stop := by omega
+  have : s.start + i' ≤ s.array.size := by omega
+  have : s.start + i' ≤ s.stop := by
+    simp only [size] at isLt
+    omega
+  let pre := {s with
+    stop := s.start + i',
+    start_le_stop := by omega,
+    stop_le_array_size := by assumption
+  }
+  let post := {s with
+    start := s.start + i'
+    start_le_stop := by assumption
+  }
+  (pre, post)
+
+/--
+Removes the first `i` elements of the subarray. If there are `i` or fewer elements, the resulting
+subarray is empty.
+-/
+def drop (arr : Subarray α) (i : Nat) : Subarray α where
+  array := arr.array
+  start := min (arr.start + i) arr.stop
+  stop := arr.stop
+  start_le_stop := by
+    rw [Nat.min_def]
+    split <;> simp only [Nat.le_refl, *]
+  stop_le_array_size := arr.stop_le_array_size
+
+/--
+Keeps only the first `i` elements of the subarray. If there are `i` or fewer elements, the resulting
+subarray is empty.
+-/
+def take (arr : Subarray α) (i : Nat) : Subarray α where
+  array := arr.array
+  start := arr.start
+  stop := min (arr.start + i) arr.stop
+  start_le_stop := by
+    have := arr.start_le_stop
+    rw [Nat.min_def]
+    split <;> omega
+  stop_le_array_size := by
+    have := arr.stop_le_array_size
+    rw [Nat.min_def]
+    split <;> omega
diff --git a/src/Lean/Elab/GuardMsgs.lean b/src/Lean/Elab/GuardMsgs.lean
@@ -5,6 +5,7 @@ Authors: Kyle Miller
 -/
 prelude
 import Lean.Elab.Notation
+import Lean.Util.Diff
 import Lean.Server.CodeActions.Attr
 
 /-! `#guard_msgs` command for testing commands
@@ -15,6 +16,12 @@ See the docstring on the `#guard_msgs` command.
 
 open Lean Parser.Tactic Elab Command
 
+register_builtin_option guard_msgs.diff : Bool := {
+  defValue := false
+  descr := "When true, show a diff between expected and actual messages if they don't match. "
+}
+
+
 namespace Lean.Elab.Tactic.GuardMsgs
 
 /-- Gives a string representation of a message without source position information.
@@ -151,7 +158,12 @@ def MessageOrdering.apply (mode : MessageOrdering) (msgs : List String) : List S
     else
       -- Failed. Put all the messages back on the message log and add an error
       modify fun st => { st with messages := initMsgs ++ msgs }
-      logErrorAt tk m!"❌ Docstring on `#guard_msgs` does not match generated message:\n\n{res}"
+      let feedback :=
+        if (← getOptions).getBool `guard_msgs.diff false then
+          let diff := Diff.diff (expected.split (· == '\n')).toArray (res.split (· == '\n')).toArray
+          Diff.linesToString diff
+        else res
+      logErrorAt tk m!"❌ Docstring on `#guard_msgs` does not match generated message:\n\n{feedback}"
       pushInfoLeaf (.ofCustomInfo { stx := ← getRef, value := Dynamic.mk (GuardMsgFailure.mk res) })
   | _ => throwUnsupportedSyntax
 

diff --git a/src/Lean/Util/Diff.lean b/src/Lean/Util/Diff.lean
@@ -0,0 +1,198 @@
+/-
+Copyright (c) 2024 Lean FRO, LLC. All rights reserved.
+Released under Apache 2.0 license as described in the file LICENSE.
+Authors: David Thrane Christiansen
+-/
+prelude
+import Init.Data
+import Lean.Data.HashMap
+import Init.Omega
+
+namespace Lean.Diff
+/--
+An action in an edit script to transform one source sequence into a target sequence.
+-/
+inductive Action where
+  /-- Insert the item into the source -/
+  | insert
+  /-- Delete the item from the source -/
+  | delete
+  /-- Leave the item in the source -/
+  | skip
+deriving Repr, BEq, Hashable, Repr
+
+instance : ToString Action where
+  toString
+    | .insert => "insert"
+    | .delete => "delete"
+    | .skip => "skip"
+
+/--
+Determines a prefix to show on a given line when printing diff results.
+
+For `delete`, the prefix is `"-"`, for `insert`, it is `"+"`, and for `skip` it is `" "`.
+-/
+def Action.linePrefix : (action : Action) → String
+  | .delete => "-"
+  | .insert => "+"
+  | .skip => " "
+
+/--
+A histogram entry consists of the number of times the element occurs in the left and right input
+arrays along with an index at which the element can be found, if applicable.
+-/
+structure Histogram.Entry (α : Type u) (lsize rsize : Nat) where
+  /-- How many times the element occurs in the left array -/
+  leftCount : Nat
+  /-- An index of the element in the left array, if applicable -/
+  leftIndex : Option (Fin lsize)
+  /-- Invariant: the count is non-zero iff there is a saved index -/
+  leftWF : leftCount = 0 ↔ leftIndex = none
+  /-- How many times the element occurs in the right array -/
+  rightCount : Nat
+  /-- An index of the element in the right array, if applicable -/
+  rightIndex : Option (Fin rsize)
+  /-- Invariant: the count is non-zero iff there is a saved index -/
+  rightWF : rightCount = 0 ↔ rightIndex = none
+
+/-- A histogram for arrays maps each element to a count and, if applicable, an index.-/
+def Histogram (α : Type u) (lsize rsize : Nat) [BEq α] [Hashable α] :=
+  Lean.HashMap α (Histogram.Entry α lsize rsize)
+
+
+section
+
+variable [BEq α] [Hashable α]
+
+/-- Add an element from the left array to a histogram -/
+def Histogram.addLeft (histogram : Histogram α lsize rsize) (index : Fin lsize) (val : α)
+    : Histogram α lsize rsize :=
+  match histogram.find? val with
+  | none => histogram.insert val {
+      leftCount := 1, leftIndex := some index,
+      leftWF := by simp,
+      rightCount := 0, rightIndex := none
+      rightWF := by simp
+    }
+  | some x => histogram.insert val {x with
+      leftCount := x.leftCount + 1, leftIndex := some index, leftWF := by simp
+    }
+
+/-- Add an element from the right array to a histogram -/
+def Histogram.addRight (histogram : Histogram α lsize rsize) (index : Fin rsize) (val : α)
+    : Histogram α lsize rsize :=
+  match histogram.find? val with
+  | none => histogram.insert val {
+      leftCount := 0, leftIndex := none,
+      leftWF := by simp,
+      rightCount := 1, rightIndex := some index,
+      rightWF := by simp
+    }
+  | some x => histogram.insert val {x with
+      rightCount := x.leftCount + 1, rightIndex := some index,
+      rightWF := by simp
+    }
+
+/-- Given two `Subarray`s, find their common prefix and return their differing suffixes -/
+def matchPrefix (left right : Subarray α) : Array α × Subarray α × Subarray α :=
+  let rec go (pref : Array α) : Array α × Subarray α × Subarray α :=
+    let i := pref.size
+    if _ : i < left.size ∧ i < right.size then
+      if left[i]'(by omega) == right[i]'(by omega) then
+        go <| pref.push <| left[i]'(by omega)
+      else (pref, left.drop pref.size, right.drop pref.size)
+    else (pref, left.drop pref.size, right.drop pref.size)
+  termination_by left.size - pref.size
+  go #[]
+
+
+/-- Given two `Subarray`s, find their common suffix and return their differing prefixes -/
+def matchSuffix (left right : Subarray α) : Subarray α × Subarray α × Array α :=
+  let rec go (i : Nat) : Subarray α × Subarray α × Array α :=
+    if _ : i < left.size ∧ i < right.size then
+      if left[left.size - i - 1]'(by omega) == right[right.size - i - 1]'(by omega) then
+        go i.succ
+      else
+        (left.take (left.size - i), right.take (right.size - i), (left.drop (left.size - i)).toArray)
+    else
+      (left.take (left.size - i), right.take (right.size - i), (left.drop (left.size - i)).toArray)
+    termination_by left.size - i
+  go 0
+
+/--
+Finds the least common subsequence of two arrays using a variant of jgit's histogram diff algorithm.
+
+Unlike jgit's implementation, this one does not perform a cutoff on histogram bucket sizes, nor does
+it fall back to the Myers diff algorithm. This means that it has quadratic worst-case complexity.
+Empirically, however, even quadratic cases of this implementation can handle hundreds of megabytes
+in a second or so, and it is intended to be used for short strings like error messages. Be
+cautious when applying it to larger workloads.
+-/
+partial def lcs (left right : Subarray α) : Array α := Id.run do
+  let (pref, left, right) := matchPrefix left right
+  let (left, right, suff) := matchSuffix left right
+  let mut hist : Histogram α left.size right.size := .empty
+  for h : i in [0:left.size] do
+    hist := hist.addLeft ⟨i, Membership.get_elem_helper h rfl⟩ left[i]
+  for h : i in [0:right.size] do
+    hist := hist.addRight ⟨i, Membership.get_elem_helper h rfl⟩ right[i]
+  let mut best := none
+  for (k, v) in hist.toList do
+    if let {leftCount := lc, leftIndex := some li, rightCount := rc, rightIndex := some ri, ..} := v then
+      match best with
+      | none =>
+        best := some ((lc + rc), k, li, ri)
+      | some (count, _) =>
+        if lc + rc < count then
+          best := some ((lc + rc), k, li, ri)
+  let some (_, v, li, ri) := best
+    | return pref ++ suff
+
+  let lefts := left.split ⟨li.val, by cases li; simp_arith; omega⟩
+  let rights := right.split ⟨ri.val, by cases ri; simp_arith; omega⟩
+
+  pref ++ lcs lefts.1 rights.1 ++ #[v] ++ lcs (lefts.2.drop 1) (rights.2.drop 1) ++ suff
+
+/--
+Computes an edit script to transform `left` into `right`.
+-/
+def diff [Inhabited α] (original edited : Array α) : Array (Action × α) :=
+  if h : ¬(0 < original.size) then
+    edited.map (.insert, ·)
+  else if h' : ¬(0 < edited.size) then
+    original.map (.delete, ·)
+  else Id.run do
+    have : original.size > 0 := by omega
+    have : edited.size > 0 := by omega
+    let mut out := #[]
+    let ds := lcs original.toSubarray edited.toSubarray
+    let mut i := 0
+    let mut j := 0
+    for d in ds do
+      while i < original.size && original[i]! != d do
+        out := out.push <| (.delete, original[i]!)
+        i := i.succ
+      while j < edited.size && edited[j]! != d do
+        out := out.push <| (.insert, edited[j]!)
+        j := j.succ
+      out := out.push <| (.skip, d)
+      i := i.succ
+      j := j.succ
+    while h : i < original.size do
+      out := out.push <| (.delete, original[i])
+      i := i.succ
+    while h : j < edited.size do
+      out := out.push <| (.insert, edited[j])
+      j := j.succ
+    out
+
+/-- Shows a line-by-line edit script with markers for added and removed lines. -/
+def linesToString [ToString α] (lines : Array (Action × α)) : String := Id.run do
+  let mut out : String := ""
+  for (act, line) in lines do
+    let lineStr := toString line
+    if lineStr == "" then
+      out := out ++ s!"{act.linePrefix}\n"
+    else
+      out := out ++ s!"{act.linePrefix} {lineStr}\n"
+  out