Skip to content

Commit

Permalink
fix: String.splitOn bug (#3832)
Browse files Browse the repository at this point in the history
Fixes #3829. As reported on Zulip (both
[recently](https://leanprover.zulipchat.com/#narrow/stream/270676-lean4/topic/current.20definition.20of.20.60String.2EsplitOn.60.20is.20incorrect/near/430930535)
and [a year
ago](https://leanprover.zulipchat.com/#narrow/stream/270676-lean4/topic/should.20we.20redefine.20.60String.2EsplitOnAux.60.3F/near/365899332)),
`String.splitOn` has a bug when dealing with separators of more than one
character (which are luckily rare). The code change here is very small,
replacing a `i` with `i - j`, but it makes termination more complex so
that's where the rest of the line count goes.
  • Loading branch information
digama0 authored Apr 4, 2024
1 parent d988849 commit e41cd31
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 4 deletions.
50 changes: 46 additions & 4 deletions src/Init/Data/String/Basic.lean
Original file line number Diff line number Diff line change
Expand Up @@ -245,12 +245,21 @@ termination_by s.endPos.1 - i.1
@[specialize] def split (s : String) (p : Char → Bool) : List String :=
splitAux s p 0 0 []

/--
Auxiliary for `splitOn`. Preconditions:
* `sep` is not empty
* `b <= i` are indexes into `s`
* `j` is an index into `sep`, and not at the end
It represents the state where we have currently parsed some split parts into `r` (in reverse order),
`b` is the beginning of the string / the end of the previous match of `sep`, and the first `j` bytes
of `sep` match the bytes `i-j .. i` of `s`.
-/
def splitOnAux (s sep : String) (b : Pos) (i : Pos) (j : Pos) (r : List String) : List String :=
if h : s.atEnd i then
if s.atEnd i then
let r := (s.extract b i)::r
r.reverse
else
have := Nat.sub_lt_sub_left (Nat.gt_of_not_le (mt decide_eq_true h)) (lt_next s _)
if s.get i == sep.get j then
let i := s.next i
let j := sep.next j
Expand All @@ -259,9 +268,42 @@ def splitOnAux (s sep : String) (b : Pos) (i : Pos) (j : Pos) (r : List String)
else
splitOnAux s sep b i j r
else
splitOnAux s sep b (s.next i) 0 r
termination_by s.endPos.1 - i.1
splitOnAux s sep b (s.next (i - j)) 0 r
termination_by (s.endPos.1 - (i - j).1, sep.endPos.1 - j.1)
decreasing_by
all_goals simp_wf
focus
rename_i h _ _
left; exact Nat.sub_lt_sub_left
(Nat.lt_of_le_of_lt (Nat.sub_le ..) (Nat.gt_of_not_le (mt decide_eq_true h)))
(Nat.lt_of_le_of_lt (Nat.sub_le ..) (lt_next s _))
focus
rename_i i₀ j₀ _ eq h'
rw [show (s.next i₀ - sep.next j₀).1 = (i₀ - j₀).1 by
show (_ + csize _) - (_ + csize _) = _
rw [(beq_iff_eq ..).1 eq, Nat.add_sub_add_right]; rfl]
right; exact Nat.sub_lt_sub_left
(Nat.lt_of_le_of_lt (Nat.le_add_right ..) (Nat.gt_of_not_le (mt decide_eq_true h')))
(lt_next sep _)
focus
rename_i h _
left; exact Nat.sub_lt_sub_left
(Nat.lt_of_le_of_lt (Nat.sub_le ..) (Nat.gt_of_not_le (mt decide_eq_true h)))
(lt_next s _)

/--
Splits a string `s` on occurrences of the separator `sep`. When `sep` is empty, it returns `[s]`;
when `sep` occurs in overlapping patterns, the first match is taken. There will always be exactly
`n+1` elements in the returned list if there were `n` nonoverlapping matches of `sep` in the string.
The default separator is `" "`. The separators are not included in the returned substrings.
```
"here is some text ".splitOn = ["here", "is", "some", "text", ""]
"here is some text ".splitOn "some" = ["here is ", " text "]
"here is some text ".splitOn "" = ["here is some text "]
"ababacabac".splitOn "aba" = ["", "bac", "c"]
```
-/
def splitOn (s : String) (sep : String := " ") : List String :=
if sep == "" then [s] else splitOnAux s sep 0 0 0 []

Expand Down
10 changes: 10 additions & 0 deletions tests/lean/string_imp.lean
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,13 @@
#eval "αβcc".mkIterator.next.next.pos
#eval "αβcc".mkIterator.next.setCurr 'a'
#eval "αβcd".mkIterator.toEnd.pos

#eval "012".splitOn "12"
#eval "007".splitOn "07"
#eval "ababcab".splitOn "abc"
#eval "αbαbcαbcαααbcα".splitOn "αb"
#eval "αbαbcαbcαααbcα".splitOn "αbcα"
#eval "here is some text ".splitOn
#eval "here is some text ".splitOn "some"
#eval "here is some text ".splitOn ""
#eval "ababacabac".splitOn "aba"
9 changes: 9 additions & 0 deletions tests/lean/string_imp.lean.expected.out
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,12 @@
{ byteIdx := 4 }
String.Iterator.mk "αacc" { byteIdx := 2 }
{ byteIdx := 6 }
["0", ""]
["0", ""]
["ab", "ab"]
["", "", "c", "cαα", "cα"]
["αb", "bcαα", ""]
["here", "is", "some", "text", ""]
["here is ", " text "]
["here is some text "]
["", "bac", "c"]

0 comments on commit e41cd31

Please sign in to comment.