From 14ad2513eaf15ae68cb46fc8af427813f1674387 Mon Sep 17 00:00:00 2001
From: Jaro Reinders <jaro.reinders@gmail.com>
Date: Sun, 30 Jul 2023 21:02:53 +0200
Subject: [PATCH] More docs

---
 experiments/Experiment2.hs | 106 ++++++++++++++++++++++++++++---------
 1 file changed, 80 insertions(+), 26 deletions(-)

diff --git a/experiments/Experiment2.hs b/experiments/Experiment2.hs
index a2b7452..35952e5 100644
--- a/experiments/Experiment2.hs
+++ b/experiments/Experiment2.hs
@@ -1,5 +1,73 @@
 {-# LANGUAGE GADTs #-}
 
+{-
+To deal with *left recursion*, we essentially transform the grammar
+following this general example:
+
+    X ::= X Y | Z    ==>    X ::= Z Y*    (1)
+
+Where Y* means repeating Y zero or more times. And note that any
+left-recursive nonterminal can be written in this form (if we allow an
+in-line disjuction operator or add more nonterminals), e.g.:
+
+    X ::= X + X | X - X | 0 | 1
+==>
+    X ::= X (+ X | - X) | (0 | 1)         (2)
+==> 
+    X ::= (0 | 1) (+ X | - X)*
+
+There are two main edge-cases: indirect left-recursion and empty
+productions. 
+
+We deal with *indirect left-recursion* using a combination of a static
+analysis, before parsing, and dynamic checks, during parsing. 
+
+* Statically, we recursively look through all nonterminals which are in
+  leftmost position to search for possible left-recursion. For each
+  left-recursive loop we find, we collect the continuation, e.g. Y in (1)
+  or + X and - X in (2).
+
+  This is done in the 'loops' function.
+
+* Dynamically, we prevent entering the same nonterminal twice by keeping
+  track of the visited nonterminals in a set. We clear the set whenever
+  the parser consumes an actual character.
+
+  This is done in the 'parse' function.
+
+As for *empty productions*, we don't deal with those yet. For now it is
+not that bad to avoid it manually. However, we do plan on resolving it
+before a 1.0 release. There seem to be two promising approaches:
+
+* Statically transform the grammar to factor out nonterminals which accept
+  the empty string. This could cause nonterminals to expand quadratically
+  if done naively, e.g.:
+
+      X  ::= X1  X2  X3  X4
+      X' ::= X1' X2  X3  X4
+           | X1  X2' X3  X4
+           | X1  X2  X3' X4
+           | X1  X2  X3  X4'
+
+  Where the primes indicate the non-empty variant of each nonterminal.
+   
+  It's also be possible to limit the blowup to be be linear if we add
+  more helper nonterminals, e.g.:
+     
+      X345 ::= X3 X45
+      X45  ::= X4 X5
+      X12  ::= X1 X2
+      X123 ::= X12 X3
+
+      X' ::= X1' X2 X345
+           | X1  X2' X345
+           | X12  X3' X45
+           | X123 X4' X5
+           | X123 X4 X5'
+
+* Dynamically enforce that input is consumed and bail out otherwise.
+-}
+
 import Control.Applicative
 import Data.Char
 import Data.Type.Equality
@@ -14,14 +82,12 @@ newtype P p a = P [P' p a] deriving Functor
 instance Applicative (P p) where
   pure x = P [Pure x]
   -- Note that the standard 'P ps <*> P qs = P [p <*> q | p <- ps, q <- qs]'
-  -- would **not** work because this would combine all the alternatives of
-  -- the second parser with the first parser. 
-  -- Essentially, that would mean we would have to choose an alternative
-  -- from the first parser **and** the second parser up front.
-  -- Instead, it is possible and better to postpone choosing the alternatives 
-  -- of the second parser.
-  -- In particular, the 'chain' combinator we use below depends on this
-  -- postponement.
+  -- would **not** work because this would combine all the alternatives of the
+  -- second parser with the first parser. Essentially, that would mean we would
+  -- have to choose an alternative from the first parser **and** the second
+  -- parser up front. Instead, it is possible and better to postpone choosing
+  -- the alternatives of the second parser. In particular, the 'chain'
+  -- combinator which we use further down depends on this postponement.
   P ps <*> q = asum (map (`go` q) ps) where
     go (Pure f) k' = fmap f k'
     go (Match c k) k' = P [Match c (k <*> k')]
@@ -43,24 +109,8 @@ send x = P [Free x (pure id)]
 parse :: forall f a. (GCompare f) => (forall x. f x -> P f x) -> f a -> String -> [a]
 parse g p0 xs0 = map fst $ filter ((== "") . snd) $ go mempty xs0 (g p0) where
 
-  -- We use the set 's :: Set (SomeNT f)' to avoid recursing into the same
-  -- nonterminal indefinitely, which would otherwise happen if the grammar
-  -- was left-recursive.  Of course that means we could miss those parses which
-  -- would require taking those loops. 
-
-  -- To account for those, we essentially transform following this general example:
-  -- X ::= X Y | Z
-  -- ==>
-  -- X ::= Z Y*
-  -- (where * means repeating zero or more times)
-  
-  -- More concretely, we analyse the grammar, simulating a single
-  -- left-recursive loop and record how parsing would continue after exiting
-  -- the loop at that point. Using the 'chain' combinator, we take this
-  -- continuation and run it zero or more times, thus simulating an arbitrary
-  -- number of left-recursive loops. We recover the missed parses by appending
-  -- this chained continuation to the body of each nonterminal we parse.
-
+  -- We use the set 's :: Set (Some f)' to avoid recursing into the same
+  -- nonterminal indefinitely.
   go :: Set (Some f) -> String -> P f b -> [(b, String)]
   go s xs (P ps) = go' s xs =<< ps
 
@@ -89,6 +139,10 @@ loops g x0 = go mempty (g x0) where
     -- TODO: what if 'x' accepts the empty string?
   go' _ _ = []
 
+--------------------------------------------------------------------------------
+-- Examples
+--------------------------------------------------------------------------------
+
 data Gram a where
   Digit :: Gram Int
   Number :: Gram Int