-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgrammar.go
239 lines (197 loc) · 6.95 KB
/
grammar.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
// package grammar allows defining regexp rules with comments, whitespace and
// newlines to make them less dense, and easier to read:
//
// ` // a NUMBER
// [+-]? // first, match an optional sign
// ( // then match integers or f.p. mantissas:
// \d+\.\d+ // mantissa of the form a.b
// |\d+\. // mantissa of the form a.
// |\.\d+ // mantissa of the form .b
// |\d+ // integer of the form a
// )
// ( [eE] [+-]? \d+ )? // finally, optionally match an exponent
// `
//
// result: [+-]?(\d+\.\d+|\d+\.|\.\d+|\d+)([eE][+-]?\d+)?
//
// Complex rules can be assembled by simpler rules (subrules) using string interpolation.
//
// `
// ^
// ${NUMBER} // start with number
// (?: // don't capture
// \s+ ${NUMBER} // followed by one ore more numbers, separated by whitespace
// )+
// $
// `
//
// Any number of rules can be added to a grammar, dependent or independent,
// as long as there are no cyclic dependencies.
//
package grammar
import (
"fmt"
"regexp"
)
// Make rule names type safe, too many strings on the road.
type ruleName string
// isValid reports whether the rulename adheres to a certain pattern.
func (name ruleName) isValid() bool {
return rxMatchSubRuleStrict.MatchString(string(name))
}
// substitute/interploate ${SUBRULE} with final string.
type replaceMap map[ruleName]string
var (
// regexps for trim.
rxComment = regexp.MustCompile(`(?m://.*$)`)
rxSpaces = regexp.MustCompile(`\s+`)
// regexps for interpolation.
rxGrepSubRuleRelaxed = regexp.MustCompile(Trim(`\$\{ (?P<SUBRULE> [^{}]+ ) \}`))
rxMatchSubRuleStrict = regexp.MustCompile(Trim(`^ [a-zA-Z_] \w* $`))
)
// Trim removes all comments and whitespace from string.
//
// Trim is a helper function and would normally not be public,
// but it is also helpful if you don't want to build whole grammars,
// but just want to remove whitespace and comments from patterns.
func Trim(s string) string {
s = rxComment.ReplaceAllString(s, "")
s = rxSpaces.ReplaceAllString(s, "")
return s
}
// Grammar is a container for related and maybe dependent rules.
// Subrules are string interpolated in other rules before compiling to regexp.
type Grammar struct {
name string // give the grammar a name
rules map[ruleName]*rule // the map of all rules, the rule name is the key
compiled bool // all dependencies are resolved und all rules are compiled
}
// rule is a container for a regexp, based on a raw string, ?trimmed?,
// parsed and interpolated with regexp strings from other rules in same grammar.
type rule struct {
name ruleName // give the rule a name
subrules []ruleName // a slice of all ${SUBRULE} the rule depends on
pattern string // the input, trimmed or unaltered
final string // all subrules interpolated
rx *regexp.Regexp // the compiled regexp
}
// New initializes a new grammar.
func New(name string) *Grammar {
return &Grammar{
name: name,
rules: make(map[ruleName]*rule),
}
}
// Add rule to grammar, returns error if rule with same name already exists
// or grammar is already compiled. The pattern string gets trimmed.
func (g *Grammar) Add(name string, pattern string) error {
return g.add(ruleName(name), Trim(pattern))
}
// AddVerbatim is similar to Add, but no trimming takes place.
// Use this method if whitespace is significant.
func (g *Grammar) AddVerbatim(name string, pattern string) error {
return g.add(ruleName(name), pattern)
}
func (g *Grammar) add(ruleName ruleName, pattern string) error {
if !ruleName.isValid() {
return fmt.Errorf("grammar %q, rulename %q not allowed", g.name, ruleName)
}
if g.compiled {
return fmt.Errorf("grammar %q is already compiled, can't add rule %q", g.name, ruleName)
}
if _, ok := g.rules[ruleName]; ok {
return fmt.Errorf("grammar %q, rule with name %q already exists", g.name, ruleName)
}
r := &rule{name: ruleName, pattern: pattern}
r.subrules = findSubrules(r)
for _, subName := range r.subrules {
if !subName.isValid() {
return fmt.Errorf("grammar %q, rule %q, wrong subrule name %q", g.name, ruleName, subName)
}
if subName == r.name {
return fmt.Errorf("grammar %q, rule %q is self referencing", g.name, ruleName)
}
}
g.rules[ruleName] = r
return nil
}
// Compile all rules in grammar. Resolve dependencies, interpolate strings and compile all rules to regexp.
func (g *Grammar) Compile() error {
if g.compiled {
return fmt.Errorf("grammar %q is already compiled", g.name)
}
// for all rules check if subrules exists in grammar
for _, rule := range g.rules {
for _, subName := range rule.subrules {
if _, ok := g.rules[subName]; !ok {
return fmt.Errorf("compiling grammar %q, rule %q depends on missing subrule %q", g.name, rule.name, subName)
}
}
}
sorted, err := g.toposort()
if err != nil {
return err
}
for _, ruleName := range sorted {
rule := g.rules[ruleName]
replace := replaceMap{}
// build replace map: replace ${SUBRULE} with final string of SUBRULE
for _, subruleName := range rule.subrules {
subrule := g.rules[subruleName]
replace[subruleName] = subrule.final
}
// and now replace the subrules and compile the pattern to regexp
if err := rule.compile(replace); err != nil {
return fmt.Errorf("grammar %q, %w", g.name, err)
}
}
g.compiled = true
return nil
}
// compile the regexp for rule, but before replace all subrules with their final string.
func (r *rule) compile(replace replaceMap) error {
if r.rx != nil {
panic("logic error, rule is already compiled")
}
// replace the subrules with their final string
s := r.pattern
for subrule, final := range replace {
rx := regexp.MustCompile(`\Q${` + string(subrule) + `}\E`)
s = rx.ReplaceAllLiteralString(s, final)
}
r.final = s
var err error
r.rx, err = regexp.Compile(r.final)
if err != nil {
return fmt.Errorf("regexp compilation of rule %q, %w", r.name, err)
}
return nil
}
// Rx returns the compiled regexp for named rule or error if rule is not added or not compiled.
func (g *Grammar) Rx(name string) (*regexp.Regexp, error) {
r, ok := g.rules[ruleName(name)]
if !ok {
return nil, fmt.Errorf("grammar %q, rule %q is not added", g.name, name)
}
if !g.compiled {
return nil, fmt.Errorf("grammar %q is not compiled", g.name)
}
return r.rx, nil
}
// findSubrules is a helper to find all ${RULENAME} in string and returns the slice of ruleNames or nil.
func findSubrules(r *rule) []ruleName {
var result []ruleName
for _, matches := range rxGrepSubRuleRelaxed.FindAllStringSubmatch(r.pattern, -1) {
for i, captureGroup := range rxGrepSubRuleRelaxed.SubexpNames() {
// index 0 is always the empty string
if i == 0 {
continue
}
if captureGroup != "SUBRULE" {
panic("logic error, unexpected named capture group: " + captureGroup)
}
result = append(result, ruleName(matches[i]))
}
}
return result
}