diff --git a/api.go b/api.go index 1fbcbc21..01df20d0 100644 --- a/api.go +++ b/api.go @@ -17,3 +17,10 @@ type Parseable interface { // Nil should be returned if parsing was successful. Parse(lex *lexer.PeekingLexer) error } + +// The Fuzzable interface can be implemented by any element in the grammar to provide custom fuzzing. +type Fuzzable interface { + // Generate a valid string that can be parsed to get a value from + // the corresponding Node. + Fuzz(l lexer.Fuzzer) string +} diff --git a/fuzzer_test.go b/fuzzer_test.go new file mode 100644 index 00000000..8011f170 --- /dev/null +++ b/fuzzer_test.go @@ -0,0 +1,56 @@ +package participle_test + +import ( + "math/rand" + "testing" + "time" + + "github.com/alecthomas/participle/v2" + "github.com/alecthomas/participle/v2/lexer" + "github.com/alecthomas/repr" +) + +func doFuzzTest(grammar interface{}, t *testing.T) { + parser := participle.MustBuild(grammar) + + rand.Seed(0) + + for i := 0; i < 5; i++ { + start := time.Now() + println("start fuzz") + data := parser.Fuzz(lexer.DefaultDefinition.(lexer.Fuzzer)) + println("fuzz", (start.Sub(time.Now()).String())) + + err := parser.ParseString("test", data, grammar) + if err != nil { + t.Fatalf("error parsing (%s): %s", repr.String(data), err) + } + + println("parse", (start.Sub(time.Now()).String())) + } +} + +func TestFuzz_LookAhead(t *testing.T) { + type val struct { + Str string ` @String` + Int int `| @Int` + } + type op struct { + Op string `@('+' | '*' (?= @Int))` + Operand val `@@` + } + type sum struct { + Left val `@@` + Ops []op `@@*` + } + + doFuzzTest(&sum{}, t) +} + +func TestFuzz_Disjunction(t *testing.T) { + type grammar struct { + Whatever string `'a' | @String | 'b'` + } + + doFuzzTest(&grammar{}, t) +} diff --git a/lexer/api.go b/lexer/api.go index 74f22ec0..e4371b40 100644 --- a/lexer/api.go +++ b/lexer/api.go @@ -45,6 +45,11 @@ type Lexer interface { Next() (Token, error) } +// A Fuzzer returns a random valid string for a given token kind +type Fuzzer interface { + Fuzz(t TokenType) string +} + // SymbolsByRune returns a map of lexer symbol names keyed by rune. func SymbolsByRune(def Definition) map[TokenType]string { symbols := def.Symbols() diff --git a/lexer/text_scanner.go b/lexer/text_scanner.go index 5607c333..115094e2 100644 --- a/lexer/text_scanner.go +++ b/lexer/text_scanner.go @@ -3,8 +3,12 @@ package lexer import ( "bytes" "io" + "math" + "math/rand" + "strconv" "strings" "text/scanner" + "unicode" ) // TextScannerLexer is a lexer that uses the text/scanner module. @@ -48,6 +52,104 @@ func (d *textScannerLexerDefinition) Symbols() map[string]TokenType { } } +func count16(rang unicode.Range16) int { + return int(((rang.Hi - rang.Lo) / rang.Stride) + 1) +} + +func count32(rang unicode.Range32) int { + return int(((rang.Hi - rang.Lo) / rang.Stride) + 1) +} + +func totalRunesInRange(tables []*unicode.RangeTable) int { + total := 0 + for _, table := range tables { + for _, r16 := range table.R16 { + total += count16(r16) + } + for _, r32 := range table.R32 { + total += count32(r32) + } + } + return total +} + +// we're pretending the tables are smushed up against +// eachother here +func nthRuneFromTables(at int, tables []*unicode.RangeTable) (ret rune) { + n := at + + for _, table := range tables { + for _, rang := range table.R16 { + num := count16(rang) + if n <= num-1 { + return rune(int(rang.Lo) + (int(rang.Stride) * n)) + } + n -= num + } + for _, rang := range table.R32 { + num := count32(rang) + if n <= num-1 { + return rune(int(rang.Lo) + (int(rang.Stride) * n)) + } + n -= num + } + } + + return ' ' +} + +func randomRune(len int, tables ...*unicode.RangeTable) rune { + return nthRuneFromTables( + rand.Intn(len), + tables) +} + +var cleaner = strings.NewReplacer( + "\x00", "", +) + +var defaultTableCount = totalRunesInRange([]*unicode.RangeTable{unicode.Letter, unicode.Symbol, unicode.Number}) +var letterTableCount = totalRunesInRange([]*unicode.RangeTable{unicode.Letter}) +var letterNumberTableCount = totalRunesInRange([]*unicode.RangeTable{unicode.Letter, unicode.Number}) + +func randomString(length int, tableLength int, tables ...*unicode.RangeTable) string { + s := make([]rune, length) + + if len(tables) == 0 { + tables = append(tables, unicode.Letter, unicode.Symbol, unicode.Number) + } + + for i := 0; i < length; i++ { + char := randomRune(tableLength, tables...) + s = append(s, char) + } + + return cleaner.Replace(string(s)) +} + +func (d *textScannerLexerDefinition) Fuzz(t TokenType) string { + switch t { + case EOF: + return "" + case scanner.Char: + return string(rune(rand.Intn(math.MaxInt))) + case scanner.Ident: + return string(randomRune(letterTableCount, unicode.Letter)) + randomString(rand.Intn(100), letterNumberTableCount, unicode.Letter, unicode.Number) + case scanner.Int: + return strconv.Itoa(rand.Int()) + case scanner.Float: + return strconv.FormatFloat(rand.Float64(), 'f', -1, 64) + case scanner.String: + return `"` + strings.ReplaceAll(randomString(rand.Intn(50), defaultTableCount), "\n", " ") + `"` + case scanner.RawString: + return "`" + randomString(rand.Intn(50), defaultTableCount) + "`" + case scanner.Comment: + return randomString(rand.Intn(50), defaultTableCount) + default: + return string(rune(t)) + } +} + // textScannerLexer is a Lexer based on text/scanner.Scanner type textScannerLexer struct { scanner *scanner.Scanner diff --git a/nodes.go b/nodes.go index 81dfc050..63eee67b 100644 --- a/nodes.go +++ b/nodes.go @@ -4,6 +4,8 @@ import ( "encoding" "errors" "fmt" + "math" + "math/rand" "reflect" "strconv" "strings" @@ -34,6 +36,9 @@ type node interface { // Returned slice will be nil if the node does not match. Parse(ctx *parseContext, parent reflect.Value) ([]reflect.Value, error) + // Returns a random valid string that can be parsed to get a value + Fuzz(l lexer.Fuzzer) string + // Return a decent string representation of the Node. fmt.Stringer @@ -72,6 +77,15 @@ func (p *parseable) Parse(ctx *parseContext, parent reflect.Value) (out []reflec return []reflect.Value{rv.Elem()}, nil } +func (p *parseable) Fuzz(l lexer.Fuzzer) string { + rv := reflect.New(p.t) + v, ok := rv.Interface().(Fuzzable) + if !ok { + panic(fmt.Sprintf("%s does not support fuzzing", p.t)) + } + return v.Fuzz(l) +} + // @@ type strct struct { typ reflect.Type @@ -125,6 +139,10 @@ func (s *strct) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Va return []reflect.Value{sv}, ctx.Apply() } +func (s *strct) Fuzz(l lexer.Fuzzer) string { + return s.expr.Fuzz(l) +} + func (s *strct) maybeInjectStartToken(token lexer.Token, v reflect.Value) { if s.posFieldIndex == nil { return @@ -184,6 +202,40 @@ type group struct { mode groupMatchMode } +func (g *group) Fuzz(l lexer.Fuzzer) string { + var ( + maxCount int + minCount int + ) + + switch g.mode { + case groupMatchOnce: + minCount, maxCount = 1, 1 + case groupMatchZeroOrOne: + minCount, maxCount = 0, 1 + case groupMatchZeroOrMore: + minCount, maxCount = 0, math.MaxInt + case groupMatchOneOrMore: + minCount, maxCount = 1, math.MaxInt + case groupMatchNonEmpty: + minCount, maxCount = 1, 1 + } + + var items int + if maxCount-minCount == 0 { + items = minCount + } else { + items = minCount + rand.Intn(maxCount-minCount) + } + var s strings.Builder + for i := 0; i < items; i++ { + s.WriteString(g.expr.Fuzz(l)) + if i < items-1 { + s.WriteString(" ") + } + } + return s.String() +} func (g *group) String() string { return ebnf(g) } func (g *group) GoString() string { return fmt.Sprintf("group{%s}", g.mode) } func (g *group) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) { @@ -254,6 +306,9 @@ type lookaheadGroup struct { func (n *lookaheadGroup) String() string { return ebnf(n) } func (n *lookaheadGroup) GoString() string { return "lookaheadGroup{}" } +func (n *lookaheadGroup) Fuzz(l lexer.Fuzzer) string { + return n.expr.Fuzz(l) +} func (n *lookaheadGroup) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) { // Create a branch to avoid advancing the parser as any match will be discarded @@ -278,6 +333,9 @@ type disjunction struct { func (d *disjunction) String() string { return ebnf(d) } func (d *disjunction) GoString() string { return "disjunction{}" } +func (d *disjunction) Fuzz(l lexer.Fuzzer) string { + return d.nodes[rand.Intn(len(d.nodes))].Fuzz(l) +} func (d *disjunction) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) { var ( @@ -325,6 +383,16 @@ type sequence struct { func (s *sequence) String() string { return ebnf(s) } func (s *sequence) GoString() string { return "sequence{}" } +func (s *sequence) Fuzz(l lexer.Fuzzer) string { + var sb strings.Builder + for n := s; n != nil; n = n.next { + sb.WriteString(n.node.Fuzz(l)) + if n.next != nil { + sb.WriteString(" ") + } + } + return sb.String() +} func (s *sequence) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) { for n := s; n != nil; n = n.next { @@ -356,6 +424,9 @@ type capture struct { func (c *capture) String() string { return ebnf(c) } func (c *capture) GoString() string { return "capture{}" } +func (c *capture) Fuzz(l lexer.Fuzzer) string { + return c.node.Fuzz(l) +} func (c *capture) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) { start := ctx.RawCursor() @@ -380,6 +451,9 @@ type reference struct { func (r *reference) String() string { return ebnf(r) } func (r *reference) GoString() string { return fmt.Sprintf("reference{%s}", r.identifier) } +func (r *reference) Fuzz(l lexer.Fuzzer) string { + return l.Fuzz(r.typ) +} func (r *reference) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) { token, err := ctx.Peek(0) @@ -400,6 +474,13 @@ type optional struct { func (o *optional) String() string { return ebnf(o) } func (o *optional) GoString() string { return "optional{}" } +func (o *optional) Fuzz(l lexer.Fuzzer) string { + if rand.Intn(1) == 0 { + return "" + } else { + return o.node.Fuzz(l) + } +} func (o *optional) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) { branch := ctx.Branch() @@ -425,6 +506,19 @@ type repetition struct { func (r *repetition) String() string { return ebnf(r) } func (r *repetition) GoString() string { return "repetition{}" } +func (r *repetition) Fuzz(l lexer.Fuzzer) string { + var ( + s strings.Builder + max = rand.Intn(100) + ) + for i := 0; i < max; i++ { + s.WriteString(r.node.Fuzz(l)) + if i < max-1 { + s.WriteString(" ") + } + } + return s.String() +} // Parse a repetition. Once a repetition is encountered it will always match, so grammars // should ensure that branches are differentiated prior to the repetition. @@ -466,6 +560,9 @@ type literal struct { func (l *literal) String() string { return ebnf(l) } func (l *literal) GoString() string { return fmt.Sprintf("literal{%q, %q}", l.s, l.tt) } +func (lit *literal) Fuzz(l lexer.Fuzzer) string { + return lit.s +} func (l *literal) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) { token, err := ctx.Peek(0) @@ -494,6 +591,9 @@ type negation struct { func (n *negation) String() string { return ebnf(n) } func (n *negation) GoString() string { return "negation{}" } +func (n *negation) Fuzz(l lexer.Fuzzer) string { + panic("todo") +} func (n *negation) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) { // Create a branch to avoid advancing the parser, but call neither Stop nor Accept on it diff --git a/parser.go b/parser.go index f45b153c..f245119a 100644 --- a/parser.go +++ b/parser.go @@ -245,6 +245,10 @@ func (p *Parser) parseOne(ctx *parseContext, rv reflect.Value) error { return nil } +func (p *Parser) Fuzz(l lexer.Fuzzer) string { + return p.root.Fuzz(l) +} + func (p *Parser) parseInto(ctx *parseContext, rv reflect.Value) error { if rv.IsNil() { return fmt.Errorf("target must be a non-nil pointer to a struct, but is a nil %s", rv.Type())