Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Introduce Fuzz(...) #205

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions api.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,10 @@ type Parseable interface {
// Nil should be returned if parsing was successful.
Parse(lex *lexer.PeekingLexer) error
}

// The Fuzzable interface can be implemented by any element in the grammar to provide custom fuzzing.
type Fuzzable interface {
// Generate a valid string that can be parsed to get a value from
// the corresponding Node.
Fuzz(l lexer.Fuzzer) string
}
56 changes: 56 additions & 0 deletions fuzzer_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package participle_test

import (
"math/rand"
"testing"
"time"

"github.com/alecthomas/participle/v2"
"github.com/alecthomas/participle/v2/lexer"
"github.com/alecthomas/repr"
)

func doFuzzTest(grammar interface{}, t *testing.T) {
parser := participle.MustBuild(grammar)

rand.Seed(0)

for i := 0; i < 5; i++ {
start := time.Now()
println("start fuzz")
data := parser.Fuzz(lexer.DefaultDefinition.(lexer.Fuzzer))
println("fuzz", (start.Sub(time.Now()).String()))

err := parser.ParseString("test", data, grammar)
if err != nil {
t.Fatalf("error parsing (%s): %s", repr.String(data), err)
}

println("parse", (start.Sub(time.Now()).String()))
}
}

func TestFuzz_LookAhead(t *testing.T) {
type val struct {
Str string ` @String`
Int int `| @Int`
}
type op struct {
Op string `@('+' | '*' (?= @Int))`
Operand val `@@`
}
type sum struct {
Left val `@@`
Ops []op `@@*`
}

doFuzzTest(&sum{}, t)
}

func TestFuzz_Disjunction(t *testing.T) {
type grammar struct {
Whatever string `'a' | @String | 'b'`
}

doFuzzTest(&grammar{}, t)
}
5 changes: 5 additions & 0 deletions lexer/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ type Lexer interface {
Next() (Token, error)
}

// A Fuzzer returns a random valid string for a given token kind
type Fuzzer interface {
Fuzz(t TokenType) string
}

// SymbolsByRune returns a map of lexer symbol names keyed by rune.
func SymbolsByRune(def Definition) map[TokenType]string {
symbols := def.Symbols()
Expand Down
102 changes: 102 additions & 0 deletions lexer/text_scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,12 @@ package lexer
import (
"bytes"
"io"
"math"
"math/rand"
"strconv"
"strings"
"text/scanner"
"unicode"
)

// TextScannerLexer is a lexer that uses the text/scanner module.
Expand Down Expand Up @@ -48,6 +52,104 @@ func (d *textScannerLexerDefinition) Symbols() map[string]TokenType {
}
}

func count16(rang unicode.Range16) int {
return int(((rang.Hi - rang.Lo) / rang.Stride) + 1)
}

func count32(rang unicode.Range32) int {
return int(((rang.Hi - rang.Lo) / rang.Stride) + 1)
}

func totalRunesInRange(tables []*unicode.RangeTable) int {
total := 0
for _, table := range tables {
for _, r16 := range table.R16 {
total += count16(r16)
}
for _, r32 := range table.R32 {
total += count32(r32)
}
}
return total
}

// we're pretending the tables are smushed up against
// eachother here
func nthRuneFromTables(at int, tables []*unicode.RangeTable) (ret rune) {
n := at

for _, table := range tables {
for _, rang := range table.R16 {
num := count16(rang)
if n <= num-1 {
return rune(int(rang.Lo) + (int(rang.Stride) * n))
}
n -= num
}
for _, rang := range table.R32 {
num := count32(rang)
if n <= num-1 {
return rune(int(rang.Lo) + (int(rang.Stride) * n))
}
n -= num
}
}

return ' '
}

func randomRune(len int, tables ...*unicode.RangeTable) rune {
return nthRuneFromTables(
rand.Intn(len),
tables)
}

var cleaner = strings.NewReplacer(
"\x00", "",
)

var defaultTableCount = totalRunesInRange([]*unicode.RangeTable{unicode.Letter, unicode.Symbol, unicode.Number})
var letterTableCount = totalRunesInRange([]*unicode.RangeTable{unicode.Letter})
var letterNumberTableCount = totalRunesInRange([]*unicode.RangeTable{unicode.Letter, unicode.Number})

func randomString(length int, tableLength int, tables ...*unicode.RangeTable) string {
s := make([]rune, length)

if len(tables) == 0 {
tables = append(tables, unicode.Letter, unicode.Symbol, unicode.Number)
}

for i := 0; i < length; i++ {
char := randomRune(tableLength, tables...)
s = append(s, char)
}

return cleaner.Replace(string(s))
}

func (d *textScannerLexerDefinition) Fuzz(t TokenType) string {
switch t {
case EOF:
return ""
case scanner.Char:
return string(rune(rand.Intn(math.MaxInt)))
case scanner.Ident:
return string(randomRune(letterTableCount, unicode.Letter)) + randomString(rand.Intn(100), letterNumberTableCount, unicode.Letter, unicode.Number)
case scanner.Int:
return strconv.Itoa(rand.Int())
case scanner.Float:
return strconv.FormatFloat(rand.Float64(), 'f', -1, 64)
case scanner.String:
return `"` + strings.ReplaceAll(randomString(rand.Intn(50), defaultTableCount), "\n", " ") + `"`
case scanner.RawString:
return "`" + randomString(rand.Intn(50), defaultTableCount) + "`"
case scanner.Comment:
return randomString(rand.Intn(50), defaultTableCount)
default:
return string(rune(t))
}
}

// textScannerLexer is a Lexer based on text/scanner.Scanner
type textScannerLexer struct {
scanner *scanner.Scanner
Expand Down
100 changes: 100 additions & 0 deletions nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import (
"encoding"
"errors"
"fmt"
"math"
"math/rand"
"reflect"
"strconv"
"strings"
Expand Down Expand Up @@ -34,6 +36,9 @@ type node interface {
// Returned slice will be nil if the node does not match.
Parse(ctx *parseContext, parent reflect.Value) ([]reflect.Value, error)

// Returns a random valid string that can be parsed to get a value
Fuzz(l lexer.Fuzzer) string

// Return a decent string representation of the Node.
fmt.Stringer

Expand Down Expand Up @@ -72,6 +77,15 @@ func (p *parseable) Parse(ctx *parseContext, parent reflect.Value) (out []reflec
return []reflect.Value{rv.Elem()}, nil
}

func (p *parseable) Fuzz(l lexer.Fuzzer) string {
rv := reflect.New(p.t)
v, ok := rv.Interface().(Fuzzable)
if !ok {
panic(fmt.Sprintf("%s does not support fuzzing", p.t))
}
return v.Fuzz(l)
}

// @@
type strct struct {
typ reflect.Type
Expand Down Expand Up @@ -125,6 +139,10 @@ func (s *strct) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Va
return []reflect.Value{sv}, ctx.Apply()
}

func (s *strct) Fuzz(l lexer.Fuzzer) string {
return s.expr.Fuzz(l)
}

func (s *strct) maybeInjectStartToken(token lexer.Token, v reflect.Value) {
if s.posFieldIndex == nil {
return
Expand Down Expand Up @@ -184,6 +202,40 @@ type group struct {
mode groupMatchMode
}

func (g *group) Fuzz(l lexer.Fuzzer) string {
var (
maxCount int
minCount int
)

switch g.mode {
case groupMatchOnce:
minCount, maxCount = 1, 1
case groupMatchZeroOrOne:
minCount, maxCount = 0, 1
case groupMatchZeroOrMore:
minCount, maxCount = 0, math.MaxInt
case groupMatchOneOrMore:
minCount, maxCount = 1, math.MaxInt
case groupMatchNonEmpty:
minCount, maxCount = 1, 1
}

var items int
if maxCount-minCount == 0 {
items = minCount
} else {
items = minCount + rand.Intn(maxCount-minCount)
}
var s strings.Builder
for i := 0; i < items; i++ {
s.WriteString(g.expr.Fuzz(l))
if i < items-1 {
s.WriteString(" ")
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a really interesting idea, but this implementation makes assumptions about whitespace that will not hold true for all lexers. For example, it is possible to have a lexer that relies on indentation, which this would break on.

I love the idea though, so perhaps there is a way to make it work in the general case.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a really interesting idea, but this implementation makes assumptions about whitespace that will not hold true for all lexers. For example, it is possible to have a lexer that relies on indentation, which this would break on.

Hm, how would you model this? I was trying to keep it stateless, but I'm not sure if I could avoid that and fix this problem.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that is definitely the question, and I'm not sure what the answer is :(

}
}
return s.String()
}
func (g *group) String() string { return ebnf(g) }
func (g *group) GoString() string { return fmt.Sprintf("group{%s}", g.mode) }
func (g *group) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) {
Expand Down Expand Up @@ -254,6 +306,9 @@ type lookaheadGroup struct {

func (n *lookaheadGroup) String() string { return ebnf(n) }
func (n *lookaheadGroup) GoString() string { return "lookaheadGroup{}" }
func (n *lookaheadGroup) Fuzz(l lexer.Fuzzer) string {
return n.expr.Fuzz(l)
}

func (n *lookaheadGroup) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) {
// Create a branch to avoid advancing the parser as any match will be discarded
Expand All @@ -278,6 +333,9 @@ type disjunction struct {

func (d *disjunction) String() string { return ebnf(d) }
func (d *disjunction) GoString() string { return "disjunction{}" }
func (d *disjunction) Fuzz(l lexer.Fuzzer) string {
return d.nodes[rand.Intn(len(d.nodes))].Fuzz(l)
}

func (d *disjunction) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) {
var (
Expand Down Expand Up @@ -325,6 +383,16 @@ type sequence struct {

func (s *sequence) String() string { return ebnf(s) }
func (s *sequence) GoString() string { return "sequence{}" }
func (s *sequence) Fuzz(l lexer.Fuzzer) string {
var sb strings.Builder
for n := s; n != nil; n = n.next {
sb.WriteString(n.node.Fuzz(l))
if n.next != nil {
sb.WriteString(" ")
}
}
return sb.String()
}

func (s *sequence) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) {
for n := s; n != nil; n = n.next {
Expand Down Expand Up @@ -356,6 +424,9 @@ type capture struct {

func (c *capture) String() string { return ebnf(c) }
func (c *capture) GoString() string { return "capture{}" }
func (c *capture) Fuzz(l lexer.Fuzzer) string {
return c.node.Fuzz(l)
}

func (c *capture) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) {
start := ctx.RawCursor()
Expand All @@ -380,6 +451,9 @@ type reference struct {

func (r *reference) String() string { return ebnf(r) }
func (r *reference) GoString() string { return fmt.Sprintf("reference{%s}", r.identifier) }
func (r *reference) Fuzz(l lexer.Fuzzer) string {
return l.Fuzz(r.typ)
}

func (r *reference) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) {
token, err := ctx.Peek(0)
Expand All @@ -400,6 +474,13 @@ type optional struct {

func (o *optional) String() string { return ebnf(o) }
func (o *optional) GoString() string { return "optional{}" }
func (o *optional) Fuzz(l lexer.Fuzzer) string {
if rand.Intn(1) == 0 {
return ""
} else {
return o.node.Fuzz(l)
}
}

func (o *optional) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) {
branch := ctx.Branch()
Expand All @@ -425,6 +506,19 @@ type repetition struct {

func (r *repetition) String() string { return ebnf(r) }
func (r *repetition) GoString() string { return "repetition{}" }
func (r *repetition) Fuzz(l lexer.Fuzzer) string {
var (
s strings.Builder
max = rand.Intn(100)
)
for i := 0; i < max; i++ {
s.WriteString(r.node.Fuzz(l))
if i < max-1 {
s.WriteString(" ")
}
}
return s.String()
}

// Parse a repetition. Once a repetition is encountered it will always match, so grammars
// should ensure that branches are differentiated prior to the repetition.
Expand Down Expand Up @@ -466,6 +560,9 @@ type literal struct {

func (l *literal) String() string { return ebnf(l) }
func (l *literal) GoString() string { return fmt.Sprintf("literal{%q, %q}", l.s, l.tt) }
func (lit *literal) Fuzz(l lexer.Fuzzer) string {
return lit.s
}

func (l *literal) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) {
token, err := ctx.Peek(0)
Expand Down Expand Up @@ -494,6 +591,9 @@ type negation struct {

func (n *negation) String() string { return ebnf(n) }
func (n *negation) GoString() string { return "negation{}" }
func (n *negation) Fuzz(l lexer.Fuzzer) string {
panic("todo")
}

func (n *negation) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) {
// Create a branch to avoid advancing the parser, but call neither Stop nor Accept on it
Expand Down
Loading