forked from jspam/mj-test
-
Notifications
You must be signed in to change notification settings - Fork 0
/
lexfuzz.py
executable file
·172 lines (139 loc) · 7.05 KB
/
lexfuzz.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#! /usr/bin/env python
"""Prints a randomly generated string to stdout, and the corresponding
list of tokens (in the format --lextest produces) to stderr.
Usage: lexfuzz.py [LENGTH | --aggressive]
LENGTH Average length (in tokens) of generated test cases
--aggressive use a more aggressive mode (which may produce an incorrect reference
token stream)
"""
from __future__ import print_function
from enum import Enum
import random
import re
import string
import sys
keywords = ["abstract", "assert", "boolean", "break", "byte", "case", "catch", "char", "class", "const", "continue", "default", "double", "do", "else", "enum", "extends", "false", "finally", "final", "float", "for", "goto", "if", "implements", "import", "instanceof", "interface", "int", "long", "native", "new", "null", "package", "private", "protected", "public", "return", "short", "static", "strictfp", "super", "switch", "synchronized", "this", "throws", "throw", "transient", "true", "try", "void", "volatile", "while"]
operators = [':', '!=', '!', '(', ')', '*=', '*', '++', '+=', '+', ',', '-=', '--', '-', '.', '/=', '/', ':', ';', '<<=', '<<', '<=', '<', '==', '=', '>=', '>>=', '>>>=', '>>>', '>>', '>', '?', '%=', '%', '&=', '&&', '&', '[', ']', '^=', '^', '{', '}', '~', '|=', '||', '|']
needs_whitespace = [('/', '*='), ('/', '*'), ('*', '/'), ('!=', '='), ('!', '=='), ('!', '='), ('*=', '='), ('*', '=='), ('*', '='), ('++', '+'), ('++', '='), ('+=', '='), ('+', '++'), ('+', '+='), ('+', '+'), ('+', '=='), ('+', '='), ('-=', '='), ('--', '-'), ('--', '='), ('-', '-='), ('-', '--'), ('-', '-'), ('-', '=='), ('-', '='), ('/=', '='), ('/', '=='), ('/', '='), ('<<=', '='), ('<<', '<='), ('<<', '<'), ('<<', '=='), ('<<', '='), ('<=', '='), ('<', '<<='), ('<', '<<'), ('<', '<='), ('<', '<'), ('<', '=='), ('<', '='), ('==', '='), ('=', '=='), ('=', '='), ('>=', '='), ('>>=', '='), ('>>>=', '='), ('>>>', '=='), ('>>>', '='), ('>>>', '>='), ('>>>', '>>='), ('>>>', '>>'), ('>>>', '>'), ('>>', '=='), ('>>', '='), ('>>', '>='), ('>>', '>>='), ('>>', '>>>='), ('>>', '>>>'), ('>>', '>>'), ('>>', '>'), ('>', '=='), ('>', '='), ('>', '>='), ('>', '>>='), ('>', '>>>='), ('>', '>>>'), ('>', '>>'), ('>', '>'), ('%=', '='), ('%', '=='), ('%', '='), ('&=', '='), ('&&', '='), ('&&', '&'), ('&', '=='), ('&', '='), ('&', '&='), ('&', '&&'), ('&', '&'), ('^=', '='), ('^', '=='), ('^', '='), ('|=', '='), ('||', '='), ('||', '|'), ('|', '=='), ('|', '='), ('|', '|='), ('|', '||'), ('|', '|')]
def make_needs_whitespace():
# this algorithm runs in O(yolo)
global needs_whitespace
needs_whitespace = [('/', '*='), ('/', '*'), ('*', '/')] # since comment start and end are special tokens, this needs to be hardcoded
for kw1 in operators:
for kw2 in operators:
for cand in operators + ['']:
for cand2 in operators + ['']:
if cand+cand2 == kw1 + kw2 and (cand != kw1 or cand2 != kw2) and (kw1,kw2) not in needs_whitespace:
needs_whitespace.append((kw1,kw2))
class Token(Enum):
T_WHITESPACE = 0
T_IDENT = 1
T_INTEGER = 2
T_KEYWORD = 3
T_OPERATOR = 4
WHITESPACE_PROB = 50 # percent
STRING_LENGTH = 5
COMMENT_PROB = 5
COMMENT_LENGTH = 10
COMMENT_NESTING_PROB = 10 # percent
def _make_comment():
length = random.randrange(COMMENT_LENGTH)
c = "/*"
while random.randrange(100) < COMMENT_NESTING_PROB:
c += get_string(length, False)
if c[-1] == '*':
# avoid accidentally closing the comment
c += _make_whitespace(comments=False)
c += "/*" + _make_whitespace(comments=False)
c += get_string(length, False) + "*/"
return c
def _make_whitespace(comments=True):
wtsp = ''
for _ in range(random.randrange(10) + 1):
ws = ['\n','\r','\t',' ']
ws_type = random.choice(ws)
if comments and random.randrange(100) < COMMENT_PROB:
ws_type = None # None -> comment
if ws_type:
wtsp += ws_type
else:
wtsp += _make_comment()
return wtsp
def _make_identifier():
"""We cheat by making every identifier include a keyword, which are non-overlapping
Thus we can be sure not to accidentally generate a keyword"""
ident = ''
def letter():
r = random.randrange(100)
if r < 33:
return random.choice(string.ascii_letters)
if r < 66:
return '_'
return str(random.randrange(20))
if random.getrandbits(1):
ident += random.choice(string.ascii_letters) if random.getrandbits(1) else '_'
kw = random.choice(keywords)
ident += kw
if random.getrandbits(1) or len(ident) == len(kw):
ident += letter()
return ident
def _make_integer():
return str(random.getrandbits(32))
def _is_var(tok):
c = string.ascii_letters + '0123456789_'
return tok[0] in c and tok[-1] in c
def _pad_tokens(tokens, comments=True, pad_operators=True):
"""Some tokens need to be padded with whitespace, e.g. two consecutive keywords"""
padded_tokens = []
last_token = None
for t_type, t in tokens:
if last_token and _is_var(last_token) and _is_var(t):
padded_tokens.append((Token.T_WHITESPACE, _make_whitespace(comments)))
elif ((last_token, t) in needs_whitespace) and pad_operators:
padded_tokens.append((Token.T_WHITESPACE, _make_whitespace(comments)))
padded_tokens.append((t_type, t))
last_token = t
if random.randrange(100) < WHITESPACE_PROB:
wtsp = _make_whitespace(comments)
padded_tokens.append((Token.T_WHITESPACE, wtsp))
last_token = wtsp
return padded_tokens
def _random_token():
r = random.randrange(100)
if r < 30:
return (Token.T_KEYWORD, random.choice(keywords))
if r < 60:
return (Token.T_OPERATOR, random.choice(operators))
if r < 80:
return (Token.T_INTEGER, _make_integer())
return (Token.T_IDENT, _make_identifier())
def get_tokens(length):
tokens = []
for _ in range(length):
tokens.append(_random_token())
return tokens
def get_string(length, comments=True):
return ''.join([t[1] for t in _pad_tokens(get_tokens(length), comments)])
def lextest_str(token):
if token[0] == Token.T_IDENT:
return "identifier " + token[1]
if token[0] == Token.T_INTEGER:
return "integer literal " + token[1]
return token[1]
if __name__ == '__main__':
length = STRING_LENGTH
pad_operators = True
comments = True
if '--aggressive' in sys.argv:
# don't use smart token padding logic
# this will run into some edge cases the normal mode avoids, but has
# the disadvantage that its results have to be manually inspected
pad_operators = False
comments = False
length = 10
else:
if len(sys.argv) > 1:
length = int(sys.argv[1])
tokens = get_tokens(length)
print(''.join([t[1] for t in _pad_tokens(tokens, comments=comments, pad_operators=pad_operators)]), file=sys.stdout)
print('\n'.join([lextest_str(t) for t in tokens] + ['EOF']), file=sys.stderr)