-
Notifications
You must be signed in to change notification settings - Fork 12
/
minify.py
683 lines (599 loc) · 23.2 KB
/
minify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
## {{{ http://code.activestate.com/recipes/576704/ (r16)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# pyminifier.py
#
# Copyright 2009 Dan McDougall <[email protected]>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; Version 3 of the License
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, the license can be downloaded here:
#
# http://www.gnu.org/licenses/gpl.html
# Meta
__version__ = '1.4.1'
__license__ = "GNU General Public License (GPL) Version 3"
__version_info__ = (1, 4, 1)
__author__ = 'Dan McDougall <[email protected]>'
"""
**Python Minifier:** Reduces the size of (minifies) Python code for use on
embedded platforms.
Performs the following:
- Removes docstrings.
- Removes comments.
- Minimizes code indentation.
- Joins multiline pairs of parentheses, braces, and brackets (and removes extraneous whitespace within).
- Preserves shebangs and encoding info (e.g. "# -- coding: utf-8 --").
Various examples and edge cases are sprinkled throughout the pyminifier code so
that it can be tested by minifying itself. The way to test is thus:
.. code-block:: bash
$ python pyminifier.py pyminifier.py > minified_pyminifier.py
$ python minified_pyminifier.py pyminifier.py > this_should_be_identical.py
$ diff minified_pyminifier.py this_should_be_identical.py
$
If you get an error executing minified_pyminifier.py or
'this_should_be_identical.py' isn't identical to minified_pyminifier.py then
something is broken.
"""
import sys, re, cStringIO, tokenize
from optparse import OptionParser
# Compile our regular expressions for speed
multiline_quoted_string = re.compile(r'(\'\'\'|\"\"\")')
not_quoted_string = re.compile(r'(\".*\'\'\'.*\"|\'.*\"\"\".*\')')
trailing_newlines = re.compile(r'\n\n')
shebang = re.compile('^#\!.*$')
encoding = re.compile(".*coding[:=]\s*([-\w.]+)")
multiline_indicator = re.compile('\\\\(\s*#.*)?\n')
# The above also removes trailing comments: "test = 'blah \ # comment here"
# These aren't used but they're a pretty good reference:
double_quoted_string = re.compile(r'((?<!\\)".*?(?<!\\)")')
single_quoted_string = re.compile(r"((?<!\\)'.*?(?<!\\)')")
single_line_single_quoted_string = re.compile(r"((?<!\\)'''.*?(?<!\\)''')")
single_line_double_quoted_string = re.compile(r"((?<!\\)'''.*?(?<!\\)''')")
def remove_comments_and_docstrings(source):
"""
Returns 'source' minus comments and docstrings.
**Note**: Uses Python's built-in tokenize module to great effect.
Example:
.. code-block:: python
def noop(): # This is a comment
'''
Does nothing.
'''
pass # Don't do anything
Will become:
.. code-block:: python
def noop():
pass
"""
io_obj = cStringIO.StringIO(source)
out = ""
prev_toktype = tokenize.INDENT
last_lineno = -1
last_col = 0
for tok in tokenize.generate_tokens(io_obj.readline):
token_type = tok[0]
token_string = tok[1]
start_line, start_col = tok[2]
end_line, end_col = tok[3]
ltext = tok[4]
# The following two conditionals preserve indentation.
# This is necessary because we're not using tokenize.untokenize()
# (because it spits out code with copious amounts of oddly-placed
# whitespace).
if start_line > last_lineno:
last_col = 0
if start_col > last_col:
out += (" " * (start_col - last_col))
# Remove comments:
if token_type == tokenize.COMMENT:
pass
# This series of conditionals removes docstrings:
elif token_type == tokenize.STRING:
if prev_toktype != tokenize.INDENT:
# This is likely a docstring; double-check we're not inside an operator:
if prev_toktype != tokenize.NEWLINE:
# Note regarding NEWLINE vs NL: The tokenize module
# differentiates between newlines that start a new statement
# and newlines inside of operators such as parens, brackes,
# and curly braces. Newlines inside of operators are
# NEWLINE and newlines that start new code are NL.
# Catch whole-module docstrings:
if start_col > 0:
# Unlabelled indentation means we're inside an operator
out += token_string
# Note regarding the INDENT token: The tokenize module does
# not label indentation inside of an operator (parens,
# brackets, and curly braces) as actual indentation.
# For example:
# def foo():
# "The spaces before this docstring are tokenize.INDENT"
# test = [
# "The spaces before this string do not get a token"
# ]
else:
out += token_string
prev_toktype = token_type
last_col = end_col
last_lineno = end_line
return out
def reduce_operators(source):
"""
Remove spaces between operators in 'source' and returns the result.
Example:
.. code-block:: python
def foo(foo, bar, blah):
test = "This is a %s" % foo
Will become:
.. code-block:: python
def foo(foo,bar,blah):
test="This is a %s"%foo
"""
io_obj = cStringIO.StringIO(source)
remove_columns = []
out = ""
out_line = ""
prev_toktype = tokenize.INDENT
prev_tok = None
last_lineno = -1
last_col = 0
lshift = 1
for tok in tokenize.generate_tokens(io_obj.readline):
token_type = tok[0]
token_string = tok[1]
start_line, start_col = tok[2]
end_line, end_col = tok[3]
ltext = tok[4]
if start_line > last_lineno:
last_col = 0
if start_col > last_col:
out_line += (" " * (start_col - last_col))
if token_type == tokenize.OP:
# Operators that begin a line such as @ or open parens should be
# left alone
start_of_line_types = [ # These indicate we're starting a new line
tokenize.NEWLINE, tokenize.DEDENT, tokenize.INDENT]
if prev_toktype not in start_of_line_types:
# This is just a regular operator; remove spaces
remove_columns.append(start_col) # Before OP
remove_columns.append(end_col+1) # After OP
if token_string.endswith('\n'):
out_line += token_string
if remove_columns:
for col in remove_columns:
col = col - lshift
try:
# This was really handy for debugging (looks nice, worth saving):
#print out_line + (" " * col) + "^"
# The above points to the character we're looking at
if out_line[col] == " ": # Only if it is a space
out_line = out_line[:col] + out_line[col+1:]
lshift += 1 # To re-align future changes on this line
except IndexError: # Reached and end of line, no biggie
pass
out += out_line
remove_columns = []
out_line = ""
lshift = 1
else:
out_line += token_string
prev_toktype = token_type
prev_token = tok
last_col = end_col
last_lineno = end_line
# This makes sure to capture the last line if it doesn't end in a newline:
out += out_line
# The tokenize module doesn't recognize @ sign before a decorator
return out
# NOTE: This isn't used anymore... Just here for reference in case someone
# searches the internet looking for a way to remove similarly-styled end-of-line
# comments from non-python code. It also acts as an edge case of sorts with
# that raw triple quoted string inside the "quoted_string" assignment.
def remove_comment(single_line):
"""
Removes the comment at the end of the line (if any) and returns the result.
"""
quoted_string = re.compile(
r'''((?<!\\)".*?(?<!\\)")|((?<!\\)'.*?(?<!\\)')'''
)
# This divides the line up into sections:
# Those inside single quotes and those that are not
split_line = quoted_string.split(single_line)
# Remove empty items:
split_line = [a for a in split_line if a]
out_line = ""
for section in split_line:
if section.startswith("'") or section.startswith('"'):
# This is a quoted string; leave it alone
out_line += section
elif '#' in section: # A '#' not in quotes? There's a comment here!
# Get rid of everything after the # including the # itself:
out_line += section.split('#')[0]
break # No reason to bother the rest--it's all comments
else:
# This isn't a quoted string OR a comment; leave it as-is
out_line += section
return out_line.rstrip() # Strip trailing whitespace before returning
def join_multiline_pairs(text, pair="()"):
"""
Finds and removes newlines in multiline matching pairs of characters in
'text'. For example, "(.*\n.*), {.*\n.*}, or [.*\n.*]".
By default it joins parens () but it will join any two characters given via
the 'pair' variable.
**Note:** Doesn't remove extraneous whitespace that ends up between the pair.
Use reduce_operators() for that.
Example:
.. code-block:: python
test = (
"This is inside a multi-line pair of parentheses"
)
Will become:
.. code-block:: python
test = ( "This is inside a multi-line pair of parentheses" )
"""
# Readability variables
opener = pair[0]
closer = pair[1]
# Tracking variables
inside_pair = False
inside_quotes = False
inside_double_quotes = False
inside_single_quotes = False
quoted_string = False
openers = 0
closers = 0
linecount = 0
# Regular expressions
opener_regex = re.compile('\%s' % opener)
closer_regex = re.compile('\%s' % closer)
output = ""
for line in text.split('\n'):
escaped = False
# First we rule out multi-line strings
multline_match = multiline_quoted_string.search(line)
not_quoted_string_match = not_quoted_string.search(line)
if multline_match and not not_quoted_string_match and not quoted_string:
if len(line.split('"""')) > 1 or len(line.split("'''")):
# This is a single line that uses the triple quotes twice
# Treat it as if it were just a regular line:
output += line + '\n'
quoted_string = False
else:
output += line + '\n'
quoted_string = True
elif quoted_string and multiline_quoted_string.search(line):
output += line + '\n'
quoted_string = False
# Now let's focus on the lines containing our opener and/or closer:
elif not quoted_string:
if opener_regex.search(line) or closer_regex.search(line) or inside_pair:
for character in line:
if character == opener:
if not escaped and not inside_quotes:
openers += 1
inside_pair = True
output += character
else:
escaped = False
output += character
elif character == closer:
if not escaped and not inside_quotes:
if openers and openers == (closers + 1):
closers = 0
openers = 0
inside_pair = False
output += character
else:
closers += 1
output += character
else:
escaped = False
output += character
elif character == '\\':
if escaped:
escaped = False
output += character
else:
escaped = True
output += character
elif character == '"' and escaped:
output += character
escaped = False
elif character == "'" and escaped:
output += character
escaped = False
elif character == '"' and inside_quotes:
if inside_single_quotes:
output += character
else:
inside_quotes = False
inside_double_quotes = False
output += character
elif character == "'" and inside_quotes:
if inside_double_quotes:
output += character
else:
inside_quotes = False
inside_single_quotes = False
output += character
elif character == '"' and not inside_quotes:
inside_quotes = True
inside_double_quotes = True
output += character
elif character == "'" and not inside_quotes:
inside_quotes = True
inside_single_quotes = True
output += character
elif character == ' ' and inside_pair and not inside_quotes:
if not output[-1] in [' ', opener]:
output += ' '
else:
if escaped:
escaped = False
output += character
if inside_pair == False:
output += '\n'
else:
output += line + '\n'
else:
output += line + '\n'
# Clean up
output = trailing_newlines.sub('\n', output)
return output
def dedent(source):
"""
Minimizes indentation to save precious bytes
Example:
.. code-block:: python
def foo(bar):
test = "This is a test"
Will become:
.. code-block:: python
def foo(bar):
test = "This is a test"
"""
io_obj = cStringIO.StringIO(source)
out = ""
last_lineno = -1
last_col = 0
prev_start_line = 0
indentation = ""
indentation_level = 0
for i,tok in enumerate(tokenize.generate_tokens(io_obj.readline)):
token_type = tok[0]
token_string = tok[1]
start_line, start_col = tok[2]
end_line, end_col = tok[3]
if start_line > last_lineno:
last_col = 0
if token_type == tokenize.INDENT:
indentation_level += 1
continue
if token_type == tokenize.DEDENT:
indentation_level -= 1
continue
indentation = " " * indentation_level
if start_line > prev_start_line:
out += indentation + token_string
elif start_col > last_col:
out += " " + token_string
else:
out += token_string
prev_start_line = start_line
last_col = end_col
last_lineno = end_line
return out
def fix_empty_methods(source):
"""
Appends 'pass' to empty methods/functions (i.e. where there was nothing but
a docstring before we removed it =).
Example:
.. code-block:: python
# Note: This triple-single-quote inside a triple-double-quote is also a
# pyminifier self-test
def myfunc():
'''This is just a placeholder function.'''
Will become:
.. code-block:: python
def myfunc(): pass
"""
def_indentation_level = 0
output = ""
just_matched = False
previous_line = None
method = re.compile(r'^\s*def\s*.*\(.*\):.*$')
for line in source.split('\n'):
if len(line.strip()) > 0: # Don't look at blank lines
if just_matched == True:
this_indentation_level = len(line.rstrip()) - len(line.strip())
if def_indentation_level == this_indentation_level:
# This method is empty, insert a 'pass' statement
output += "%s pass\n%s\n" % (previous_line, line)
else:
output += "%s\n%s\n" % (previous_line, line)
just_matched = False
elif method.match(line):
def_indentation_level = len(line) - len(line.strip()) # A commment
just_matched = True
previous_line = line
else:
output += "%s\n" % line # Another self-test
else:
output += "\n"
return output
def remove_blank_lines(source):
"""
Removes blank lines from 'source' and returns the result.
Example:
.. code-block:: python
test = "foo"
test2 = "bar"
Will become:
.. code-block:: python
test = "foo"
test2 = "bar"
"""
io_obj = cStringIO.StringIO(source)
source = [a for a in io_obj.readlines() if a.strip()]
return "".join(source)
def minify(source):
"""
Remove all docstrings, comments, blank lines, and minimize code
indentation from 'source' then prints the result.
"""
preserved_shebang = None
preserved_encoding = None
# This is for things like shebangs that must be precisely preserved
for line in source.split('\n')[0:2]:
# Save the first comment line if it starts with a shebang
# (e.g. '#!/usr/bin/env python') <--also a self test!
if shebang.match(line): # Must be first line
preserved_shebang = line
continue
# Save the encoding string (must be first or second line in file)
if encoding.match(line):
preserved_encoding = line
# Remove multilines (e.g. lines that end with '\' followed by a newline)
source = multiline_indicator.sub('', source)
# Remove docstrings (Note: Must run before fix_empty_methods())
source = remove_comments_and_docstrings(source)
# Remove empty (i.e. single line) methods/functions
source = fix_empty_methods(source)
# Join multiline pairs of parens, brackets, and braces
source = join_multiline_pairs(source)
source = join_multiline_pairs(source, '[]')
source = join_multiline_pairs(source, '{}')
# Remove whitespace between operators:
source = reduce_operators(source)
# Minimize indentation
source = dedent(source)
# Re-add preseved items
if preserved_encoding:
source = preserved_encoding + "\n" + source
if preserved_shebang:
source = preserved_shebang + "\n" + source
# Remove blank lines
source = remove_blank_lines(source).rstrip('\n') # Stubborn last newline
return source
def bz2_pack(source):
"Returns 'source' as a bzip2-compressed, self-extracting python script."
import bz2, base64
out = ""
compressed_source = bz2.compress(source)
out += 'import bz2, base64\n'
out += "exec bz2.decompress(base64.b64decode('"
out += base64.b64encode((compressed_source))
out += "'))\n"
return out
def gz_pack(source):
"Returns 'source' as a gzip-compressed, self-extracting python script."
import zlib, base64
out = ""
compressed_source = zlib.compress(source)
out += 'import zlib, base64\n'
out += "exec zlib.decompress(base64.b64decode('"
out += base64.b64encode((compressed_source))
out += "'))\n"
return out
# The test.+() functions below are for testing pyminifer...
def test_decorator(f):
"""Decorator that does nothing"""
return f
def test_reduce_operators():
"""Test the case where an operator such as an open paren starts a line"""
(a, b) = 1, 2 # The indentation level should be preserved
pass
def test_empty_functions():
"""
This is a test method.
This should be replaced with 'def empty_method: pass'
"""
class test_class(object):
"Testing indented decorators"
@test_decorator
def foo(self):
pass
def test_function():
"""
This function encapsulates the edge cases to prevent them from invading the
global namespace.
"""
foo = ("The # character in this string should " # This comment
"not result in a syntax error") # ...and this one should go away
test_multi_line_list = [
'item1',
'item2',
'item3'
]
test_multi_line_dict = {
'item1': 1,
'item2': 2,
'item3': 3
}
# It may seem strange but the code below tests our docstring removal code.
test_string_inside_operators = imaginary_function(
"This string was indented but the tokenizer won't see it that way."
) # To understand how this could mess up docstring removal code see the
# remove_comments_and_docstrings() function starting at this line:
# "elif token_type == tokenize.STRING:"
# This tests remove_extraneous_spaces():
this_line_has_leading_indentation = '''<--That extraneous space should be
removed''' # But not these spaces
def main():
usage = '%prog [options] "<input file>"'
parser = OptionParser(usage=usage, version=__version__)
parser.disable_interspersed_args()
parser.add_option(
"-o", "--outfile",
dest="outfile",
default=None,
help="Save output to the given file.",
metavar="<file path>"
)
parser.add_option(
"--bzip2",
action="store_true",
dest="bzip2",
default=False,
help="bzip2-compress the result into a self-executing python script."
)
parser.add_option(
"--gzip",
action="store_true",
dest="gzip",
default=False,
help="gzip-compress the result into a self-executing python script."
)
options, args = parser.parse_args()
try:
source = open(args[0]).read()
except Exception, e:
print e
parser.print_help()
sys.exit(2)
# Minify our input script
result = minify(source)
# Compress it if we were asked to do so
if options.bzip2:
result = bz2_pack(result)
elif options.gzip:
result = gz_pack(result)
# Either save the result to the output file or print it to stdout
if options.outfile:
f = open(options.outfile, 'w')
f.write(result)
f.close()
else:
print result
if __name__ == "__main__":
main()
## end of http://code.activestate.com/recipes/576704/ }}}