badchars.py

#!/usr/bin/python3
#
# Bad characters matching tool capable of transforming input bytes from
# many different hexdump / hex string / escaped hex strings formats.
# To be used during exploit development stage when the shellcode gets corrupted
# due to filtered bytes. Additionally armed with modified LCS algoritm designed by
# Peter Van Eeckhoutte from Corelan.be (originally taken from his Mona.py).
#
# LICENSE note:
#   This program contains adapted source code taken from Mona.py script, that was
#   originally written by Peter Van Eeckhoutte - Corelan GCV.
#   Specifically his MemoryComparator class and couple of supplying routines like
#   draw_chunk_table or guess_bad_chars.
#   One can refer to the original Mona's license here:
#       https://github.com/corelan/mona/blob/master/LICENSE  
#
# Written by: 
# Mariusz B. / mgeeky, 2017-2020
#
# Converted to Python 3 by:
# Z.Y. Liew / onlylonly, Oct 2019
#

import re
import sys
import types
import string
import os.path
import itertools
from optparse import OptionParser
from operator import itemgetter
from collections import defaultdict, namedtuple

VERSION = '0.3'

options = { }
filenames = []
buffers = [[], []]

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'

    @staticmethod
    def strip_colors(line):
        line = line.replace(bcolors.HEADER, '')
        line = line.replace(bcolors.OKBLUE, '')
        line = line.replace(bcolors.OKGREEN, '')
        line = line.replace(bcolors.WARNING, '')
        line = line.replace(bcolors.FAIL, '')
        return line.replace(bcolors.ENDC, '')

def _out(x, color=None):
    if not options.quiet:
        if color and options.colored: return color + x + bcolors.ENDC
        else: return x 
    else: return "" 

def out(x): 
    o = _out(x)
    if len(o): print (o)
        
def ok(x): return _out("[+] " + x, bcolors.OKGREEN)
def dbg(x): 
    if options.debug:
        return _out("[dbg] " + x, bcolors.OKBLUE)
    else:
        return ''

def warn(x): return _out("[?] " + x, bcolors.WARNING)
def err(x): return _out(x, bcolors.FAIL)

class BytesParser():
    formats_rex = {
        'xxd': r'^[^0-9a-f]*[0-9a-f]{2,}\:\s((?:[0-9a-f]{4}\s)+)\s+.+$',
        'hexdump': r'^[^0-9a-f]*[0-9a-f]{2,}\s+([0-9a-f\s]+[0-9a-f])$',
        'classic-hexdump':r'^[0-9a-f]*[0-9a-f]{2,}(?:\:|\s)+\s([0-9a-f\s]+)\s{2,}.+$',
        'hexdump-C': r'^[0-9a-f]*[0-9a-f]{2,}\s+\s([0-9a-f\s]+)\s*\|', 
        'escaped-hexes': r'^[^\'"]*((?:\'[\\\\x0-9a-f]{8,}\')|(?:"[\\\\x0-9a-f]{8,}"))',
        'hexstring': r'^([0-9a-f ]+)$',
        'powershell': r'^[^0x]+((?:0x[0-9a-f]{1,2},?)+)$',
        'byte-array': r'^[^0x]*((?:0x[0-9a-f]{2}(?:,\s?))+)',
        'js-unicode': r'^[^%u0-9a-f]*((?:%u[0-9a-f]{4})+)$',
        #modify from r'^(?:((?:0x[0-9a-f]{1,8}\s[<>\w\+]+)):\s*)?((?:0x[0-9a-f]{8},?\s*)+)$
        #include match of GDB address 
        'dword': r'^(?:((?:0x[0-9a-f]{1,8}\s[<>\w\+]+)|(?:0x[0-9a-f]{1,8})):\s*)?((?:0x[0-9a-f]{8},?\s*)+)$',
    }
    formats_aliases = {
        'classic-hexdump': ['ollydbg'],
        'escaped-hexes': ['ruby','c', 'carray', 'python'],
        'dword': ['gdb']
    }
    formats_compiled = {}

    def __init__(self, input, name = None, format = None):
        self.input = input[:]
        self.name = name
        self.bytes = []
        self.parsed = False
        self.format = None

        BytesParser.compile_regexps()

        #do not normalize input on raw format to prevent input tempering
        if str(format).lower() != "raw":
            self.normalize_input()

        if format:
            out(dbg("Using user-specified format: %s" % format))

            if str(format).lower() == "raw":
                self.format = "raw"

            else:		
                try:
                    self.format = BytesParser.interpret_format_name(format)
                except Exception as e:
                    out(dbg(str(e)))

                #exit when user-specified format not in both formats_rex and formats_aliases 
                assert (format in BytesParser.formats_rex.keys() or self.format is not None), \
                        "Format '%s' is not implemented." % format
                    
            if self.format is None:
                self.format = format

        else:
            self.recognize_format()

        if not self.format:
            self.parsed = False
        else:
            if self.fetch_bytes():
                out(ok("Fetched %d bytes successfully from %s" % (len(self.bytes), self.name)))
                self.parsed = True
            else:
                if format and len(format):
                    out(err("Could not parse %s with user-specified format: %s" % (self.name, format)))
                else:
                    out(err("Recognized input %s as formatted with %s but failed fetching bytes." %
                        (self.name, self.format)))

    def normalize_input(self):
        input = []
        for line in self.input.decode().split('\n'):
            line = line.strip()
            line2 = line.encode('unicode_escape')
            input.append(line2)
        self.input = b'\n'.join(input)

    @staticmethod
    def interpret_format_name(name):
        for k, v in BytesParser.formats_aliases.items():
            if name.lower() in v:
                return k
        raise Exception("Format name: %s not recognized as alias." % name)

    @staticmethod
    def compile_regexps():
        if len(BytesParser.formats_compiled) == 0:
            for name, rex in BytesParser.formats_rex.items():
                BytesParser.formats_compiled[name] = re.compile(rex, re.I)

    @staticmethod
    def make_line_printable(line):
        return ''.join([c if c in string.printable else '.' for c in line])

    def recognize_format(self):
        for line in self.input.decode().split('\n'):
            if self.format: break
            for format, rex in BytesParser.formats_compiled.items():
                line = BytesParser.make_line_printable(line)

                out(dbg("Trying format %s on ('%s')" % (format, line)))
                
                if rex.match(line):
                    out(ok("%s has been recognized as %s formatted." % (self.name, format)))
                    self.format = format
                    break

        if not self.format:
            if not all(c in string.printable for c in self.input):
                out(ok("%s has been recognized as RAW bytes." % (self.name)))
                self.format = 'raw'
                return True
            else:
                out(err("Could not recognize input bytes format of the %s!" % self.name))
                return False

        return (len(self.format) > 0)

    @staticmethod
    def post_process_bytes_line(line):
        outb = []
        l = line.strip()[:]
        strip = ['0x', ',', ' ', '\\', 'x', '%u', '+', '.', "'", '"']
        for s in strip:
            l = l.replace(s, '')

        for i in range(0, len(l), 2):
            outb.append(int(l[i:i+2], 16))
        return outb

    @staticmethod
    def preprocess_bytes_line(line):
        l = line.strip()[:]
        strip = ['(byte)', '+', '.']
        for s in strip:
            l = l.replace(s, '')
        return l

    @staticmethod
    def unpack_dword(line):
        outs = ''
        i = 0
        """
        modify from r'((?:0x[0-9a-f]{8},?\s*))
        added constrain to remove match of gdb address format
        e.g, remove match of 0xffffd67a:

        0xffffd67a:     0xdfb8c2db      0xd9db029c      0x5bf42474      0x0bb1c933
        0xffffd68a:     0x031a4331      0xeb831a43      0xf62ae2fc      0x554d8309
        0xffffd69a:     0x39405b68      0x92f27cfd      0x8502ea8e      0x3b6b895f
        0xffffd6aa:     0x2b39ae29      0xabbd3121      0xc5d4531d      0x1a55ff4e
        0xffffd6ba:     0xfb10acc6      0x0000d225      0x96900000      0x4520f7fe
        0xffffd6ca:     0xd000f7fe      0x0001f7ff      0x83400000      0x00000804
        0xffffd6da:     0x83610000      0x84080804      0x00010804      0xd7040000
        0xffffd6ea:     0x8430ffff      0x84200804      0x45200804      0xd6fcf7fe
        0xffffd6fa:     0xd950ffff      0x0001f7ff      0xd83c0000      0x0000ffff
        0xffffd70a:     0xd8480000      0xde34ffff      0xde62ffff      0xde71ffff
        0xffffd71a:     0xde82ffff      0xde97ffff      0xdea1ffff      0xdeb4ffff
        0xffffd72a:     0xdebdffff      0xdec8ffff
            
        """
        for m in re.finditer(r'((?:0x[0-9a-f]{8}(?!:),?\s*))', line):
            l = m.group(0)
            l = l.replace(',', '')
            l = l.replace(' ', '')
            dword = int(l, 16)
            unpack = reversed([
                (dword & 0xff000000) >> 24,
                (dword & 0x00ff0000) >> 16,
                (dword & 0x0000ff00) >>  8,
                (dword & 0x000000ff)
            ])
            i += 4
            for b in unpack:
                outs += '%02x' % b

        out(dbg("After callback ('%s')" % outs))
        return BytesParser.formats_compiled['hexstring'].match(outs)

    def fetch_bytes(self):
        if not self.format:
            out(err("fetch_bytes(): Format has not been specified!"))
            return False

        if self.format == 'raw':
            out(dbg("Parsing %s as raw bytes." % self.name))
            #self.bytes = [ord(c) for c in list(self.input)]
            self.bytes = self.input
            return len(self.bytes) > 0
        
        for line in self.input.decode().split('\n'):
            callback_called = False
            if self.format in BytesParser.formats_callbacks.keys() and \
                    BytesParser.formats_callbacks[self.format]:
                out(dbg("Before callback ('%s')" % line))
                m = BytesParser.formats_callbacks[self.format].__func__(line)
                callback_called = True
            else:
                line = BytesParser.preprocess_bytes_line(line[:])
                m = BytesParser.formats_compiled[self.format].match(line)

            if m:
                extract = ''
                for mg in m.groups()[0:]:
                    if len(mg) > 0:
                        extract = mg
                bytes = BytesParser.post_process_bytes_line(extract)
                if not bytes:
                    out(err("Could not process %s bytes line ('%s') as %s formatted! Quitting." \
                            % (self.name, line, self.format)))
                else:
                    out(dbg("Line ('%s'), bytes ('%s'), extracted ('%s'), len: %d" % (line, extract, bytes, len(bytes))))
                    self.bytes.extend(bytes)
            else:
                if callback_called:
                    out(dbg("Callback failure: transformed string ('%s') did not catched on returned match" % (line)))
                else:
                    out(dbg("Parsing line ('%s') failed with format '%s'." % (line, self.format)))

        return len(self.bytes) > 0

    def get_bytes(self):
        return self.bytes

    formats_callbacks = {
        'dword': unpack_dword
    }

def memoized(func):
    ''' A function decorator to make a function cache it's return values.
    If a function returns a generator, it's transformed into a list and
    cached that way. '''
    cache = {}
    def wrapper(*args):
        if args in cache:
            return cache[args]
        val = func(*args)
        if isinstance(val, types.GeneratorType):
            val = list(val)
        cache[args] = val
        return val
    wrapper.__doc__ = func.__doc__
    wrapper.__name__ = '%s_memoized' % func.__name__
    return wrapper

def bin2hex(binbytes):
    """
    Converts a binary string to a string of space-separated hexadecimal bytes.
    """
    if len(binbytes) > 0 and type(binbytes[0]) == type(''):
        return ' '.join('%02x' % ord(c) for c in binbytes)
    else:
        return ' '.join('%02x' % c for c in binbytes)

def bad_chars(comp):
    mapped_chunks = map(''.join, comp.guess_mapping())
    buffer1 = [chr(c) for c in buffers[0]]
    mapping = zip(buffer1, mapped_chunks)
    broken = [(i,x,y) for i,(x,y) in enumerate(mapping) if x != y]
    guessed_bc = guess_bad_chars(comp)

    return (broken, guessed_bc)

def rrange(x, y = 0):
    """ Creates a reversed range (from x - 1 down to y).
        Example:
        >>> rrange(10, 0) # => [9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
    """
    return range(x - 1, y - 1, -1)

def guess_bad_chars(comp):
    guessed_badchars = []
    ''' Tries to guess bad characters and outputs them '''
    bytes_in_changed_blocks = defaultdict(int)
    chunks = comp.get_chunks()
    last_unmodified = comp.get_last_unmodified_chunk()

    for i, c in enumerate(chunks):
        if c.unmodified: continue
        if i == last_unmodified + 1:
            # only report the first character as bad in the final corrupted chunk
            bytes_in_changed_blocks[c.xchunk[0]] += 1
            break
        for b in set(c.xchunk):
            bytes_in_changed_blocks[b] += 1

    # guess bad chars
    likely_bc = [char for char, count in bytes_in_changed_blocks.items() if count > 2]
    if likely_bc:
        out(dbg("Very likely bad chars: %s" % bin2hex(sorted(likely_bc))))
        guessed_badchars += list(sorted(likely_bc))
        out(dbg("Possibly bad chars: %s" % bin2hex(sorted(bytes_in_changed_blocks))))

    guessed_badchars += list(sorted(bytes_in_changed_blocks))
    
    # list bytes already omitted from the input
    bytes_omitted_from_input = set(map(chr, range(0, 256))) - set(comp.x)
    if bytes_omitted_from_input:
        out(dbg("Bytes omitted from input: %s" % bin2hex(sorted(bytes_omitted_from_input))))
        guessed_badchars += list(sorted( bytes_omitted_from_input))
        
    # return list, use list(set(..)) to remove dups
    return list(set(guessed_badchars))

def shorten_bytes(bytes, size=8):
    if len(bytes) <= size: return bin2hex(bytes)
    return '%02x ... %02x' % (ord(bytes[0]), ord(bytes[-1]))

def draw_chunk_table(comp):
    ''' Outputs a table that compares the found memory chunks side-by-side
    in input file vs. memory '''
    table = [('', '', '', '', 'File', 'Memory', 'Note')]
    delims = (' ', ' ', ' ', ' | ', ' | ', ' | ', '')
    last_unmodified = comp.get_last_unmodified_chunk()
    for c in comp.get_chunks():
        if   c.dy == 0:    note = 'missing'
        elif c.dx > c.dy:  note = 'compacted'
        elif c.dx < c.dy:  note = 'expanded'
        elif c.unmodified: note = 'unmodified!'
        else:              note = 'corrupted'
        table.append((c.i, c.j, c.dx, c.dy, shorten_bytes(c.xchunk), shorten_bytes(c.ychunk), note))

    # draw the table
    sizes = tuple(max(len(str(c)) for c in col) for col in zip(*table))
    for i, row in enumerate(table):
        out('\t' + ''.join(str(x).ljust(size) + delim for x, size, delim in zip(row, sizes, delims)))
        if i == 0 or (i == last_unmodified + 1 and i < len(table)):
            out('\t' + '-' * (sum(sizes) + sum(len(d) for d in delims)))


#
# Memory comparison algorithm originally taken from Mona.py by Peter Van Eeckhoutte - Corelan GCV
# https://github.com/corelan/mona
#
# It utilizes modified Longest Common Subsequence algorithm to mark number of modifications over
# supplied input to let it be transformed into another input, as compared to.
#
class MemoryComparator(object):
    ''' Solve the memory comparison problem with a special dynamic programming
    algorithm similar to that for the LCS problem '''

    Chunk = namedtuple('Chunk', 'unmodified i j dx dy xchunk ychunk')

    move_to_gradient = {
        0: (0, 0),
        1: (0, 1),
        2: (1, 1),
        3: (2, 1),
    }

    def __init__(self, x, y):
        self.x, self.y = x, y

    @memoized
    def get_last_unmodified_chunk(self):
        ''' Returns the index of the last chunk of size > 1 that is unmodified '''
        try:
            return max(i for i, c in enumerate(self.get_chunks()) if c.unmodified and c.dx > 1)
        except:
            # no match
            return -1

    @memoized
    def get_grid(self):
        ''' Builds a 2-d suffix grid for our DP algorithm. '''
        x = self.x
        y = self.y[:len(x)*2]
        width, height  = len(x), len(y)
        values = [[0] * (width + 1) for j in range(height + 1)]
        moves  = [[0] * (width + 1) for j in range(height + 1)]
        equal  = [[x[i] == y[j] for i in range(width)] for j in range(height)]
        equal.append([False] * width)

        for j, i in itertools.product(rrange(height + 1), rrange(width + 1)):
            value = values[j][i]
            if i >= 1 and j >= 1:
                if equal[j-1][i-1]:
                    values[j-1][i-1] = value + 1
                    moves[j-1][i-1] = 2
                elif value > values[j][i-1]:
                    values[j-1][i-1] = value
                    moves[j-1][i-1] = 2
            if i >= 1 and not equal[j][i-1] and value - 2 > values[j][i-1]:
                values[j][i-1] = value - 2
                moves[j][i-1] = 1
            if i >= 1 and j >= 2 and not equal[j-2][i-1] and value - 1 > values[j-2][i-1]:
                values[j-2][i-1] = value - 1
                moves[j-2][i-1] = 3
        return (values, moves)

    @memoized
    def get_blocks(self):
        '''
            Compares two binary strings under the assumption that y is the result of
            applying the following transformations onto x:

             * change single bytes in x (likely)
             * expand single bytes in x to two bytes (less likely)
             * drop single bytes in x (even less likely)

            Returns a generator that yields elements of the form (unmodified, xdiff, ydiff),
            where each item represents a binary chunk with "unmodified" denoting whether the
            chunk is the same in both strings, "xdiff" denoting the size of the chunk in x
            and "ydiff" denoting the size of the chunk in y.

            Example:
            >>> x = "abcdefghijklm"
            >>> y = "mmmcdefgHIJZklm"
            >>> list(MemoryComparator(x, y).get_blocks())
            [(False, 2, 3), (True, 5, 5),
             (False, 3, 4), (True, 3, 3)]
        '''
        x, y = self.x, self.y
        _, moves = self.get_grid()

        # walk the grid
        path = []
        i, j = 0, 0
        while True:
            dy, dx = self.move_to_gradient[moves[j][i]]
            if dy == dx == 0: break
            path.append((dy == 1 and x[i] == y[j], dy, dx))
            j, i = j + dy, i + dx

        for i, j in zip(range(i, len(x)), itertools.count(j)):
            if j < len(y): path.append((x[i] == y[j], 1, 1))
            else:          path.append((False,        0, 1))

        i = j = 0
        for unmodified, subpath in itertools.groupby(path, itemgetter(0)):
            ydiffs = list(map(itemgetter(1), subpath))
            dx, dy = len(ydiffs), sum(ydiffs)
            yield unmodified, dx, dy
            i += dx
            j += dy

    @memoized
    def get_chunks(self):
        i = j = 0
        for unmodified, dx, dy in self.get_blocks():
            yield self.Chunk(unmodified, i, j, dx, dy, self.x[i:i+dx], self.y[j:j+dy])
            i += dx
            j += dy

    @memoized
    def guess_mapping(self):
        ''' Tries to guess how the bytes in x have been mapped to substrings in y by
            applying nasty heuristics.

            Examples:
            >>> list(MemoryComparator("abcdefghijklm", "mmmcdefgHIJZklm").guess_mapping())
            [('m', 'm'), ('m',), ('c',), ('d',), ('e',), ('f',), ('g',), ('H', 'I'), ('J',),
             ('Z',), ('k',), ('l',), ('m',)]
            >>> list(MemoryComparator("abcdefgcbadefg", "ABBCdefgCBBAdefg").guess_mapping())
            [('A',), ('B', 'B'), ('C',), ('d',), ('e',), ('f',), ('g',), ('C',), ('B', 'B'),
             ('A',), ('d',), ('e',), ('f',), ('g',)]
        '''
        x, y = self.x, self.y

        mappings_by_byte = defaultdict(lambda: defaultdict(int))
        for c in self.get_chunks():
            dx, dy = c.dx, c.dy
            # heuristics to detect expansions
            if dx < dy and dy - dx <= 3 and dy <= 5:
                for i, b in enumerate(c.xchunk):
                    slices = set()
                    for start in range(i, min(2*i + 1, dy)):
                        for size in range(1, min(dy - start + 1, 3)):
                            slc = tuple(c.ychunk[start:start+size])
                            if slc in slices: continue
                            mappings_by_byte[b][slc] += 1
                            slices.add(slc)

        for b, values in mappings_by_byte.items():
            mappings_by_byte[b] = sorted(values.items(),
                                     key=lambda value, count: (-count, -len(value)))

        for c in self.get_chunks():
            dx, dy, xchunk, ychunk = c.dx, c.dy, c.xchunk, c.ychunk
            if dx < dy:  # expansion
                # try to apply heuristics for small chunks
                if dx <= 10:
                    res = []
                    for b in xchunk:
                        if dx == dy or dy >= 2*dx: break
                        for value, count in mappings_by_byte[b]:
                            if tuple(ychunk[:len(value)]) != value: continue
                            res.append(value)
                            ychunk = ychunk[len(value):]
                            dy -= len(value)
                            break
                        else:
                            yield (ychunk[0],)
                            ychunk = ychunk[1:]
                            dy -= 1
                        dx -= 1
                    for c in res: yield c

                # ... or do it the stupid way. If n bytes were changed to m, simply do
                # as much drops/expansions as necessary at the beginning and than
                # yield the rest of the y chunk as single-byte modifications
                for k in range(dy - dx): yield tuple(ychunk[2*k:2*k+2])
                ychunk = ychunk[2*(dy - dx):]
            elif dx > dy:
                for _ in range(dx - dy): yield ()

            for b in ychunk: yield (b,)

class HexDumpPrinter:
    def __init__(self, options, good_buffer, bad_buffer):
        self.comparator = None
        self.fill_matching = ''
        if not options.dont_use_lcs:
            self.dump1 = []
            self.dump2 = []

            if not options.match_empty:
                self.fill_matching = ' ' 
            else:
                self.fill_matching = '0' 
            self.use_comparator()
        else:
            self.dump1 = HexDumpPrinter.hex_dump(buffers[0]).split('\n')
            self.dump2 = HexDumpPrinter.hex_dump(buffers[1]).split('\n')

        self.minlen = min(len(self.dump1), len(self.dump2))

        self.bad_start_diff = bcolors.FAIL
        self.bad_stop_diff = bcolors.ENDC

        self.good_start_diff = bcolors.OKGREEN
        self.good_stop_diff = bcolors.ENDC

        self.address_good = '+'
        self.address_bad = '-'

        if not options.colored:
            self.good_start_diff = '-'
            self.good_stop_diff = '-'
            self.bad_start_diff = '!'
            self.bad_stop_diff = '!'
        else:
            self.address_good = bcolors.OKGREEN + self.address_good + bcolors.ENDC
            self.address_bad = bcolors.FAIL + self.address_bad + bcolors.ENDC

    def get_comparator(self): return self.comparator
        
    def use_comparator(self):
        buffer1 = [chr(c) for c in buffers[0]]
        buffer2 = [chr(c) for c in buffers[1]]

        comp = MemoryComparator(buffer1, buffer2)
        self.comparator = comp

        mapped_chunks = map(''.join, comp.guess_mapping())
        mapping = zip(buffer1, mapped_chunks)

        self.construct_comparator_dump(mapping)

        broken = [(i,x,y) for i,(x,y) in enumerate(mapping) if x != y]
        return (comp, broken, mapped_chunks)


    @staticmethod
    def hex_dump(data):
        s = ''
        n = 0
        lines = []

        def val(x):
            if type(x) == type(0): return x
            elif type(x) == type(''): return ord(x)
            elif type(x) == type([]): return _val(x[0])
            raise Exception("Unknown type of single byte.")

        if len(data) == 0:
            return '<empty>'

        for i in range(0, len(data), 16):
            line = ''
            line += '%04x | ' % (i)
            n += 16

            for j in range(n-16, n):
                if j >= len(data): break
                line += '%02x ' % val(data[j])

            line += ' ' * (3 * 16 + 7 - len(line)) + ' | '

            for j in range(n-16, n):
                if j >= len(data): break
                c = data[j] if not (val(data[j]) < 0x20 or val(data[j]) > 0x7e) else '.'
                line += '%c' % c

            lines.append(line)

        return '\n'.join(lines)

    @staticmethod
    def extract_chunks(iterable):
        """ Retrieves chunks of the given :size from the :iterable """
        fill = object()
        gen = itertools.zip_longest(fillvalue=fill, *([iter(iterable)] * 16))
        return (tuple(x for x in chunk if x != fill) for chunk in gen)

    def construct_comparator_dump(self, mapping):
        def toprint(x, src):
            c = x
            if len(c) == 0: c = ' '
            elif len(c) == 2: c = x[1]

            if ord(c) >= 0x20 and ord(c) < 0x7f:
                return c
            else: 
                return '.'

        for i, chunk in enumerate(HexDumpPrinter.extract_chunks(mapping)):
            chunk = list(chunk)  # save generator result in a list
            src, mapped = zip(*chunk)
            values = []
            for left, right in zip(src, mapped):
                if   left == right:   values.append('')             # byte matches original
                elif len(right) == 0: values.append('-1')           # byte dropped
                elif len(right) == 2: values.append('+1')           # byte expanded
                else:                 values.append(bin2hex(right)) # byte modified

            line1 = '%04x' % (i * 16) + ' | ' + bin2hex(src).ljust(49, ' ')
            line2 = '%04x' % (i * 16) + ' | ' + ' '.join(sym.ljust(2, self.fill_matching) for sym in values)

            line1 += '| ' + ''.join(map(lambda x: x if ord(x) >= 0x20 and ord(x) < 0x7f else '.', src)).ljust(16, ' ')
            ascii2 = '| '
            for i in range(len(values)): ascii2 += toprint(values[i], src[i])
            for i in range(len(values), 16): ascii2 += ' '
            line2 = line2.ljust(56, ' ')
            line2 += ascii2

            #out(dbg("Line1: ('%s')" % line1))
            #out(dbg("Line2: ('%s')" % line2))

            self.dump1.append(line1)
            self.dump2.append(line2)


    @staticmethod
    def wide_line(letter, d1, d2):
        d1t = d1.split(' | ')
        d2t = d2.split(' | ')

        if options.colored:
            letter = bcolors.HEADER + letter + bcolors.ENDC

        padding = ' ' * (17 - len(bcolors.strip_colors(d1t[2])))
        return '{}{} | {} | {}{}| {} | {}'.format(letter, d1t[0], d1t[1], d1t[2], padding, d2t[1], d2t[2])

    @staticmethod
    def extract_bytes(line):
        linet = line.split(' | ')
        strbytes = [linet[1][i:i+2] for i in range(0, len(linet[1]), 3)]
        bytes = []
        for s in strbytes:
            bytes.append(s)
        return bytes


    @staticmethod
    def reconstruct_line(letter, line, bytes):
        bytes_line = ''
        linet = line.split(' | ')
        color_address = False

        diff_indexes = []
        i = 0
        for b in bytes:
            if len(b) != 2:
                # difference
                diff_indexes.append(i)
                color_address = True
                if len(b) == 4:
                    # not colored difference
                    l = list(bytes_line)
                    if len(l) > 1:
                        l[-1] = b[0]
                    bytes_line = ''.join(l)
                    bytes_line += b[1:]
                else:
                    # colored difference
                    bytes_line += b + ' '
            else:
                bytes_line += b + ' '
            i += 1

        address = linet[0]
        ascii = linet[2]

        for b in range(len(bytes), 16):
            bytes_line += ' ' * 3
            ascii += ' '

        if options.colored:
            new_ascii = ''
            for j in range(len(ascii)):
                if j in diff_indexes:
                    new_ascii += bcolors.FAIL + ascii[j] + bcolors.ENDC
                else:
                    new_ascii += ascii[j]
            new_ascii, ascii = ascii, new_ascii

        if color_address or len(letter) > 1:
            if options.colored:
                address = bcolors.OKBLUE + address + bcolors.ENDC
            else:
                address = address

        return '{}{} | {} | {}'.format(letter, address, bytes_line, ascii.ljust(16))

    def highlight_differences(self, d1, d2):
        if d1 != d2:
            d1t = d1.split(' | ')
            d2t = d2.split(' | ')

            d1bytes = HexDumpPrinter.extract_bytes(d1)
            d2bytes = HexDumpPrinter.extract_bytes(d2)
            minlen = min(len(d1bytes), len(d2bytes))

            for i in range(minlen):
                if d1bytes[i] != d2bytes[i]:
                    if not options.dont_use_lcs and d2bytes[i] == self.fill_matching * len(d2bytes[i]):
                        continue
                    d1bytes[i] = self.good_start_diff + d1bytes[i] + self.good_stop_diff
                    d2bytes[i] = self.bad_start_diff + d2bytes[i] + self.bad_stop_diff

            d1 = HexDumpPrinter.reconstruct_line(self.address_good, d1, d1bytes)
            d2 = HexDumpPrinter.reconstruct_line(self.address_bad, d2, d2bytes)
        
        return (d1, d2)
    
    def __str__(self):
        buff = ''

        if not options.wide:
            buff += ' ' * 5 + ' | ' + ' '.join(['%02x' % x for x in range(16)]) + '  |\n'
            buff += ' ' * 5 + ' |' + '-' * 50 + '|\n'
        else:
            buff += ' ' * 5 + ' | ' + ' '.join(['%02x' % x for x in range(16)]) + '  |'
            buff = buff + ' ' * 12 + buff + '\n'
            buff += ' ' * 5 + ' |' + '-' * 50 + '|' + ' ' * 12
            buff += ' ' * 5 + ' |' + '-' * 50 + '|\n'

        for i in range(self.minlen):
            d1 = self.dump1[i]
            d2 = self.dump2[i]
            d1t = d1.split(' | ')
            d2t = d2.split(' | ')

            if d1 == d2 or (d2t[1].count(' ') == len(d2t[1])):
                if not options.wide:
                    buff += ' ' + d1
                    if options.match_empty:
                        buff += '\n ' + d2
                else:
                    buff += HexDumpPrinter.wide_line(' ', d1, d2)
            else:
                (d1, d2) = self.highlight_differences(d1, d2)
                if not options.wide:
                    buff += d1 + '\n' + d2
                else:
                    num = 1
                    if options.colored: num = 6
                    buff += HexDumpPrinter.wide_line('>', d1[num:], d2)

            buff += '\n'

        good_longer = len(self.dump1) > len(self.dump2)
        maxlen = max(len(self.dump1), len(self.dump2))
        once = False
        for i in range(self.minlen, maxlen):
            if (self.minlen + 8 < maxlen - 5) and i > self.minlen + 5 and i < maxlen - 5: 
                if not once: 
                    buff += ' ...' + '\n'
                    once = True
                continue
            if good_longer:
                buff += ' ' + self.dump1[i] + '\n'
            else:
                buff += ' ' + self.dump2[i] + '\n'

        return buff

def fetch_file(filename, name, format):
    out(dbg("Opening file '%s' as %s in format: %s" % (filename, name, format)))
    with open(filename, 'rb') as f:
        buff = f.read()
        try:
            b = BytesParser(buff, name, format)
        except UnicodeDecodeError as e:
            out(warn("Bytes parsing failed on format %s and file (%s). Fall back to RAW instead." % (format, filename)))
            b = BytesParser(buff, name, 'raw')

        if not b.parsed:
            sys.exit(1)
        else:
            return b.get_bytes() 

def parse_options():
    global options
    global filenames

    avail_formats = ['raw',]
    avail_formats.extend(BytesParser.formats_rex.keys())
    for k, v in BytesParser.formats_aliases.items():
        avail_formats.extend(v)

    formats = ', '.join(["'"+x+"'" for x in avail_formats])
    usage = "Usage: %prog [options] good_buffer bad_buffer\n\n"
    usage += "Buffers explanation:\n\t- good_buffer\t- file "
    usage += "containing buffer considered to be a model one, "
    usage += "having expected bytes in it.\n\t- bad_buffer\t- "
    usage += "file that has tainted/modified/varying bytes comparing to good_buffer."
    usage += "\n\nAvailable formats:\n\t" + formats

    parser = OptionParser(usage = usage)
    parser.add_option("", "--format1", metavar="FORMAT", dest="format1", default=None, 
                        help="Enforce specific format on first buffer.")
    parser.add_option("", "--format2", metavar="FORMAT", dest="format2", default=None, 
                        help="Enforce specific format on second buffer.")
    parser.add_option("-C", "--nocolors", action="store_false", dest="colored", default=True, 
                        help="Don't apply colors to the comparison output.")
    parser.add_option("-w", "--wide", action="store_true", dest="wide", default=False, 
                        help="Wide mode, display hex dumps next to each other.")
    parser.add_option("-e", "--match-empty", action="store_true", dest="match_empty", default=False, 
                        help="Print matching bytes as empty line from bad_buffer.")
    parser.add_option("-n", "--no-lcs", action="store_true", dest="dont_use_lcs", default=False, 
                        help="Don't use LCS (Longest Common Subsequence) algorithm in hex dump printing. Go with simple comparison.")
    parser.add_option("-f", "--first-bytes-only", action="store_true", dest="first_bytes", default=False, 
                        help="Compare only N first bytes from a bigger file. If good_buffer is smaller than bad_buffer, only first Len(good_buffer) bytes of bad_buffer will be processed.")
    parser.add_option("-d", "--debug", action="store_true", dest="debug", default=False, 
                        help="Debug mode - more verbose.")
    parser.add_option("-q", "--quiet", action="store_true", dest="quiet", default=False, 
                        help="Quiet mode, no infos. Return 1 if not equal, 0 otherwise.")

    (options, args) = parser.parse_args()
    if len(args) != 2:
        parser.error(err("You have to specify two buffer files."))
        parser.print_usage()
        return False

    if not os.path.isfile(args[0]):
        parser.error(err("First file specified does not exist."))
        parser.print_usage()
        return False

    if not os.path.isfile(args[1]):
        parser.error(err("Second file specified does not exist."))
        parser.print_usage()
        return False

    if options.format1: options.format1 = options.format1.lower()
    if options.format2: options.format2 = options.format2.lower()

    if options.format1 and options.format1 not in avail_formats:
        parser.error(err("Format1 that was specified is not recognized."))
        parser.error(err("Valid formats: %s" % formats))
        return False

    if options.format2 and options.format2 not in avail_formats:
        parser.error(err("Format2 that was specified is not recognized."))
        parser.error(err("Valid formats: %s" % formats))
        return False

    filenames = args

    return True

def check_if_match():
    diff = 0
    bad_chars = defaultdict(list)
    minlen = min(len(buffers[0]), len(buffers[1]))

    for i in range(minlen):
        if buffers[0][i] != buffers[1][i]:
            diff += 1
            bad_chars[buffers[0][i]].append(buffers[1][i])

    if len(buffers[0]) > minlen:
        bad_chars[-1].append(buffers[1][-1])
    elif len(buffers[1]) > minlen:
        bad_chars[-1].append(buffers[0][-1])

    return (diff, bad_chars)

def banner():
    sys.stderr.write("\n\t:: BadChars.py (v:%s) - Exploit Development Bad Characters hunting tool." % VERSION)
    sys.stderr.write("\n\t\tEquipped with Corelan.be Mona's buffers comparison LCS-based algorithm\n\n")

def main(argv):
    banner()
    if not parse_options():
        return 1

    buffers[0].extend(fetch_file(filenames[0], 'good_buffer', options.format1))
    buffers[1].extend(fetch_file(filenames[1], 'bad_buffer', options.format2))

    len0 = len(buffers[0])
    len1 = len(buffers[1])

    if len0 != len1:
        out("\n"+warn("Specified buffer files differ in contents length (%d, %d)!" \
                    % (len0, len1)))

        if options.first_bytes:
            if len0 > len1:
                buffers[0] = buffers[0][:len1]
                out(dbg("Comparing only first %d bytes from good_buffer." % len1))
            elif len1 > len0:
                buffers[1] = buffers[1][:len0]
                out(dbg("Comparing only first %d bytes from bad_buffer." % len0))
    else:
        out(ok("Buffers are of same size: %d bytes." % len0))
        
    res, bad_chars_dict = check_if_match()
    
    if not res:
        print()
        out(ok("Buffers match. No Bad characters found.\n"))
        return 0
    else:
        bad_chars_string = ''
        bad_chars_flatten = [x for x in list(bad_chars_dict.keys()) if x != -1]

        if options.quiet:
            return 1

        printer = HexDumpPrinter(options, buffers[0], buffers[1])

        minlen = min(len0, len1)
        proc = (float(res)/float(minlen) * 100.0)

        if not options.dont_use_lcs:
            (broken, values) = bad_chars(printer.get_comparator())
            bad_chars_flatten = [ord(c) for c in values]
            bad_chars_dict = {}

        if not options.quiet:
            chars = ', '.join(['0x%02x' % c for c in bad_chars_flatten])
            bad_chars_string += _out("Likely to be bad chars: " + bcolors.HEADER + chars + "\n", bcolors.WARNING)

            if len(bad_chars_dict.keys()) > 0:
                bad_chars_string += _out("Found mappings:\n", bcolors.WARNING)

                tochar = lambda x: x if ((x > 0 and x < 256) and (chr(x) in string.printable)) else '.'
                added = set()
                for k, v in bad_chars_dict.items():
                    a = k
                    a1 = tochar(k)
                    for b in v:
                        b1 = tochar(b)
                        if (a, b) not in added and a != -1:
                            bad_chars_string += "\t0x%02x (%s) => 0x%02x (%s)\n" % (a, a1, b, b1)
                            added.add((a,b))

        if proc != 100.0:
            out(err("\n\tBuffers differ! Found at least %d differences (%d/%d, %0.2f%%) and %d bad chars\n" \
                    % (res, res, minlen, proc, len(bad_chars_flatten))))
        else:
            out(err("\n\tBuffers differ entirely.\n"))

        out(str(printer))

        if options.dont_use_lcs:
            if proc < 10.0:
                out(bad_chars_string)
            else:
                out(warn("Too many differences to guess bad chars correctly."))
        else:
            draw_chunk_table(printer.get_comparator())

        print()
        out(warn("You may as well consider changing buffers formats (--format1 / --format2) as bytes could be fetched wrongly."))
        
    return 0
        
if __name__ == '__main__':
    sys.exit(main(sys.argv))