-
Notifications
You must be signed in to change notification settings - Fork 0
/
htmlDiff.py
458 lines (403 loc) · 15.6 KB
/
htmlDiff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
# DIFF v1.5
from lxml import etree
from lxml.html.soupparser import parse, fromstring
import xml.etree.ElementTree as ET
import difflib
from BeautifulSoup import BeautifulSoup
import os
from difflib import SequenceMatcher
"""In order for our algorithm to work, we must make the elment names unique
lxml names every child according to its type image, script, body, etc
We have a uniques path to get to nodes in the tree """
def makeChildNamesUnique(node, key, isChangeTree, originalFileHash):
children = node.getchildren()
uniqueSet = {}
checkedKeys = [] #Used for performance optimazation
if isChangeTree: #Check to see if this is the original or modified
if "/!!/" in key:
finish = key.rfind("/!/")
path = key[:finish] + '/!!/' + key[finish + 3:] + '/!/'
else:
path = key
matchingSubset = slicedict(originalFileHash, path) #Find possible matches in other tree
#Go through each child node and rename it based on
#1. If this is the orginal tree? or
#2. Have we seen this name before
for child in children:
if isChangeTree:
maxTag = None
maxScore = 0
didFindSimilarMatch = False
elementString1 = removeExtraChars(
etree.tostring(child))
for originalFileKey in matchingSubset: # Cycle through all new keys
if originalFileKey in checkedKeys:
continue
elementString2 = removeExtraChars(
etree.tostring(originalFileHash[originalFileKey]))
temp = similar(elementString1, elementString2)
try:
if temp > maxScore and child.tag in originalFileKey:
maxScore = temp
finish = originalFileKey.rfind("/!/")
maxTag = originalFileKey[finish + 3:]
didFindSimilarMatch = True
if maxScore == 1: #Perfect match
checkedKeys.append(originalFileKey)
break
except TypeError: #Escape Comments
continue
#Do the actual renaming based on the match above
try:
if didFindSimilarMatch and child.tag in maxTag:
if maxTag in uniqueSet:
if maxScore > uniqueSet[maxTag]:
for tempChild in children:
if tempChild.tag == maxTag:
tempTag = generateTag(
tempChild, child.tag, uniqueSet, False)
tempChild.tag = tempTag
uniqueSet[tempTag] = 0
break
child.tag = maxTag
uniqueSet[maxTag] = maxScore
else:
tempTag = generateTag(
child, child.tag, uniqueSet, True)
continue
else:
child.tag = maxTag
uniqueSet[maxTag] = maxScore
else:
tempTag = generateTag(child, child.tag, uniqueSet, True)
except TypeError:
pass
else: #We are utilizing the original file, just generate new Tag
generateTag(child, child.tag, uniqueSet, True)
return node
""" Generates new tags """
def generateTag(child, tag, mySet, doRename):
if tag in mySet: # Make every child tag unique
i = 1
while True:
tempkey = '%s_%d' % (tag, i)
if tempkey in mySet: # Have we seen this tag already
i = i + 1
continue
else: # No,so add it to the set
if doRename:
try:
child.tag = tempkey
mySet[tempkey] = 0
break
except AttributeError: # Catches immutable nodes
return tag
else:
return tempkey
else: # Its not in the set so add it
if doRename:
mySet[tag] = 0
return tag
""" Recursive function used to create a hash map of all the leaf
nodes in an html file"""
def hashNodes(tree, data, isleaf, isChangeTree, originalFileHash):
for node in tree:
if node.getchildren(): # if true, not a leaf
key = hashSingleNode(node, data, isleaf, 0)
node = makeChildNamesUnique(
node, key, isChangeTree, originalFileHash)
# Recursive call to go deeper into the tree
hashNodes(node, data, isleaf, isChangeTree, originalFileHash)
else: # This node must be a leaf node
hashSingleNode(node, data, isleaf, 1)
def hashSingleNode(node, data, isleaf, mark):
key = node.tag
if node.getparent():
parent_node = node.getparent()
tempKey = parent_node.tag
while parent_node.getparent():
parent_node = parent_node.getparent()
tempKey = parent_node.tag + '/!!/' + tempKey
key = '%s/!/%s' % (tempKey, key)
if key in data: # Handles duplicate paths like 2 scripts inside a body tag, (Shouldnt be called!)
i = 1
while True:
tempkey = '%s_%d' % (key, i)
if tempkey in data:
i = i + 1
continue
else:
key = tempkey
data[key] = node # Put element in hashmap
if mark == 1:
isleaf[key] = True # This tag belongs to a leaf node
else:
isleaf[key] = False # This tag belongs to a parent node
break
else:
data[key] = node
if mark == 1:
isleaf[key] = True
else:
isleaf[key] = False
return key
""" Function used to store any changes inside a dictionary"""
def reportChange(
outputDict,
afterText,
rawChange,
elementType,
listOfStringOperations,
fullText,
otherInfo=""):
output = {}
for op in listOfStringOperations:
output['afterText'] = afterText
output['rawChange'] = rawChange
output['elementType'] = elementType
output['op'] = op
output['fullText'] = fullText
output['otherInfo'] = otherInfo
outputDict[getDiffOutputNumber(outputDict)] = output
""" Online Function I found to find changes between two strings and output it with tags"""
def show_diff(seqm):
"""Unify operations between two compared strings
seqm is a difflib.SequenceMatcher instance whose a & b are strings"""
output = []
for opcode, a0, a1, b0, b1 in seqm.get_opcodes():
if opcode == 'equal':
output.append(seqm.a[a0:a1])
elif opcode == 'insert':
output.append("<ins>" + seqm.b[b0:b1] + "</ins>")
elif opcode == 'delete':
output.append("<del>" + seqm.a[a0:a1] + "</del>")
elif opcode == 'replace':
output.append("<replace>" + seqm.a[a0:a1] + "</replace>")
#raise NotImplementedError, "what to do with 'replace' opcode?"
else:
raise RuntimeError("unexpected opcode")
return ''.join(output)
""" Function to elegantly report any text mismatches"""
def reportStringChange(outputDict, stringA, stringB, tag, otherInfo=""):
if stringA is None:
stringA = ""
if stringB is None:
stringB = ""
textChange = show_diff(difflib.SequenceMatcher(None, stringA, stringB))
soup = BeautifulSoup(textChange)
insertTags = soup.findAll('ins')
deleteTags = soup.findAll('del')
replaceTags = soup.findAll('replace')
for i in insertTags:
reportChange(
outputDict,
stringB,
i.text,
tag,
['ins'],
textChange,
otherInfo)
for d in deleteTags:
reportChange(
outputDict,
stringB,
d.text,
tag,
['del'],
textChange,
otherInfo)
for r in replaceTags:
reportChange(
outputDict,
stringB,
r.text,
tag,
['replace'],
textChange,
otherInfo)
""" Main diff Function used to diff two elements"""
def isSameNode(a, b, isLeaf, outputDict):
output = {}
for attrib in a.attrib: # Compare values assigned to attributes
if a.get(attrib) != b.get(attrib):
if b.get(attrib) is not None:
reportStringChange(
outputDict,
a.get(attrib),
b.get(attrib),
a.tag,
"ATTRIBUTE VALUE CHANGE")
else:
output['afterText'] = ""
output['rawChange'] = a.get(attrib)
output['elementType'] = a.tag
output['op'] = ""
output['fullText'] = ""
output['otherInfo'] = "DELETED ATTRIBUTE"
outputDict[getDiffOutputNumber(outputDict)] = output
for attrib in b.attrib:
if a.get(attrib) is None:
output['afterText'] = b.get(attrib)
output['rawChange'] = ""
output['elementType'] = a.tag
output['op'] = ""
output['fullText'] = ""
output['otherInfo'] = "ADDED ATTRIBUTE"
outputDict[getDiffOutputNumber(outputDict)] = output
if isLeaf and a.text != b.text: # Compare text inside element
reportStringChange(outputDict, a.text, b.text, a.tag, "TEXT CHANGE")
# if tagsame==True:
# if a.tag != b.tag:
# return False
# if a.prefix != b.prefix:
# return False
# if a.tail != b.tail:
# return False
# if a.values()!=b.values(): #redundant to the attrib matching
# return False
# if sorted(a.keys()) != sorted(b.keys()): #may also be redundant to the attrib matching, #See if any attributes were added/removed
# str1 = ''.join(sorted(a.keys()))
# str2 = ''.join(sorted(b.keys()))
# reportStringChange(str1, str2, a.tag, "ATTRIBUTE CHANGE")
def storeDict(path, outputdict):
if not os.path.exists(path):
os.makedirs(path)
for key in outputdict:
for key2 in outputdict[key]:
try:
# print path+key+key2;
f = open(path + key + key2 + '.json', 'a')
json.dump(outputdict[key][key2], f)
f.close()
except Exception:
os.remove(path + key + key2 + '.json')
return
""" Function to detect similartiy between two strings """
def similar(a, b):
return SequenceMatcher(None, a, b).ratio()
""" Returns a portion of a dictionary matching a prefix, s """
def slicedict(d, s):
return {k: v for k, v in d.iteritems() if k.startswith(s)}
""" Remove un-needed chars from a string """
def removeExtraChars(string1):
return string1.replace(" ", "")
def findSimilarNodes(originalFileHash, newFileHash):
originalSet = set(originalFileHash)
newSet = set(newFileHash)
checkedKeys = []
for originalFileKey, originalFileValue in originalFileHash.iteritems(
): # Cycle through all original keys
myMax = 0
maxNode = None
maxKey = None
didFindSimilarMatch = False
if "li_1" in originalFileKey:
g = originalFileHash[originalFileKey]
myMax = 0
if "/!!/" in originalFileKey:
start = originalFileKey.rfind("/!!/")
finish = originalFileKey.rfind("/!/")
path = originalFileKey[:finish + 4]
else:
path = originalFileKey
if "!!/body/" in path or "!!/head/!" in path:
matchingSubset = slicedict(newFileHash, path)
else:
print "skipped path -- %s", path
continue
elementString1 = removeExtraChars(
etree.tostring(originalFileHash[originalFileKey]))
for newFileKey in matchingSubset: # Cycle through all new keys
if newFileKey in checkedKeys:
continue
elementString2 = removeExtraChars(
etree.tostring(newFileHash[newFileKey]))
temp = similar(elementString1, elementString2)
if temp > myMax:
myMax = temp
maxNode = newFileHash[newFileKey]
maxKey = newFileKey
didFindSimilarMatch = True
if myMax == 1:
checkedKeys.append(originalFileKey)
break
if didFindSimilarMatch:
if originalFileKey == maxKey:
continue
elif originalFileKey in newFileHash:
tempNode = newFileHash[originalFileKey]
newFileHash[originalFileKey] = maxNode
newFileHash[maxKey] = tempNode
else:
newFileHash[originalFileKey] = maxNode
return checkedKeys
""" Reports the number of changes found """
def getDiffOutputNumber(diffHash):
return len(diffHash)
""" Main differ function """
def htmldiff(path1, path2):
tree1 = parse(path1).getroot()
tree2 = parse(path2).getroot()
elementsA_hash = {}
elementsB_hash = {}
isLeafNodeA = {}
isLeafNodeB = {}
outputDict = {}
numberOfChanges = 0
hashNodes(tree1, elementsA_hash, isLeafNodeA, False, None)
hashNodes(tree2, elementsB_hash, isLeafNodeB, True, elementsA_hash)
sameKeys = findSimilarNodes(elementsA_hash, elementsB_hash)
for key, value in elementsA_hash.iteritems():
output = {}
try:
if key in sameKeys:
continue
isSameNode(
elementsA_hash[key],
elementsB_hash[key],
isLeafNodeA[key],
outputDict)
except KeyError as e:
node = elementsA_hash[key]
# print 'I got a KeyError - reason "%s"' % str(e)
output['afterText'] = ""
output['afterAttribute'] = ""
output['rawTextChange'] = node.text
output['rawAttributeChange'] = node.attrib
output['elementType'] = node.tag
output['op'] = ""
output['fullText'] = ""
output['otherInfo'] = "DELETED NODE"
outputDict[getDiffOutputNumber(outputDict)] = output
for key, value in elementsB_hash.iteritems():
output = {}
try:
# Check to see if this node exist in the original file
tempNode = elementsA_hash[key]
except KeyError as e:
node = elementsB_hash[key]
# print 'I got a KeyError - reason "%s"' % str(e)
output['afterText'] = node.text
output['afterAttribute'] = node.attrib
output['rawTextChange'] = node.text
output['rawAttributeChange'] = node.attrib
output['elementType'] = node.tag
output['op'] = ""
output['fullText'] = ""
output['otherInfo'] = "ADDED NODE"
outputDict[getDiffOutputNumber(outputDict)] = output
print getDiffOutputNumber(outputDict)
return outputDict
if __name__ == "__main__":
import sys
from pprint import *
if len(sys.argv) == 3:
f1 = sys.argv[1]
f2 = sys.argv[2]
diff = htmldiff(f1, f2)
pprint(diff)
elif len(sys.argv) == 2:
f1 = sys.argv[1] + '/before.html'
f2 = sys.argv[1] + '/after.html'
diff = htmldiff(f1, f2)
pprint(diff)