-
Notifications
You must be signed in to change notification settings - Fork 0
/
plagiarism_detector.py
134 lines (110 loc) · 4.25 KB
/
plagiarism_detector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""
This module defines a plagiarism detector in Python.
"""
class PlagiarismDetector(object):
"""
A class which defines the plagiarism detector
"""
def __init__(self, N=3):
"""
Initializes the plagiarism detector.
@param N the size of N-tuple. default value = 3
"""
self.N = N
def get_similiarity_percentage_str(self, file1_words, file2_words):
"""
Returns the similiarity score (see get_similiarity_score) as a
percentage string.
@param file1_words word list from file1 with synonyms replaced
@param file2_words word list from file2 with synonyms replaced
@returns the similiarity score percentage as a string.
"""
similiarity_score = self.get_similiarity_score(file1_words,
file2_words)
return str(int(round(100.0 * similiarity_score, 3))) + "%"
def get_similiarity_score(self, file1_words, file2_words):
"""
Returns the similiarity score for the given file words by
calculating the number of common tuples divided by the total
tuples in file1.
@param file1_words word list from file1 with synonyms replaced
@param file2_words word list from file2 with synonyms replaced
@returns number of common N-tuples/number of tuples in
file1_words
"""
file1_tuples_container = NTupleContainer(file1_words, self.N)
file2_tuples_contianer = NTupleContainer(file2_words, self.N)
similar_tuple_count = 0.0
total_tuple_count = float(len(file1_tuples_container))
# This for loop takes O(n) time. The __contains__ method
# defined in the NTupleContainer class words in O(1) so the
# line `t in # file2_tuples_contianer` should run in O(1).
for n_tuple in file1_tuples_container:
if n_tuple in file2_tuples_contianer:
similar_tuple_count += 1
similarity_score = similar_tuple_count/total_tuple_count
return similarity_score
class NTupleContainer(object):
"""
A custom ADT which holds the N-tuples found in a word list.
This is used by the PlagiarismDetector class to efficiently
calculate the similarity score
"""
def __init__(self, word_list, N):
"""
initializes the container by generating all N-sized tuples from
the word_list.
"""
self.tuple_dict = {}
self.populate_tuple_dict(word_list, N)
def populate_tuple_dict(self, word_list, N):
"""
Populates the tuple dict by finding all N tuples in word_list.
"""
if N > len(word_list):
raise NValueTooLarge("N is larger than the length of word_list.")
if N <= 0:
raise NValueTooSmall("Value of N cannot be <= 0")
self.tuple_dict = {}
sliding_window = []
for word in word_list:
sliding_window.append(word)
if len(sliding_window) == N:
self.tuple_dict[tuple(sliding_window)] = True
# remove the last element so we can slide to the next
# element when we append later
sliding_window.pop(0)
def __contains__(self, item):
"""
Allows the use of the "in" keyword with this container.
@return True if the container contains the given tuple in O(1)
otherwise returns False
"""
return item in self.tuple_dict
def __len__(self):
"""
Allows the len() function to work on this container.
@return the number of tuples held in the container.
"""
return len(self.tuple_dict)
def __iter__(self):
"""
Allows the container to be iterable in python.
@returns an iterator over the tuples in the container
"""
return iter(self.tuple_dict.keys())
def __str__(self):
"""
@return the elements contained in the container as a string.
"""
return str(self.tuple_dict.keys())
class NValueTooLarge(Exception):
"""
A custom exception which is raised when the value of N is too large.
"""
pass
class NValueTooSmall(Exception):
"""
A custom exception which is raised when the value of N <= 0.
"""
pass