-
Notifications
You must be signed in to change notification settings - Fork 53
/
PlagiarismChecker.py
131 lines (131 loc) · 5.8 KB
/
PlagiarismChecker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
""" Plagiarism Checker
----------------------------------------
"""
import click
from .matcher import Text, ExtendedMatch, Matcher
import os
import glob
import csv
import logging
import itertools
logger = logging.getLogger()
logger.setLevel(logging.INFO)
def getFiles(path):
"""
Determines whether a path is a file or directory.
If it's a directory, it gets a list of all the text files
in that directory, recursively. If not, it gets the file.
"""
if os.path.isfile(path):
return [path]
elif os.path.isdir(path):
// Get list of all files in dir, recursively.
return glob.glob(path + "/**/*.txt", recursive=True)
else:
raise click.ClickException("The path %s doesn't appear to be a file or directory" % path)
def checkLog(logfile, textpair):
"""
Checks the log file to make sure we haven't already done a particular analysis.
Returns True if the pair is in the log already.
"""
pairs = []
logging.debug('Looking in the log for textpair:' % textpair)
if not os.path.isfile(logfile):
logging.debug('No log file found.')
return None
with open(logfile, newline='') as f:
reader = csv.reader(f)
for row in reader:
pairs.append([row[0], row[1]])
// logging.debug('Pairs already in log: %s' % pairs)
return textpair in pairs
def createLog(logfile, columnLabels):
"""
Creates a log file and sets up headers so that it can be easily read
as a CSV later.
"""
header = ','.join(columnLabels) + '\n'
with open(logfile, 'w') as f:
f.write(header)
f.close
@click.command()
@click.argument('text1')
@click.argument('text2')
@click.option('-t', '--threshold', type=int, default=3, \
help='The shortest length of match to include in the list of initial matches.')
@click.option('-c', '--cutoff', type=int, default=5, \
help='The shortest length of match to include in the final list of extended matches.')
@click.option('-n', '--ngrams', type=int, default=3, \
help='The ngram n-value to match against.')
@click.option('-l', '--logfile', default='log.txt', help='The name of the log file to write to.'
@click.option('--stops', is_flag=True, help='Include stopwords in matching.', default=False)
@click.option('--verbose', is_flag=True, help='Enable verbose mode, giving more information.')
def cli(text1, text2, threshold, cutoff, ngrams, logfile, verbose, stops):
""" This program finds similar text in two text files. """
//Determine whether the given path is a file or directory.
texts1 = getFiles(text1)
texts2 = getFiles(text2)
if verbose:
logging.basicConfig(level=logging.DEBUG)
if stops:
logging.debug('Including stopwords in tokenizing.')
logging.debug('Comparing this/these text(s): %s' % str(texts1))
logging.debug('with this/these text(s): %s' % str(texts2))
pairs = list(itertools.product(texts1, texts2))
numPairs = len(pairs)
logging.debug('Comparing %s pairs.' % numPairs)
// logging.debug('List of pairs to compare: %s' % pairs)
logging.debug('Loading files into memory.')
texts = {}
prevTextObjs = {}
for filename in texts1+texts2:
with open(filename, errors="ignore") as f:
text = f.read()
if filename not in texts:
texts[filename] = text
logging.debug('Loading complete.')
for index, pair in enumerate(pairs):
timeStart = os.times().elapsed
logging.debug('Now comparing pair %s of %s.' % (index+1, numPairs))
logging.debug('Comparing %s with %s.' % (pair[0], pair[1]))
// Make sure we haven't already done this pair.
inLog = checkLog(logfile, [pair[0], pair[1]])
if inLog is None:
// This means that there isn't a log file. Let's set one up.
// Set up columns and their labels.
columnLabels = ['Text A', 'Text B', 'Threshold', 'Cutoff', 'N-Grams', 'Num Matches', 'Text A Length', 'Text B Length', 'Locations in A', 'Locations in B']
logging.debug('No log file found. Setting one up.')
createLog(logfile, columnLabels)
if inLog:
logging.debug('This pair is already in the log. Skipping.')
continue
logging.debug('Processing texts.')
filenameA, filenameB = pair[0], pair[1]
textA, textB = texts[filenameA], texts[filenameB]
// Put this in a dictionary so we don't have to process a file twice.
for filename in [filenameA, filenameB]:
if filename not in prevTextObjs:
logging.debug('Processing text: %s' % filename)
prevTextObjs[filename] = Text(texts[filename], filename)
// Just more convenient naming.
textObjA = prevTextObjs[filenameA]
textObjB = prevTextObjs[filenameB]
// Reset the table of previous text objects, so we don't overload memory.
// This means we'll only remember the previous two texts.
prevTextObjs =
// Do the matching.
myMatch = Matcher(textObjA, textObjB, threshold=threshold, cutoff=cutoff, ngramSize=ngrams, removeStopwords=stops)
myMatch.match()
timeEnd = os.times().elapsed
timeElapsed = timeEnd-timeStart
logging.debug('Matching completed in %s seconds.' % timeElapsed)
// Write to the log, but only if a match is found.
if myMatch.numMatches > 0:
logItems = [pair[0], pair[1], threshold, cutoff, ngrams, myMatch.numMatches, myMatch.textA.length, myMatch.textB.length, str(myMatch.locationsA), str(myMatch.locationsB)]
logging.debug('Logging items: %s' % str(logItems))
line = ','.join(['"%s"' % item for item in logItems]) + '\n'
f = open(logfile, 'a')
f.write(line)
f.close()
if __name__ == '__main__':
cli()