forked from chrismattmann/tika-similarity
-
Notifications
You must be signed in to change notification settings - Fork 0
/
value-similarity.py
executable file
·202 lines (154 loc) · 5.76 KB
/
value-similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#!/usr/bin/env python2.7
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
import tika
from tika import parser
from pprint import pprint
import os
import sys
import getopt
import json
import operator
_verbose = False
_helpMessage = '''
Usage: similarity [-v] [-f directory] [-c file1 file2]
Options:
-v, --verbose
Work verbosely rather than silently.
-f, --directory [path to directory]
read files from this directory recursively
-c, --file [file1 file2]
compare similarity of given files
--accept [jpeg pdf etc...]
Optional: compute similarity only on specified IANA MIME Type(s)
-h --help
show help on the screen
'''
def verboseLog(message):
if _verbose:
print >>sys.stderr, message
class _Usage(Exception):
''' an error for arguments '''
def __init__(self, msg):
self.msg = msg
def main(argv = None):
if argv is None:
argv = sys.argv
try:
try:
opts, args = getopt.getopt(argv[1:], 'hvf:c:a:', ['help', 'verbose', 'directory=', 'file=', 'accept=' ])
except getopt.error, msg:
raise _Usage(msg)
if len(opts) ==0:
raise _Usage(_helpMessage)
dirFile = ""
filenames = []
filename_list = []
allowed_mime_types = []
directory_flag = 0
for option, value in opts:
if option in ('-h', '--help'):
raise _Usage(_helpMessage)
elif option in ('-c', '--file'):
#extract file names from command line
if '-c' in argv :
index_of_file_option = argv.index('-c')
else :
index_of_file_option = argv.index('--file')
filenames = argv[index_of_file_option+1 : ]
elif option in ('-f', '--directory'):
dirFile = value
directory_flag = 1
for root, dirnames, files in os.walk(dirFile):
dirnames[:] = [d for d in dirnames if not d.startswith('.')]
for filename in files:
if not filename.startswith('.'):
filename_list.append(os.path.join(root, filename))
elif option in ('--accept'):
#extract accepted mime types from command line
index_of_mime_type_option = argv.index('--accept')
allowed_mime_types = argv[index_of_mime_type_option+1 : ]
elif option in ('-v', '--verbose'):
global _verbose
_verbose = True
#format filename
if directory_flag == 0:
filenames = [x.strip() for x in filenames]
filenames = [filenames[k].strip('\'\n') for k in range(len(filenames))]
for filename in filenames :
if not os.path.isfile(os.path.join(dirFile, filename)):
continue
filename = os.path.join(dirFile, filename) if dirFile else filename
filename_list.append(filename)
if len(filename_list) <2 :
raise _Usage("you need to type in at least two valid files")
#allow only files with specifed mime types
if len(allowed_mime_types) != 0:
filename_list = [filename for filename in filename_list if parser.from_file(filename) and str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in allowed_mime_types]
else:
print "Accepting all MIME Types....."
union_feature_names = set()
file_parsed_data = {}
resemblance_scores = {}
file_metadata={}
for filename in filename_list:
file_parsed = []
# first compute the union of all features
parsedData = parser.from_file(filename)
filename_stripped = filename.replace(",", "")
try:
file_metadata[filename_stripped] = parsedData["metadata"]
#get key : value of metadata
for key in parsedData["metadata"]:
value = parsedData["metadata"][key]
if isinstance(value, list):
value = ", ".join(parsedData["metadata"][key])
file_parsed.append(str(key.strip(' ').encode('utf-8') + ": " + value.strip(' ').encode('utf-8')))
file_parsed_data[filename_stripped] = set(file_parsed)
union_feature_names = union_feature_names | set(file_parsed_data[filename_stripped])
except KeyError:
continue
total_num_features = len(union_feature_names)
# now compute the specific resemblance and containment scores
for filename in file_parsed_data:
overlap = {}
overlap = file_parsed_data[filename] & set(union_feature_names)
resemblance_scores[filename] = float(len(overlap))/total_num_features
sorted_resemblance_scores = sorted(resemblance_scores.items(), key=operator.itemgetter(1), reverse=True)
'''print "Resemblance:\n"
for tuple in sorted_resemblance_scores:
print os.path.basename(tuple[0].rstrip(os.sep))+","+str(tuple[1]) +"," + tuple[0] + ","+ convertUnicode(file_metadata[tuple[0]])+'\n'''
with open("similarity-scores.txt", "w") as f:
f.write("Resemblance : \n")
for tuple in sorted_resemblance_scores:
f.write(os.path.basename(tuple[0].rstrip(os.sep))+","+str(tuple[1]) +"," + tuple[0] + ","+ convertUnicode(file_metadata[tuple[0]])+'\n')
except _Usage, err:
print >>sys.stderr, sys.argv[0].split('/')[-1] + ': ' + str(err.msg)
return 2
def convertUnicode( fileDict ) :
fileUTFDict = {}
for key in fileDict:
if isinstance(key, unicode) :
key = key.encode('utf-8').strip()
value = fileDict.get(key)
if isinstance(value, unicode) :
value = value.encode('utf-8').strip()
fileUTFDict[key] = value
return str(fileUTFDict)
if __name__ == "__main__":
sys.exit(main())