forked from Codeshastra-7/Lost-n-Found
-
Notifications
You must be signed in to change notification settings - Fork 0
/
NLTK similarity.py
35 lines (29 loc) · 935 Bytes
/
NLTK similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# X = input("Enter first string: ").lower()
# Y = input("Enter second string: ").lower()
X ="red wallet"
Y ="maroon purse"
# tokenization
X_list = word_tokenize(X)
Y_list = word_tokenize(Y)
# sw contains the list of stopwords
sw = stopwords.words('english')
l1 =[];l2 =[]
# remove stop words from the string
X_set = {w for w in X_list if not w in sw}
Y_set = {w for w in Y_list if not w in sw}
# form a set containing keywords of both strings
rvector = X_set.union(Y_set)
for w in rvector:
if w in X_set: l1.append(1) # create a vector
else: l1.append(0)
if w in Y_set: l2.append(1)
else: l2.append(0)
c = 0
# cosine formula
for i in range(len(rvector)):
c+= l1[i]*l2[i]
cosine = c / float((sum(l1)*sum(l2))**0.5)
print("similarity: ", cosine)