forked from sergeio/text_clustering
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Sergei Orlov
committed
May 5, 2013
1 parent
b704af4
commit fee7bc1
Showing
4 changed files
with
263 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import random | ||
|
||
from similarity import similarity | ||
|
||
|
||
class KMeans(object): | ||
"""K-Means clustering. Uses cosine similarity as the distance function.""" | ||
|
||
def __init__(self, k, vectors): | ||
assert vectors >= k | ||
self.centers = random.sample(vectors, k) | ||
self.clusters = [[] for c in self.centers] | ||
self.vectors = vectors | ||
|
||
def update_clusters(self): | ||
"""Determine which cluster center each `self.vector` is closest to.""" | ||
def closest_center_index(vector): | ||
"""Get the index of the closest cluster center to `self.vector`.""" | ||
similarity_to_vector = lambda center: similarity(center,vector) | ||
center = max(self.centers, key=similarity_to_vector) | ||
return self.centers.index(center) | ||
|
||
self.clusters = [[] for c in self.centers] | ||
for vector in self.vectors: | ||
index = closest_center_index(vector) | ||
self.clusters[index].append(vector) | ||
|
||
def update_centers(self): | ||
"""Move `self.centers` to the centers of `self.clusters`. | ||
Return True if centers moved, else False. | ||
""" | ||
new_centers = [] | ||
for cluster in self.clusters: | ||
center = [average(ci) for ci in zip(*cluster)] | ||
new_centers.append(center) | ||
|
||
if new_centers == self.centers: | ||
return False | ||
|
||
self.centers = new_centers | ||
return True | ||
|
||
def main_loop(self): | ||
"""Perform k-means clustering.""" | ||
self.update_clusters() | ||
while self.update_centers(): | ||
self.update_clusters() | ||
|
||
|
||
def average(sequence): | ||
return sum(sequence) / len(sequence) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
from math import sqrt | ||
|
||
|
||
def dot_product(v1, v2): | ||
"""Get the dot product of the two vectors. | ||
if A = [a1, a2, a3] && B = [b1, b2, b3]; then | ||
dot_product(A, B) == (a1 * b1) + (a2 * b2) + (a3 * b3) | ||
true | ||
Input vectors must be the same length. | ||
""" | ||
return sum(a * b for a, b in zip(v1, v2)) | ||
|
||
|
||
def magnitude(vector): | ||
"""Returns the numerical length / magnitude of the vector.""" | ||
return sqrt(dot_product(vector, vector)) | ||
|
||
|
||
def similarity(v1, v2): | ||
"""Ratio of the dot product & the product of the magnitudes of vectors.""" | ||
return dot_product(v1, v2) / (magnitude(v1) * magnitude(v2) + .00000000001) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
from string import split | ||
from unittest import TestCase, main | ||
|
||
from vectorizer import ( | ||
cluster_paragraphs, | ||
make_word_set, | ||
make_word_vectors, | ||
) | ||
|
||
|
||
class WhenCreatingAVectorSpace(TestCase): | ||
|
||
def setUp(self): | ||
text1 = 'this is some text' | ||
text2 = 'unique text' | ||
text3 = 'and even way more text' | ||
|
||
self.word_vectors = map(split, [text1, text2, text3]) | ||
|
||
self.returned = make_word_set(self.word_vectors) | ||
|
||
def test_foo(self): | ||
for vector in self.word_vectors: | ||
for word in vector: | ||
self.assertTrue(word in self.returned) | ||
|
||
class WhenCalculatingWordVectors(TestCase): | ||
|
||
def setUp(self): | ||
text1 = 'this is some text' | ||
text2 = 'unique text' | ||
text3 = 'and even way more text' | ||
|
||
self.word_vectors = [ | ||
['some', 'text'], | ||
['unique', 'text'], | ||
['even', 'way', 'more', 'text'], | ||
] | ||
self.vector_space = make_word_set(self.word_vectors) | ||
|
||
self.returned = make_word_vectors(self.vector_space, self.word_vectors) | ||
|
||
def test_return_value(self): | ||
expected_return = [ | ||
[0, 0.5, 0.5, 0, 0, 0], | ||
[0, 0.5, 0, 0, 0.5, 0], | ||
[0.25, 0.25, 0, 0.25, 0, 0.25], | ||
] | ||
self.assertEqual(self.returned, expected_return) | ||
|
||
class WhenClusteringParagraphs(TestCase): | ||
|
||
def setUp(self): | ||
self.text1 = 'A study on the effectiveness of milk and micronutrients.' | ||
self.text2 = 'A study on the effectiveness of milk.' | ||
self.text3 = 'Something completely unrelated' | ||
|
||
self.returned = cluster_paragraphs( | ||
[self.text1, self.text2, self.text3]) | ||
|
||
def test_cluster_correctness(self): | ||
self.assertTrue([self.text1, self.text2] in self.returned) | ||
self.assertTrue([self.text3] in self.returned) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
from collections import defaultdict | ||
import re | ||
|
||
from similarity import similarity | ||
from k_means import KMeans | ||
|
||
|
||
def word_frequencies(word_vector): | ||
"""What percent of the time does each word in the vector appear? | ||
Returns a dictionary mapping each word to its frequency. | ||
""" | ||
num_words = len(word_vector) | ||
frequencies = defaultdict(float) | ||
for word in word_vector: | ||
frequencies[word] += 1.0 / num_words | ||
|
||
return dict(frequencies) | ||
|
||
|
||
def compare_vectors(word_vector1, word_vector2): | ||
"""Numerical similarity between lists of words. Higher is better. | ||
Uses cosine similarity. | ||
Result range: 0 (bad) - 1 (uses all the same words in the same proportions) | ||
""" | ||
all_words = list(set(word_vector1).union(set(word_vector2))) | ||
frequency_dict1 = word_frequencies(word_vector1) | ||
frequency_dict2 = word_frequencies(word_vector2) | ||
|
||
frequency_vector1 = [frequency_dict1.get(word, 0) for word in all_words] | ||
frequency_vector2 = [frequency_dict2.get(word, 0) for word in all_words] | ||
|
||
return similarity(frequency_vector1, frequency_vector2) | ||
|
||
|
||
def vectorize_text(text): | ||
"""Takes in text, processes it, and vectorizes it.""" | ||
|
||
def remove_punctuation(text): | ||
"""Removes special characters from text.""" | ||
return re.sub('[,.?";:\-!@#$%^&*()]', '', text) | ||
|
||
def remove_common_words(text_vector): | ||
"""Removes 50 most common words in the uk english. | ||
source: http://www.bckelk.ukfsn.org/words/uk1000n.html | ||
""" | ||
common_words = set(['the', 'and', 'to', 'of', 'a', 'I', 'in', | ||
'was', 'he', 'that', 'it', 'his', 'her', 'you', 'as', | ||
'had', 'with', 'for', 'she', 'not', 'at', 'but', 'be', | ||
'my', 'on', 'have', 'him', 'is', 'said', 'me', 'which', | ||
'by', 'so', 'this', 'all', 'from', 'they', 'no', 'were', | ||
'if', 'would', 'or', 'when', 'what', 'there', 'been', | ||
'one', 'could', 'very', 'an', 'who']) | ||
return [word for word in text_vector if word not in common_words] | ||
|
||
text = text.lower() | ||
text = remove_punctuation(text) | ||
words_list = text.split() | ||
words_list = remove_common_words(words_list) | ||
|
||
return words_list | ||
|
||
|
||
def compare_texts(text1, text2): | ||
"""How similar are the two input paragraphs?""" | ||
return compare_vectors(vectorize_text(text1), vectorize_text(text2)) | ||
|
||
|
||
################################ | ||
|
||
|
||
def make_word_lists(paragraphs): | ||
return map(vectorize_text, paragraphs) | ||
|
||
def make_word_set(word_lists): | ||
""" """ | ||
return set(word for words in word_lists for word in words) | ||
|
||
def make_word_vectors(word_set, word_lists): | ||
|
||
def vectorize(frequency_dict): | ||
return [frequency_dict.get(word, 0) for word in word_set] | ||
|
||
frequencies = map(word_frequencies, word_lists) | ||
|
||
return map(vectorize, frequencies) | ||
|
||
def translator(clusters, paragraph_map): | ||
"""Translate vectors back into paragraphs, to make them human-readable.""" | ||
def item_translator(vector): | ||
return paragraph_map.get(str(vector)) | ||
|
||
def cluster_translator(cluster): | ||
return map(item_translator, cluster) | ||
|
||
return map(cluster_translator, clusters) | ||
|
||
def cluster_paragraphs(paragraphs): | ||
word_lists = make_word_lists(paragraphs) | ||
word_set = make_word_set(word_lists) | ||
word_vectors = make_word_vectors(word_set, word_lists) | ||
|
||
paragraph_map = dict(zip(map(str, word_vectors), paragraphs)) | ||
|
||
k_means = KMeans(2, word_vectors) | ||
k_means.main_loop() | ||
return translator(k_means.clusters, paragraph_map) | ||
|
||
|
||
|
||
# the `vectorize_text` function is not actually vectorizing, it's just | ||
# splitting/stripping. The vectorization happens in the `compare_texts` | ||
# function, where the word-lists are replaced by the frequency of their | ||
# occurence. I should rename functions to rectify. |