Initial implementation

gu-raime · May 5, 2013 · fee7bc1 · fee7bc1
1 parent b704af4
commit fee7bc1
Show file tree

Hide file tree

Showing 4 changed files with 263 additions and 0 deletions.
diff --git a/k_means.py b/k_means.py
@@ -0,0 +1,53 @@
+import random
+
+from similarity import similarity
+
+
+class KMeans(object):
+    """K-Means clustering. Uses cosine similarity as the distance function."""
+
+    def __init__(self, k, vectors):
+        assert vectors >= k
+        self.centers = random.sample(vectors, k)
+        self.clusters = [[] for c in self.centers]
+        self.vectors = vectors
+
+    def update_clusters(self):
+        """Determine which cluster center each `self.vector` is closest to."""
+        def closest_center_index(vector):
+            """Get the index of the closest cluster center to `self.vector`."""
+            similarity_to_vector = lambda center: similarity(center,vector)
+            center = max(self.centers, key=similarity_to_vector)
+            return self.centers.index(center)
+
+        self.clusters = [[] for c in self.centers]
+        for vector in self.vectors:
+             index = closest_center_index(vector)
+             self.clusters[index].append(vector)
+
+    def update_centers(self):
+        """Move `self.centers` to the centers of `self.clusters`.
+
+        Return True if centers moved, else False.
+
+        """
+        new_centers = []
+        for cluster in self.clusters:
+            center = [average(ci) for ci in zip(*cluster)]
+            new_centers.append(center)
+
+        if new_centers == self.centers:
+            return False
+
+        self.centers = new_centers
+        return True
+
+    def main_loop(self):
+        """Perform k-means clustering."""
+        self.update_clusters()
+        while self.update_centers():
+            self.update_clusters()
+
+
+def average(sequence):
+    return sum(sequence) / len(sequence)
diff --git a/similarity.py b/similarity.py
@@ -0,0 +1,24 @@
+from math import sqrt
+
+
+def dot_product(v1, v2):
+    """Get the dot product of the two vectors.
+
+    if A = [a1, a2, a3] && B = [b1, b2, b3]; then
+    dot_product(A, B) == (a1 * b1) + (a2 * b2) + (a3 * b3)
+    true
+
+    Input vectors must be the same length.
+
+    """
+    return sum(a * b for a, b in zip(v1, v2))
+
+
+def magnitude(vector):
+    """Returns the numerical length / magnitude of the vector."""
+    return sqrt(dot_product(vector, vector))
+
+
+def similarity(v1, v2):
+    """Ratio of the dot product & the product of the magnitudes of vectors."""
+    return dot_product(v1, v2) / (magnitude(v1) * magnitude(v2) + .00000000001)
diff --git a/test_vectorizer.py b/test_vectorizer.py
@@ -0,0 +1,67 @@
+from string import split
+from unittest import TestCase, main
+
+from vectorizer import (
+    cluster_paragraphs,
+    make_word_set,
+    make_word_vectors,
+)
+
+
+class WhenCreatingAVectorSpace(TestCase):
+
+    def setUp(self):
+        text1 = 'this is some text'
+        text2 = 'unique text'
+        text3 = 'and even way more text'
+
+        self.word_vectors = map(split, [text1, text2, text3])
+
+        self.returned = make_word_set(self.word_vectors)
+
+    def test_foo(self):
+        for vector in self.word_vectors:
+            for word in vector:
+                self.assertTrue(word in self.returned)
+
+class WhenCalculatingWordVectors(TestCase):
+
+    def setUp(self):
+        text1 = 'this is some text'
+        text2 = 'unique text'
+        text3 = 'and even way more text'
+
+        self.word_vectors = [
+            ['some', 'text'],
+            ['unique', 'text'],
+            ['even', 'way', 'more', 'text'],
+        ]
+        self.vector_space = make_word_set(self.word_vectors)
+
+        self.returned = make_word_vectors(self.vector_space, self.word_vectors)
+
+    def test_return_value(self):
+        expected_return = [
+            [0, 0.5, 0.5, 0, 0, 0],
+            [0, 0.5, 0, 0, 0.5, 0],
+            [0.25, 0.25, 0, 0.25, 0, 0.25],
+        ]
+        self.assertEqual(self.returned, expected_return)
+
+class WhenClusteringParagraphs(TestCase):
+
+    def setUp(self):
+        self.text1 = 'A study on the effectiveness of milk and micronutrients.'
+        self.text2 = 'A study on the effectiveness of milk.'
+        self.text3 = 'Something completely unrelated'
+
+        self.returned = cluster_paragraphs(
+            [self.text1, self.text2, self.text3])
+
+    def test_cluster_correctness(self):
+        self.assertTrue([self.text1, self.text2] in self.returned)
+        self.assertTrue([self.text3] in self.returned)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/vectorizer.py b/vectorizer.py
@@ -0,0 +1,119 @@
+from collections import defaultdict
+import re
+
+from similarity import similarity
+from k_means import KMeans
+
+
+def word_frequencies(word_vector):
+    """What percent of the time does each word in the vector appear?
+
+    Returns a dictionary mapping each word to its frequency.
+
+    """
+    num_words = len(word_vector)
+    frequencies = defaultdict(float)
+    for word in word_vector:
+        frequencies[word] += 1.0 / num_words
+
+    return dict(frequencies)
+
+
+def compare_vectors(word_vector1, word_vector2):
+    """Numerical similarity between lists of words. Higher is better.
+
+    Uses cosine similarity.
+    Result range: 0 (bad) - 1 (uses all the same words in the same proportions)
+
+    """
+    all_words = list(set(word_vector1).union(set(word_vector2)))
+    frequency_dict1 = word_frequencies(word_vector1)
+    frequency_dict2 = word_frequencies(word_vector2)
+
+    frequency_vector1 = [frequency_dict1.get(word, 0) for word in all_words]
+    frequency_vector2 = [frequency_dict2.get(word, 0) for word in all_words]
+
+    return similarity(frequency_vector1, frequency_vector2)
+
+
+def vectorize_text(text):
+    """Takes in text, processes it, and vectorizes it."""
+
+    def remove_punctuation(text):
+        """Removes special characters from text."""
+        return re.sub('[,.?";:\-!@#$%^&*()]', '', text)
+
+    def remove_common_words(text_vector):
+        """Removes 50 most common words in the uk english.
+
+        source: http://www.bckelk.ukfsn.org/words/uk1000n.html
+
+        """
+        common_words = set(['the', 'and', 'to', 'of', 'a', 'I', 'in',
+            'was', 'he', 'that', 'it', 'his', 'her', 'you', 'as',
+            'had', 'with', 'for', 'she', 'not', 'at', 'but', 'be',
+            'my', 'on', 'have', 'him', 'is', 'said', 'me', 'which',
+            'by', 'so', 'this', 'all', 'from', 'they', 'no', 'were',
+            'if', 'would', 'or', 'when', 'what', 'there', 'been',
+            'one', 'could', 'very', 'an', 'who'])
+        return [word for word in text_vector if word not in common_words]
+
+    text = text.lower()
+    text = remove_punctuation(text)
+    words_list = text.split()
+    words_list = remove_common_words(words_list)
+
+    return words_list
+
+
+def compare_texts(text1, text2):
+    """How similar are the two input paragraphs?"""
+    return compare_vectors(vectorize_text(text1), vectorize_text(text2))
+
+
+################################
+
+
+def make_word_lists(paragraphs):
+    return map(vectorize_text, paragraphs)
+
+def make_word_set(word_lists):
+    """ """
+    return set(word for words in word_lists for word in words)
+
+def make_word_vectors(word_set, word_lists):
+
+    def vectorize(frequency_dict):
+        return [frequency_dict.get(word, 0) for word in word_set]
+
+    frequencies = map(word_frequencies, word_lists)
+
+    return map(vectorize, frequencies)
+
+def translator(clusters, paragraph_map):
+    """Translate vectors back into paragraphs, to make them human-readable."""
+    def item_translator(vector):
+        return paragraph_map.get(str(vector))
+
+    def cluster_translator(cluster):
+        return map(item_translator, cluster)
+
+    return map(cluster_translator, clusters)
+
+def cluster_paragraphs(paragraphs):
+    word_lists = make_word_lists(paragraphs)
+    word_set = make_word_set(word_lists)
+    word_vectors = make_word_vectors(word_set, word_lists)
+
+    paragraph_map = dict(zip(map(str, word_vectors), paragraphs))
+
+    k_means = KMeans(2, word_vectors)
+    k_means.main_loop()
+    return translator(k_means.clusters, paragraph_map)
+
+
+
+# the `vectorize_text` function is not actually vectorizing, it's just
+# splitting/stripping.  The vectorization happens in the `compare_texts`
+# function, where the word-lists are replaced by the frequency of their
+# occurence. I should rename functions to rectify.