Skip to content

Commit

Permalink
Initial implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
Sergei Orlov committed May 5, 2013
1 parent b704af4 commit fee7bc1
Show file tree
Hide file tree
Showing 4 changed files with 263 additions and 0 deletions.
53 changes: 53 additions & 0 deletions k_means.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import random

from similarity import similarity


class KMeans(object):
"""K-Means clustering. Uses cosine similarity as the distance function."""

def __init__(self, k, vectors):
assert vectors >= k
self.centers = random.sample(vectors, k)
self.clusters = [[] for c in self.centers]
self.vectors = vectors

def update_clusters(self):
"""Determine which cluster center each `self.vector` is closest to."""
def closest_center_index(vector):
"""Get the index of the closest cluster center to `self.vector`."""
similarity_to_vector = lambda center: similarity(center,vector)
center = max(self.centers, key=similarity_to_vector)
return self.centers.index(center)

self.clusters = [[] for c in self.centers]
for vector in self.vectors:
index = closest_center_index(vector)
self.clusters[index].append(vector)

def update_centers(self):
"""Move `self.centers` to the centers of `self.clusters`.
Return True if centers moved, else False.
"""
new_centers = []
for cluster in self.clusters:
center = [average(ci) for ci in zip(*cluster)]
new_centers.append(center)

if new_centers == self.centers:
return False

self.centers = new_centers
return True

def main_loop(self):
"""Perform k-means clustering."""
self.update_clusters()
while self.update_centers():
self.update_clusters()


def average(sequence):
return sum(sequence) / len(sequence)
24 changes: 24 additions & 0 deletions similarity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from math import sqrt


def dot_product(v1, v2):
"""Get the dot product of the two vectors.
if A = [a1, a2, a3] && B = [b1, b2, b3]; then
dot_product(A, B) == (a1 * b1) + (a2 * b2) + (a3 * b3)
true
Input vectors must be the same length.
"""
return sum(a * b for a, b in zip(v1, v2))


def magnitude(vector):
"""Returns the numerical length / magnitude of the vector."""
return sqrt(dot_product(vector, vector))


def similarity(v1, v2):
"""Ratio of the dot product & the product of the magnitudes of vectors."""
return dot_product(v1, v2) / (magnitude(v1) * magnitude(v2) + .00000000001)
67 changes: 67 additions & 0 deletions test_vectorizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from string import split
from unittest import TestCase, main

from vectorizer import (
cluster_paragraphs,
make_word_set,
make_word_vectors,
)


class WhenCreatingAVectorSpace(TestCase):

def setUp(self):
text1 = 'this is some text'
text2 = 'unique text'
text3 = 'and even way more text'

self.word_vectors = map(split, [text1, text2, text3])

self.returned = make_word_set(self.word_vectors)

def test_foo(self):
for vector in self.word_vectors:
for word in vector:
self.assertTrue(word in self.returned)

class WhenCalculatingWordVectors(TestCase):

def setUp(self):
text1 = 'this is some text'
text2 = 'unique text'
text3 = 'and even way more text'

self.word_vectors = [
['some', 'text'],
['unique', 'text'],
['even', 'way', 'more', 'text'],
]
self.vector_space = make_word_set(self.word_vectors)

self.returned = make_word_vectors(self.vector_space, self.word_vectors)

def test_return_value(self):
expected_return = [
[0, 0.5, 0.5, 0, 0, 0],
[0, 0.5, 0, 0, 0.5, 0],
[0.25, 0.25, 0, 0.25, 0, 0.25],
]
self.assertEqual(self.returned, expected_return)

class WhenClusteringParagraphs(TestCase):

def setUp(self):
self.text1 = 'A study on the effectiveness of milk and micronutrients.'
self.text2 = 'A study on the effectiveness of milk.'
self.text3 = 'Something completely unrelated'

self.returned = cluster_paragraphs(
[self.text1, self.text2, self.text3])

def test_cluster_correctness(self):
self.assertTrue([self.text1, self.text2] in self.returned)
self.assertTrue([self.text3] in self.returned)


if __name__ == '__main__':
main()
119 changes: 119 additions & 0 deletions vectorizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
from collections import defaultdict
import re

from similarity import similarity
from k_means import KMeans


def word_frequencies(word_vector):
"""What percent of the time does each word in the vector appear?
Returns a dictionary mapping each word to its frequency.
"""
num_words = len(word_vector)
frequencies = defaultdict(float)
for word in word_vector:
frequencies[word] += 1.0 / num_words

return dict(frequencies)


def compare_vectors(word_vector1, word_vector2):
"""Numerical similarity between lists of words. Higher is better.
Uses cosine similarity.
Result range: 0 (bad) - 1 (uses all the same words in the same proportions)
"""
all_words = list(set(word_vector1).union(set(word_vector2)))
frequency_dict1 = word_frequencies(word_vector1)
frequency_dict2 = word_frequencies(word_vector2)

frequency_vector1 = [frequency_dict1.get(word, 0) for word in all_words]
frequency_vector2 = [frequency_dict2.get(word, 0) for word in all_words]

return similarity(frequency_vector1, frequency_vector2)


def vectorize_text(text):
"""Takes in text, processes it, and vectorizes it."""

def remove_punctuation(text):
"""Removes special characters from text."""
return re.sub('[,.?";:\-!@#$%^&*()]', '', text)

def remove_common_words(text_vector):
"""Removes 50 most common words in the uk english.
source: http://www.bckelk.ukfsn.org/words/uk1000n.html
"""
common_words = set(['the', 'and', 'to', 'of', 'a', 'I', 'in',
'was', 'he', 'that', 'it', 'his', 'her', 'you', 'as',
'had', 'with', 'for', 'she', 'not', 'at', 'but', 'be',
'my', 'on', 'have', 'him', 'is', 'said', 'me', 'which',
'by', 'so', 'this', 'all', 'from', 'they', 'no', 'were',
'if', 'would', 'or', 'when', 'what', 'there', 'been',
'one', 'could', 'very', 'an', 'who'])
return [word for word in text_vector if word not in common_words]

text = text.lower()
text = remove_punctuation(text)
words_list = text.split()
words_list = remove_common_words(words_list)

return words_list


def compare_texts(text1, text2):
"""How similar are the two input paragraphs?"""
return compare_vectors(vectorize_text(text1), vectorize_text(text2))


################################


def make_word_lists(paragraphs):
return map(vectorize_text, paragraphs)

def make_word_set(word_lists):
""" """
return set(word for words in word_lists for word in words)

def make_word_vectors(word_set, word_lists):

def vectorize(frequency_dict):
return [frequency_dict.get(word, 0) for word in word_set]

frequencies = map(word_frequencies, word_lists)

return map(vectorize, frequencies)

def translator(clusters, paragraph_map):
"""Translate vectors back into paragraphs, to make them human-readable."""
def item_translator(vector):
return paragraph_map.get(str(vector))

def cluster_translator(cluster):
return map(item_translator, cluster)

return map(cluster_translator, clusters)

def cluster_paragraphs(paragraphs):
word_lists = make_word_lists(paragraphs)
word_set = make_word_set(word_lists)
word_vectors = make_word_vectors(word_set, word_lists)

paragraph_map = dict(zip(map(str, word_vectors), paragraphs))

k_means = KMeans(2, word_vectors)
k_means.main_loop()
return translator(k_means.clusters, paragraph_map)



# the `vectorize_text` function is not actually vectorizing, it's just
# splitting/stripping. The vectorization happens in the `compare_texts`
# function, where the word-lists are replaced by the frequency of their
# occurence. I should rename functions to rectify.

0 comments on commit fee7bc1

Please sign in to comment.