-
Notifications
You must be signed in to change notification settings - Fork 5
/
create_model.py
96 lines (69 loc) · 2.47 KB
/
create_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# In [1]:
import numpy as np
import pandas as pd
import math
import random
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import colors
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import gzip
import time
import re
initYear = 1940
now = datetime.now()
today = now.today()
yearToday = now.today().year
startProcessingTime = time.time()
dfs = []
for year in range(initYear, yearToday):
dfs.append(pd.read_csv('scraped_movies/top_movies_of_%d.csv' %
year, encoding='cp1252'))
def cleanTitle(title):
matchObject = re.match(r"(.*?) (?:\(I+\) )?\(([0-9]{4})", title)
return matchObject.group(1)
def addSimpleTitle(df):
return df.assign(simpleTitle=df.title.apply(cleanTitle))
movie_data = pd.concat(dfs)[["IMDbId", "title", "release_year"]]
movie_data_complete = addSimpleTitle(movie_data)
# In [3]:
dfs = []
for year in range(initYear, yearToday):
dfs.append(pd.read_csv(
'scraped_movies/keywords_for_top_movies_of_%d.csv' % year, encoding='cp1252'))
keywords = pd.concat(dfs)
# In [4]:
movie_data.index = range(len(movie_data))
keywords.index = range(len(keywords))
# In [24]:
# Throwing the whole process into a little method
def make_matrix(df, countvectoriser):
megatron = TfidfTransformer()
sparse = countvectoriser.fit_transform(
pd.Series(df.keywords.fillna('').values))
weighted = megatron.fit_transform(sparse)
matrix = weighted.dot(weighted.T)
movies = pd.Series(countvectoriser.get_feature_names())
return matrix, movies
# In [25]:
vlad = CountVectorizer(tokenizer=lambda x: x.split('|'), min_df=10)
matrix, words = make_matrix(keywords, vlad)
# In [26]:
shrinky = NMF(n_components=100)
shrunk_100 = shrinky.fit_transform(matrix.toarray())
def saveToFileAndCompress(object, filename):
file = gzip.GzipFile(filename, 'wb')
file.write(pickle.dumps(object))
file.close()
fileName = "hanibalVectorModel-" + today.strftime("%d-%m-%Y") + ".gz"
indexFileName = "hanibalVectorIndex-" + today.strftime("%d-%m-%Y") + ".gz"
saveToFileAndCompress(shrunk_100, fileName)
saveToFileAndCompress(movie_data_complete, indexFileName)
endProcessingTime = time.time()
elapsedTimeInSeconds = endProcessingTime - startProcessingTime
print("All completed in: %ss" % elapsedTimeInSeconds)