-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
74 lines (58 loc) · 2.34 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
# matplotlib inline
def main():
df = pd.read_csv('./dataset.csv')
columns_names=df.columns.tolist()
df_drop=df.drop(labels=
['company','country', 'director', 'genre', 'name', 'rating', 'star','released','writer', 'year']
,axis=1)
X_std = StandardScaler().fit_transform(df_drop)
mean_vec = np.mean(X_std, axis=0)
cov_mat = (X_std - mean_vec).T.dot((X_std - mean_vec)) / (X_std.shape[0]-1)
#print('Covariance matrix \n%s' %cov_mat)
#print('NumPy covariance matrix: \n%s' %np.cov(X_std.T))
eig_vals, eig_vecs = np.linalg.eig(cov_mat)
#print('Eigenvectors \n%s' %eig_vecs)
#print('\nEigenvalues \n%s' %eig_vals)
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort(key=lambda x: x[0], reverse=True)
# Visually confirm that the list is correctly sorted by decreasing eigenvalues
#print('Eigenvalues in descending order:')
tot = sum(eig_vals)
var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
matrix_w = np.hstack((eig_pairs[0][1].reshape(5,1),
eig_pairs[1][1].reshape(5,1)
))
Y = X_std.dot(matrix_w)
sklearn_pca = PCA(n_components=5)
Y_sklearn = sklearn_pca.fit_transform(X_std)
pca = PCA(n_components=5).fit_transform(X_std)
kmeans_pca = KMeans(5)
kmeans_pca.fit(pca)
df_kmeas = pd.concat([df,pd.DataFrame(pca)], axis=1)
df_kmeas.columns.values[-5:] = ['Component1', 'Component2', 'Component3', 'Component4', 'Component5']
df_kmeas['cluster'] = kmeans_pca.labels_
print(df_kmeas)
df_kmeas['Segment'] = df_kmeas['cluster'].map({
0:"first",
1:"second",
2:"third",
3:"fourth"
})
x_axis = df_kmeas['Component1']
y_axis = df_kmeas['Component2']
sns.scatterplot(x_axis, y_axis, hue= df_kmeas['Segment'])
#plt.show()
dat = pd.DataFrame(df_kmeas)
dat.to_csv('pca.csv', index=False)
if __name__ == "__main__":
main()