-
Notifications
You must be signed in to change notification settings - Fork 0
/
Plot_embeddings.py
146 lines (122 loc) · 6.48 KB
/
Plot_embeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from rdkit import Chem
import rdkit
from rdkit.Chem import Descriptors
import sys
def filter(ar):
return ar[np.isfinite(ar)]
if True:
protein_data = np.load('dataset/protein_embeddings/BindingDB_train_prot.dat-embeddings.npy')
protein_data = np.concatenate((protein_data, np.load('dataset/protein_embeddings/BindingDB_val_prot.dat-embeddings.npy')))
protein_data = np.concatenate((protein_data, np.load('dataset/protein_embeddings/BindingDB_test_prot.dat-embeddings.npy')))
protein_data = np.concatenate((protein_data, np.load('dataset/protein_embeddings/BIOSNAP_train_prot.dat-embeddings.npy')))
protein_data = np.concatenate((protein_data, np.load('dataset/protein_embeddings/BIOSNAP_test_prot.dat-embeddings.npy')))
protein_data = np.concatenate((protein_data, np.load('dataset/protein_embeddings/BIOSNAP_val_prot.dat-embeddings.npy')))
protein_data = np.concatenate((protein_data, np.load('dataset/protein_embeddings/DAVIS_train_prot.dat-embeddings.npy')))
protein_data = np.concatenate((protein_data, np.load('dataset/protein_embeddings/DAVIS_test_prot.dat-embeddings.npy')))
protein_data = np.concatenate((protein_data, np.load('dataset/protein_embeddings/DAVIS_val_prot.dat-embeddings.npy')))
smiles_data = np.load('dataset/smi_embeddings/BindingDB_train.smi-embeddings.npy')
smiles_data = np.concatenate((smiles_data, np.load('dataset/smi_embeddings/BindingDB_val.smi-embeddings.npy')))
smiles_data = np.concatenate((smiles_data, np.load('dataset/smi_embeddings/BindingDB_test.smi-embeddings.npy')))
smiles_data = np.concatenate((smiles_data, np.load('dataset/smi_embeddings/BIOSNAP_train.smi-embeddings.npy')))
smiles_data = np.concatenate((smiles_data, np.load('dataset/smi_embeddings/BIOSNAP_test.smi-embeddings.npy')))
smiles_data = np.concatenate((smiles_data, np.load('dataset/smi_embeddings/BIOSNAP_val.smi-embeddings.npy')))
smiles_data = np.concatenate((smiles_data, np.load('dataset/smi_embeddings/DAVIS_train.smi-embeddings.npy')))
smiles_data = np.concatenate((smiles_data, np.load('dataset/smi_embeddings/DAVIS_test.smi-embeddings.npy')))
smiles_data = np.concatenate((smiles_data, np.load('dataset/smi_embeddings/DAVIS_val.smi-embeddings.npy')))
print(protein_data.shape)
print(smiles_data.shape)
if False:
'''
Protein
'''
protein_data[~np.isfinite(protein_data)] = 0#filter(protein_data)
protein_data = np.ma.masked_equal(protein_data,0)
print(protein_data)
#X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
prot_embedded = TSNE(n_components=2).fit_transform(protein_data)
np.save('tsne_prot.npy', prot_embedded)
print(prot_embedded.shape)
print(prot_embedded)
plt.scatter(prot_embedded[:,0], prot_embedded[:,1], s=2, color='black')
plt.savefig('ProteinEmbedded.png')
plt.close()
'''
SMILES
'''
smiles_data[~np.isfinite(smiles_data)] = 0#filter(protein_data)
smiles_data = np.ma.masked_equal(smiles_data,0)
print(smiles_data)
#X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
smi_embedded = TSNE(n_components=2).fit_transform(smiles_data)
np.save('tsne_smi.npy', smi_embedded)
print(smi_embedded.shape)
print(smi_embedded)
plt.scatter(smi_embedded[:,0], smi_embedded[:,1], s=2, color='black')
plt.savefig('SmiEmbedded.png')
plt.close()
sys.exit()
'''
Color data
'''
'''
Protein
'''
import pandas as pd
prot_embedded = np.load('dataset/tsne_prot.npy')
protein_seq = pd.read_csv('dataset/BindingDB_train_prot.dat')
protein_seq = pd.concat([protein_seq, pd.read_csv('dataset/BindingDB_val_prot.dat')])
protein_seq = pd.concat([protein_seq, pd.read_csv('dataset/BindingDB_test_prot.dat')])
protein_seq = pd.concat([protein_seq, pd.read_csv('dataset/BIOSNAP_train_prot.dat')])
protein_seq = pd.concat([protein_seq, pd.read_csv('dataset/BIOSNAP_val_prot.dat')])
protein_seq = pd.concat([protein_seq, pd.read_csv('dataset/BIOSNAP_test_prot.dat')])
protein_seq = pd.concat([protein_seq, pd.read_csv('dataset/DAVIS_train_prot.dat')])
protein_seq = pd.concat([protein_seq, pd.read_csv('dataset/DAVIS_val_prot.dat')])
protein_seq = pd.concat([protein_seq, pd.read_csv('dataset/DAVIS_test_prot.dat')])
protein_seq['length'] = [len(seq) for seq in protein_seq['Target Sequence']]
#protein_seq['length']=(protein_seq['length']-protein_seq['length'].mean())/protein_seq['length'].std()
mask = protein_seq['length'] < 2000
print(protein_seq)
plt.scatter(x=prot_embedded[:,0][mask], y=prot_embedded[:,1][mask], s=2, c=protein_seq['length'][mask], cmap='tab20c')#, vmin=-1, vmax=1)
#plt.colorbar()
plt.colorbar(label="Sequence length", orientation="horizontal")
plt.savefig('ProteinEmbedded_length.png', bbox_inches='tight')
plt.close()
print(protein_seq)
'''
SMILES
'''
smi_embedded = np.load('dataset/tsne_smi.npy')
smiles_seq = pd.read_csv('dataset/BindingDB_train.smi')
smiles_seq = pd.concat([smiles_seq, pd.read_csv('dataset/BindingDB_val.smi')])
smiles_seq = pd.concat([smiles_seq, pd.read_csv('dataset/BindingDB_test.smi')])
smiles_seq = pd.concat([smiles_seq, pd.read_csv('dataset/BIOSNAP_train.smi')])
smiles_seq = pd.concat([smiles_seq, pd.read_csv('dataset/BIOSNAP_val.smi')])
smiles_seq = pd.concat([smiles_seq, pd.read_csv('dataset/BIOSNAP_test.smi')])
smiles_seq = pd.concat([smiles_seq, pd.read_csv('dataset/DAVIS_train.smi')])
smiles_seq = pd.concat([smiles_seq, pd.read_csv('dataset/DAVIS_val.smi')])
smiles_seq = pd.concat([smiles_seq, pd.read_csv('dataset/DAVIS_test.smi')])
smiles_seq['weight'] = [Descriptors.ExactMolWt(Chem.MolFromSmiles(smi)) for smi in smiles_seq['SMILES']]
print(smiles_seq)
import matplotlib.pyplot as plt
cmap = plt.get_cmap('tab20c')
norm = plt.Normalize(0, 1000)
color = cmap(norm(400.))
mask = smiles_seq['weight'] < 1000
#smiles_seq['weight'][mask]
print(np.max(smiles_seq['weight']))
plt.scatter(x=smi_embedded[:,0][mask], y=smi_embedded[:,1][mask], s=2, c=smiles_seq['weight'][mask], cmap='tab20c', vmin=0, vmax=1000)#, vmin=-1, vmax=1)
#plt.colorbar()
plt.colorbar(label="Molecular Weight", orientation="horizontal")
plt.savefig('SMILESEmbedded_length.png', bbox_inches='tight')
plt.close()
'''
Protein embedding with molecular weight
'''
plt.scatter(x=prot_embedded[:,0][mask], y=prot_embedded[:,1][mask], s=2, c=smiles_seq['weight'][mask], cmap='tab20c')#, vmin=-1, vmax=1)
#plt.colorbar()
plt.colorbar(label="Sequence length", orientation="horizontal")
plt.savefig('ProteinEmbedded_molweight.png', bbox_inches='tight')
plt.close()