-
Notifications
You must be signed in to change notification settings - Fork 3
/
feature_selection_data_preproccess_CGP.py
63 lines (49 loc) · 2.76 KB
/
feature_selection_data_preproccess_CGP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import pandas as pd
# 读取mRNA expression
data = pd.read_csv('data/CGP/sanger1018_brainarray_ensemblgene_rma.txt', delimiter='\t', error_bad_lines=False)
print(data.shape)
print(data.columns)
print(data.head(5))
data.index = data['ensembl_gene'].tolist() # 将ensembl_gene列转换为索引列
data = data.drop('ensembl_gene', axis=1) # 删除ensembl_gene列
data_stack = data.stack()
# print(data_stack.head(5))
data_unstack = data_stack.unstack(0) # 行列转换
print(data_unstack.head(5))
# 转换成文件, index=False, 去掉索引
# data_unstack.to_csv('data/CGP/gene_expression/ccle_expression_trans_index_col.csv',index=False, float_format='%.2f')
# Lapatinib
# cols = ['ENSG00000154639','ENSG00000151012','ENSG00000175591','ENSG00000102595','ENSG00000142765',
# 'ENSG00000171004','ENSG00000128512']
# Sorafenib
# cols = ['ENSG00000131149', 'ENSG00000104093', 'ENSG00000135069', 'ENSG00000110002', 'ENSG00000196387',
# 'ENSG00000102445', 'ENSG00000169583', 'ENSG00000102362', 'ENSG00000198846']
# PHA-665752
# cols = ['ENSG00000175183', 'ENSG00000170323', 'ENSG00000086205', 'ENSG00000154589', 'ENSG00000163083',
# 'ENSG00000100852', 'ENSG00000167191', 'ENSG00000082497', 'ENSG00000105497', 'ENSG00000116014']
# Erlotinib 没有ENSG00000122641,删除该列
# cols = ['ENSG00000163359', 'ENSG00000152661', 'ENSG00000129038', 'ENSG00000196611',
# 'ENSG00000113269', 'ENSG00000080823', 'ENSG00000127074', 'ENSG00000166510', 'ENSG00000142765']
# Paclitaxel 没有ENSG00000133131->ENST00000355610,删除该列
# cols = ['ENSG00000170231', 'ENSG00000135069', 'ENSG00000137673', 'ENSG00000023892', 'ENSG00000085563',
# 'ENSG00000167766', 'ENSG00000177706', 'ENSG00000169902', 'ENSG00000072682', 'ENSG00000172716']
# PD-0325901
cols = ['ENSG00000075213', 'ENSG00000114771', 'ENSG00000112137', 'ENSG00000185022', 'ENSG00000115461',
'ENSG00000221968', 'ENSG00000152952', 'ENSG00000087253', 'ENSG00000101311', 'ENSG00000151692']
data_unstack = data_unstack[cols]
print(data_unstack.head(5))
print(data_unstack.shape)
drug_info = pd.read_csv('data/CGP/drug_cell/drug/PD0325901.csv', header=None)
drug_info_cell_Col = drug_info[0] # 选择cell列
drug_info_label_Col = drug_info[1] # 选择label列
print(drug_info_cell_Col)
print(type(drug_info_cell_Col))
drug_info_cell_Col_list = list(drug_info_cell_Col)
drug_info_cell_Col_str = [str(i) for i in drug_info_cell_Col_list] # 将数字转换成字符串
# 选择指定的cell行
data_unstack_select = data_unstack.loc[drug_info_cell_Col_str]
data_unstack_select['label'] = drug_info_label_Col.values
print(data_unstack_select)
# data_unstack_select.fillna(0)
data_unstack_select.to_csv('data/CGP/drug_cell/common_drugs/PD-0325901_train_data.csv', index=False,
float_format='%.2f')