-
Notifications
You must be signed in to change notification settings - Fork 3
/
feature_selection_data_preproccess.py
61 lines (52 loc) · 2.94 KB
/
feature_selection_data_preproccess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import pandas as pd
# 读取mRNA expression
data = pd.read_csv('data/CCLE_Expression_Entrez_2012-09-29 _copy.gct', delimiter='\t', error_bad_lines=False)
print(data.shape)
print(data.columns)
data = data.drop('Description', axis=1) # 删除Description列, 改为删除Name列-----不能改,因为Description有NaN值
cols = data.columns
print(cols)
# data = data.pivot(index='Name')
data.index = data['Name'].tolist() # 将Name列转换为索引列, 改为Description
data = data.drop('Name', axis=1) # 删除Name列, 改为Description
# print(data.head(5))
data_stack = data.stack()
# print(data_stack.head(5))
data_unstack = data_stack.unstack(0) # 行列转换
print(data_unstack.head(5))
# 转换成文件, index=False, 去掉索引
# data_unstack.to_csv('data/gene_expression/ccle_expression_trans_index_col.csv',index=False, float_format='%.2f')
# 将每一种药物的标签与基因表达数据结合
# drug_info = pd.read_csv('data/drug_cell/drug/17-AAG.csv', header=None)
# drug_info = pd.read_csv('data/drug_cell/drug/Erlotinib.csv', header=None) -c
# drug_info = pd.read_csv('data/drug_cell/drug/Irinotecan.csv', header=None)
# drug_info = pd.read_csv('data/drug_cell/drug/Lapatinib.csv', header=None) -c
# drug_info = pd.read_csv('data/drug_cell/drug/PD-0325901.csv', header=None) -c
# drug_info = pd.read_csv('data/drug_cell/drug/AEW541.csv', header=None)
# drug_info = pd.read_csv('data/drug_cell/drug/PHA-665752.csv', header=None) -c
# drug_info = pd.read_csv('data/drug_cell/drug/Paclitaxel.csv', header=None) -c
# drug_info = pd.read_csv('data/drug_cell/drug/Sorafenib.csv', header=None) -c
# drug_info = pd.read_csv('data/drug_cell/drug/PLX4720.csv', header=None) -c -discard
# drug_info = pd.read_csv('data/drug_cell/drug/AZD0530.csv', header=None)
# drug_info = pd.read_csv('data/drug_cell/drug/LBW242.csv', header=None)
# drug_info = pd.read_csv('data/drug_cell/drug/Nutlin-3.csv', header=None)
# drug_info = pd.read_csv('data/drug_cell/drug/Panobinostat.csv', header=None)
# drug_info = pd.read_csv('data/drug_cell/drug/PD-0332991.csv', header=None)
# drug_info = pd.read_csv('data/drug_cell/drug/PF2341066.csv', header=None)
# drug_info = pd.read_csv('data/drug_cell/drug/RAF265.csv', header=None)
# drug_info = pd.read_csv('data/drug_cell/drug/TAE684.csv', header=None)
# drug_info = pd.read_csv('data/drug_cell/drug/TKI258.csv', header=None)
drug_info = pd.read_csv('data/drug_cell/drug/ZD-6474.csv', header=None)
# print(drug_info)
drug_info_cell_Col = drug_info[0] # 选择cell列
# print(drug_info_cell_Col)
# print(type(drug_info_cell_Col))
drug_info_label_Col = drug_info[1] # 选择label列
print(drug_info_label_Col)
print(len(drug_info_label_Col))
# 选择指定的cell行
data_unstack_select = data_unstack.loc[drug_info_cell_Col]
# print(data_unstack_select)
data_unstack_select['label'] = drug_info_label_Col.values
print(data_unstack_select)
data_unstack_select.to_csv('data/drug_cell/drug/ZD-6474_train_data.csv', index=False, float_format='%.2f')