-
Notifications
You must be signed in to change notification settings - Fork 0
/
input_data_prep.py
134 lines (118 loc) · 4.9 KB
/
input_data_prep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import numpy as np
import pandas as pd
import sys
import IPython
##
# @file
# This file implements function used for reading, re-shaping and pre-processing the input
# datasets for TSS prediction
#
def read_input_files(filename):
''' This function implements the loading of the input data, shuffling it and spliting it
into training and testing set.
Params:
- filename: tab-delimited file containing locations of all input tables in format:
<input_type> <positive example file> <negative example file>
Example of such file:
CA true_promoters/CA_pos.csv false_promoters_dist350/CA_neg.csv
CG true_promoters/CG_pos.csv false_promoters_dist350/CG_neg.csv
COV true_promoters/Coverage_pos.csv false_promoters_dist350/Coverage_neg.csv
MAF true_promoters/MAF_pos.csv false_promoters_dist350/MAF_neg.csv
METH true_promoters/METH_pos.csv false_promoters_dist350/METH_neg.csv
seq true_promoters/promoters_pos.csv false_promoters_dist350/promoters_neg.csv
SNPs true_promoters/SNPs_pos.csv false_promoters_dist350/SNPs_neg.csv
TATA true_promoters/TATA_pos.csv false_promoters_dist350/TATA_neg.csv
'''
input_loc = pd.read_csv(filename, sep="\t", index_col=0, header=None)
system_variables = locals().keys()
test_size = 8000
TSS_pos=500
# PROMOTOR DNA SEQUENCE
# read true promoter sequences
seq_pos = pd.read_csv(input_loc.loc["seq"].iloc[0], header=None, index_col=0)
# read false promoter sequences
seq_neg = pd.read_csv(input_loc.loc["seq"].iloc[1], header=None, index_col=0)
# merge the ndarrays
seq = np.concatenate((seq_pos.values, seq_neg.values), axis=0)
# reshape promotor sequences from 2d to 3d (gene,pos,nucleotide)
seq = seq.reshape(seq.shape[0], seq.shape[1]/4, 4)
# CG-SKEW
# read true promoter skew
CG_pos = pd.read_csv(input_loc.loc["CG"].iloc[0], header=None, index_col=0)
# read false promoter skew
CG_neg = pd.read_csv(input_loc.loc["CG"].iloc[1], header=None, index_col=0)
# merge the ndarrays
CG = np.concatenate((CG_pos.values, CG_neg.values), axis=0)
# add the channel layer (as axis[2])
CG = np.expand_dims(CG,2)
# CA-dinucl
# read true promoter CA mask
CA_pos = pd.read_csv(input_loc.loc["CA"].iloc[0], header=None, index_col=0)
# read false promoter CA mask
CA_neg = pd.read_csv(input_loc.loc["CA"].iloc[1], header=None, index_col=0)
# merge the ndarrays
CA = np.concatenate((CA_pos.values, CA_neg.values), axis=0)
CA = CA[:,TSS_pos-20:TSS_pos+20]
# add the channel layer (as axis[2])
CA = np.expand_dims(CA,2)
# COV
# read true promoter coverage
COV_pos = pd.read_csv(input_loc.loc["COV"].iloc[0], header=None, index_col=0)
# read false promoter coverage
COV_neg = pd.read_csv(input_loc.loc["COV"].iloc[1], header=None, index_col=0)
# merge the ndarrays
COV = np.concatenate((COV_pos.values, COV_neg.values), axis=0)
# add the channel layer (as axis[2])
COV = np.expand_dims(COV,2)
# SNPs
# read true promoter SNPs
SNPs_pos = pd.read_csv(input_loc.loc["SNPs"].iloc[0], header=None, index_col=0)
# read false promoter SNPs
SNPs_neg = pd.read_csv(input_loc.loc["SNPs"].iloc[1], header=None, index_col=0)
# merge the ndarrays
SNPs = np.concatenate((SNPs_pos.values, SNPs_neg.values), axis=0)
# add the channel layer (as axis[2])
SNPs = np.expand_dims(SNPs,2)
# METH
# read true promoter Methylation
METH_pos = pd.read_csv(input_loc.loc["METH"].iloc[0], header=None, index_col=0)
# read false promoter Methylation
METH_neg = pd.read_csv(input_loc.loc["METH"].iloc[1], header=None, index_col=0)
# merge the ndarrays
METH = np.concatenate((METH_pos.values, METH_neg.values), axis=0)
# add the channel layer (as axis[2])
METH = np.expand_dims(METH,2)
# LABELS
# create array with lables
labels_1 = np.column_stack([np.ones(seq_pos.shape[0]), np.zeros(seq_pos.shape[0])] )
labels_0 = np.column_stack([np.zeros(seq_neg.shape[0]), np.ones(seq_neg.shape[0])] )
labels = np.concatenate([labels_1, labels_0])
# SHUFFLE
shuffle_index = np.random.permutation(np.arange(seq.shape[0]))
seq = seq[shuffle_index]
CG = CG[shuffle_index]
CA = CA[shuffle_index]
COV = COV[shuffle_index]
SNPs = SNPs[shuffle_index]
METH = METH[shuffle_index]
labels = labels[shuffle_index]
# DEVIDE DATASETS INTO TRAIN AND TEST SETS
test_seq = seq[:test_size,:]
test_CG = CG[:test_size,:]
test_CA = CA[:test_size,:]
test_COV = COV[:test_size,:]
test_SNPs = SNPs[:test_size,:]
test_METH = METH[:test_size,:]
test_labels = labels[:test_size]
train_seq = seq[test_size:,:]
train_CG = CG[test_size:,:]
train_CA = CA[test_size:,:]
train_COV = COV[test_size:,:]
train_SNPs = SNPs[test_size:,:]
train_METH = METH[test_size:,:]
train_labels = labels[test_size:]
ret = {}
ret_variables = [i for i in locals().keys() if i not in system_variables]
for l in ret_variables:
ret[l] = eval(l)
return ret