-
Notifications
You must be signed in to change notification settings - Fork 20
/
data_integration.py
63 lines (44 loc) · 2.37 KB
/
data_integration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# DNA methylation
import numpy as np
import pandas as pd
# Select both samples
both_ids = np.loadtxt('data/PANCAN/GDC-PANCAN_both_samples.tsv', delimiter='\t', dtype='U32')
both_ids_index = np.insert(both_ids, 0, 'Composite Element REF')
file_path = 'data/PANCAN/GDC-PANCAN_methylation450_preprocessed.tsv'
# DNA methylation: 392761 rows × 8764 columns
input_df = pd.read_csv(file_path, sep='\t', header=0, index_col=0, usecols=both_ids_index)[both_ids]
# Select specific chr
all_index_set = set(input_df.index)
mapping = pd.read_csv('data/illuminaMethyl450_hg38_GDC', sep='\t', header=0, index_col=0)
chrs = mapping['chrom'].unique()
chrs = np.delete(chrs, 17)
# Store the number of probes for each chromosome
chrs_number_dict = {'chrs':list(chrs), 'in_mapping':list(np.zeros(24)), 'in_data':list(np.zeros(24))}
chrs_number_df = pd.DataFrame(chrs_number_dict)
chrs_number_df.set_index(['chrs'], inplace=True)
for chrom in chrs:
chr_index_set = set(mapping[mapping['chrom'] == chrom].index)
chrs_number_df.loc[chrom, 'in_mapping'] = len(chr_index_set)
chr_index_exi_set = all_index_set & chr_index_set
chrs_number_df.loc[chrom, 'in_data'] = len(chr_index_exi_set)
chr_index_exi_array = np.array(list(chr_index_exi_set))
chr_df = input_df.loc[chr_index_exi_array]
output_path = 'data/PANCAN/GDC-PANCAN_methylation450_preprocessed_both_' + chrom + '.tsv'
chr_df.to_csv(output_path, sep='\t')
chrs_number_df.to_csv('data/PANCAN/GDC-PANCAN_methylation450_preprocessed_chr_number.tsv', sep='\t')
input_df.to_csv('data/PANCAN/GDC-PANCAN_methylation450_preprocessed_both.tsv', sep='\t')
# Combine methy and expr data to a single file
# P
input_path = 'data/PANCAN/GDC-PANCAN_'
sample_id = np.loadtxt(input_path + 'both_samples.tsv', delimiter='\t', dtype='str')
expr_path = input_path + 'htseq_fpkm_'
methy_path = input_path + 'methylation450_'
# Set the dtype to f32 for memory saving purpose
all_cols_f32 = {col: np.float32 for col in sample_id}
print('Loading gene expression data...')
expr_df = pd.read_csv(expr_path + 'preprocessed_both.tsv', sep='\t', header=0, index_col=0, dtype=all_cols_f32)
print('Loading DNA methylation data...')
methy_df = pd.read_csv(methy_path + 'preprocessed_both.tsv', sep='\t', header=0, index_col=0, dtype=all_cols_f32)
multi_df = pd.concat([methy_df, expr_df])
out_path = input_path + 'preprocessed_both.tsv'
multi_df.to_csv(out_path, sep='\t')