Skip to content

Commit

Permalink
build either on rnaseq or snps with a choice of subset
Browse files Browse the repository at this point in the history
  • Loading branch information
hyoo committed May 16, 2019
1 parent 73ff5cb commit 9d48285
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 8 deletions.
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
```
$ python build.py -h
usage: build.py [-h] [--top_n TOP_N] [--drug_descriptor {dragon7,mordred}]
[--cell_feature {rnaseq}]
[--cell_feature {rnaseq,snps}]
[--cell_feature_subset {lincs1000,oncogenes,all}]
[--format {csv,tsv,parquet,hdf5,feather}]
[--response_type {reg,bin}]
Expand All @@ -12,8 +13,10 @@ optional arguments:
--top_n TOP_N Number of cancer types to be included. Default 6
--drug_descriptor {dragon7,mordred}
Drug descriptors
--cell_feature {rnaseq}
--cell_feature {rnaseq,snps}
Cell line features
--cell_feature_subset {lincs1000,oncogenes,all}
Subset of cell line features. Default lincs1000
--format {csv,tsv,parquet,hdf5,feather}
Dataframe file format. Default hdf5
--response_type {reg,bin}
Expand Down
33 changes: 29 additions & 4 deletions build.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
import pandas as pd
import argparse
from pathlib import Path
from functools import reduce

# input files
base_data_dir = './data'
response_path = Path('./data/combined_single_response_agg')
cell_cancer_types_map_path = Path('./data/combined_cancer_types')
drug_list_path = Path('./data/drugs_1800')
cell_rnaseq_path = Path('./data/combined_rnaseq_data_lincs1000_combat')


def parse_arguments(model_name=''):
Expand All @@ -20,8 +20,11 @@ def parse_arguments(model_name=''):
choices=['dragon7', 'mordred'],
help='Drug descriptors')
parser.add_argument('--cell_feature', default='rnaseq',
choices=['rnaseq'],
choices=['rnaseq', 'snps'],
help='Cell line features')
parser.add_argument('--cell_feature_subset', default='lincs1000',
choices=['lincs1000', 'oncogenes', 'all'],
help='Subset of cell line features. Default lincs1000')
parser.add_argument('--format', default='hdf5',
choices=['csv', 'tsv', 'parquet', 'hdf5', 'feather'],
help='Dataframe file format. Default hdf5')
Expand All @@ -33,6 +36,27 @@ def parse_arguments(model_name=''):
return args, unparsed


def check_file(filepath):
print("checking {}".format(filepath))
status = filepath.is_file()
if status is False:
print("File {} is not found in data dir.".format(filepath))
return status


def check_data_files(args):
filelist = [response_path, cell_cancer_types_map_path, drug_list_path, get_cell_feature_path(args), get_drug_descriptor_path(args)]
return reduce((lambda x, y: x & y), map(check_file, filelist))


def get_cell_feature_path(args):
if args.cell_feature_subset == 'all':
filename = 'combined_{}_data_combat'.format(args.cell_feature)
else:
filename = 'combined_{}_data_{}_combat'.format(args.cell_feature, args.cell_feature_subset)
return Path(base_data_dir, filename)


def get_drug_descriptor_path(args):
filename = 'combined_{}_descriptors'.format(args.drug_descriptor)
return Path(base_data_dir, filename)
Expand Down Expand Up @@ -80,7 +104,7 @@ def build_dataframe(args):
df_response.rename(columns={'AUC': 'Response'}, inplace=True)

# Join response data with Drug descriptor & RNASeq
df_rnaseq = pd.read_csv(cell_rnaseq_path, sep='\t', low_memory=False)
df_rnaseq = pd.read_csv(get_cell_feature_path(args), sep='\t', low_memory=False)
df_rnaseq = df_rnaseq[df_rnaseq['Sample'].isin(cl_filter)].reset_index(drop=True)

df_rnaseq.rename(columns={'Sample': 'CELL'}, inplace=True)
Expand Down Expand Up @@ -113,4 +137,5 @@ def build_dataframe(args):

if __name__ == '__main__':
FLAGS, unparsed = parse_arguments()
build_dataframe(FLAGS)
if check_data_files(FLAGS):
build_dataframe(FLAGS)
16 changes: 14 additions & 2 deletions download.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,18 @@
# download cell descriptors
scp biologin-4.mcs.anl.gov:/vol/nciftp/private/data_frames/rna-seq/combined_rnaseq_data_lincs1000_combat data/
scp biologin-4.mcs.anl.gov:/vol/nciftp/private/data_frames/dose_response/combined_single_response_agg data/
scp biologin-4.mcs.anl.gov:/vol/nciftp/private/data_frames/cell_lines/combined_cancer_types data/
scp biologin-4.mcs.anl.gov:/vol/nciftp/private/data_frames/rna-seq/combined_rnaseq_data_oncogenes_combat data/
scp biologin-4.mcs.anl.gov:/vol/nciftp/private/data_frames/rna-seq/combined_rnaseq_data_combat data/
scp biologin-4.mcs.anl.gov:/vol/nciftp/private/data_frames/snps/combined_snps_data_lincs1000_combat data/
scp biologin-4.mcs.anl.gov:/vol/nciftp/private/data_frames/snps/combined_snps_data_oncogenes_combat data/
scp biologin-4.mcs.anl.gov:/vol/nciftp/private/data_frames/snps/combined_snps_data_combat data/

# download drug descritpros
scp biologin-4.mcs.anl.gov:/vol/nciftp/private/data_frames/drug_descriptors/combined_dragon7_descriptors data/
scp biologin-4.mcs.anl.gov:/vol/nciftp/private/data_frames/drug_descriptors/combined_mordred_descriptors data/

# download response data
scp biologin-4.mcs.anl.gov:/vol/nciftp/private/data_frames/dose_response/combined_single_response_agg data/

# download data files
scp biologin-4.mcs.anl.gov:/vol/nciftp/private/data_frames/drugs/drugs_1800 data/
scp biologin-4.mcs.anl.gov:/vol/nciftp/private/data_frames/cell_lines/combined_cancer_types data/

0 comments on commit 9d48285

Please sign in to comment.