diff --git a/README.md b/README.md index 65d46e0..41ddd5a 100644 --- a/README.md +++ b/README.md @@ -51,13 +51,14 @@ optional arguments: ## Notes -- The input tsv should have at least four columns, including `Run`, `Platform`, `Model`, and `LibraryLayout` (header must be presented). +- The input tsv should have at least five columns, including `Run`, `Platform`, `Model`, `LibraryLayout` (header must be presented), and `download_path`. - `Run` column represents the paths to the SRA files. You can use either relative path to your current directory or absolute path. To make less confusion, we recommned to use absolute path. - `Platform` column represents the sequencer's brand. We will recognize `ILLUMINA` and `ABI_SOLID` (although we will not process `ABI_SOLID`) in this field, because it determines the adapters used in the pipeline with `Model`. - `Model` column represents the sequencer's model. We will recongize `Illumina HiSeq ...`, `Illumina MiSeq ...`, and `Illumina Genome Analyzer II ...`, because it determines the adapters used in the pipeline with `Platform`. Don't forget that if you have any space in the name of the model. You need to escape them using quoting such as `"Illumina Hiseq 2000"` or back slash. - `LibraryLayout` column represents what's the strategy of RNA-Seq experiment. It can be only `SINGLE` or `PAIRED`. - - Currently, the `ABI_SOLID` sequencer is not supported. - - For paired-end layout, `Trimmomatic` will produces four fastq files: forward\_paired, forward\_unpaired, reverse\_paired, reverse\_unpaired, but we will only use the paired data in alignment (by HISAT2) + - Currently, the `ABI_SOLID` sequencer is not supported. + - For paired-end layout, `Trimmomatic` will produces four fastq files: forward\_paired, forward\_unpaired, reverse\_paired, reverse\_unpaired, but we will only use the paired data in alignment (by HISAT2) + - `download_path` column represents where we can download the SRA files. - When using on server, make sure you use the `JAVA_TOOL_OPTIONS` environment to set the maximum memory usage like `export JAVA_TOOL_OPTIONS="-Xmx2g"` when running the toolkit. You can also check an example [here](example/example_script.sh). ## Tests diff --git a/rnannot/RNAseq_annotate.py b/rnannot/RNAseq_annotate.py index b193a40..94dccad 100755 --- a/rnannot/RNAseq_annotate.py +++ b/rnannot/RNAseq_annotate.py @@ -9,9 +9,10 @@ import gzip import shutil from itertools import islice +from six.moves import urllib -def run_pipeline(file, genome, outdir, name, layout, platform, model): +def run_pipeline(file, genome, outdir, name, layout, platform, model, download_link): # create the output folder output_prefix = path.join(outdir, name) os.mkdir(output_prefix) @@ -32,6 +33,10 @@ def run_pipeline(file, genome, outdir, name, layout, platform, model): sra_file_name = path.basename(file) genome_file_name = path.basename(genome) + # check if SRA file exist or download it first + if not path.exists(file): + urllib.request.urlretrieve(download_link, file) + # convert SRA file to fastq file(s) print('Unpacking the SRA file: {} ...'.format(file)) f_stdout = open( @@ -374,9 +379,10 @@ def read_sam_errors(file_path): platform_ind = col_names.index('Platform') model_ind = col_names.index('Model') layout_ind = col_names.index('LibraryLayout') + download_ind = col_names.index('download_path') print('Checking the input tsv file: {}'.format(args.input)) - for ind, name in zip([run_ind, platform_ind, model_ind, layout_ind], - ['Run', 'Platform', 'Model', 'LibraryLayout']): + for ind, name in zip([run_ind, platform_ind, model_ind, layout_ind, download_ind], + ['Run', 'Platform', 'Model', 'LibraryLayout', 'download_path']): if ind == -1: print('{} column is missing in input tsv file.'.format(name)) exit(1) @@ -384,14 +390,16 @@ def read_sam_errors(file_path): platforms = [] models = [] layouts = [] + download_links = [] for line in f: temp = line.rstrip('\n').split('\t') runs.append(temp[run_ind]) platforms.append(temp[platform_ind]) models.append(temp[model_ind]) layouts.append(temp[layout_ind]) + download_links.append(temp[download_ind]) files_for_merge = [] - for run, platform, model, layout in zip(runs, platforms, models, layouts): + for run, platform, model, layout, download_link in zip(runs, platforms, models, layouts, download_links): print('Processing the file: {}'.format(run)) if not path.isabs(run): run = path.abspath(run) @@ -403,7 +411,9 @@ def read_sam_errors(file_path): name=run_file_name, layout=layout, platform=platform, - model=model) + model=model, + download_link=download_link + ) if return_status: files_for_merge.append( path.join(args.outdir, args.name, run_file_name, 'output.bam')) diff --git a/rnannot/download_sra_metadata.py b/rnannot/download_sra_metadata.py new file mode 100644 index 0000000..71cf9a5 --- /dev/null +++ b/rnannot/download_sra_metadata.py @@ -0,0 +1,23 @@ +import subprocess +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("-t","--taxid", help="your tax_id",type=str) +parser.add_argument("-o","--output", help="directory of output folder at, if not specified, use current folder",type=str) +args = parser.parse_args() +tax_id=args.taxid +print('Processing tax id:'+args.taxid) + +proc_esearch = subprocess.Popen(['esearch', '-db', 'taxonomy', '-query' , tax_id + '[uid]'], stdout=subprocess.PIPE) +proc_elink = subprocess.Popen(['elink', '-target', 'sra'], stdin=proc_esearch.stdout, stdout=subprocess.PIPE) +proc_efilter = subprocess.Popen(['efilter', '-query', 'rna seq[stra] AND Transcriptomic[src]'], stdin=proc_elink.stdout, stdout=subprocess.PIPE) +proc_efetch = subprocess.Popen(['efetch', '-format', 'runinfo', '-mode', 'xml'], stdin=proc_efilter.stdout, stdout=subprocess.PIPE) +proc_xtract = subprocess.Popen(['xtract', '-pattern', 'Row', '-def', 'N/A','-element', 'Run', 'ReleaseDate', 'LoadDate', 'spots', 'bases', 'spots_with_mates', 'avgLength', 'size_MB', 'download_path', 'Experiment', 'LibraryName', 'LibraryStrategy', 'LibrarySelection', 'LibrarySource', 'LibraryLayout', 'InsertSize', 'InsertDev', 'Platform', 'Model', 'SRAStudy', 'BioProject', 'Study_Pubmed_id', 'ProjectID', 'Sample', 'BioSample', 'SampleType', 'TaxID', 'ScientificName', 'SampleName', 'Sex', 'Tumor', 'Submission', 'Consent', 'RunHash', 'ReadHash' ], stdin=proc_efetch.stdout, stdout=subprocess.PIPE) +if args.output: + with open(args.output, 'w') as f: + f.write('\t'.join('Run,ReleaseDate,LoadDate,spots,bases,spots_with_mates,avgLength,size_MB,download_path,Experiment,LibraryName,LibraryStrategy,LibrarySelection,LibrarySource,LibraryLayout,InsertSize,InsertDev,Platform,Model,SRAStudy,BioProject,Study_Pubmed_id,ProjectID,Sample,BioSample,SampleType,TaxID,ScientificName,SampleName,Sex,Tumor,Submission,Consent,RunHash,ReadHash'.split(',')) + '\n') + f.write(proc_xtract.stdout.read().decode('utf-8') ) +else: + with open(args.taxid+'.tsv', 'w') as f: + f.write('\t'.join('Run,ReleaseDate,LoadDate,spots,bases,spots_with_mates,avgLength,size_MB,download_path,Experiment,LibraryName,LibraryStrategy,LibrarySelection,LibrarySource,LibraryLayout,InsertSize,InsertDev,Platform,Model,SRAStudy,BioProject,Study_Pubmed_id,ProjectID,Sample,BioSample,SampleType,TaxID,ScientificName,SampleName,Sex,Tumor,Submission,Consent,RunHash,ReadHash'.split(',')) + '\n') + f.write(proc_xtract.stdout.read().decode('utf-8') ) \ No newline at end of file diff --git a/setup.py b/setup.py index e4780ff..70d1996 100644 --- a/setup.py +++ b/setup.py @@ -101,8 +101,9 @@ setup( name='rnannot', version='0.0.1', + install_requires=['six'], packages=find_packages('.'), - scripts=['rnannot/RNAseq_annotate.py'], + scripts=['rnannot/RNAseq_annotate.py','rnannot/download_sra_metadata.py'], include_package_data=True, author='Yi Hsiao', author_email='hsiaoyi0504@gmail.com',