Merge pull request #19 from NAL-i5K/download_SRA_files

Download sra files
NAL-i5K · Oct 15, 2018 · b040358 · b040358
2 parents 961db58 + 51997a5
commit b040358
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -51,13 +51,14 @@ optional arguments:
 
 ## Notes
 
-- The input tsv should have at least four columns, including `Run`, `Platform`, `Model`, and `LibraryLayout` (header must be presented).
+- The input tsv should have at least five columns, including `Run`, `Platform`, `Model`, `LibraryLayout` (header must be presented), and `download_path`.
   - `Run` column represents the paths to the SRA files. You can use either relative path to your current directory or absolute path. To make less confusion, we recommned to use absolute path.
   - `Platform` column represents the sequencer's brand. We will recognize `ILLUMINA` and `ABI_SOLID` (although we will not process `ABI_SOLID`) in this field, because it determines the adapters used in the pipeline with `Model`.
   - `Model` column represents the sequencer's model. We will recongize `Illumina HiSeq ...`, `Illumina MiSeq ...`, and `Illumina Genome Analyzer II ...`, because it determines the adapters used in the pipeline with `Platform`. Don't forget that if you have any space in the name of the model. You need to escape them using quoting such as `"Illumina Hiseq 2000"` or back slash. 
   - `LibraryLayout` column represents what's the strategy of RNA-Seq experiment. It can be only `SINGLE` or `PAIRED`.
-  - Currently, the `ABI_SOLID` sequencer is not supported.
-  - For paired-end layout, `Trimmomatic` will produces four fastq files: forward\_paired, forward\_unpaired, reverse\_paired, reverse\_unpaired, but we will only use the paired data in alignment (by HISAT2)
+    - Currently, the `ABI_SOLID` sequencer is not supported.
+    - For paired-end layout, `Trimmomatic` will produces four fastq files: forward\_paired, forward\_unpaired, reverse\_paired, reverse\_unpaired, but we will only use the paired data in alignment (by HISAT2)
+  - `download_path` column represents where we can download the SRA files.
 - When using on server, make sure you use the `JAVA_TOOL_OPTIONS` environment to set the maximum memory usage like `export JAVA_TOOL_OPTIONS="-Xmx2g"` when running the toolkit. You can also check an example [here](example/example_script.sh).
 
 ## Tests

diff --git a/rnannot/RNAseq_annotate.py b/rnannot/RNAseq_annotate.py
@@ -9,9 +9,10 @@
 import gzip
 import shutil
 from itertools import islice
+from six.moves import urllib
 
 
-def run_pipeline(file, genome, outdir, name, layout, platform, model):
+def run_pipeline(file, genome, outdir, name, layout, platform, model, download_link):
     # create the output folder
     output_prefix = path.join(outdir, name)
     os.mkdir(output_prefix)
@@ -32,6 +33,10 @@ def run_pipeline(file, genome, outdir, name, layout, platform, model):
     sra_file_name = path.basename(file)
     genome_file_name = path.basename(genome)
 
+    # check if SRA file exist or download it first
+    if not path.exists(file):
+        urllib.request.urlretrieve(download_link, file)
+
     # convert SRA file to fastq file(s)
     print('Unpacking the SRA file: {} ...'.format(file))
     f_stdout = open(
@@ -374,24 +379,27 @@ def read_sam_errors(file_path):
         platform_ind = col_names.index('Platform')
         model_ind = col_names.index('Model')
         layout_ind = col_names.index('LibraryLayout')
+        download_ind = col_names.index('download_path')
         print('Checking the input tsv file: {}'.format(args.input))
-        for ind, name in zip([run_ind, platform_ind, model_ind, layout_ind],
-                             ['Run', 'Platform', 'Model', 'LibraryLayout']):
+        for ind, name in zip([run_ind, platform_ind, model_ind, layout_ind, download_ind],
+                             ['Run', 'Platform', 'Model', 'LibraryLayout', 'download_path']):
             if ind == -1:
                 print('{} column is missing in input tsv file.'.format(name))
                 exit(1)
         runs = []
         platforms = []
         models = []
         layouts = []
+        download_links = []
         for line in f:
             temp = line.rstrip('\n').split('\t')
             runs.append(temp[run_ind])
             platforms.append(temp[platform_ind])
             models.append(temp[model_ind])
             layouts.append(temp[layout_ind])
+            download_links.append(temp[download_ind])
     files_for_merge = []
-    for run, platform, model, layout in zip(runs, platforms, models, layouts):
+    for run, platform, model, layout, download_link in zip(runs, platforms, models, layouts, download_links):
         print('Processing the file: {}'.format(run))
         if not path.isabs(run):
             run = path.abspath(run)
@@ -403,7 +411,9 @@ def read_sam_errors(file_path):
             name=run_file_name,
             layout=layout,
             platform=platform,
-            model=model)
+            model=model,
+            download_link=download_link
+            )
         if return_status:
             files_for_merge.append(
                 path.join(args.outdir, args.name, run_file_name, 'output.bam'))

diff --git a/rnannot/download_sra_metadata.py b/rnannot/download_sra_metadata.py
@@ -0,0 +1,23 @@
+import subprocess
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("-t","--taxid", help="your tax_id",type=str)
+parser.add_argument("-o","--output", help="directory of output folder at, if not specified, use current folder",type=str)                        
+args = parser.parse_args()
+tax_id=args.taxid
+print('Processing tax id:'+args.taxid)
+
+proc_esearch = subprocess.Popen(['esearch', '-db', 'taxonomy', '-query' , tax_id + '[uid]'], stdout=subprocess.PIPE)
+proc_elink = subprocess.Popen(['elink', '-target', 'sra'], stdin=proc_esearch.stdout, stdout=subprocess.PIPE)
+proc_efilter = subprocess.Popen(['efilter', '-query', 'rna seq[stra] AND Transcriptomic[src]'], stdin=proc_elink.stdout, stdout=subprocess.PIPE)
+proc_efetch = subprocess.Popen(['efetch', '-format', 'runinfo', '-mode', 'xml'], stdin=proc_efilter.stdout, stdout=subprocess.PIPE)
+proc_xtract = subprocess.Popen(['xtract', '-pattern', 'Row', '-def', 'N/A','-element', 'Run', 'ReleaseDate', 'LoadDate', 'spots', 'bases', 'spots_with_mates', 'avgLength', 'size_MB', 'download_path', 'Experiment', 'LibraryName', 'LibraryStrategy', 'LibrarySelection', 'LibrarySource', 'LibraryLayout', 'InsertSize', 'InsertDev', 'Platform', 'Model', 'SRAStudy', 'BioProject', 'Study_Pubmed_id', 'ProjectID', 'Sample', 'BioSample', 'SampleType', 'TaxID', 'ScientificName', 'SampleName', 'Sex', 'Tumor', 'Submission', 'Consent', 'RunHash', 'ReadHash' ], stdin=proc_efetch.stdout, stdout=subprocess.PIPE)
+if args.output:
+    with open(args.output, 'w') as f:
+         f.write('\t'.join('Run,ReleaseDate,LoadDate,spots,bases,spots_with_mates,avgLength,size_MB,download_path,Experiment,LibraryName,LibraryStrategy,LibrarySelection,LibrarySource,LibraryLayout,InsertSize,InsertDev,Platform,Model,SRAStudy,BioProject,Study_Pubmed_id,ProjectID,Sample,BioSample,SampleType,TaxID,ScientificName,SampleName,Sex,Tumor,Submission,Consent,RunHash,ReadHash'.split(',')) + '\n')
+         f.write(proc_xtract.stdout.read().decode('utf-8') )         
+else:
+    with open(args.taxid+'.tsv', 'w') as f:
+         f.write('\t'.join('Run,ReleaseDate,LoadDate,spots,bases,spots_with_mates,avgLength,size_MB,download_path,Experiment,LibraryName,LibraryStrategy,LibrarySelection,LibrarySource,LibraryLayout,InsertSize,InsertDev,Platform,Model,SRAStudy,BioProject,Study_Pubmed_id,ProjectID,Sample,BioSample,SampleType,TaxID,ScientificName,SampleName,Sex,Tumor,Submission,Consent,RunHash,ReadHash'.split(',')) + '\n')
+         f.write(proc_xtract.stdout.read().decode('utf-8') )
diff --git a/setup.py b/setup.py
@@ -101,8 +101,9 @@
 setup(
     name='rnannot',
     version='0.0.1',
+    install_requires=['six'],
     packages=find_packages('.'),
-    scripts=['rnannot/RNAseq_annotate.py'],
+    scripts=['rnannot/RNAseq_annotate.py','rnannot/download_sra_metadata.py'],
     include_package_data=True,
     author='Yi Hsiao',
     author_email='[email protected]',