Skip to content

Commit

Permalink
Merge pull request #19 from NAL-i5K/download_SRA_files
Browse files Browse the repository at this point in the history
Download sra files
  • Loading branch information
tony006469 authored Oct 15, 2018
2 parents 961db58 + 51997a5 commit b040358
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 9 deletions.
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,14 @@ optional arguments:
## Notes
- The input tsv should have at least four columns, including `Run`, `Platform`, `Model`, and `LibraryLayout` (header must be presented).
- The input tsv should have at least five columns, including `Run`, `Platform`, `Model`, `LibraryLayout` (header must be presented), and `download_path`.
- `Run` column represents the paths to the SRA files. You can use either relative path to your current directory or absolute path. To make less confusion, we recommned to use absolute path.
- `Platform` column represents the sequencer's brand. We will recognize `ILLUMINA` and `ABI_SOLID` (although we will not process `ABI_SOLID`) in this field, because it determines the adapters used in the pipeline with `Model`.
- `Model` column represents the sequencer's model. We will recongize `Illumina HiSeq ...`, `Illumina MiSeq ...`, and `Illumina Genome Analyzer II ...`, because it determines the adapters used in the pipeline with `Platform`. Don't forget that if you have any space in the name of the model. You need to escape them using quoting such as `"Illumina Hiseq 2000"` or back slash.
- `LibraryLayout` column represents what's the strategy of RNA-Seq experiment. It can be only `SINGLE` or `PAIRED`.
- Currently, the `ABI_SOLID` sequencer is not supported.
- For paired-end layout, `Trimmomatic` will produces four fastq files: forward\_paired, forward\_unpaired, reverse\_paired, reverse\_unpaired, but we will only use the paired data in alignment (by HISAT2)
- Currently, the `ABI_SOLID` sequencer is not supported.
- For paired-end layout, `Trimmomatic` will produces four fastq files: forward\_paired, forward\_unpaired, reverse\_paired, reverse\_unpaired, but we will only use the paired data in alignment (by HISAT2)
- `download_path` column represents where we can download the SRA files.
- When using on server, make sure you use the `JAVA_TOOL_OPTIONS` environment to set the maximum memory usage like `export JAVA_TOOL_OPTIONS="-Xmx2g"` when running the toolkit. You can also check an example [here](example/example_script.sh).
## Tests
Expand Down
20 changes: 15 additions & 5 deletions rnannot/RNAseq_annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@
import gzip
import shutil
from itertools import islice
from six.moves import urllib


def run_pipeline(file, genome, outdir, name, layout, platform, model):
def run_pipeline(file, genome, outdir, name, layout, platform, model, download_link):
# create the output folder
output_prefix = path.join(outdir, name)
os.mkdir(output_prefix)
Expand All @@ -32,6 +33,10 @@ def run_pipeline(file, genome, outdir, name, layout, platform, model):
sra_file_name = path.basename(file)
genome_file_name = path.basename(genome)

# check if SRA file exist or download it first
if not path.exists(file):
urllib.request.urlretrieve(download_link, file)

# convert SRA file to fastq file(s)
print('Unpacking the SRA file: {} ...'.format(file))
f_stdout = open(
Expand Down Expand Up @@ -374,24 +379,27 @@ def read_sam_errors(file_path):
platform_ind = col_names.index('Platform')
model_ind = col_names.index('Model')
layout_ind = col_names.index('LibraryLayout')
download_ind = col_names.index('download_path')
print('Checking the input tsv file: {}'.format(args.input))
for ind, name in zip([run_ind, platform_ind, model_ind, layout_ind],
['Run', 'Platform', 'Model', 'LibraryLayout']):
for ind, name in zip([run_ind, platform_ind, model_ind, layout_ind, download_ind],
['Run', 'Platform', 'Model', 'LibraryLayout', 'download_path']):
if ind == -1:
print('{} column is missing in input tsv file.'.format(name))
exit(1)
runs = []
platforms = []
models = []
layouts = []
download_links = []
for line in f:
temp = line.rstrip('\n').split('\t')
runs.append(temp[run_ind])
platforms.append(temp[platform_ind])
models.append(temp[model_ind])
layouts.append(temp[layout_ind])
download_links.append(temp[download_ind])
files_for_merge = []
for run, platform, model, layout in zip(runs, platforms, models, layouts):
for run, platform, model, layout, download_link in zip(runs, platforms, models, layouts, download_links):
print('Processing the file: {}'.format(run))
if not path.isabs(run):
run = path.abspath(run)
Expand All @@ -403,7 +411,9 @@ def read_sam_errors(file_path):
name=run_file_name,
layout=layout,
platform=platform,
model=model)
model=model,
download_link=download_link
)
if return_status:
files_for_merge.append(
path.join(args.outdir, args.name, run_file_name, 'output.bam'))
Expand Down
23 changes: 23 additions & 0 deletions rnannot/download_sra_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import subprocess
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("-t","--taxid", help="your tax_id",type=str)
parser.add_argument("-o","--output", help="directory of output folder at, if not specified, use current folder",type=str)
args = parser.parse_args()
tax_id=args.taxid
print('Processing tax id:'+args.taxid)

proc_esearch = subprocess.Popen(['esearch', '-db', 'taxonomy', '-query' , tax_id + '[uid]'], stdout=subprocess.PIPE)
proc_elink = subprocess.Popen(['elink', '-target', 'sra'], stdin=proc_esearch.stdout, stdout=subprocess.PIPE)
proc_efilter = subprocess.Popen(['efilter', '-query', 'rna seq[stra] AND Transcriptomic[src]'], stdin=proc_elink.stdout, stdout=subprocess.PIPE)
proc_efetch = subprocess.Popen(['efetch', '-format', 'runinfo', '-mode', 'xml'], stdin=proc_efilter.stdout, stdout=subprocess.PIPE)
proc_xtract = subprocess.Popen(['xtract', '-pattern', 'Row', '-def', 'N/A','-element', 'Run', 'ReleaseDate', 'LoadDate', 'spots', 'bases', 'spots_with_mates', 'avgLength', 'size_MB', 'download_path', 'Experiment', 'LibraryName', 'LibraryStrategy', 'LibrarySelection', 'LibrarySource', 'LibraryLayout', 'InsertSize', 'InsertDev', 'Platform', 'Model', 'SRAStudy', 'BioProject', 'Study_Pubmed_id', 'ProjectID', 'Sample', 'BioSample', 'SampleType', 'TaxID', 'ScientificName', 'SampleName', 'Sex', 'Tumor', 'Submission', 'Consent', 'RunHash', 'ReadHash' ], stdin=proc_efetch.stdout, stdout=subprocess.PIPE)
if args.output:
with open(args.output, 'w') as f:
f.write('\t'.join('Run,ReleaseDate,LoadDate,spots,bases,spots_with_mates,avgLength,size_MB,download_path,Experiment,LibraryName,LibraryStrategy,LibrarySelection,LibrarySource,LibraryLayout,InsertSize,InsertDev,Platform,Model,SRAStudy,BioProject,Study_Pubmed_id,ProjectID,Sample,BioSample,SampleType,TaxID,ScientificName,SampleName,Sex,Tumor,Submission,Consent,RunHash,ReadHash'.split(',')) + '\n')
f.write(proc_xtract.stdout.read().decode('utf-8') )
else:
with open(args.taxid+'.tsv', 'w') as f:
f.write('\t'.join('Run,ReleaseDate,LoadDate,spots,bases,spots_with_mates,avgLength,size_MB,download_path,Experiment,LibraryName,LibraryStrategy,LibrarySelection,LibrarySource,LibraryLayout,InsertSize,InsertDev,Platform,Model,SRAStudy,BioProject,Study_Pubmed_id,ProjectID,Sample,BioSample,SampleType,TaxID,ScientificName,SampleName,Sex,Tumor,Submission,Consent,RunHash,ReadHash'.split(',')) + '\n')
f.write(proc_xtract.stdout.read().decode('utf-8') )
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,9 @@
setup(
name='rnannot',
version='0.0.1',
install_requires=['six'],
packages=find_packages('.'),
scripts=['rnannot/RNAseq_annotate.py'],
scripts=['rnannot/RNAseq_annotate.py','rnannot/download_sra_metadata.py'],
include_package_data=True,
author='Yi Hsiao',
author_email='[email protected]',
Expand Down

0 comments on commit b040358

Please sign in to comment.