Skip to content

Commit

Permalink
Merge remote-tracking branch 'tamu-origin/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
Cory Maughmer committed May 21, 2020
2 parents 1b24941 + 5f5cfc8 commit 70a77cc
Show file tree
Hide file tree
Showing 61 changed files with 34,184 additions and 307 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -153,4 +153,5 @@ tools/spanin/debug

## proximity
tools/proximity/*.db
tools/proximity/termHits.txt
tools/proximity/termHits.txt
tools/proximity/test-data/prox/lambda_NRBLAST.gff3
3 changes: 3 additions & 0 deletions tool_conf.xml
Original file line number Diff line number Diff line change
Expand Up @@ -148,13 +148,16 @@
</section>
<section id="5477aa4e-90ce-4672-85d1-41b08f506097" name="CPT: Sequence Search">
<tool file="cpt2/galaxy-tools/tools/fasta/fasta_regex_search.xml" label="fasta" />
<tool file="cpt2/galaxy-tools/tools/external/pattern_finder.xml"/>
<tool file="cpt2/galaxy-tools/tools/external/motif_locator.xml"/>
</section>
<section id="b751a1c4-9cb9-4a41-8f3d-68c2b440088f" name="CPT: Blast Analysis / Filtering">
<tool file="cpt2/galaxy-tools/tools/blast_latest/tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml"/>
<tool file="cpt2/galaxy-tools/tools/blast_latest/tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml"/>
<tool file="cpt2/galaxy-tools/tools/blast_latest/tools/ncbi_blast_plus/ncbi_blastx_wrapper.xml"/>
<tool file="cpt2/galaxy-tools/tools/blast_latest/tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml"/>
<tool file="cpt2/galaxy-tools/tools/blast_latest/tools/ncbi_blast_plus/ncbi_tblastx_wrapper.xml"/>
<tool file="cpt2/galaxy-tools/tools/blast_latest/tools/ncbi_blast_plus/ncbi_makeblastdb.xml"/>
<tool file="cpt2/galaxy-tools/tools/blast_latest/tools/ncbi_blast_plus/ncbi_blastn_taxID.xml" label="experimental"/>
<tool file="cpt2/galaxy-tools/tools/blast/list_taxids.xml"/>
<tool file="cpt2/galaxy-tools/tools/blast/blasttab_add_dice_column.xml"/>
Expand Down
18 changes: 9 additions & 9 deletions tools/blast_latest/tools/ncbi_blast_plus/ncbi_makeblastdb.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<tool id="ncbi_makeblastdb" name="NCBI BLAST+ makeblastdb" version="@WRAPPER_VERSION@">
<tool id="ncbi_makeblastdb_latest" name="NCBI BLAST+ makeblastdb (CPT Latest)" version="@WRAPPER_VERSION@">
<description>Make BLAST database</description>
<macros>
<token name="@BINARY@">makeblastdb</token>
Expand Down Expand Up @@ -29,6 +29,7 @@ $hash_index
-in -
#if $title:
-title '${title}'
-blastdb_version $verNum
#else:
##Would default to being based on the cryptic Galaxy filenames, which is unhelpful
-title 'BLAST Database'
Expand All @@ -49,9 +50,8 @@ $hash_index
## --------------------------------------------------------------------
#if $tax.taxselect == 'id':
-taxid $tax.taxid
## TODO - Can we use a tabular file for the taxonomy mapping?
## #else if $tax.taxselect == 'map':
## -taxid_map $tax.taxmap
#else if $tax.taxselect == 'map':
-taxid_map $tax.taxmap
#end if
## --------------------------------------------------------------------
## Capture the stdout log information to the primary file (plain text):
Expand All @@ -68,6 +68,10 @@ $hash_index
<!-- Note this is a mandatory parameter - default should be most recent FASTA file -->
<param name="input_file" argument="-in" type="data" multiple="true" optional="false" format="fasta,fasta.gz" label="Input FASTA files(s)" help="One or more FASTA files" />
<param argument="-title" type="text" value="" label="Title for BLAST database" help="This is the database name shown in BLAST search output" />
<param name="verNum" type="select" label="Blast DB version number">
<option value="4">Version 4 (Support for older Blast versions, no TaxID support)</option>
<option value="5">Version 5 (Newer Blast only, supports TaxID mapping)</option>
</param>
<param argument="-parse_seqids" type="boolean" truevalue="-parse_seqids" falsevalue="" checked="false" label="Parse the sequence identifiers" help="This is only advised if your FASTA file follows the NCBI naming conventions using pipe '|' symbols" />
<param argument="-hash_index" type="boolean" truevalue="-hash_index" falsevalue="" checked="true" label="Enable the creation of sequence hash values" help="These hash values can then be used to quickly determine if a given sequence data exists in this BLAST database." />
<!-- SEQUENCE MASKING OPTIONS -->
Expand All @@ -79,20 +83,16 @@ $hash_index
<param name="taxselect" type="select" label="Taxonomy options">
<option value="">Do not assign a Taxonomy ID to the sequences</option>
<option value="id">Assign the same Taxonomy ID to all the sequences</option>
<!--
<option value="map">Supply text file mapping sequence IDs to taxnomy IDs</option>
-->
<option value="map">Supply text file mapping sequence IDs to taxnomy IDs (Parse Sequence Identifiers must be true)</option>
</param>
<when value="">
</when>
<when value="id">
<param argument="-taxid" type="integer" min="0" value="" label="NCBI taxonomy ID" help="Integer &gt;=0, e.g. 9606 for Homo sapiens" />
</when>
<!-- TODO: File format?
<when value="map">
<param name="taxmap" argument="-taxid_map" type="data" format="txt" label="Seq ID : Tax ID mapping file" help="Format: SequenceId TaxonomyId" />
</when>
-->
</conditional>
</inputs>
<outputs>
Expand Down
25 changes: 25 additions & 0 deletions tools/efetch/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Overview
This is a modified version of the efetch tool from NCBI and Galaxy. It is a reduced version of efetch, in that it limits the databases and return types. This can be expanded in the future, however, it is currently (05.14.2020) _just_ able to access the protein and nucleotide database. Genbank and Fasta files are the only return file types.

The power in this tool is in fetching large amounts of files. There is built in sleep functions that will delay large queries to NCBI, as well as attempt to resubmit GET requests if a HTTP error occurs.

Due to not _wanting_ to cause too much of a ruccus if multiple user's are wanting to use this tool, I believe in Galaxy we should limit only one concurrent use of this tool. It is capable of being abused, and since NCBI __will__ block by IP, I think it's worth our time and effort to ensure we do not overwhelm their systems as well as not be on the recieving end of a ban. Helena from the Galaxy team has their efetch tools with the following Galaxy config:

``` xml
<destination id="entrez" runner="local">
</destination>
<limit type="concurrent_jobs" id="entrez">1</limit>
<tools>
<tool id="ncbi.eutils.efetch" destination="entrez" />
<tool id="ncbi.eutils.esearch" destination="entrez" />
<tool id="ncbi.eutils.epost" destination="entrez" />
<tool id="ncbi.eutils.elink" destination="entrez" />
<tool id="ncbi.eutils.einfo" destination="entrez" />
<tool id="ncbi.eutils.esummary" destination="entrez" />
</tools>
```

# Current Implmentation
* Databases: `protein`, `nucleotide`
* Return Types: `fasta`, `genbank`
* Output formats: `fasta`, `genbank`, `multifasta`, `multigenbank`
62 changes: 44 additions & 18 deletions tools/efetch/cpt-efetch.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
#!/usr/bin/env python

import sys
print(sys.version)
from time import sleep
import os
from os import path
from Bio import Entrez
from Bio import SeqIO
from urllib.error import HTTPError
import argparse
from helperFunctions import awk_files
from helperFunctions import awk_files, is_dir


Entrez.email = "[email protected]"
#Entrez.email = "[email protected]"

class CPTEfetch:
""" Object that has built in functions to retrieve data from NCBI. Initially constructued to retreive GB and FA files from the nuccore and protein NCBI databases """
Expand All @@ -20,6 +21,7 @@ def __init__(self,email,acc,db,ret_type):
self.acc = acc
self.db = db
self.ret_type = ret_type
Entrez.email = self.email


def __repr__(self):
Expand Down Expand Up @@ -61,7 +63,7 @@ def write_record(self,name,st,galaxy=True):
##### Arguments
parser = argparse.ArgumentParser(description="CPT's very own modified Efetch")

parser.add_argument("email",
parser.add_argument("--email",
type=str,
help="Entrez Required Email") # current place holder until I determine how best to use the current user's email from Galaxy

Expand Down Expand Up @@ -92,12 +94,12 @@ def write_record(self,name,st,galaxy=True):

parser.add_argument("--sleepy",
type=int,
default=20,
default=30,
help="Amount to delay a query to NCBI by")

parser.add_argument("--data",
type=argparse.FileType("w+"),
default="data_accs.txt")
type=lambda x: is_dir(parser,x,"results"),
default="results/data_accs.txt")

"""
parser.add_argument("--multi_output",
Expand All @@ -109,30 +111,53 @@ def write_record(self,name,st,galaxy=True):
help="user to run galaxy like outputs")


parser.add_argument("--data_name",
type=str,
default="data_accs.txt",
help="name of acc file")

args = parser.parse_args()
#print(args)
# Write individual records
if not os.path.exists("results"):
os.mkdir("results")
#if not os.path.exists("results"):
#os.mkdir("results")
print(os.getcwd())
path = os.path.join("results",args.data_name)

with args.data as f:
with open(path,"w+") as f:
f.writelines("accessions: "+str(args.input)+"\n")

if args.galaxy_on:
os.chdir("results")
#if args.galaxy_on:
# os.chdir("results")

if "__at__" in args.email:
splits = args.email.split("__at__")
email = splits[0]+"@"+splits[1]
else:
elif "@" in args.email:
email = args.email
elif args.email is None:
raise Exception("EMAIL IS NECESSARY TO USE TOOL")

# Join together admin emails to append to hopefully catch NCBI's eye if abuse occurs
admins = ["[email protected]","[email protected]","[email protected]"]
sep = ";"
admins.insert(0,email)
emails = sep.join(admins)

print("Logged in as: "+email)
count = 0 # add a counter, so, it will do a two minute delay every 20th query, to attempt to not bother NCBI with load.
path = os.path.join("results","output")
for acc in args.input:
c = CPTEfetch(email, acc, args.db, args.ret_type)
count += 1
if count % 20 == 0:
sleep(120)
pass
else:
pass
c = CPTEfetch(emails, acc, args.db, args.ret_type)
print(c)
if args.galaxy_on:
c.write_record(st=args.sleepy,name="output",galaxy=True)
c.write_record(st=args.sleepy,name=path,galaxy=True)
else:
c.write_record(st=args.sleepy,name="data_",galaxy=False)

Expand All @@ -141,7 +166,8 @@ def write_record(self,name,st,galaxy=True):
if args.galaxy_on:
#awk_files("DAT",output=f"outputMulti.{str(args.ret_type)}")
#awk_files(str(args.ret_type),output=f"outputMulti.{str(args.ret_type)}")
awk_files(str(args.ret_type),output="output",galaxy=True)
awk_files(str(args.ret_type),output=path,galaxy=True)
else:
awk_files(str(args.ret_type),output="outputMulti"+str(args.ret_type))

print("---finish---")
print(os.getcwd())
25 changes: 18 additions & 7 deletions tools/efetch/cpt-efetch.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@
<import>macros.xml</import>
<!-- <import>cpt-macros.xml</import>-->
</macros>
<expand macro="requirements">
</expand>
<command detect_errors="aggressive"><![CDATA[
python $__tool_directory__/cpt-efetch.py
email $email
--email $email
--input $input
--db $db
--ret_format $ret_format
--ret_type $ret_type
--galaxy_on
--data $data
--sleepy $sleepy
]]></command>
Expand All @@ -35,22 +36,32 @@ email $email
<param name="sleepy" type="integer" value="20" label="Amount to slow request to NCBI by; increase if errors occur"/>
</inputs>
<outputs>
<data name="data" format="txt">
<data name="output" format="fasta">
<discover_datasets pattern="__designation_and_ext__" ext="fasta" directory="results" visible="true" assign_primary_output="true"/>
</data>
</outputs>
<tests>
<test>
<param name="email" value="[email protected]"/>
<param name="input" value="NC_001416.1"/>
<param name="db" value="nuccore"/>
<param name="ret_type" value="fasta"/>
<param name="ret_format" value="individual"/>
<output name="output">
<discovered_dataset designation="__designation_and_ext__" ftype="fasta"/>
</output>
</test>
</tests>
<help><![CDATA[
** WARNING : THIS IS AN ALPHA VERSION OF THE TOOL. IT DOES NOT WORK AS DESCRIBED IN THE CURRENT STATE. **
**WARNING : THIS IS AN ALPHA VERSION OF THE TOOL. IT DOES NOT WORK AS DESCRIBED IN THE CURRENT STATE.**
INPUT : An accession, or set of accessions separated by new line.
** aside: ** Current version has manual entry of email. This is due to the following (from NCBI): To make use of NCBI's E-utilities, NCBI requires you to specify your email address with each request. In case of excessive usage of the E-utilities, NCBI will attempt to contact a user at the email address provided before blocking access to the E-utilities.
**aside:** Current version has manual entry of email.
OUTPUT : Requested file type (genbank or fasta) individually and/or combined together.
@DISCLAIMER@
]]></help>
<citations>
<citation type="bibtex">
Expand Down
10 changes: 10 additions & 0 deletions tools/efetch/helperFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,16 @@ def pass_flag(input,flag="--output"):
except subprocess.TimeoutExpired as err:
print(err)

def redirect(input):
pass


def is_dir(parser, arg, make_dir):
if not os.path.exists(arg):
os.mkdir(make_dir)
open(arg, "w+")
else:
open(arg, "w+")

if __name__ == "__main__":
#cat_files("fasta")
Expand Down
Loading

0 comments on commit 70a77cc

Please sign in to comment.