Merge remote-tracking branch 'tamu-origin/master'

TAMU-CPT · May 21, 2020 · 70a77cc · 70a77cc
2 parents 1b24941 + 5f5cfc8
commit 70a77cc
Show file tree

Hide file tree

Showing 61 changed files with 34,184 additions and 307 deletions.
diff --git a/.gitignore b/.gitignore
@@ -153,4 +153,5 @@ tools/spanin/debug
 
 ## proximity
 tools/proximity/*.db
-tools/proximity/termHits.txt
+tools/proximity/termHits.txt
+tools/proximity/test-data/prox/lambda_NRBLAST.gff3
diff --git a/tool_conf.xml b/tool_conf.xml
@@ -148,13 +148,16 @@
 	</section>
 	<section id="5477aa4e-90ce-4672-85d1-41b08f506097" name="CPT: Sequence Search">
 		<tool file="cpt2/galaxy-tools/tools/fasta/fasta_regex_search.xml" label="fasta" />
+                <tool file="cpt2/galaxy-tools/tools/external/pattern_finder.xml"/>
+		<tool file="cpt2/galaxy-tools/tools/external/motif_locator.xml"/>     
 	</section>
 	<section id="b751a1c4-9cb9-4a41-8f3d-68c2b440088f" name="CPT: Blast Analysis / Filtering">
                 <tool file="cpt2/galaxy-tools/tools/blast_latest/tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml"/>
                 <tool file="cpt2/galaxy-tools/tools/blast_latest/tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml"/>
                 <tool file="cpt2/galaxy-tools/tools/blast_latest/tools/ncbi_blast_plus/ncbi_blastx_wrapper.xml"/>
                 <tool file="cpt2/galaxy-tools/tools/blast_latest/tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml"/>
     	        <tool file="cpt2/galaxy-tools/tools/blast_latest/tools/ncbi_blast_plus/ncbi_tblastx_wrapper.xml"/>
+                <tool file="cpt2/galaxy-tools/tools/blast_latest/tools/ncbi_blast_plus/ncbi_makeblastdb.xml"/>
                 <tool file="cpt2/galaxy-tools/tools/blast_latest/tools/ncbi_blast_plus/ncbi_blastn_taxID.xml" label="experimental"/>
 		<tool file="cpt2/galaxy-tools/tools/blast/list_taxids.xml"/>
 		<tool file="cpt2/galaxy-tools/tools/blast/blasttab_add_dice_column.xml"/>

diff --git a/tools/blast_latest/tools/ncbi_blast_plus/ncbi_makeblastdb.xml b/tools/blast_latest/tools/ncbi_blast_plus/ncbi_makeblastdb.xml
@@ -1,4 +1,4 @@
-<tool id="ncbi_makeblastdb" name="NCBI BLAST+ makeblastdb" version="@WRAPPER_VERSION@">
+<tool id="ncbi_makeblastdb_latest" name="NCBI BLAST+ makeblastdb (CPT Latest)" version="@WRAPPER_VERSION@">
     <description>Make BLAST database</description>
     <macros>
         <token name="@BINARY@">makeblastdb</token>
@@ -29,6 +29,7 @@ $hash_index
 -in -
 #if $title:
 -title '${title}'
+-blastdb_version $verNum
 #else:
 ##Would default to being based on the cryptic Galaxy filenames, which is unhelpful
 -title 'BLAST Database'
@@ -49,9 +50,8 @@ $hash_index
 ## --------------------------------------------------------------------
 #if $tax.taxselect == 'id':
 -taxid $tax.taxid
-## TODO - Can we use a tabular file for the taxonomy mapping?
-## #else if $tax.taxselect == 'map':
-## -taxid_map $tax.taxmap
+#else if $tax.taxselect == 'map':
+ -taxid_map $tax.taxmap
 #end if
 ## --------------------------------------------------------------------
 ## Capture the stdout log information to the primary file (plain text):
@@ -68,6 +68,10 @@ $hash_index
         <!-- Note this is a mandatory parameter - default should be most recent FASTA file -->
         <param name="input_file" argument="-in" type="data" multiple="true" optional="false" format="fasta,fasta.gz" label="Input FASTA files(s)" help="One or more FASTA files" />
         <param argument="-title" type="text" value="" label="Title for BLAST database" help="This is the database name shown in BLAST search output" />
+        <param name="verNum" type="select" label="Blast DB version number">
+                <option value="4">Version 4 (Support for older Blast versions, no TaxID support)</option>
+                <option value="5">Version 5 (Newer Blast only, supports TaxID mapping)</option>
+        </param>
         <param argument="-parse_seqids" type="boolean" truevalue="-parse_seqids" falsevalue="" checked="false" label="Parse the sequence identifiers" help="This is only advised if your FASTA file follows the NCBI naming conventions using pipe '|' symbols" />
         <param argument="-hash_index" type="boolean" truevalue="-hash_index" falsevalue="" checked="true" label="Enable the creation of sequence hash values" help="These hash values can then be used to quickly determine if a given sequence data exists in this BLAST database." />
         <!-- SEQUENCE MASKING OPTIONS -->
@@ -79,20 +83,16 @@ $hash_index
             <param name="taxselect" type="select" label="Taxonomy options">
                 <option value="">Do not assign a Taxonomy ID to the sequences</option>
                 <option value="id">Assign the same Taxonomy ID to all the sequences</option>
-                <!--
-                <option value="map">Supply text file mapping sequence IDs to taxnomy IDs</option>
-                -->
+                <option value="map">Supply text file mapping sequence IDs to taxnomy IDs (Parse Sequence Identifiers must be true)</option> 
             </param>
             <when value="">
             </when>
             <when value="id">
                 <param argument="-taxid" type="integer" min="0" value="" label="NCBI taxonomy ID" help="Integer &gt;=0, e.g. 9606 for Homo sapiens" />
             </when>
-            <!-- TODO: File format?
             <when value="map">
                 <param name="taxmap" argument="-taxid_map" type="data" format="txt" label="Seq ID : Tax ID mapping file" help="Format: SequenceId TaxonomyId" />
             </when>
-            -->
         </conditional>
     </inputs>
     <outputs>

diff --git a/tools/efetch/README.md b/tools/efetch/README.md
@@ -0,0 +1,25 @@
+# Overview
+This is a modified version of the efetch tool from NCBI and Galaxy. It is a reduced version of efetch, in that it limits the databases and return types. This can be expanded in the future, however, it is currently (05.14.2020) _just_ able to access the protein and nucleotide database. Genbank and Fasta files are the only return file types. 
+
+The power in this tool is in fetching large amounts of files. There is built in sleep functions that will delay large queries to NCBI, as well as attempt to resubmit GET requests if a HTTP error occurs.
+
+Due to not _wanting_ to cause too much of a ruccus if multiple user's are wanting to use this tool, I believe in Galaxy we should limit only one concurrent use of this tool. It is capable of being abused, and since NCBI __will__ block by IP, I think it's worth our time and effort to ensure we do not overwhelm their systems as well as not be on the recieving end of a ban. Helena from the Galaxy team has their efetch tools with the following Galaxy config:
+
+``` xml
+<destination id="entrez" runner="local">
+</destination>
+<limit type="concurrent_jobs" id="entrez">1</limit>
+<tools>
+  <tool id="ncbi.eutils.efetch" destination="entrez" />
+  <tool id="ncbi.eutils.esearch" destination="entrez" />
+  <tool id="ncbi.eutils.epost" destination="entrez" />
+  <tool id="ncbi.eutils.elink" destination="entrez" />
+  <tool id="ncbi.eutils.einfo" destination="entrez" />
+  <tool id="ncbi.eutils.esummary" destination="entrez" />
+</tools>
+```
+
+# Current Implmentation
+* Databases: `protein`, `nucleotide`
+* Return Types: `fasta`, `genbank`
+* Output formats: `fasta`, `genbank`, `multifasta`, `multigenbank`
diff --git a/tools/efetch/cpt-efetch.py b/tools/efetch/cpt-efetch.py
@@ -1,16 +1,17 @@
+#!/usr/bin/env python
+
 import sys
-print(sys.version)
 from time import sleep 
 import os
 from os import path
 from Bio import Entrez
 from Bio import SeqIO
 from urllib.error import HTTPError
 import argparse
-from helperFunctions import awk_files
+from helperFunctions import awk_files, is_dir
 
 
-Entrez.email = "[email protected]"
+#Entrez.email = "[email protected]"
 
 class CPTEfetch:
     """ Object that has built in functions to retrieve data from NCBI. Initially constructued to retreive GB and FA files from the nuccore and protein NCBI databases """
@@ -20,6 +21,7 @@ def __init__(self,email,acc,db,ret_type):
         self.acc = acc
         self.db = db
         self.ret_type = ret_type
+        Entrez.email = self.email
 
 
     def __repr__(self):
@@ -61,7 +63,7 @@ def write_record(self,name,st,galaxy=True):
         ##### Arguments
     parser = argparse.ArgumentParser(description="CPT's very own modified Efetch")
 
-    parser.add_argument("email",
+    parser.add_argument("--email",
                         type=str,
                         help="Entrez Required Email") # current place holder until I determine how best to use the current user's email from Galaxy
 
@@ -92,12 +94,12 @@ def write_record(self,name,st,galaxy=True):
 
     parser.add_argument("--sleepy",
                         type=int,
-                        default=20,
+                        default=30,
                         help="Amount to delay a query to NCBI by")
 
     parser.add_argument("--data",
-                        type=argparse.FileType("w+"),
-                        default="data_accs.txt")
+                        type=lambda x: is_dir(parser,x,"results"),
+                        default="results/data_accs.txt")
 
     """
     parser.add_argument("--multi_output",
@@ -109,30 +111,53 @@ def write_record(self,name,st,galaxy=True):
                         help="user to run galaxy like outputs")
 
 
+    parser.add_argument("--data_name",
+                        type=str,
+                        default="data_accs.txt",
+                        help="name of acc file")
+
     args = parser.parse_args()
     #print(args)
     # Write individual records
-    if not os.path.exists("results"):
-        os.mkdir("results")
+    #if not os.path.exists("results"):
+        #os.mkdir("results")
+    print(os.getcwd())
+    path = os.path.join("results",args.data_name)
 
-    with args.data as f:
+    with open(path,"w+") as f:
         f.writelines("accessions: "+str(args.input)+"\n")
 
-    if args.galaxy_on:
-        os.chdir("results")
-    
+    #if args.galaxy_on:
+    #    os.chdir("results")
+
     if "__at__" in args.email:
         splits = args.email.split("__at__")
         email = splits[0]+"@"+splits[1]
-    else:
+    elif "@" in args.email:
         email = args.email
+    elif args.email is None:
+        raise Exception("EMAIL IS NECESSARY TO USE TOOL")
+
+    #  Join together admin emails to append to hopefully catch NCBI's eye if abuse occurs
+    admins = ["[email protected]","[email protected]","[email protected]"]
+    sep = ";"
+    admins.insert(0,email)
+    emails = sep.join(admins)
 
     print("Logged in as: "+email)
+    count = 0 # add a counter, so, it will do a two minute delay every 20th query, to attempt to not bother NCBI with load.
+    path = os.path.join("results","output")
     for acc in args.input:
-        c = CPTEfetch(email, acc, args.db, args.ret_type)
+        count += 1
+        if count % 20 == 0:
+            sleep(120)
+            pass
+        else:
+            pass
+        c = CPTEfetch(emails, acc, args.db, args.ret_type)
         print(c)
         if args.galaxy_on:
-            c.write_record(st=args.sleepy,name="output",galaxy=True)
+            c.write_record(st=args.sleepy,name=path,galaxy=True)
         else:
             c.write_record(st=args.sleepy,name="data_",galaxy=False)
 
@@ -141,7 +166,8 @@ def write_record(self,name,st,galaxy=True):
         if args.galaxy_on:
             #awk_files("DAT",output=f"outputMulti.{str(args.ret_type)}")
             #awk_files(str(args.ret_type),output=f"outputMulti.{str(args.ret_type)}")
-            awk_files(str(args.ret_type),output="output",galaxy=True)
+            awk_files(str(args.ret_type),output=path,galaxy=True)
         else:
             awk_files(str(args.ret_type),output="outputMulti"+str(args.ret_type))
-
+    print("---finish---")
+    print(os.getcwd())
diff --git a/tools/efetch/cpt-efetch.xml b/tools/efetch/cpt-efetch.xml
@@ -4,15 +4,16 @@
         <import>macros.xml</import>
     <!--    <import>cpt-macros.xml</import>-->
     </macros>
+    <expand macro="requirements">
+    </expand>
     <command detect_errors="aggressive"><![CDATA[
 python $__tool_directory__/cpt-efetch.py
-email $email
+--email $email
 --input $input
 --db $db
 --ret_format $ret_format
 --ret_type $ret_type
 --galaxy_on 
---data $data
 --sleepy $sleepy
 
     ]]></command>
@@ -35,22 +36,32 @@ email $email
             <param name="sleepy" type="integer" value="20" label="Amount to slow request to NCBI by; increase if errors occur"/>
         </inputs>
         <outputs>
-            <data name="data" format="txt">
+            <data name="output" format="fasta">
+                <discover_datasets pattern="__designation_and_ext__" ext="fasta" directory="results" visible="true" assign_primary_output="true"/>
             </data>
         </outputs>
         <tests>
+            <test>
+                <param name="email" value="[email protected]"/>
+                <param name="input" value="NC_001416.1"/>
+                <param name="db" value="nuccore"/>
+                <param name="ret_type" value="fasta"/>
+                <param name="ret_format" value="individual"/>
+                <output name="output">
+                    <discovered_dataset designation="__designation_and_ext__" ftype="fasta"/>
+                </output>
+            </test>
         </tests>
     <help><![CDATA[
-** WARNING : THIS IS AN ALPHA VERSION OF THE TOOL. IT DOES NOT WORK AS DESCRIBED IN THE CURRENT STATE. **
+**WARNING : THIS IS AN ALPHA VERSION OF THE TOOL. IT DOES NOT WORK AS DESCRIBED IN THE CURRENT STATE.**
 
 INPUT : An accession, or set of accessions separated by new line. 
 
-
-** aside: ** Current version has manual entry of email. This is due to the following (from NCBI): To make use of NCBI's E-utilities, NCBI requires you to specify your email address with each request. In case of excessive usage of the E-utilities, NCBI will attempt to contact a user at the email address provided before blocking access to the E-utilities.
-
+**aside:** Current version has manual entry of email.
 
 OUTPUT : Requested file type (genbank or fasta) individually and/or combined together.
 
+@DISCLAIMER@
     ]]></help>
         <citations>
             <citation type="bibtex">

diff --git a/tools/efetch/helperFunctions.py b/tools/efetch/helperFunctions.py
@@ -33,6 +33,16 @@ def pass_flag(input,flag="--output"):
     except subprocess.TimeoutExpired as err:
         print(err)
 
+def redirect(input):
+    pass
+
+
+def is_dir(parser, arg, make_dir):
+    if not os.path.exists(arg):
+        os.mkdir(make_dir)
+        open(arg, "w+")
+    else:
+        open(arg, "w+")
 
 if __name__ == "__main__":
     #cat_files("fasta")