Merge remote-tracking branch 'tamu-origin/master'

TAMU-CPT · Sep 24, 2020 · e712834 · e712834
2 parents eb3f045 + 8a37010
commit e712834
Show file tree

Hide file tree

Showing 308 changed files with 80,885 additions and 625 deletions.
diff --git a/.gitignore b/.gitignore
@@ -154,4 +154,8 @@ tools/spanin/debug
 ## proximity
 tools/proximity/*.db
 tools/proximity/termHits.txt
-tools/proximity/test-data/prox/lambda_NRBLAST.gff3
+tools/proximity/test-data/prox/lambda_NRBLAST.gff3
+
+## SAR
+
+tools/SAR/big-data/
diff --git a/tool_conf.xml b/tool_conf.xml
@@ -4,15 +4,18 @@
 	<label id="=====1" text="Input"/>
 	<section id="8a843907-df36-4d82-a1ab-2bde1b4ce827" name="CPT: Get Data">
 		<label id="getext_1" text="Apollo"/>
-		<tool file="cpt2/galaxy-tools/tools/webapollo/export.xml"/>
-		<tool file="cpt2/galaxy-tools/tools/webapollo/fetch_organism_jbrowse.xml"/>
+		<tool file="cpt2/galaxy-tools/tools/webapollo/gga-apollo/list_organisms.xml" label="apollo" />
+		<tool file="cpt2/galaxy-tools/tools/webapollo/list_organism_data.xml" label="apollo" />
+		<tool file="cpt2/galaxy-tools/tools/webapollo/gga-apollo/export.xml" label="apollo" />
+		<tool file="cpt2/galaxy-tools/tools/webapollo/gga-apollo/fetch_organism_jbrowse.xml" label="apollo" />
 		<label id="getext_2" text="BaseSpace"/>
 		<tool file="cpt2/galaxy-tools/tools/sequencing/basespace-auth-1.xml"/>
 		<tool file="cpt2/galaxy-tools/tools/sequencing/basespace-auth-2.xml"/>
 		<tool file="cpt2/galaxy-tools/tools/sequencing/BaseSpaceRunList.xml"/>
 		<tool file="cpt2/galaxy-tools/tools/sequencing/BaseSpaceRunSampleList.xml"/>
 		<tool file="cpt2/galaxy-tools/tools/sequencing/BaseSpaceRunDownloader_v2.xml"/>
 		<tool file="cpt2/galaxy-tools/tools/sequencing/BaseSpaceSampleDownloader.xml"/>
+		<tool file="cpt2/galaxy-tools/tools/webapollo/gffFastaSplit.xml"/>
 		<label id="getext_3" text="NCBI"/>
 		<tool file="cpt2/galaxy-tools/tools/edirect/efetch.xml" label="dev-only"/>
 		<tool hidden="True" file="cpt2/galaxy-tools/tools/util/efetch.xml" label="experimental"/>
@@ -48,6 +51,7 @@
                 <tool file="cpt2/galaxy-tools/tools/gff3/gff3_validator.xml" label="gff3"/>
 		<tool file="cpt2/galaxy-tools/tools/gff3/gff3_fix_sixpack.xml" label="gff3" />
 		<tool file="cpt2/tools-iuc/tools/bioperl/bp_genbank2gff3.xml" label="genbank" />
+                <tool file="cpt2/galaxy-tools/tools/util/tRNAscan_to_gff_SE_format.xml"/>  
 	</section>
 	<section id="6dbd8994-74d6-4a9e-834b-131d9cd99526" name="CPT: Text Utilities">
 		<tool file="cpt2/galaxy-tools/tools/text/cat.xml"/>
@@ -82,14 +86,16 @@
 		<tool file="cpt2/galaxy-tools/tools/gff3/gff3_aa_stats.xml" label="gff3" />
 		<tool file="cpt2/galaxy-tools/tools/one-off/starts-sds-scrubber.xml" label="fasta"/>
 		<tool file="cpt2/galaxy-tools/tools/util/compare_codons.xml" label="gff3" />
+                <tool file="cpt2/galaxy-tools/tools/tRNAscan-SE-2.0/trnaScan.xml"/>
 	</section>
 	<section id="3e66d481-56c7-471f-8de2-7d496977c5eb" name="CPT: Genomic Data Editing">
 		<tool file="cpt2/galaxy-tools/tools/fasta/fasta_concat.xml"/>
-                <tool file="cpt2/galaxy-tools/tools/fasta/fasta_append.xml"/>
+		<tool file="cpt2/galaxy-tools/tools/fasta/fasta_append.xml"/>
 		<tool file="cpt2/galaxy-tools/tools/fasta/fasta_join.xml" label="fasta"/>
 		<tool file="cpt2/galaxy-tools/tools/fasta/fasta_ligate.xml" label="fasta"/>
 		<tool file="cpt2/galaxy-tools/tools/fasta/fasta_reopen.xml" label="fasta" />
-                <tool file="cpt2/galaxy-tools/tools/fasta/fasta_chop.xml"/>
+		<tool file="cpt2/galaxy-tools/tools/fasta/fasta_ID_swapper.xml" />
+		<tool file="cpt2/galaxy-tools/tools/fasta/fasta_chop.xml"/>
 		<tool file="cpt2/galaxy-tools/tools/fasta/safe_reopen.xml" label="fasta" />
 		<tool file="cpt2/galaxy-tools/tools/gbk/rename.xml" label="genbank,fasta,gff3,experimental"/>
 		<tool file="cpt2/galaxy-tools/tools/gff3/genome_editor.xml" label="experimental,gff3" />
@@ -123,6 +129,7 @@
                 <tool file="cpt2/galaxy-tools/tools/gff3/retype_features.xml" label="gff3" />
 		<tool file="cpt2/galaxy-tools/tools/gff3/rebase_features.xml" label="gff3" />
 		<tool file="cpt2/galaxy-tools/tools/gbk/feature_export.xml" label="genbank" />
+		<tool file="cpt2/galaxy-tools/tools/gbk/gbk_sectioning.xml" label="genbank" />
 		<tool file="cpt2/galaxy-tools/tools/gbk/seq_export.xml" label="genbank" />
 		<tool file="cpt2/galaxy-tools/tools/gbk/adjacent_features.xml" label="genbank" />
                 <tool file="cpt2/galaxy-tools/tools/fasta/fasta_split.xml" label="fasta" />
@@ -141,8 +148,7 @@
 		<tool file="cpt2/galaxy-tools/tools/external/tmhmm.xml"/>
 		<tool file="cpt2/galaxy-tools/tools/external/signalp.xml"/>
         <!--<tool file="cpt2/galaxy-tools/tools/external/interproscan.xml"/>-->
-		<tool file="cpt2/galaxy-tools/tools/external/interproscan-5.22.xml"/>
-		<tool file="cpt2/galaxy-tools/tools/external/interproscan-5.33.xml"/>
+		<tool file="cpt2/galaxy-tools/tools/external/interproscan-5.XX.xml"/>
 		<tool file="cpt2/galaxy-tools/tools/external/fix-aragorn-gff3.xml"/>
 		<tool file="cpt2/galaxy-tools/tools/one-off/genrand.xml"/>
 	</section>
@@ -177,6 +183,7 @@
 		<tool file="cpt2/galaxy-tools/tools/blast/blastn_to_gff3.xml" label="experimental"/>
 		<tool file="cpt2/galaxy-tools/tools/blast/blastp_to_gff3.xml" label="experimental"/>
 		<tool file="cpt2/galaxy-tools/tools/blast/blasttab_dice_filter.xml"/>
+                <tool file="cpt2/galaxy-tools/tools/genome_viz/brigaid.xml"/>
 	</section>
 	<section id="7045e32e-90c8-4e20-b4ac-a6cb070d40c3" name="CPT: Filter Features">
 		<tool file="cpt2/galaxy-tools/tools/gff3/filter_type.xml" label="gff3" />
@@ -251,23 +258,23 @@
 	<section id="6b26a975-a24a-4b05-b082-2286e1693fef" name="CPT: Genomic Viz">
 		<tool file="cpt2/galaxy-tools/tools/gbk/featuresvg.xml" label="genbank" />
 		<tool file="cpt2/galaxy-tools/tools/phage/genome_map.xml"/>
-		<!-- <tool file="cpt2/tools-iuc/tools/jbrowse/jbrowse.xml"/> -->
-		<tool file="cpt2/galaxy-tools/tools/jbrowse/jbrowse.xml"/>
+        	<tool file="cpt2/galaxy-tools/tools/genome_viz/linear_genome_plot.xml"/>
+		<tool file="cpt2/galaxy-tools/tools/jbrowse/jbrowse.xml" label="deprecated"/>
+		<tool file="cpt2/galaxy-tools/tools//jbrowse/jbrowse-iuc/jbrowse.xml" label="experimental"/>
 		<tool file="cpt2/Restriction-Digest-Tool/bin/digest_dna.xml"/>
 		<tool file="cpt2/Restriction-Digest-Tool/bin/graphic_drawer.xml"/>
 	</section>
 	<section id="8dee2684-f833-4dd9-b50d-abfdbb78f0be" name="CPT: Apollo">
-		<tool file="cpt2/galaxy-tools/tools/webapollo/create_account.xml" label="apollo" />
-		<tool file="cpt2/galaxy-tools/tools/webapollo/list_organisms.xml" label="apollo" />
-		<tool file="cpt2/galaxy-tools/tools/webapollo/list_organism_data.xml" label="apollo" />
-		<tool file="cpt2/galaxy-tools/tools/webapollo/create_or_update_organism.xml" label="apollo" />
+		<tool file="cpt2/galaxy-tools/tools/gff3/gff3_prep_for_apollo.xml" label="apollo" />
+		<tool file="cpt2/galaxy-tools/tools/webapollo/gga-apollo/create_account.xml" label="apollo" />
+		<tool file="cpt2/galaxy-tools/tools/webapollo/gga-apollo/create_features_from_gff3.xml" label="apollo" />
+		<tool file="cpt2/galaxy-tools/tools/webapollo/gga-apollo/create_or_update_organism.xml" label="apollo" />
+		<tool file="cpt2/galaxy-tools/tools/webapollo/gga-apollo/json2iframe.xml" label="apollo" />
+		<tool file="cpt2/galaxy-tools/tools/webapollo/gga-apollo/delete_features.xml" label="danger, apollo"/>
+		<tool file="cpt2/galaxy-tools/tools/webapollo/gga-apollo/delete_organism.xml" label="danger, apollo"/>
 		<tool file="cpt2/galaxy-tools/tools/webapollo/renumber_features.xml" label="new,apollo" />
-		<tool file="cpt2/galaxy-tools/tools/webapollo/delete_features.xml" label="danger,apollo" />
-		<tool file="cpt2/galaxy-tools/tools/webapollo/create_features_from_gff3.xml" label="experimental,apollo" />
-        <tool file="cpt2/galaxy-tools/tools/webapollo/rename_from_table.xml" label="new,apollo" />
-		<tool file="cpt2/galaxy-tools/tools/webapollo/json2iframe.xml" label="apollo" />
 		<tool file="cpt2/galaxy-tools/tools/gff3/gff3_fix_apollo_sd.xml" label="apollo" />
-		<tool file="cpt2/galaxy-tools/tools/webapollo/share_with.xml" label="apollo" />
+		<tool file="cpt2/galaxy-tools/tools/webapollo/shareOrg/share_with.xml" label="apollo" />
 	</section>
 	<section id="42fc75d5-cae2-4d29-8794-9a8d2f390032" name="CPT: Phage">
 		<tool file="cpt2/galaxy-tools/tools/gbk/phage_renamer.xml" label="genbank" />
@@ -303,9 +310,13 @@
                 <tool file="cpt2/galaxy-tools/tools/genemark/genemarkHMM.xml"/>
                 <tool file="cpt2/galaxy-tools/tools/spanin/generate-putative-isp.xml"/>
                 <tool file="cpt2/galaxy-tools/tools/spanin/generate-putative-osp.xml"/>
+                <tool file="cpt2/galaxy-tools/tools/spanin/generate-putative-usp.xml"/>
                 <tool file="cpt2/galaxy-tools/tools/spanin/findSpanin.xml"/>
                 <tool file="cpt2/galaxy-tools/tools/helicalWheel/generateHelicalWheel.xml"/>
                 <tool file="cpt2/galaxy-tools/tools/proximity/searchFile.xml"/>
+                <tool file="cpt2/galaxy-tools/tools/external/FlaGs/FlaGs.xml"/>
+                <tool file="cpt2/galaxy-tools/tools/SAR/SAR_finder.xml"/>
+                <tool file="cpt2/galaxy-tools/tools/external/bacphlib.xml"/>
 	</section>
 	<!-- deprecated -->
 	<tool file="cpt2/galaxy-tools/tools/gff3/gff3_ipr_merge.xml" label="deprecated" hidden="True" />

diff --git a/tools/SAR/README.md b/tools/SAR/README.md
@@ -0,0 +1,48 @@
+# Overview
+> input: multi protein fasta file
+> output: multi fasta of positive candidates and a table summarizing the stats for each candidate with identifier, length of potential SAR, topology orientation, calculate % of hydrophillic residues.
+
+# Requirements
+* python 3.6+
+* biopython
+* <s>pandas</s> --> thought I might use this, but ended up seeing I wouldnt needed it by the time I was wrapping this up.
+* <s>numpy</s>
+
+# Outline
+1. Read in input multi fasta
+    * multi fasta parsed by `biopython_parsing.py`
+2. Check SAR requirements
+    * <s>Min peptide length check</s> --> Currently omitted
+    * <s>Max peptide length check (user dictated)</s> --> Currently omitted
+    * Hydrophobic residues (Ile, Leu, Val, Phe, Tyr, Trp, Met) except often rich in Gly, Ala, and/or Ser residues
+        * <s>Option 1: FIWLVMYAGS</s> Using option #2
+        * __Option 2: FIWLVMYCATGS # add C and T --> This is what is being used__
+    * Lysines can be present in the hydrophobic stretch if within 3 residues of the domain boundary (lysine snorkeling)
+        * <s>Currently, I'm not checking for lysine snorkels as the hydrophobic region present will still be caught.</s>
+        * Snorkelers are found by if a Lys is on the first or last index of the sequence range being inspected, checks for hydrophobic residues between it and either the beginning or end of the sequence.
+        * More refinement will be necessary to verify a "K_nonhydro_nonhydro_nonhydro_hydro..hydro_"
+            * I would think the argument of tuning it anymore is that the method currently would still catch the hydrophobic domains with within a given range.
+    * Topology check
+        * N term (net positive charge)
+        * C term catalytic domain
+3. Return candidates and multi fasta.
+4. Return candidates in multi gff3.
+5. Write statistics to output file in table format
+    * <s>identifier :: length of peptide :: topology orientation :: %G and %A :: likely more later</s>
+    * Been reworked to include what is currently in tab-separated format:
+        * ["Name","Protein Sequence","Protein Length","SAR Length","Putative SAR Sequence","SAR Start Location","[res%]","N-term Sequence","N-term net Charge"]
+
+# File Summaries
+* `SAR_functions.py`
+    * Has the SAR class and accompanying methods
+* `SAR_finder.py`
+    * The executed script by Galaxy
+* `biopython_parsing.py`
+    * _might scale to a_ sym link for parsing bio related files, otherwise will just be related to addressing this experiment.
+* `file_operations.py`
+    * _mich scale to a_ sym link for operating on files and exporting them, otherwise will just be related to addressing this experiment, and writing the outputs.
+
+# Testing
+* Mu (mu-proteins.fa) for a TP
+* Phage-21 for a TP
+* simple-proteins.fa includes TP from Mu and TN from Mu. Used with Planemo.
diff --git a/tools/SAR/SAR_finder.py b/tools/SAR/SAR_finder.py
@@ -0,0 +1,44 @@
+import sys
+import argparse
+import os
+import re
+from biopython_parsing import FASTA_parser
+from file_operations import fasta_from_SAR_dict, gff3_from_SAR_dict, tab_from_SAR_dict
+from SAR_functions import CheckSequence
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="SAR Finder")
+
+    parser.add_argument("fa",type=argparse.FileType("r"),help="organism's multi fasta file")
+
+    parser.add_argument("--min",type=int,default=20,help="minimum size of candidate peptide")
+
+    parser.add_argument("--max",type=int,default=200,help="maximum size of candidate peptide")
+
+    parser.add_argument("--sar_min",type=int,default=15,help="minimum size of candidate peptide TMD domain")
+
+    parser.add_argument("--sar_max",type=int,default=20,help="maximum size of candidate peptide TMD domain")
+
+    parser.add_argument("--out_fa",type=argparse.FileType("w"),help="multifasta output of candidate SAR proteins",default="candidate_SAR.fa")
+
+    parser.add_argument("--out_stat",type=argparse.FileType("w"),help="summary statistic file for candidate SAR proteins, tab separated",default="candidate_SAR_stats.tsv")
+
+    parser.add_argument("--out_gff3",type=argparse.FileType("w"),help="multigff3 file for candidate SAR proteins",default="candidate_SAR.gff3")
+
+    args = parser.parse_args()
+
+    fa_dict = FASTA_parser(fa=args.fa).multifasta_dict()
+
+    sars = {}
+
+    for protein_name, protein_data in fa_dict.items():
+        sar = CheckSequence(protein_name, protein_data)
+        #sar.check_sizes(min=args.min,max=args.max)
+        hydros = sar.shrink_results(sar_min=args.sar_min, sar_max=args.sar_max)
+        sars.update(hydros)
+
+
+    gff3_from_SAR_dict(sars, args.out_gff3)
+    tab_from_SAR_dict(sars,args.out_stat,"SGAT",sar_min=args.sar_min, sar_max=args.sar_max)
+    fasta_from_SAR_dict(sars,args.out_fa)
+    #stat_file_from_SAR_dict(sars,args.out_stat,sar_min=args.sar_min,sar_max=args.sar_max) # fix this whenever ready.
diff --git a/tools/SAR/SAR_finder.xml b/tools/SAR/SAR_finder.xml
@@ -0,0 +1,65 @@
+<tool id="edu.tamu.cpt.sar.sar_finder" name="SAR Finder" version="1.0">
+    <description>SAR Domain Finder</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements">
+    </expand>
+    <command detect_errors="aggressive"><![CDATA[
+python $__tool_directory__/SAR_finder.py
+$fa
+--sar_min $sar_min
+--sar_max $sar_max
+--out_fa $out_fa
+--out_gff3 $out_gff3
+--out_stat $out_stat
+    ]]></command>
+    <inputs>
+        <param label="Multi FASTA File" name="fa" type="data" format="fasta" />
+        <param label="SAR domain minimal size" name="sar_min" type="integer" value="15" />
+        <param label="SAR domain maximum size" name="sar_max" type="integer" value="20" />
+    </inputs>
+    <outputs>
+        <data format="tabular" name="out_stat" label="candidate_SAR_stats.tsv"/>
+        <data format="fasta" name="out_fa" label="candidate_SAR.fa"/>
+        <data format="gff3" name="out_gff3" label="candidate_SAR.gff3"/>
+    </outputs>
+        <tests>
+            <test>
+                <param name="fa" value="simple-proteins.fa"/>
+                <param name="sar_min" value="15"/>
+                <param name="sar_max" value="20"/>
+                <output name="out_stat" file="candidate_SAR_stats.tsv"/>
+                <output name="out_fa" file="candidate_SAR.fa"/>
+                <output name="out_gff3" file="candidate_SAR.gff3"/>
+            </test>
+        </tests>
+    <help><![CDATA[
+A tool that analyzes protein sequence within the first 50 residues for a weakly hydrophobic domain sometimes found in endolysins called Signal-Anchor-Release (aka SAR)
+
+Definition: A Signal-Arrest-Release (SAR) domain is a N-terminal, weakly hydrophobic transmembrane region rich is Gly/Ala and/or Ser residues sometimes found in phage lysis proteins, including endolysins and holins. The SAR domain can be released from the membrane in a proton motive force-dependent manner.
+This tool finds proteins that contain a stretch (default 15-20 residues) of hydrophobic residues (Ile, Leu, Val, Phe, Tyr, Trp, Met, Gly, Ala, Ser) and calculates the % Gly/Ala/Ser/Thr residues in the hydrophobic stretch. The net charge on the N-terminal region is also displayed to aid in determining the SAR topology.[1]
+
+INPUT : Protein Multi FASTA
+
+OUTPUT : 
+
+* Multi FASTA of candidate proteins that pass the SAR domain criteria
+
+* Text summary file containing each protein that passes the SAR domain criteria
+
+* Multi GFF3
+
+    ]]></help>
+    <citations>
+        <citation type="doi">https://dx.doi.org/10.1016/bs.aivir.2018.09.003</citation>
+        <citation type="bibtex">
+            @unpublished{galaxyTools,
+            author = {C. Ross},
+            title = {CPT Galaxy Tools},
+            year = {2020-},
+            note = {https://github.com/tamu-cpt/galaxy-tools/}
+            }
+        </citation>
+    </citations>
+</tool>