Skip to content

Commit

Permalink
Merge remote-tracking branch 'tamu-origin/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
Cory Maughmer committed Sep 24, 2020
2 parents eb3f045 + 8a37010 commit e712834
Show file tree
Hide file tree
Showing 308 changed files with 80,885 additions and 625 deletions.
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -154,4 +154,8 @@ tools/spanin/debug
## proximity
tools/proximity/*.db
tools/proximity/termHits.txt
tools/proximity/test-data/prox/lambda_NRBLAST.gff3
tools/proximity/test-data/prox/lambda_NRBLAST.gff3

## SAR

tools/SAR/big-data/
45 changes: 28 additions & 17 deletions tool_conf.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,18 @@
<label id="=====1" text="Input"/>
<section id="8a843907-df36-4d82-a1ab-2bde1b4ce827" name="CPT: Get Data">
<label id="getext_1" text="Apollo"/>
<tool file="cpt2/galaxy-tools/tools/webapollo/export.xml"/>
<tool file="cpt2/galaxy-tools/tools/webapollo/fetch_organism_jbrowse.xml"/>
<tool file="cpt2/galaxy-tools/tools/webapollo/gga-apollo/list_organisms.xml" label="apollo" />
<tool file="cpt2/galaxy-tools/tools/webapollo/list_organism_data.xml" label="apollo" />
<tool file="cpt2/galaxy-tools/tools/webapollo/gga-apollo/export.xml" label="apollo" />
<tool file="cpt2/galaxy-tools/tools/webapollo/gga-apollo/fetch_organism_jbrowse.xml" label="apollo" />
<label id="getext_2" text="BaseSpace"/>
<tool file="cpt2/galaxy-tools/tools/sequencing/basespace-auth-1.xml"/>
<tool file="cpt2/galaxy-tools/tools/sequencing/basespace-auth-2.xml"/>
<tool file="cpt2/galaxy-tools/tools/sequencing/BaseSpaceRunList.xml"/>
<tool file="cpt2/galaxy-tools/tools/sequencing/BaseSpaceRunSampleList.xml"/>
<tool file="cpt2/galaxy-tools/tools/sequencing/BaseSpaceRunDownloader_v2.xml"/>
<tool file="cpt2/galaxy-tools/tools/sequencing/BaseSpaceSampleDownloader.xml"/>
<tool file="cpt2/galaxy-tools/tools/webapollo/gffFastaSplit.xml"/>
<label id="getext_3" text="NCBI"/>
<tool file="cpt2/galaxy-tools/tools/edirect/efetch.xml" label="dev-only"/>
<tool hidden="True" file="cpt2/galaxy-tools/tools/util/efetch.xml" label="experimental"/>
Expand Down Expand Up @@ -48,6 +51,7 @@
<tool file="cpt2/galaxy-tools/tools/gff3/gff3_validator.xml" label="gff3"/>
<tool file="cpt2/galaxy-tools/tools/gff3/gff3_fix_sixpack.xml" label="gff3" />
<tool file="cpt2/tools-iuc/tools/bioperl/bp_genbank2gff3.xml" label="genbank" />
<tool file="cpt2/galaxy-tools/tools/util/tRNAscan_to_gff_SE_format.xml"/>
</section>
<section id="6dbd8994-74d6-4a9e-834b-131d9cd99526" name="CPT: Text Utilities">
<tool file="cpt2/galaxy-tools/tools/text/cat.xml"/>
Expand Down Expand Up @@ -82,14 +86,16 @@
<tool file="cpt2/galaxy-tools/tools/gff3/gff3_aa_stats.xml" label="gff3" />
<tool file="cpt2/galaxy-tools/tools/one-off/starts-sds-scrubber.xml" label="fasta"/>
<tool file="cpt2/galaxy-tools/tools/util/compare_codons.xml" label="gff3" />
<tool file="cpt2/galaxy-tools/tools/tRNAscan-SE-2.0/trnaScan.xml"/>
</section>
<section id="3e66d481-56c7-471f-8de2-7d496977c5eb" name="CPT: Genomic Data Editing">
<tool file="cpt2/galaxy-tools/tools/fasta/fasta_concat.xml"/>
<tool file="cpt2/galaxy-tools/tools/fasta/fasta_append.xml"/>
<tool file="cpt2/galaxy-tools/tools/fasta/fasta_append.xml"/>
<tool file="cpt2/galaxy-tools/tools/fasta/fasta_join.xml" label="fasta"/>
<tool file="cpt2/galaxy-tools/tools/fasta/fasta_ligate.xml" label="fasta"/>
<tool file="cpt2/galaxy-tools/tools/fasta/fasta_reopen.xml" label="fasta" />
<tool file="cpt2/galaxy-tools/tools/fasta/fasta_chop.xml"/>
<tool file="cpt2/galaxy-tools/tools/fasta/fasta_ID_swapper.xml" />
<tool file="cpt2/galaxy-tools/tools/fasta/fasta_chop.xml"/>
<tool file="cpt2/galaxy-tools/tools/fasta/safe_reopen.xml" label="fasta" />
<tool file="cpt2/galaxy-tools/tools/gbk/rename.xml" label="genbank,fasta,gff3,experimental"/>
<tool file="cpt2/galaxy-tools/tools/gff3/genome_editor.xml" label="experimental,gff3" />
Expand Down Expand Up @@ -123,6 +129,7 @@
<tool file="cpt2/galaxy-tools/tools/gff3/retype_features.xml" label="gff3" />
<tool file="cpt2/galaxy-tools/tools/gff3/rebase_features.xml" label="gff3" />
<tool file="cpt2/galaxy-tools/tools/gbk/feature_export.xml" label="genbank" />
<tool file="cpt2/galaxy-tools/tools/gbk/gbk_sectioning.xml" label="genbank" />
<tool file="cpt2/galaxy-tools/tools/gbk/seq_export.xml" label="genbank" />
<tool file="cpt2/galaxy-tools/tools/gbk/adjacent_features.xml" label="genbank" />
<tool file="cpt2/galaxy-tools/tools/fasta/fasta_split.xml" label="fasta" />
Expand All @@ -141,8 +148,7 @@
<tool file="cpt2/galaxy-tools/tools/external/tmhmm.xml"/>
<tool file="cpt2/galaxy-tools/tools/external/signalp.xml"/>
<!--<tool file="cpt2/galaxy-tools/tools/external/interproscan.xml"/>-->
<tool file="cpt2/galaxy-tools/tools/external/interproscan-5.22.xml"/>
<tool file="cpt2/galaxy-tools/tools/external/interproscan-5.33.xml"/>
<tool file="cpt2/galaxy-tools/tools/external/interproscan-5.XX.xml"/>
<tool file="cpt2/galaxy-tools/tools/external/fix-aragorn-gff3.xml"/>
<tool file="cpt2/galaxy-tools/tools/one-off/genrand.xml"/>
</section>
Expand Down Expand Up @@ -177,6 +183,7 @@
<tool file="cpt2/galaxy-tools/tools/blast/blastn_to_gff3.xml" label="experimental"/>
<tool file="cpt2/galaxy-tools/tools/blast/blastp_to_gff3.xml" label="experimental"/>
<tool file="cpt2/galaxy-tools/tools/blast/blasttab_dice_filter.xml"/>
<tool file="cpt2/galaxy-tools/tools/genome_viz/brigaid.xml"/>
</section>
<section id="7045e32e-90c8-4e20-b4ac-a6cb070d40c3" name="CPT: Filter Features">
<tool file="cpt2/galaxy-tools/tools/gff3/filter_type.xml" label="gff3" />
Expand Down Expand Up @@ -251,23 +258,23 @@
<section id="6b26a975-a24a-4b05-b082-2286e1693fef" name="CPT: Genomic Viz">
<tool file="cpt2/galaxy-tools/tools/gbk/featuresvg.xml" label="genbank" />
<tool file="cpt2/galaxy-tools/tools/phage/genome_map.xml"/>
<!-- <tool file="cpt2/tools-iuc/tools/jbrowse/jbrowse.xml"/> -->
<tool file="cpt2/galaxy-tools/tools/jbrowse/jbrowse.xml"/>
<tool file="cpt2/galaxy-tools/tools/genome_viz/linear_genome_plot.xml"/>
<tool file="cpt2/galaxy-tools/tools/jbrowse/jbrowse.xml" label="deprecated"/>
<tool file="cpt2/galaxy-tools/tools//jbrowse/jbrowse-iuc/jbrowse.xml" label="experimental"/>
<tool file="cpt2/Restriction-Digest-Tool/bin/digest_dna.xml"/>
<tool file="cpt2/Restriction-Digest-Tool/bin/graphic_drawer.xml"/>
</section>
<section id="8dee2684-f833-4dd9-b50d-abfdbb78f0be" name="CPT: Apollo">
<tool file="cpt2/galaxy-tools/tools/webapollo/create_account.xml" label="apollo" />
<tool file="cpt2/galaxy-tools/tools/webapollo/list_organisms.xml" label="apollo" />
<tool file="cpt2/galaxy-tools/tools/webapollo/list_organism_data.xml" label="apollo" />
<tool file="cpt2/galaxy-tools/tools/webapollo/create_or_update_organism.xml" label="apollo" />
<tool file="cpt2/galaxy-tools/tools/gff3/gff3_prep_for_apollo.xml" label="apollo" />
<tool file="cpt2/galaxy-tools/tools/webapollo/gga-apollo/create_account.xml" label="apollo" />
<tool file="cpt2/galaxy-tools/tools/webapollo/gga-apollo/create_features_from_gff3.xml" label="apollo" />
<tool file="cpt2/galaxy-tools/tools/webapollo/gga-apollo/create_or_update_organism.xml" label="apollo" />
<tool file="cpt2/galaxy-tools/tools/webapollo/gga-apollo/json2iframe.xml" label="apollo" />
<tool file="cpt2/galaxy-tools/tools/webapollo/gga-apollo/delete_features.xml" label="danger, apollo"/>
<tool file="cpt2/galaxy-tools/tools/webapollo/gga-apollo/delete_organism.xml" label="danger, apollo"/>
<tool file="cpt2/galaxy-tools/tools/webapollo/renumber_features.xml" label="new,apollo" />
<tool file="cpt2/galaxy-tools/tools/webapollo/delete_features.xml" label="danger,apollo" />
<tool file="cpt2/galaxy-tools/tools/webapollo/create_features_from_gff3.xml" label="experimental,apollo" />
<tool file="cpt2/galaxy-tools/tools/webapollo/rename_from_table.xml" label="new,apollo" />
<tool file="cpt2/galaxy-tools/tools/webapollo/json2iframe.xml" label="apollo" />
<tool file="cpt2/galaxy-tools/tools/gff3/gff3_fix_apollo_sd.xml" label="apollo" />
<tool file="cpt2/galaxy-tools/tools/webapollo/share_with.xml" label="apollo" />
<tool file="cpt2/galaxy-tools/tools/webapollo/shareOrg/share_with.xml" label="apollo" />
</section>
<section id="42fc75d5-cae2-4d29-8794-9a8d2f390032" name="CPT: Phage">
<tool file="cpt2/galaxy-tools/tools/gbk/phage_renamer.xml" label="genbank" />
Expand Down Expand Up @@ -303,9 +310,13 @@
<tool file="cpt2/galaxy-tools/tools/genemark/genemarkHMM.xml"/>
<tool file="cpt2/galaxy-tools/tools/spanin/generate-putative-isp.xml"/>
<tool file="cpt2/galaxy-tools/tools/spanin/generate-putative-osp.xml"/>
<tool file="cpt2/galaxy-tools/tools/spanin/generate-putative-usp.xml"/>
<tool file="cpt2/galaxy-tools/tools/spanin/findSpanin.xml"/>
<tool file="cpt2/galaxy-tools/tools/helicalWheel/generateHelicalWheel.xml"/>
<tool file="cpt2/galaxy-tools/tools/proximity/searchFile.xml"/>
<tool file="cpt2/galaxy-tools/tools/external/FlaGs/FlaGs.xml"/>
<tool file="cpt2/galaxy-tools/tools/SAR/SAR_finder.xml"/>
<tool file="cpt2/galaxy-tools/tools/external/bacphlib.xml"/>
</section>
<!-- deprecated -->
<tool file="cpt2/galaxy-tools/tools/gff3/gff3_ipr_merge.xml" label="deprecated" hidden="True" />
Expand Down
48 changes: 48 additions & 0 deletions tools/SAR/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Overview
> input: multi protein fasta file
> output: multi fasta of positive candidates and a table summarizing the stats for each candidate with identifier, length of potential SAR, topology orientation, calculate % of hydrophillic residues.
# Requirements
* python 3.6+
* biopython
* <s>pandas</s> --> thought I might use this, but ended up seeing I wouldnt needed it by the time I was wrapping this up.
* <s>numpy</s>

# Outline
1. Read in input multi fasta
* multi fasta parsed by `biopython_parsing.py`
2. Check SAR requirements
* <s>Min peptide length check</s> --> Currently omitted
* <s>Max peptide length check (user dictated)</s> --> Currently omitted
* Hydrophobic residues (Ile, Leu, Val, Phe, Tyr, Trp, Met) except often rich in Gly, Ala, and/or Ser residues
* <s>Option 1: FIWLVMYAGS</s> Using option #2
* __Option 2: FIWLVMYCATGS # add C and T --> This is what is being used__
* Lysines can be present in the hydrophobic stretch if within 3 residues of the domain boundary (lysine snorkeling)
* <s>Currently, I'm not checking for lysine snorkels as the hydrophobic region present will still be caught.</s>
* Snorkelers are found by if a Lys is on the first or last index of the sequence range being inspected, checks for hydrophobic residues between it and either the beginning or end of the sequence.
* More refinement will be necessary to verify a "K_nonhydro_nonhydro_nonhydro_hydro..hydro_"
* I would think the argument of tuning it anymore is that the method currently would still catch the hydrophobic domains with within a given range.
* Topology check
* N term (net positive charge)
* C term catalytic domain
3. Return candidates and multi fasta.
4. Return candidates in multi gff3.
5. Write statistics to output file in table format
* <s>identifier :: length of peptide :: topology orientation :: %G and %A :: likely more later</s>
* Been reworked to include what is currently in tab-separated format:
* ["Name","Protein Sequence","Protein Length","SAR Length","Putative SAR Sequence","SAR Start Location","[res%]","N-term Sequence","N-term net Charge"]

# File Summaries
* `SAR_functions.py`
* Has the SAR class and accompanying methods
* `SAR_finder.py`
* The executed script by Galaxy
* `biopython_parsing.py`
* _might scale to a_ sym link for parsing bio related files, otherwise will just be related to addressing this experiment.
* `file_operations.py`
* _mich scale to a_ sym link for operating on files and exporting them, otherwise will just be related to addressing this experiment, and writing the outputs.

# Testing
* Mu (mu-proteins.fa) for a TP
* Phage-21 for a TP
* simple-proteins.fa includes TP from Mu and TN from Mu. Used with Planemo.
44 changes: 44 additions & 0 deletions tools/SAR/SAR_finder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import sys
import argparse
import os
import re
from biopython_parsing import FASTA_parser
from file_operations import fasta_from_SAR_dict, gff3_from_SAR_dict, tab_from_SAR_dict
from SAR_functions import CheckSequence

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="SAR Finder")

parser.add_argument("fa",type=argparse.FileType("r"),help="organism's multi fasta file")

parser.add_argument("--min",type=int,default=20,help="minimum size of candidate peptide")

parser.add_argument("--max",type=int,default=200,help="maximum size of candidate peptide")

parser.add_argument("--sar_min",type=int,default=15,help="minimum size of candidate peptide TMD domain")

parser.add_argument("--sar_max",type=int,default=20,help="maximum size of candidate peptide TMD domain")

parser.add_argument("--out_fa",type=argparse.FileType("w"),help="multifasta output of candidate SAR proteins",default="candidate_SAR.fa")

parser.add_argument("--out_stat",type=argparse.FileType("w"),help="summary statistic file for candidate SAR proteins, tab separated",default="candidate_SAR_stats.tsv")

parser.add_argument("--out_gff3",type=argparse.FileType("w"),help="multigff3 file for candidate SAR proteins",default="candidate_SAR.gff3")

args = parser.parse_args()

fa_dict = FASTA_parser(fa=args.fa).multifasta_dict()

sars = {}

for protein_name, protein_data in fa_dict.items():
sar = CheckSequence(protein_name, protein_data)
#sar.check_sizes(min=args.min,max=args.max)
hydros = sar.shrink_results(sar_min=args.sar_min, sar_max=args.sar_max)
sars.update(hydros)


gff3_from_SAR_dict(sars, args.out_gff3)
tab_from_SAR_dict(sars,args.out_stat,"SGAT",sar_min=args.sar_min, sar_max=args.sar_max)
fasta_from_SAR_dict(sars,args.out_fa)
#stat_file_from_SAR_dict(sars,args.out_stat,sar_min=args.sar_min,sar_max=args.sar_max) # fix this whenever ready.
65 changes: 65 additions & 0 deletions tools/SAR/SAR_finder.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
<tool id="edu.tamu.cpt.sar.sar_finder" name="SAR Finder" version="1.0">
<description>SAR Domain Finder</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="requirements">
</expand>
<command detect_errors="aggressive"><![CDATA[
python $__tool_directory__/SAR_finder.py
$fa
--sar_min $sar_min
--sar_max $sar_max
--out_fa $out_fa
--out_gff3 $out_gff3
--out_stat $out_stat
]]></command>
<inputs>
<param label="Multi FASTA File" name="fa" type="data" format="fasta" />
<param label="SAR domain minimal size" name="sar_min" type="integer" value="15" />
<param label="SAR domain maximum size" name="sar_max" type="integer" value="20" />
</inputs>
<outputs>
<data format="tabular" name="out_stat" label="candidate_SAR_stats.tsv"/>
<data format="fasta" name="out_fa" label="candidate_SAR.fa"/>
<data format="gff3" name="out_gff3" label="candidate_SAR.gff3"/>
</outputs>
<tests>
<test>
<param name="fa" value="simple-proteins.fa"/>
<param name="sar_min" value="15"/>
<param name="sar_max" value="20"/>
<output name="out_stat" file="candidate_SAR_stats.tsv"/>
<output name="out_fa" file="candidate_SAR.fa"/>
<output name="out_gff3" file="candidate_SAR.gff3"/>
</test>
</tests>
<help><![CDATA[
A tool that analyzes protein sequence within the first 50 residues for a weakly hydrophobic domain sometimes found in endolysins called Signal-Anchor-Release (aka SAR)
Definition: A Signal-Arrest-Release (SAR) domain is a N-terminal, weakly hydrophobic transmembrane region rich is Gly/Ala and/or Ser residues sometimes found in phage lysis proteins, including endolysins and holins. The SAR domain can be released from the membrane in a proton motive force-dependent manner.
This tool finds proteins that contain a stretch (default 15-20 residues) of hydrophobic residues (Ile, Leu, Val, Phe, Tyr, Trp, Met, Gly, Ala, Ser) and calculates the % Gly/Ala/Ser/Thr residues in the hydrophobic stretch. The net charge on the N-terminal region is also displayed to aid in determining the SAR topology.[1]
INPUT : Protein Multi FASTA
OUTPUT :
* Multi FASTA of candidate proteins that pass the SAR domain criteria
* Text summary file containing each protein that passes the SAR domain criteria
* Multi GFF3
]]></help>
<citations>
<citation type="doi">https://dx.doi.org/10.1016/bs.aivir.2018.09.003</citation>
<citation type="bibtex">
@unpublished{galaxyTools,
author = {C. Ross},
title = {CPT Galaxy Tools},
year = {2020-},
note = {https://github.com/tamu-cpt/galaxy-tools/}
}
</citation>
</citations>
</tool>
Loading

0 comments on commit e712834

Please sign in to comment.