From 5342fa45fadb73a15f2a1e6fcb5e235370d60499 Mon Sep 17 00:00:00 2001 From: Shinichi Sunagawa Date: Sat, 4 Apr 2020 17:42:36 +0200 Subject: [PATCH] added -i option to ignore header formats. to be used with -v for single genomes per file --- README.txt | 37 +++++++++++++++++++++++-------------- fetchMGs.pl | 25 +++++++++++++++++-------- 2 files changed, 40 insertions(+), 22 deletions(-) diff --git a/README.txt b/README.txt index 4c6d8e3..93ffcd1 100755 --- a/README.txt +++ b/README.txt @@ -1,6 +1,6 @@ =============== - fetchMGs v 1.1 + fetchMGs v 1.2 =============== @@ -82,19 +82,28 @@ Usage fetchMGs.pl -m|mode [OPTIONS] Extraction mode - ./fetchMGs.pl -m extraction [optional options] - Multi-FASTA file with protein sequences from which marker genes should be extracted - -c|og_used Orthologous group id to be extracted; example: 'COG0012'; default = 'all' - -o|outdir Output directory; default = 'output' - -h|hmmdir Path to directory that contains hmm models; default = './lib' - -b|bitscore Path to bitscore cutoff file; default = 'lib/MG_BitScoreCutoffs.defaults.txt' - -p|protein_only Set if nucleotide sequences for filename.faa is not available - -v|verybesthit_only Recommended to use, if extracting sequences from reference genomes. - For this fasta identifiers should be in the form: taxID.geneID and, - if needed have ' project_id=XXX' somewhere in the header - -t|threads Number of processors/threads to be used - -d|dnaFastaFile Fasta file with DNA sequences of the same genes; not neccesary if protein file and dna file have the same with .faa and .fna suffixes - -x|xbin Path to binaries used by this script. default = '' --> will search for variables in $PATH + ./fetchMGs.pl [options] -m extraction + Multi-FASTA file with protein sequences from which universal single-copy marker genes should be extracted + -c|og_used Orthologous group id to be extracted; example: \'COG0012\'; default = \'all\' + -o|outdir Output directory; default = \'output\' + -b|bitscore Path to bitscore cutoff file; + Default = \'\$pathInWhichThisScriptResides/lib/MG_BitScoreCutoffs.[allhits|verybesthit].txt\' (depending on -v option) + -l|library Path to directory that contains hmm models; + default = \'\$pathInWhichThisScriptResides/lib\' + -p|protein_only Set if nucleotide sequences file for is not available + -d|dnaFastaFile Multi-FASTA file with nucleotide sequences file for ; + not neccesary if protein and nucleotide fasta file have the same name except .faa and .fna suffixes + -v|verybesthit_only Only extract the best hit of each OG from each genome. + Recommended to use, if extracting sequences from multiple reference genomes in the same file. + Do not use it for metagenomes. + If this option is set fasta identifiers should be in the form: taxID.geneID and, if needed, have 'project_id=XXX' in the header. + Alternatively, set -i to ignore the headers. Then, the best hit of each OG in the whole input file will be selected, regardless of the headers used. + -i|ignore_headers If this option is set in addition to -v, the best hit of each COG will be selected. + Recommended to use, if extracting sequences from a single genome in the same file. + -t|threads Number of processors/threads to be used + -x|executables Path to executables used by this script (hmmsearch; seqtk). + default = \'\$pathInWhichThisScriptResides/bin\' + If set to \'\' will search for executables in \$PATH Calibration mode ./fetchMGs.pl -m calibration diff --git a/fetchMGs.pl b/fetchMGs.pl index 5244b3d..45e926f 100755 --- a/fetchMGs.pl +++ b/fetchMGs.pl @@ -22,6 +22,7 @@ Extraction mode ./fetchMGs.pl [options] -m extraction Multi-FASTA file with protein sequences from which universal single-copy marker genes should be extracted + -c|og_used Orthologous group id to be extracted; example: \'COG0012\'; default = \'all\' -o|outdir Output directory; default = \'output\' -b|bitscore Path to bitscore cutoff file; default = \'\$pathInWhichThisScriptResides/lib/MG_BitScoreCutoffs.[allhits|verybesthit].txt\' (depending on -v option) @@ -30,12 +31,13 @@ -p|protein_only Set if nucleotide sequences file for is not available -d|dnaFastaFile Multi-FASTA file with nucleotide sequences file for ; not neccesary if protein and nucleotide fasta file have the same name except .faa and .fna suffixes - -v|verybesthit_only Only extract the best hit to each OG from each genome. - Recommended to use, if extracting sequences from reference genomes. - Please do not use for metagenomes. - If this option is set fasta identifiers should be in the form: taxID.geneID and, - if needed have \' project_id=XXX\' somewhere in the header - -c|og_used Orthologous group id to be extracted; example: \'COG0012\'; default = \'all\' + -v|verybesthit_only Only extract the best hit of each OG from each genome. + Recommended to use, if extracting sequences from multiple reference genomes in the same file. + Do not use it for metagenomes. + If this option is set fasta identifiers should be in the form: taxID.geneID and, if needed, have 'project_id=XXX' in the header. + Alternatively, set -i to ignore the headers. Then, the best hit of each OG in the whole input file will be selected, regardless of the headers used. + -i|ignore_headers If this option is set in addition to -v, the best hit of each COG will be selected. + Recommended to use, if extracting sequences from a single genome in the same file. -t|threads Number of processors/threads to be used -x|executables Path to executables used by this script (hmmsearch; seqtk). default = \'\$pathInWhichThisScriptResides/bin\' @@ -65,6 +67,7 @@ my $cutoff_file = "DEFAULT"; my $protein_only = 0; my $besthit_only = 0; +my $ignoreheaders = 0; my $intProcessorNumber = 1; my $floatMin = 60; my $manual = 0; @@ -92,6 +95,7 @@ 'b|bitscore=s' => \$cutoff_file, 'p|protein_only' => \$protein_only, 'v|verybesthit_only' => \$besthit_only, + 'i|ignore_headers' => \$ignoreheaders, 't|threads=i' => \$intProcessorNumber, 'd|dnaFastaFile=s' => \$strDNAFastaFileName, 'x|executables=s' => \$bin @@ -206,7 +210,7 @@ print " =================================================================================== - FetchMGs v1.1 - extraction of marker genes from protein sequences + FetchMGs v1.2 - extraction of marker genes from protein sequences Copyright (c) 2019 Shinichi Sunagawa, Daniel R Mende =================================================================================== @@ -473,7 +477,12 @@ sub filterResultsArray_besthitAmongGenomes { for my $i ( 0 .. $#arrAllResults ) { my $strQueryId = $arrAllResults[$i][0]; - my $strTaxProjectID = $arrAllResults[$i][1]; +# EDIT + my $strTaxProjectID; + if ( $ignoreheaders ) { + $strTaxProjectID = "X" }else{ + $strTaxProjectID = $arrAllResults[$i][1]; + } my $strCOGid = $arrAllResults[$i][2]; my $floatBitscore = $arrAllResults[$i][3];