From 5342fa45fadb73a15f2a1e6fcb5e235370d60499 Mon Sep 17 00:00:00 2001
From: Shinichi Sunagawa <ssunagaw@morgan.ethz.ch>
Date: Sat, 4 Apr 2020 17:42:36 +0200
Subject: [PATCH] added -i option to ignore header formats. to be used with -v
 for single genomes per file

---
 README.txt  | 37 +++++++++++++++++++++++--------------
 fetchMGs.pl | 25 +++++++++++++++++--------
 2 files changed, 40 insertions(+), 22 deletions(-)
diff --git a/README.txt b/README.txt
index 4c6d8e3..93ffcd1 100755
--- a/README.txt
+++ b/README.txt
@@ -1,6 +1,6 @@
 
 ===============
- fetchMGs v 1.1 
+ fetchMGs v 1.2 
 ===============
 
 
@@ -82,19 +82,28 @@ Usage
        fetchMGs.pl -m|mode <extraction|calibration> [OPTIONS]
 
 Extraction mode
-       ./fetchMGs.pl -m extraction <protein sequences> [optional options]
-           <protein sequences>               Multi-FASTA file with protein sequences from which marker genes should be extracted
-           -c|og_used                        Orthologous group id to be extracted; example: 'COG0012'; default = 'all'
-           -o|outdir                         Output directory; default = 'output'
-           -h|hmmdir                         Path to directory that contains hmm models; default = './lib'
-           -b|bitscore                       Path to bitscore cutoff file; default = 'lib/MG_BitScoreCutoffs.defaults.txt'
-           -p|protein_only                   Set if nucleotide sequences for filename.faa is not available
-           -v|verybesthit_only               Recommended to use, if extracting sequences from reference genomes.
-                                               For this fasta identifiers should be in the form: taxID.geneID and,
-                                               if needed have ' project_id=XXX' somewhere in the header
-           -t|threads                        Number of processors/threads to be used
-           -d|dnaFastaFile                   Fasta file with DNA sequences of the same genes; not neccesary if protein file and dna file have the same with .faa and .fna suffixes
-           -x|xbin                           Path to binaries used by this script. default = '' --> will search for variables in $PATH
+       ./fetchMGs.pl [options] -m extraction <protein sequences>
+           <protein sequences>           Multi-FASTA file with protein sequences from which universal single-copy marker genes should be extracted
+           -c|og_used                    Orthologous group id to be extracted; example: \'COG0012\'; default = \'all\'
+           -o|outdir                     Output directory; default = \'output\'
+           -b|bitscore                   Path to bitscore cutoff file;
+                                           Default = \'\$pathInWhichThisScriptResides/lib/MG_BitScoreCutoffs.[allhits|verybesthit].txt\' (depending on -v option)
+           -l|library                    Path to directory that contains hmm models;
+                                           default = \'\$pathInWhichThisScriptResides/lib\'
+           -p|protein_only               Set if nucleotide sequences file for <protein sequences> is not available
+           -d|dnaFastaFile               Multi-FASTA file with nucleotide sequences file for <protein sequences>;
+                                                   not neccesary if protein and nucleotide fasta file have the same name except .faa and .fna suffixes
+           -v|verybesthit_only               Only extract the best hit of each OG from each genome.
+                                                  Recommended to use, if extracting sequences from multiple reference genomes in the same file.
+                                                  Do not use it for metagenomes.
+                                                  If this option is set fasta identifiers should be in the form: taxID.geneID and, if needed, have 'project_id=XXX' in the header.
+                                                  Alternatively, set -i to ignore the headers. Then, the best hit of each OG in the whole input file will be selected, regardless of the headers used.
+           -i|ignore_headers                 If this option is set in addition to -v, the best hit of each COG will be selected.
+                                                  Recommended to use, if extracting sequences from a single genome in the same file.
+           -t|threads                    Number of processors/threads to be used
+           -x|executables                Path to executables used by this script (hmmsearch; seqtk).
+                                                   default = \'\$pathInWhichThisScriptResides/bin\'
+                                                   If set to \'\' will search for executables in \$PATH
 
 Calibration mode
        ./fetchMGs.pl -m calibration <reference protein sequences> <true positives map>
diff --git a/fetchMGs.pl b/fetchMGs.pl
index 5244b3d..45e926f 100755
--- a/fetchMGs.pl
+++ b/fetchMGs.pl
@@ -22,6 +22,7 @@
 Extraction mode
        ./fetchMGs.pl [options] -m extraction <protein sequences> 
            <protein sequences>           Multi-FASTA file with protein sequences from which universal single-copy marker genes should be extracted
+           -c|og_used                    Orthologous group id to be extracted; example: \'COG0012\'; default = \'all\'
            -o|outdir                     Output directory; default = \'output\'
            -b|bitscore                   Path to bitscore cutoff file; 
                                            default = \'\$pathInWhichThisScriptResides/lib/MG_BitScoreCutoffs.[allhits|verybesthit].txt\' (depending on -v option)
@@ -30,12 +31,13 @@
            -p|protein_only               Set if nucleotide sequences file for <protein sequences> is not available
            -d|dnaFastaFile               Multi-FASTA file with nucleotide sequences file for <protein sequences>; 
 		                                   not neccesary if protein and nucleotide fasta file have the same name except .faa and .fna suffixes
-           -v|verybesthit_only           Only extract the best hit to each OG from each genome. 
-		                                   Recommended to use, if extracting sequences from reference genomes. 
-                                           Please do not use for metagenomes.
-                                           If this option is set fasta identifiers should be in the form: taxID.geneID and,
-                                           if needed have \' project_id=XXX\' somewhere in the header
-           -c|og_used                    Orthologous group id to be extracted; example: \'COG0012\'; default = \'all\'
+           -v|verybesthit_only               Only extract the best hit of each OG from each genome.
+                                                  Recommended to use, if extracting sequences from multiple reference genomes in the same file.
+                                                  Do not use it for metagenomes.
+                                                  If this option is set fasta identifiers should be in the form: taxID.geneID and, if needed, have 'project_id=XXX' in the header.
+                                                  Alternatively, set -i to ignore the headers. Then, the best hit of each OG in the whole input file will be selected, regardless of the headers used.
+           -i|ignore_headers                 If this option is set in addition to -v, the best hit of each COG will be selected.
+                                                  Recommended to use, if extracting sequences from a single genome in the same file.
            -t|threads                    Number of processors/threads to be used
            -x|executables                Path to executables used by this script (hmmsearch; seqtk). 
 		                                   default = \'\$pathInWhichThisScriptResides/bin\' 
@@ -65,6 +67,7 @@
 my $cutoff_file             = "DEFAULT";
 my $protein_only            = 0;
 my $besthit_only            = 0;
+my $ignoreheaders           = 0;
 my $intProcessorNumber      = 1;
 my $floatMin                = 60;
 my $manual                  = 0;
@@ -92,6 +95,7 @@
 	'b|bitscore=s'			=> \$cutoff_file,
 	'p|protein_only'		=> \$protein_only,
 	'v|verybesthit_only'	=> \$besthit_only,
+        'i|ignore_headers'      => \$ignoreheaders,
 	't|threads=i'			=> \$intProcessorNumber,
 	'd|dnaFastaFile=s'		=> \$strDNAFastaFileName,
 	'x|executables=s'		=> \$bin
@@ -206,7 +210,7 @@
 
 print "
 ===================================================================================
-         FetchMGs v1.1 - extraction of marker genes from protein sequences
+         FetchMGs v1.2 - extraction of marker genes from protein sequences
           Copyright (c) 2019 Shinichi Sunagawa, Daniel R Mende
 ===================================================================================
 
@@ -473,7 +477,12 @@ sub filterResultsArray_besthitAmongGenomes {
 	for my $i ( 0 .. $#arrAllResults ) {
 
 		my $strQueryId      = $arrAllResults[$i][0];
-		my $strTaxProjectID = $arrAllResults[$i][1];
+# EDIT
+		my $strTaxProjectID;
+		if ( $ignoreheaders ) {
+		    $strTaxProjectID = "X" }else{
+			$strTaxProjectID = $arrAllResults[$i][1];
+		    }
 		my $strCOGid        = $arrAllResults[$i][2];
 		my $floatBitscore   = $arrAllResults[$i][3];