push to version 3.1.0 of mOTUs. Fixed some issues with newlines in a …

…mOTUs db file. Added script that creates a fake taxonomy for added genomes
motu-tool · Jun 23, 2023 · 8061921 · 8061921
1 parent dd15f79
commit 8061921
Show file tree

Hide file tree

Showing 15 changed files with 4,611 additions and 40 deletions.
diff --git a/mOTUs-extender/0prod_fetchMGs.py b/mOTUs-extender/0prod_fetchMGs.py
@@ -60,16 +60,17 @@
         mkdir -p {params.outdir}genome/
         mkdir -p {params.outdir}prodigal/
         mkdir -p {params.outdir}fetchMGs/
-        rsync {input.fa} {params.outdir}genome/
-       	prodigal -i {input.fa} -a {params.faa} -d {params.fna} -f gff -o {params.gff} -c -m -g 11 -p single
+        python {script_folder}/reformat_genome.py {input.fa} {params.outdir}genome/{wildcards.sample}.fa
+        #rsync {input.fa} {params.outdir}genome/
+       	prodigal -i {params.outdir}genome/{wildcards.sample}.fa -a {params.faa} -d {params.fna} -f gff -o {params.gff} -c -m -g 11 -p single
         rm -rf {params.outdir}fetchMGs/{wildcards.sample}-bestMGs/
         fetchMGs.pl -m extraction -v -i -d {params.fna} -o {params.outdir}fetchMGs/{wildcards.sample}-bestMGs/ {params.faa} -t {threads}
         mkdir -p {params.outdir}fetchMGs/{wildcards.sample}-bestMGs-renamed/
         python {script_folder}/parse_fetchMGs.py {params.outdir}fetchMGs/{wildcards.sample}-bestMGs/ {params.outdir}fetchMGs/{wildcards.sample}-bestMGs-renamed/ {wildcards.sample}
         rm -r {params.outdir}fetchMGs/{wildcards.sample}-bestMGs/hmmResults/
         rm -r {params.outdir}fetchMGs/{wildcards.sample}-bestMGs/temp/
         if [ -f {params.outdir}fetchMGs/{wildcards.sample}-bestMGs-renamed/motus.mgs.count.ok ]; then
-            sh {script_folder}/extend_mOTUs_addMarkerGenes.sh {input.fa} {wildcards.sample} {dest_path_extension} {script_folder} {temp_db_folder_path} {params.outdir}fetchMGs/{wildcards.sample}-bestMGs-renamed/ {mapfile}
+            sh {script_folder}/extend_mOTUs_addMarkerGenes.sh {params.outdir}genome/{wildcards.sample}.fa {wildcards.sample} {dest_path_extension} {script_folder} {temp_db_folder_path} {params.outdir}fetchMGs/{wildcards.sample}-bestMGs-renamed/ {mapfile}
         fi
         
         ";

diff --git a/mOTUs-extender/LICENSE b/mOTUs-extender/LICENSE
diff --git a/mOTUs-extender/README.md b/mOTUs-extender/README.md
diff --git a/mOTUs-extender/combineDistances_5.py b/mOTUs-extender/combineDistances_5.py
@@ -209,7 +209,7 @@ def fillDistMatrix(distFileName, dictGene2Genome, dictGene2OG, dictGeneLengths,
             OG_2 = dictGene2OG[strGeneID_2]
 
             if (OG_1 != OG_2):
-                sys.stderr.write(str(strGeneID_1) + " " + str(OG_1) + " " + str(strGeneID_2) + " " + str(OG_2) + " have a distance but are annotated to different OGs.\n")
+                sys.stderr.write(str(strGeneID_1) + " cog1=" + str(OG_1) + " " + str(strGeneID_2) + " cog2=" + str(OG_2) + " have a distance but are annotated to different OGs.\n")
 
 #########
 #            if (strGeneID_1.count("|") > 1):

diff --git a/mOTUs-extender/extend_mOTUs_addGenome.sh b/mOTUs-extender/extend_mOTUs_addGenome.sh
@@ -68,7 +68,7 @@ while read line;
 do
 if [ -s "$new_database_folder/dbs/${genome_ID}/${genome_ID}_markerGenes2/$line.notab.fna" ]
 then
-vsearch --threads 1 --usearch_global $new_database_folder/dbs/${genome_ID}/${genome_ID}_markerGenes2/$line.notab.fna --db ${unpadded_mOTUseqsFile}.udb --id 0.8  --maxaccepts 100000 --mincols 20 --blast6out $new_database_folder/dbs/$genome_ID/vsearch/$line.distances_vs_db.m8
+vsearch --threads 1 --usearch_global $new_database_folder/dbs/${genome_ID}/${genome_ID}_markerGenes2/$line.notab.fna --db ${unpadded_mOTUseqsFile}.udb --id 0.8  --maxaccepts 100000 --mincols 20 --strand both --blast6out $new_database_folder/dbs/$genome_ID/vsearch/$line.distances_vs_db.m8
 else
 touch $new_database_folder/dbs/$genome_ID/vsearch/$line.distances_vs_db.m8
 fi

diff --git a/mOTUs-extender/extend_mOTUs_addMarkerGenes.sh b/mOTUs-extender/extend_mOTUs_addMarkerGenes.sh
@@ -35,7 +35,7 @@ cat $new_database_folder/dbs/${genome_ID}/${genome_ID}.genes.len $db_mOTU_genes_
 #make global map file
 
 cat $new_database_folder/dbs/${genome_ID}/${genome_ID}.map2genome $mapfile > $new_database_folder/dbs/${genome_ID}/${genome_ID}.all.map2genome
-python $scriptDir/map2genome.py $new_database_folder/dbs/${genome_ID}/${genome_ID}.all.map2genome $new_database_folder/dbs/${genome_ID}/${genome_ID}.all.map2genome_v5
+python $scriptDir/map2genome_v6.py $new_database_folder/dbs/${genome_ID}/${genome_ID}.all.map2genome $new_database_folder/dbs/${genome_ID}/${genome_ID}.all.map2genome_v6
 
 #make global genome IDs file
 cut -f2 $new_database_folder/dbs/${genome_ID}/${genome_ID}.map2genome > $new_database_folder/dbs/$genome_ID/${genome_ID}.genomeIDs
@@ -60,7 +60,7 @@ cat $new_database_folder/dbs/$genome_ID/vsearch/normalized//COG*.distances_vs_db
 
 ls $new_database_folder/dbs/${genome_ID}/vsearch/normalized/COG*.distances_vs_db.50p.20n.m8  > $new_database_folder/dbs/${genome_ID}/vsearch/normalized//files_normalized.txt
 
-python $scriptDir/combineDistances_5.py -d 55.0 -c 3 $new_database_folder/dbs/${genome_ID}/vsearch/normalized//files_normalized.txt $new_database_folder/dbs/${genome_ID}/${genome_ID}.genes.all.len $new_database_folder/dbs/${genome_ID}/${genome_ID}.all.genomeIDs $new_database_folder/dbs/${genome_ID}/${genome_ID}.all.map2genome_v5 $new_database_folder/dbs/${genome_ID}/vsearch/normalized/AllCOGs.distances_vs_db.excludedPairs $new_database_folder/dbs/${genome_ID}/vsearch/combined.normalized.distances_vs_db.m8
+python $scriptDir/combineDistances_5.py -d 55.0 -c 3 $new_database_folder/dbs/${genome_ID}/vsearch/normalized//files_normalized.txt $new_database_folder/dbs/${genome_ID}/${genome_ID}.genes.all.len $new_database_folder/dbs/${genome_ID}/${genome_ID}.all.genomeIDs $new_database_folder/dbs/${genome_ID}/${genome_ID}.all.map2genome_v6 $new_database_folder/dbs/${genome_ID}/vsearch/normalized/AllCOGs.distances_vs_db.excludedPairs $new_database_folder/dbs/${genome_ID}/vsearch/combined.normalized.distances_vs_db.m8
 
 #python $scriptDir/combineDistances_4_useMap.py $new_database_folder/dbs/${genome_ID}/vsearch/normalized//files_normalized.txt $new_database_folder/dbs/${genome_ID}/${genome_ID}.genes.all.len $new_database_folder/dbs/${genome_ID}/${genome_ID}.all.genomeIDs $new_database_folder/dbs/${genome_ID}/${genome_ID}.all.map2genome $new_database_folder/dbs/${genome_ID}/vsearch/normalized/AllCOGs.distances_vs_db.excludedPairs $new_database_folder/dbs/${genome_ID}/vsearch/combined.normalized.distances_vs_db.m8
 

diff --git a/mOTUs-extender/extend_mOTUs_generateDB.sh b/mOTUs-extender/extend_mOTUs_generateDB.sh
@@ -1,4 +1,4 @@
-#set -euxo pipefail
+set -euxo pipefail
 #filter genomes from mOTU DB
 
 
@@ -73,7 +73,7 @@ cat $new_database_folder/dbs/${genome_ID}/${genome_ID}.map2genome
 done < $new_database_folder/$newDBName/genomes.filtered.list > $new_database_folder/$newDBName/${newDBName}.new.map2genome
 cut -f2 $new_database_folder/$newDBName/${newDBName}.new.map2genome | sort -u > $new_database_folder/$newDBName/${newDBName}.new.genomeIDs
 
-python $scriptDir/map2genome.py $new_database_folder/$newDBName/${newDBName}.new.map2genome $new_database_folder/$newDBName/${newDBName}.new.map2genome_v5
+python $scriptDir/map2genome_v6.py $new_database_folder/$newDBName/${newDBName}.new.map2genome $new_database_folder/$newDBName/${newDBName}.new.map2genome_v6
 
 while read genome_ID;
 do
@@ -139,7 +139,7 @@ echo "Duplicated entry in new_database_folder/$newDBName/${newDBName}.new.map2ge
 exit 1
 fi
 
-python $scriptDir/combineDistances_5.py -d 55.0 -c 3 $new_database_folder/$newDBName/vsearch/files_normalized.txt $new_database_folder/$newDBName/${newDBName}.new.len $new_database_folder/$newDBName/${newDBName}.new.genomeIDs $new_database_folder/$newDBName/${newDBName}.new.map2genome_v5 $new_database_folder/$newDBName/vsearch/AllCOGs.normalized.excludedPairs $new_database_folder/$newDBName/vsearch/combined.normalized.new.m8
+python $scriptDir/combineDistances_5.py -d 55.0 -c 3 $new_database_folder/$newDBName/vsearch/files_normalized.txt $new_database_folder/$newDBName/${newDBName}.new.len $new_database_folder/$newDBName/${newDBName}.new.genomeIDs $new_database_folder/$newDBName/${newDBName}.new.map2genome_v6 $new_database_folder/$newDBName/vsearch/AllCOGs.normalized.excludedPairs $new_database_folder/$newDBName/vsearch/combined.normalized.new.m8
 
 #python $scriptDir/combineDistances_4_useMap.py $new_database_folder/$newDBName/vsearch/files_normalized.txt $new_database_folder/$newDBName/${newDBName}.new.len $new_database_folder/$newDBName/${newDBName}.new.genomeIDs $new_database_folder/$newDBName/${newDBName}.new.map2genome $new_database_folder/$newDBName/vsearch/AllCOGs.normalized.excludedPairs $new_database_folder/$newDBName/vsearch/combined.normalized.new.m8
 
@@ -191,7 +191,7 @@ done < $new_database_folder/$newDBName/genomes.filtered.list  > $new_database_fo
 
 
 
-python $scriptDir/postprocess_min1.py $mOTU_folder/ $new_database_folder/$newDBName/ $scriptDir/cutoffs_fscore_specIAsRef.csv $threads
+python $scriptDir/postprocess_min1.py $mOTU_folder/ $new_database_folder/$newDBName/$newDBName.padded $new_database_folder/$newDBName/ $scriptDir/cutoffs_fscore_specIAsRef.csv $threads
 
 python $scriptDir/extend_mOTUs_getClusterTaxonomy.py $new_database_folder/$newDBName/vsearch/$newDBName.clustering $taxonomyFile > $new_database_folder/$newDBName/vsearch/$newDBName.taxonomy
 
@@ -220,7 +220,7 @@ cat $mOTU_folder/db_mOTU/db_mOTU_DB_CEN.fasta.annotations $new_database_folder/$
 cat $mOTU_folder/db_mOTU/db_mOTU_DB_NR.fasta $new_database_folder/$newDBName/updated_min1_db_mOTU/db_mOTU_DB_NR.fasta $new_database_folder/$newDBName/$newDBName.padded > $new_database_folder/$newDBName/db_mOTU/db_mOTU_DB_NR.fasta
 cat $mOTU_folder/db_mOTU/db_mOTU_genes_length_NR $new_database_folder/$newDBName/updated_min1_db_mOTU/db_mOTU_genes_length_NR $new_database_folder/$newDBName/${newDBName}.new.len > $new_database_folder/$newDBName/db_mOTU/db_mOTU_genes_length_NR
 cat $mOTU_folder/db_mOTU/db_mOTU_MAP_genes_to_MGCs.tsv $new_database_folder/$newDBName/updated_min1_db_mOTU/db_mOTU_MAP_genes_to_MGCs.tsv $new_database_folder/$newDBName/${newDBName}.map > $new_database_folder/$newDBName/db_mOTU/db_mOTU_MAP_genes_to_MGCs.tsv
-cat $mOTU_folder/db_mOTU/db_mOTU_MAP_MGCs_to_mOTUs_in-line.tsv $new_database_folder/$newDBName/updated_min1_db_mOTU/db_mOTU_MAP_MGCs_to_mOTUs_in-line.tsv $new_database_folder/$newDBName/vsearch/$newDBName.mOTU-LG.map.line.tsv > $new_database_folder/$newDBName/db_mOTU/db_mOTU_MAP_MGCs_to_mOTUs_in-line.tsv
+cat $mOTU_folder/db_mOTU/db_mOTU_MAP_MGCs_to_mOTUs_in-line.tsv $new_database_folder/$newDBName/updated_min1_db_mOTU/db_mOTU_MAP_MGCs_to_mOTUs_in-line.tsv <(echo) $new_database_folder/$newDBName/vsearch/$newDBName.mOTU-LG.map.line.tsv > $new_database_folder/$newDBName/db_mOTU/db_mOTU_MAP_MGCs_to_mOTUs_in-line.tsv
 cat $mOTU_folder/db_mOTU/db_mOTU_MAP_MGCs_to_mOTUs.tsv $new_database_folder/$newDBName/updated_min1_db_mOTU/db_mOTU_MAP_MGCs_to_mOTUs.tsv $new_database_folder/$newDBName//$newDBName.mOTU-LG.map.tsv > $new_database_folder/$newDBName/db_mOTU/db_mOTU_MAP_MGCs_to_mOTUs.tsv
 cat $mOTU_folder/db_mOTU/db_mOTU_padding_coordinates_CEN.tsv $new_database_folder/$newDBName/updated_min1_db_mOTU/db_mOTU_padding_coordinates_CEN.tsv > $new_database_folder/$newDBName/db_mOTU/db_mOTU_padding_coordinates_CEN.tsv
 cat $mOTU_folder/db_mOTU/db_mOTU_padding_coordinates_NR.tsv $new_database_folder/$newDBName/updated_min1_db_mOTU/db_mOTU_padding_coordinates_NR.tsv $new_database_folder/$newDBName/$newDBName.padded.coords > $new_database_folder/$newDBName/db_mOTU/db_mOTU_padding_coordinates_NR.tsv

diff --git a/mOTUs-extender/fake_taxonomy.py b/mOTUs-extender/fake_taxonomy.py
@@ -0,0 +1,18 @@
+import sys
+
+
+novel = set()
+with open(sys.argv[1]) as handle:
+    for line in handle:
+        if 'Novel' in line:
+            genome = line.strip().split('\t')[0]
+            novel.add(genome)
+#with open('test/genomes.tax') as handle:
+#    for line in handle:
+#        line = line.strip().split('\t')
+#        print(line)
+
+for n in sorted(list(novel)):
+    tmp = [n] + ['0 NA'] * 7
+    tmp = '\t'.join(tmp)
+    print(tmp)
diff --git a/mOTUs-extender/load_and_pad_10MGs.py b/mOTUs-extender/load_and_pad_10MGs.py
@@ -34,7 +34,7 @@ def get_sequences(seq_file):
     sequences = []
     with open(seq_file) as handle:
         for (header, sequence) in FastaIO.SimpleFastaParser(handle):
-            sequences.append((header, sequence.upper()))
+            sequences.append((header.split()[0], sequence.upper()))
     return sequences
 
 def revcomp(sequence):

diff --git a/mOTUs-extender/mOTU.v3.1.mOTU-LG.map.gz b/mOTUs-extender/mOTU.v3.1.mOTU-LG.map.gz
diff --git a/mOTUs-extender/map2genome_v6.py b/mOTUs-extender/map2genome_v6.py
@@ -0,0 +1,22 @@
+import sys
+
+input_file = sys.argv[1]
+output_file = sys.argv[2]
+mgs = ['COG0172','COG0012','COG0215','COG0525','COG0495','COG0541','COG0533','COG0552','COG0016','COG0018']
+with open(input_file) as inhandle, open(output_file, 'w') as outhandle:
+    for line in inhandle:
+
+        splits = line.strip().split()
+        gene = splits[0]
+        genome = splits[1]
+        thismg = []
+        for mg in mgs:
+            if mg in gene:
+                thismg.append(mg)
+        if len(thismg) != 1:
+            print('Either no MG or multiple MGs are found. SHould find exactly one. Quitting')
+            exit(1)
+        cog = thismg[0]
+        #cog = 'COG' + gene.split('.COG')[1].split('.')[0].split('-')[0]
+        #print(gene, genome, cog)
+        outhandle.write(f'{gene}\t{genome}\t{cog}\n')
diff --git a/mOTUs-extender/motus-extender.py b/mOTUs-extender/motus-extender.py
@@ -8,7 +8,7 @@
 
 
 
-MOTUS_EXTENDER_VERSION = '3.0.1'
+MOTUS_EXTENDER_VERSION = '3.1.0'
 SCRIPT_DIR = pathlib.Path(__file__).parent.resolve()
 
 ########################################################################################################################################################################################################
@@ -117,10 +117,10 @@ def execute_motus_prepare(args):
     logging.info(f'Downloading mOTUs {MOTUS_EXTENDER_VERSION} database')
     orig_db_folder = input_workfolder.joinpath('orig_db')
     orig_db_folder.mkdir(parents=True, exist_ok=True)
-    download_command = f'curl https://zenodo.org/record/5140350/files/db_mOTU_v3.0.1.tar.gz -o {orig_db_folder}/db_mOTU_v3.0.1.tar.gz'
+    download_command = f'curl https://zenodo.org/record/7778108/files/db_mOTU_v3.1.0.tar.gz -o {orig_db_folder}/db_mOTU_v3.1.0.tar.gz'
     check_call(download_command)
     logging.info(f'Uncompressing mOTUs {MOTUS_EXTENDER_VERSION} database')
-    untar_command = f'tar -xzvf {orig_db_folder}/db_mOTU_v3.0.1.tar.gz -C {orig_db_folder}'
+    untar_command = f'tar -xzvf {orig_db_folder}/db_mOTU_v3.1.0.tar.gz -C {orig_db_folder}'
     check_call(untar_command)
     logging.info(f'Preparing mOTUs {MOTUS_EXTENDER_VERSION} database for extension')
     prepare_db_command = f'python {SCRIPT_DIR}/prepare_mOTUs.py {orig_db_folder}/db_mOTU/ {input_workfolder}/temp_db_folder'
@@ -187,8 +187,8 @@ def execute_motus_membership(args):
         shutdown(0)
 
     # copy mOTU.v3.0.mOTU-LG.map.gz to workfolder
-    motu_map_file = f'{input_workfolder}/mOTU.v3.0.mOTU-LG.map'
-    unzip_command = f'gunzip -c {SCRIPT_DIR}/mOTU.v3.0.mOTU-LG.map.gz > {motu_map_file}'
+    motu_map_file = f'{input_workfolder}/mOTU.v3.1.mOTU-LG.map'
+    unzip_command = f'gunzip -c {SCRIPT_DIR}/mOTU.v3.1.mOTU-LG.map.gz > {motu_map_file}'
     check_call(unzip_command)
     # check the number of genomes
 
@@ -345,8 +345,8 @@ def execute_motus_createdb(args):
         logging.error(f'Taxonomy file {taxonomy_file} does not exist. Please provide a valid file')
         shutdown(1)
 
-    # copy mOTU.v3.0.mOTU-LG.map.gz to workfolder
-    motu_map_file = f'{input_workfolder}/mOTU.v3.0.mOTU-LG.map'
+    # copy mOTU.v3.1.mOTU-LG.map.gz to workfolder
+    motu_map_file = f'{input_workfolder}/mOTU.v3.1.mOTU-LG.map'
     membership_file = input_workfolder.joinpath('mOTUs.membership.tsv')
     novel_genomes  = [line.strip().split('\t')[0] for line in membership_file.read_text().splitlines() if 'Novel' in line]
     all_taxonomy_entries = {line.strip().split('\t')[0]: line.strip().split('\t')[1:]  for line in taxonomy_file.read_text().splitlines() if not line.startswith('#')}

diff --git a/mOTUs-extender/postprocess_min1.py b/mOTUs-extender/postprocess_min1.py
@@ -14,6 +14,7 @@
 def main():
     parser = argparse.ArgumentParser(description='Postprocess the new motus database by adding -1 MGs that are distant enough to new ext-MGs', add_help = True)
     parser.add_argument('db', action="store", help='The updated database folder (same as in prepare_mOTUs.py)')
+    parser.add_argument('reffasta', action="store", help='The padded fasta file with sequences that will be extended.')
     parser.add_argument('newdbfolder', action="store", help='The folder with the file with the new ext-MGs')
     parser.add_argument('cutoffs', action="store", help='10MGs and their cutoff as a file')
     parser.add_argument('threads', action="store", help='threads')
@@ -23,18 +24,18 @@ def main():
     newdbfolder = args.newdbfolder
     threads = int(args.threads)
     cutoffs_file = args.cutoffs
-
+    maxaccepts = 1000
+    maxrejects = 1000
     min_coverage = 0.8
 
-    ext_mgs_file = glob.glob(newdbfolder + '/*padded')[0]
+    ext_mgs_file = args.reffasta
 
-    min1_vs_ext_mgs_file = ext_mgs_file + '_min1.m8'
+    min1_vs_ext_mgs_file = newdbfolder + '/min1_vs_padded.m8'
 
     min1_unpadded_fasta_file = db_folder + '/min1_db_mOTU/db_mOTU_DB_NR_unpadded.fasta'
 
-    if not pathlib.Path(min1_vs_ext_mgs_file).exists():
-        print(f'vsearch --threads {threads} --usearch_global {min1_unpadded_fasta_file} --strand both --db {ext_mgs_file} --id 0.8  --maxaccepts 100 --maxrejects 100 --mincols 20 --alnout {min1_vs_ext_mgs_file}.aln --blast6out {min1_vs_ext_mgs_file}')
-        subprocess.check_call(f'vsearch --threads {threads} --usearch_global {min1_unpadded_fasta_file} --strand both --db {ext_mgs_file} --id 0.8  --maxaccepts 100 --maxrejects 100 --mincols 20 --alnout {min1_vs_ext_mgs_file}.aln --blast6out {min1_vs_ext_mgs_file}',shell=True)
+    #if not pathlib.Path(min1_vs_ext_mgs_file).exists():
+    subprocess.check_call(f'vsearch --threads {threads} --usearch_global {min1_unpadded_fasta_file} --db {ext_mgs_file} --id 0.8  --maxaccepts {maxaccepts} --maxrejects {maxrejects} --mincols 20 --alnout {min1_vs_ext_mgs_file}.aln --blast6out {min1_vs_ext_mgs_file}',shell=True)
 
 
     mg_2_cutoff = {}
@@ -53,7 +54,7 @@ def main():
             splits = line.strip().split()
 
             query = splits[0]
-            cog = query.rsplit('.', 1)[-1]
+            cog = query.split('.')[1].split('-')[0]#query.rsplit('.', 1)[-1]
             cutoff = mg_2_cutoff[cog]
             ref = splits[1]
             percid = float(splits[2])
@@ -191,7 +192,7 @@ def main():
         for line in inhandle:
             allmgcs = line.strip().split()[1].split(';')
             mgcs = ';'.join([mgc for mgc in allmgcs if mgc not in mgcs_to_remove])
-        outhandle.write(f'unassigned\t{mgcs}\n')
+        outhandle.write(f'unassigned\t{mgcs}')
 
     print('################################################################')
 
@@ -283,8 +284,8 @@ def main():
     with open(db_mOTU_padding_coordinates_CEN_file) as inhandle, open(updated_db_mOTU_padding_coordinates_CEN_file, 'w') as outhandle:
         for line in inhandle:
             splits = line.strip().split()
-            mg = splits[0].replace('NA.', '')
-            mg_orig_name = mg.replace('_C', '.C')
+            mg = splits[0]#.replace('NA.', '')
+            mg_orig_name = mg[::-1].replace('_', '.', 1)[::-1]
             if mg_orig_name not in mgs_to_remove:
                 outhandle.write(line)
                 written_mgs += 1
@@ -307,8 +308,8 @@ def main():
     with open(db_mOTU_DB_CEN_fastaannotations_file) as inhandle, open(updated_db_mOTU_DB_CEN_fastaannotations_file, 'w') as outhandle:
         for line in inhandle:
             splits = line.strip().split()
-            mg = splits[1].replace('NA.', '')
-            mg_orig_name = mg.replace('_C', '.C')
+            mg = splits[1]#.replace('NA.', '')
+            mg_orig_name = mg[::-1].replace('_', '.', 1)[::-1]#mg.replace('_C', '.C')
             if mg_orig_name not in mgs_to_remove:
                 outhandle.write(line)
                 written_mgs += 1
@@ -323,14 +324,14 @@ def main():
 
 
     updated_db_mOTU_DB_CEN_file = outputfolder + '/db_mOTU_DB_CEN.fasta'
-    print(f'Writing {db_mOTU_DB_CEN_file}')
+    print(f'Reading {db_mOTU_DB_CEN_file}')
     print(f'Writing {updated_db_mOTU_DB_CEN_file}')
     written_mgs = 0
     removed_mgs = 0
     with open(db_mOTU_DB_CEN_file) as inhandle, open(updated_db_mOTU_DB_CEN_file, 'w') as outhandle:
         for cnt, (header, sequence) in enumerate(FastaIO.SimpleFastaParser(inhandle), 1):
-            mg = header.replace('NA.', '')
-            mg_orig_name = mg.replace('_C', '.C')
+            mg = header#.replace('NA.', '')
+            mg_orig_name = mg[::-1].replace('_', '.', 1)[::-1]#mg.replace('_C', '.C')
             if mg_orig_name not in mgs_to_remove:
                 written_mgs += 1
                 outhandle.write(f'>{header}\n{sequence}\n')