Skip to content

Commit

Permalink
push to version 3.1.0 of mOTUs. Fixed some issues with newlines in a …
Browse files Browse the repository at this point in the history
…mOTUs db file. Added script that creates a fake taxonomy for added genomes
  • Loading branch information
Hans-Joachim Ruscheweyh committed Jun 23, 2023
1 parent dd15f79 commit 8061921
Show file tree
Hide file tree
Showing 15 changed files with 4,611 additions and 40 deletions.
7 changes: 4 additions & 3 deletions mOTUs-extender/0prod_fetchMGs.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,17 @@
mkdir -p {params.outdir}genome/
mkdir -p {params.outdir}prodigal/
mkdir -p {params.outdir}fetchMGs/
rsync {input.fa} {params.outdir}genome/
prodigal -i {input.fa} -a {params.faa} -d {params.fna} -f gff -o {params.gff} -c -m -g 11 -p single
python {script_folder}/reformat_genome.py {input.fa} {params.outdir}genome/{wildcards.sample}.fa
#rsync {input.fa} {params.outdir}genome/
prodigal -i {params.outdir}genome/{wildcards.sample}.fa -a {params.faa} -d {params.fna} -f gff -o {params.gff} -c -m -g 11 -p single
rm -rf {params.outdir}fetchMGs/{wildcards.sample}-bestMGs/
fetchMGs.pl -m extraction -v -i -d {params.fna} -o {params.outdir}fetchMGs/{wildcards.sample}-bestMGs/ {params.faa} -t {threads}
mkdir -p {params.outdir}fetchMGs/{wildcards.sample}-bestMGs-renamed/
python {script_folder}/parse_fetchMGs.py {params.outdir}fetchMGs/{wildcards.sample}-bestMGs/ {params.outdir}fetchMGs/{wildcards.sample}-bestMGs-renamed/ {wildcards.sample}
rm -r {params.outdir}fetchMGs/{wildcards.sample}-bestMGs/hmmResults/
rm -r {params.outdir}fetchMGs/{wildcards.sample}-bestMGs/temp/
if [ -f {params.outdir}fetchMGs/{wildcards.sample}-bestMGs-renamed/motus.mgs.count.ok ]; then
sh {script_folder}/extend_mOTUs_addMarkerGenes.sh {input.fa} {wildcards.sample} {dest_path_extension} {script_folder} {temp_db_folder_path} {params.outdir}fetchMGs/{wildcards.sample}-bestMGs-renamed/ {mapfile}
sh {script_folder}/extend_mOTUs_addMarkerGenes.sh {params.outdir}genome/{wildcards.sample}.fa {wildcards.sample} {dest_path_extension} {script_folder} {temp_db_folder_path} {params.outdir}fetchMGs/{wildcards.sample}-bestMGs-renamed/ {mapfile}
fi
";
Expand Down
674 changes: 674 additions & 0 deletions mOTUs-extender/LICENSE

Large diffs are not rendered by default.

3,845 changes: 3,845 additions & 0 deletions mOTUs-extender/README.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion mOTUs-extender/combineDistances_5.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def fillDistMatrix(distFileName, dictGene2Genome, dictGene2OG, dictGeneLengths,
OG_2 = dictGene2OG[strGeneID_2]

if (OG_1 != OG_2):
sys.stderr.write(str(strGeneID_1) + " " + str(OG_1) + " " + str(strGeneID_2) + " " + str(OG_2) + " have a distance but are annotated to different OGs.\n")
sys.stderr.write(str(strGeneID_1) + " cog1=" + str(OG_1) + " " + str(strGeneID_2) + " cog2=" + str(OG_2) + " have a distance but are annotated to different OGs.\n")

#########
# if (strGeneID_1.count("|") > 1):
Expand Down
2 changes: 1 addition & 1 deletion mOTUs-extender/extend_mOTUs_addGenome.sh
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ while read line;
do
if [ -s "$new_database_folder/dbs/${genome_ID}/${genome_ID}_markerGenes2/$line.notab.fna" ]
then
vsearch --threads 1 --usearch_global $new_database_folder/dbs/${genome_ID}/${genome_ID}_markerGenes2/$line.notab.fna --db ${unpadded_mOTUseqsFile}.udb --id 0.8 --maxaccepts 100000 --mincols 20 --blast6out $new_database_folder/dbs/$genome_ID/vsearch/$line.distances_vs_db.m8
vsearch --threads 1 --usearch_global $new_database_folder/dbs/${genome_ID}/${genome_ID}_markerGenes2/$line.notab.fna --db ${unpadded_mOTUseqsFile}.udb --id 0.8 --maxaccepts 100000 --mincols 20 --strand both --blast6out $new_database_folder/dbs/$genome_ID/vsearch/$line.distances_vs_db.m8
else
touch $new_database_folder/dbs/$genome_ID/vsearch/$line.distances_vs_db.m8
fi
Expand Down
4 changes: 2 additions & 2 deletions mOTUs-extender/extend_mOTUs_addMarkerGenes.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ cat $new_database_folder/dbs/${genome_ID}/${genome_ID}.genes.len $db_mOTU_genes_
#make global map file

cat $new_database_folder/dbs/${genome_ID}/${genome_ID}.map2genome $mapfile > $new_database_folder/dbs/${genome_ID}/${genome_ID}.all.map2genome
python $scriptDir/map2genome.py $new_database_folder/dbs/${genome_ID}/${genome_ID}.all.map2genome $new_database_folder/dbs/${genome_ID}/${genome_ID}.all.map2genome_v5
python $scriptDir/map2genome_v6.py $new_database_folder/dbs/${genome_ID}/${genome_ID}.all.map2genome $new_database_folder/dbs/${genome_ID}/${genome_ID}.all.map2genome_v6

#make global genome IDs file
cut -f2 $new_database_folder/dbs/${genome_ID}/${genome_ID}.map2genome > $new_database_folder/dbs/$genome_ID/${genome_ID}.genomeIDs
Expand All @@ -60,7 +60,7 @@ cat $new_database_folder/dbs/$genome_ID/vsearch/normalized//COG*.distances_vs_db

ls $new_database_folder/dbs/${genome_ID}/vsearch/normalized/COG*.distances_vs_db.50p.20n.m8 > $new_database_folder/dbs/${genome_ID}/vsearch/normalized//files_normalized.txt

python $scriptDir/combineDistances_5.py -d 55.0 -c 3 $new_database_folder/dbs/${genome_ID}/vsearch/normalized//files_normalized.txt $new_database_folder/dbs/${genome_ID}/${genome_ID}.genes.all.len $new_database_folder/dbs/${genome_ID}/${genome_ID}.all.genomeIDs $new_database_folder/dbs/${genome_ID}/${genome_ID}.all.map2genome_v5 $new_database_folder/dbs/${genome_ID}/vsearch/normalized/AllCOGs.distances_vs_db.excludedPairs $new_database_folder/dbs/${genome_ID}/vsearch/combined.normalized.distances_vs_db.m8
python $scriptDir/combineDistances_5.py -d 55.0 -c 3 $new_database_folder/dbs/${genome_ID}/vsearch/normalized//files_normalized.txt $new_database_folder/dbs/${genome_ID}/${genome_ID}.genes.all.len $new_database_folder/dbs/${genome_ID}/${genome_ID}.all.genomeIDs $new_database_folder/dbs/${genome_ID}/${genome_ID}.all.map2genome_v6 $new_database_folder/dbs/${genome_ID}/vsearch/normalized/AllCOGs.distances_vs_db.excludedPairs $new_database_folder/dbs/${genome_ID}/vsearch/combined.normalized.distances_vs_db.m8

#python $scriptDir/combineDistances_4_useMap.py $new_database_folder/dbs/${genome_ID}/vsearch/normalized//files_normalized.txt $new_database_folder/dbs/${genome_ID}/${genome_ID}.genes.all.len $new_database_folder/dbs/${genome_ID}/${genome_ID}.all.genomeIDs $new_database_folder/dbs/${genome_ID}/${genome_ID}.all.map2genome $new_database_folder/dbs/${genome_ID}/vsearch/normalized/AllCOGs.distances_vs_db.excludedPairs $new_database_folder/dbs/${genome_ID}/vsearch/combined.normalized.distances_vs_db.m8

Expand Down
10 changes: 5 additions & 5 deletions mOTUs-extender/extend_mOTUs_generateDB.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#set -euxo pipefail
set -euxo pipefail
#filter genomes from mOTU DB


Expand Down Expand Up @@ -73,7 +73,7 @@ cat $new_database_folder/dbs/${genome_ID}/${genome_ID}.map2genome
done < $new_database_folder/$newDBName/genomes.filtered.list > $new_database_folder/$newDBName/${newDBName}.new.map2genome
cut -f2 $new_database_folder/$newDBName/${newDBName}.new.map2genome | sort -u > $new_database_folder/$newDBName/${newDBName}.new.genomeIDs

python $scriptDir/map2genome.py $new_database_folder/$newDBName/${newDBName}.new.map2genome $new_database_folder/$newDBName/${newDBName}.new.map2genome_v5
python $scriptDir/map2genome_v6.py $new_database_folder/$newDBName/${newDBName}.new.map2genome $new_database_folder/$newDBName/${newDBName}.new.map2genome_v6

while read genome_ID;
do
Expand Down Expand Up @@ -139,7 +139,7 @@ echo "Duplicated entry in new_database_folder/$newDBName/${newDBName}.new.map2ge
exit 1
fi

python $scriptDir/combineDistances_5.py -d 55.0 -c 3 $new_database_folder/$newDBName/vsearch/files_normalized.txt $new_database_folder/$newDBName/${newDBName}.new.len $new_database_folder/$newDBName/${newDBName}.new.genomeIDs $new_database_folder/$newDBName/${newDBName}.new.map2genome_v5 $new_database_folder/$newDBName/vsearch/AllCOGs.normalized.excludedPairs $new_database_folder/$newDBName/vsearch/combined.normalized.new.m8
python $scriptDir/combineDistances_5.py -d 55.0 -c 3 $new_database_folder/$newDBName/vsearch/files_normalized.txt $new_database_folder/$newDBName/${newDBName}.new.len $new_database_folder/$newDBName/${newDBName}.new.genomeIDs $new_database_folder/$newDBName/${newDBName}.new.map2genome_v6 $new_database_folder/$newDBName/vsearch/AllCOGs.normalized.excludedPairs $new_database_folder/$newDBName/vsearch/combined.normalized.new.m8

#python $scriptDir/combineDistances_4_useMap.py $new_database_folder/$newDBName/vsearch/files_normalized.txt $new_database_folder/$newDBName/${newDBName}.new.len $new_database_folder/$newDBName/${newDBName}.new.genomeIDs $new_database_folder/$newDBName/${newDBName}.new.map2genome $new_database_folder/$newDBName/vsearch/AllCOGs.normalized.excludedPairs $new_database_folder/$newDBName/vsearch/combined.normalized.new.m8

Expand Down Expand Up @@ -191,7 +191,7 @@ done < $new_database_folder/$newDBName/genomes.filtered.list > $new_database_fo



python $scriptDir/postprocess_min1.py $mOTU_folder/ $new_database_folder/$newDBName/ $scriptDir/cutoffs_fscore_specIAsRef.csv $threads
python $scriptDir/postprocess_min1.py $mOTU_folder/ $new_database_folder/$newDBName/$newDBName.padded $new_database_folder/$newDBName/ $scriptDir/cutoffs_fscore_specIAsRef.csv $threads

python $scriptDir/extend_mOTUs_getClusterTaxonomy.py $new_database_folder/$newDBName/vsearch/$newDBName.clustering $taxonomyFile > $new_database_folder/$newDBName/vsearch/$newDBName.taxonomy

Expand Down Expand Up @@ -220,7 +220,7 @@ cat $mOTU_folder/db_mOTU/db_mOTU_DB_CEN.fasta.annotations $new_database_folder/$
cat $mOTU_folder/db_mOTU/db_mOTU_DB_NR.fasta $new_database_folder/$newDBName/updated_min1_db_mOTU/db_mOTU_DB_NR.fasta $new_database_folder/$newDBName/$newDBName.padded > $new_database_folder/$newDBName/db_mOTU/db_mOTU_DB_NR.fasta
cat $mOTU_folder/db_mOTU/db_mOTU_genes_length_NR $new_database_folder/$newDBName/updated_min1_db_mOTU/db_mOTU_genes_length_NR $new_database_folder/$newDBName/${newDBName}.new.len > $new_database_folder/$newDBName/db_mOTU/db_mOTU_genes_length_NR
cat $mOTU_folder/db_mOTU/db_mOTU_MAP_genes_to_MGCs.tsv $new_database_folder/$newDBName/updated_min1_db_mOTU/db_mOTU_MAP_genes_to_MGCs.tsv $new_database_folder/$newDBName/${newDBName}.map > $new_database_folder/$newDBName/db_mOTU/db_mOTU_MAP_genes_to_MGCs.tsv
cat $mOTU_folder/db_mOTU/db_mOTU_MAP_MGCs_to_mOTUs_in-line.tsv $new_database_folder/$newDBName/updated_min1_db_mOTU/db_mOTU_MAP_MGCs_to_mOTUs_in-line.tsv $new_database_folder/$newDBName/vsearch/$newDBName.mOTU-LG.map.line.tsv > $new_database_folder/$newDBName/db_mOTU/db_mOTU_MAP_MGCs_to_mOTUs_in-line.tsv
cat $mOTU_folder/db_mOTU/db_mOTU_MAP_MGCs_to_mOTUs_in-line.tsv $new_database_folder/$newDBName/updated_min1_db_mOTU/db_mOTU_MAP_MGCs_to_mOTUs_in-line.tsv <(echo) $new_database_folder/$newDBName/vsearch/$newDBName.mOTU-LG.map.line.tsv > $new_database_folder/$newDBName/db_mOTU/db_mOTU_MAP_MGCs_to_mOTUs_in-line.tsv
cat $mOTU_folder/db_mOTU/db_mOTU_MAP_MGCs_to_mOTUs.tsv $new_database_folder/$newDBName/updated_min1_db_mOTU/db_mOTU_MAP_MGCs_to_mOTUs.tsv $new_database_folder/$newDBName//$newDBName.mOTU-LG.map.tsv > $new_database_folder/$newDBName/db_mOTU/db_mOTU_MAP_MGCs_to_mOTUs.tsv
cat $mOTU_folder/db_mOTU/db_mOTU_padding_coordinates_CEN.tsv $new_database_folder/$newDBName/updated_min1_db_mOTU/db_mOTU_padding_coordinates_CEN.tsv > $new_database_folder/$newDBName/db_mOTU/db_mOTU_padding_coordinates_CEN.tsv
cat $mOTU_folder/db_mOTU/db_mOTU_padding_coordinates_NR.tsv $new_database_folder/$newDBName/updated_min1_db_mOTU/db_mOTU_padding_coordinates_NR.tsv $new_database_folder/$newDBName/$newDBName.padded.coords > $new_database_folder/$newDBName/db_mOTU/db_mOTU_padding_coordinates_NR.tsv
Expand Down
18 changes: 18 additions & 0 deletions mOTUs-extender/fake_taxonomy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import sys


novel = set()
with open(sys.argv[1]) as handle:
for line in handle:
if 'Novel' in line:
genome = line.strip().split('\t')[0]
novel.add(genome)
#with open('test/genomes.tax') as handle:
# for line in handle:
# line = line.strip().split('\t')
# print(line)

for n in sorted(list(novel)):
tmp = [n] + ['0 NA'] * 7
tmp = '\t'.join(tmp)
print(tmp)
2 changes: 1 addition & 1 deletion mOTUs-extender/load_and_pad_10MGs.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def get_sequences(seq_file):
sequences = []
with open(seq_file) as handle:
for (header, sequence) in FastaIO.SimpleFastaParser(handle):
sequences.append((header, sequence.upper()))
sequences.append((header.split()[0], sequence.upper()))
return sequences

def revcomp(sequence):
Expand Down
Binary file added mOTUs-extender/mOTU.v3.1.mOTU-LG.map.gz
Binary file not shown.
22 changes: 22 additions & 0 deletions mOTUs-extender/map2genome_v6.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import sys

input_file = sys.argv[1]
output_file = sys.argv[2]
mgs = ['COG0172','COG0012','COG0215','COG0525','COG0495','COG0541','COG0533','COG0552','COG0016','COG0018']
with open(input_file) as inhandle, open(output_file, 'w') as outhandle:
for line in inhandle:

splits = line.strip().split()
gene = splits[0]
genome = splits[1]
thismg = []
for mg in mgs:
if mg in gene:
thismg.append(mg)
if len(thismg) != 1:
print('Either no MG or multiple MGs are found. SHould find exactly one. Quitting')
exit(1)
cog = thismg[0]
#cog = 'COG' + gene.split('.COG')[1].split('.')[0].split('-')[0]
#print(gene, genome, cog)
outhandle.write(f'{gene}\t{genome}\t{cog}\n')
14 changes: 7 additions & 7 deletions mOTUs-extender/motus-extender.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@



MOTUS_EXTENDER_VERSION = '3.0.1'
MOTUS_EXTENDER_VERSION = '3.1.0'
SCRIPT_DIR = pathlib.Path(__file__).parent.resolve()

########################################################################################################################################################################################################
Expand Down Expand Up @@ -117,10 +117,10 @@ def execute_motus_prepare(args):
logging.info(f'Downloading mOTUs {MOTUS_EXTENDER_VERSION} database')
orig_db_folder = input_workfolder.joinpath('orig_db')
orig_db_folder.mkdir(parents=True, exist_ok=True)
download_command = f'curl https://zenodo.org/record/5140350/files/db_mOTU_v3.0.1.tar.gz -o {orig_db_folder}/db_mOTU_v3.0.1.tar.gz'
download_command = f'curl https://zenodo.org/record/7778108/files/db_mOTU_v3.1.0.tar.gz -o {orig_db_folder}/db_mOTU_v3.1.0.tar.gz'
check_call(download_command)
logging.info(f'Uncompressing mOTUs {MOTUS_EXTENDER_VERSION} database')
untar_command = f'tar -xzvf {orig_db_folder}/db_mOTU_v3.0.1.tar.gz -C {orig_db_folder}'
untar_command = f'tar -xzvf {orig_db_folder}/db_mOTU_v3.1.0.tar.gz -C {orig_db_folder}'
check_call(untar_command)
logging.info(f'Preparing mOTUs {MOTUS_EXTENDER_VERSION} database for extension')
prepare_db_command = f'python {SCRIPT_DIR}/prepare_mOTUs.py {orig_db_folder}/db_mOTU/ {input_workfolder}/temp_db_folder'
Expand Down Expand Up @@ -187,8 +187,8 @@ def execute_motus_membership(args):
shutdown(0)

# copy mOTU.v3.0.mOTU-LG.map.gz to workfolder
motu_map_file = f'{input_workfolder}/mOTU.v3.0.mOTU-LG.map'
unzip_command = f'gunzip -c {SCRIPT_DIR}/mOTU.v3.0.mOTU-LG.map.gz > {motu_map_file}'
motu_map_file = f'{input_workfolder}/mOTU.v3.1.mOTU-LG.map'
unzip_command = f'gunzip -c {SCRIPT_DIR}/mOTU.v3.1.mOTU-LG.map.gz > {motu_map_file}'
check_call(unzip_command)
# check the number of genomes

Expand Down Expand Up @@ -345,8 +345,8 @@ def execute_motus_createdb(args):
logging.error(f'Taxonomy file {taxonomy_file} does not exist. Please provide a valid file')
shutdown(1)

# copy mOTU.v3.0.mOTU-LG.map.gz to workfolder
motu_map_file = f'{input_workfolder}/mOTU.v3.0.mOTU-LG.map'
# copy mOTU.v3.1.mOTU-LG.map.gz to workfolder
motu_map_file = f'{input_workfolder}/mOTU.v3.1.mOTU-LG.map'
membership_file = input_workfolder.joinpath('mOTUs.membership.tsv')
novel_genomes = [line.strip().split('\t')[0] for line in membership_file.read_text().splitlines() if 'Novel' in line]
all_taxonomy_entries = {line.strip().split('\t')[0]: line.strip().split('\t')[1:] for line in taxonomy_file.read_text().splitlines() if not line.startswith('#')}
Expand Down
31 changes: 16 additions & 15 deletions mOTUs-extender/postprocess_min1.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
def main():
parser = argparse.ArgumentParser(description='Postprocess the new motus database by adding -1 MGs that are distant enough to new ext-MGs', add_help = True)
parser.add_argument('db', action="store", help='The updated database folder (same as in prepare_mOTUs.py)')
parser.add_argument('reffasta', action="store", help='The padded fasta file with sequences that will be extended.')
parser.add_argument('newdbfolder', action="store", help='The folder with the file with the new ext-MGs')
parser.add_argument('cutoffs', action="store", help='10MGs and their cutoff as a file')
parser.add_argument('threads', action="store", help='threads')
Expand All @@ -23,18 +24,18 @@ def main():
newdbfolder = args.newdbfolder
threads = int(args.threads)
cutoffs_file = args.cutoffs

maxaccepts = 1000
maxrejects = 1000
min_coverage = 0.8

ext_mgs_file = glob.glob(newdbfolder + '/*padded')[0]
ext_mgs_file = args.reffasta

min1_vs_ext_mgs_file = ext_mgs_file + '_min1.m8'
min1_vs_ext_mgs_file = newdbfolder + '/min1_vs_padded.m8'

min1_unpadded_fasta_file = db_folder + '/min1_db_mOTU/db_mOTU_DB_NR_unpadded.fasta'

if not pathlib.Path(min1_vs_ext_mgs_file).exists():
print(f'vsearch --threads {threads} --usearch_global {min1_unpadded_fasta_file} --strand both --db {ext_mgs_file} --id 0.8 --maxaccepts 100 --maxrejects 100 --mincols 20 --alnout {min1_vs_ext_mgs_file}.aln --blast6out {min1_vs_ext_mgs_file}')
subprocess.check_call(f'vsearch --threads {threads} --usearch_global {min1_unpadded_fasta_file} --strand both --db {ext_mgs_file} --id 0.8 --maxaccepts 100 --maxrejects 100 --mincols 20 --alnout {min1_vs_ext_mgs_file}.aln --blast6out {min1_vs_ext_mgs_file}',shell=True)
#if not pathlib.Path(min1_vs_ext_mgs_file).exists():
subprocess.check_call(f'vsearch --threads {threads} --usearch_global {min1_unpadded_fasta_file} --db {ext_mgs_file} --id 0.8 --maxaccepts {maxaccepts} --maxrejects {maxrejects} --mincols 20 --alnout {min1_vs_ext_mgs_file}.aln --blast6out {min1_vs_ext_mgs_file}',shell=True)


mg_2_cutoff = {}
Expand All @@ -53,7 +54,7 @@ def main():
splits = line.strip().split()

query = splits[0]
cog = query.rsplit('.', 1)[-1]
cog = query.split('.')[1].split('-')[0]#query.rsplit('.', 1)[-1]
cutoff = mg_2_cutoff[cog]
ref = splits[1]
percid = float(splits[2])
Expand Down Expand Up @@ -191,7 +192,7 @@ def main():
for line in inhandle:
allmgcs = line.strip().split()[1].split(';')
mgcs = ';'.join([mgc for mgc in allmgcs if mgc not in mgcs_to_remove])
outhandle.write(f'unassigned\t{mgcs}\n')
outhandle.write(f'unassigned\t{mgcs}')

print('################################################################')

Expand Down Expand Up @@ -283,8 +284,8 @@ def main():
with open(db_mOTU_padding_coordinates_CEN_file) as inhandle, open(updated_db_mOTU_padding_coordinates_CEN_file, 'w') as outhandle:
for line in inhandle:
splits = line.strip().split()
mg = splits[0].replace('NA.', '')
mg_orig_name = mg.replace('_C', '.C')
mg = splits[0]#.replace('NA.', '')
mg_orig_name = mg[::-1].replace('_', '.', 1)[::-1]
if mg_orig_name not in mgs_to_remove:
outhandle.write(line)
written_mgs += 1
Expand All @@ -307,8 +308,8 @@ def main():
with open(db_mOTU_DB_CEN_fastaannotations_file) as inhandle, open(updated_db_mOTU_DB_CEN_fastaannotations_file, 'w') as outhandle:
for line in inhandle:
splits = line.strip().split()
mg = splits[1].replace('NA.', '')
mg_orig_name = mg.replace('_C', '.C')
mg = splits[1]#.replace('NA.', '')
mg_orig_name = mg[::-1].replace('_', '.', 1)[::-1]#mg.replace('_C', '.C')
if mg_orig_name not in mgs_to_remove:
outhandle.write(line)
written_mgs += 1
Expand All @@ -323,14 +324,14 @@ def main():


updated_db_mOTU_DB_CEN_file = outputfolder + '/db_mOTU_DB_CEN.fasta'
print(f'Writing {db_mOTU_DB_CEN_file}')
print(f'Reading {db_mOTU_DB_CEN_file}')
print(f'Writing {updated_db_mOTU_DB_CEN_file}')
written_mgs = 0
removed_mgs = 0
with open(db_mOTU_DB_CEN_file) as inhandle, open(updated_db_mOTU_DB_CEN_file, 'w') as outhandle:
for cnt, (header, sequence) in enumerate(FastaIO.SimpleFastaParser(inhandle), 1):
mg = header.replace('NA.', '')
mg_orig_name = mg.replace('_C', '.C')
mg = header#.replace('NA.', '')
mg_orig_name = mg[::-1].replace('_', '.', 1)[::-1]#mg.replace('_C', '.C')
if mg_orig_name not in mgs_to_remove:
written_mgs += 1
outhandle.write(f'>{header}\n{sequence}\n')
Expand Down
Loading

0 comments on commit 8061921

Please sign in to comment.