diff --git a/src/virmet/cli.py b/src/virmet/cli.py index 6d92431..737e6be 100755 --- a/src/virmet/cli.py +++ b/src/virmet/cli.py @@ -98,6 +98,7 @@ def main(): parser_fetch.add_argument('--bact', help='bacterial (RefSeq)', action='store_true') parser_fetch.add_argument('--fungal', help='fungal (RefSeq)', action='store_true') parser_fetch.add_argument('--bovine', help='bovine (Bos taurus)', action='store_true') + parser_fetch.add_argument('--no_db_compression', help='do not compress the viral database', action='store_true', default=False) parser_fetch.set_defaults(func=fetch_db) # create the parser for command "update" @@ -106,6 +107,7 @@ def main(): parser_update.add_argument('--bact', help='update bacterial database', action='store_true', default=False) parser_update.add_argument('--fungal', help='update fungal database', action='store_true', default=False) parser_update.add_argument('--picked', help='file with additional sequences, one GI per line', default=None) + parser_update.add_argument('--update_min_date', help='update viral [n]ucleic/[p]rotein with sequences produced after the date YYYY/MM/DD (e.g. 2022/09/01)', default=None) parser_update.set_defaults(func=update_db) # create the parser for command "index" diff --git a/src/virmet/common.py b/src/virmet/common.py index 75e2135..4e19078 100644 --- a/src/virmet/common.py +++ b/src/virmet/common.py @@ -10,8 +10,10 @@ import multiprocessing as mp import pandas as pd -DB_DIR = '/data/virmet_databases/' -N_FILES_BACT = 5 +DB_DIR = '/data/virmet_databases' +DB_DIR_UPDATE = '/data/virmet_databases_update' +N_FILES_BACT = 16 +MAX_TAXID = 10000 # max number of sequences belonging to the same taxid in compressed viral database # decorator for taken from RepoPhlan # https://bitbucket.org/nsegata/repophlan/src/5804db9d341165f72c4f7e448691ce90a1465764/repophlan_get_microbes.py?at=ncbi2017&fileviewer=file-view-default @@ -82,6 +84,8 @@ def ftp_down(remote_url, local_url=None): # decompressing elif remote_url.endswith('.gz') and not outname.endswith('.gz'): + #print(remote_url) + #print(outname) if os.path.exists(outname): outhandle = open(outname, 'a') else: @@ -98,7 +102,7 @@ def ftp_down(remote_url, local_url=None): else: outhandle = open(outname, 'wb') logging.debug('Downloading %s', remote_url) - with urllib.request.urlopen(remote_url, timeout=30) as f: + with urllib.request.urlopen(remote_url, timeout=300) as f: outhandle.write(f.read()) # uncompressed to uncompressed @@ -115,28 +119,97 @@ def ftp_down(remote_url, local_url=None): # outhandle.close() return outhandle +def random_reduction(viral_mode): + #This code, identify sequences from the same species using their taxid. + #Based on the given thresholds (10,000) subsample sequences of the corresponding taxid. + #Currently this code has the absolute threshold and any taxid with above + #10,000 reference sequences are filtered out. + #This is mainly done because of SARS-CoV-2 sequences covering above 90% of + #the viral database as of begining of 2023. + #The highest frequent virus is SARS-CoV-2 with 90% sequences of the database. + + def strip(str_): + """Make the strip method a function""" + return str_.strip() + + if viral_mode == 'n': + logging.info('compressing viral nuccore sequences') + target_dir = os.path.join(DB_DIR_UPDATE, 'viral_nuccore') + elif viral_mode == 'p': + logging.info('compressing viral protein sequences') + target_dir = os.path.join(DB_DIR_UPDATE, 'viral_protein') + logging.info('Database real path for compression: %s' %os.path.realpath(target_dir)) + os.chdir(target_dir) + + viral_info_file = os.path.join(target_dir, 'viral_seqs_info.tsv') + viral_fasta_file = os.path.join(target_dir, 'viral_database.fasta') + + viral_info = pd.read_table(viral_info_file, names=['accn', 'TaxId', 'seq_len', 'Organism', 'Title', 'accn_version' ]) + viral_info.drop_duplicates(subset=['accn_version'], inplace=True) + # Do compression at the texid ID level, + # check if any taxid has more than 10,000 reference sequences move them to a new pandas table + # remove them from the main pandas table + TaxId_to_counter_df = viral_info['TaxId'].value_counts().reset_index() + TaxId_to_counter_df.rename(columns={"TaxId": "TaxId_count", "index": "TaxId_num"}, inplace=True) + TaxId_to_percentage = viral_info['TaxId'].value_counts(normalize=True).reset_index() + TaxId_to_counter_df['percentage'] = TaxId_to_percentage['TaxId'] + TaxId_to_counter_filterred_df = TaxId_to_counter_df[TaxId_to_counter_df["TaxId_count"] > MAX_TAXID] + + viral_info_subsampled = viral_info.copy() + random.seed(100) + removed_set = set() + for index, row in TaxId_to_counter_filterred_df.iterrows(): + taxid_to_subsample = int(row['TaxId_num']) + viral_info_to_subsample_df = viral_info_subsampled[viral_info_subsampled['TaxId'] == taxid_to_subsample] + accn_set = set(viral_info_to_subsample_df['accn_version']) + # subsample to make it about 1% of the database + selected_accn_ls = random.sample(accn_set, MAX_TAXID) + + # filter out the unselected IDs from viral_info + # (viral_info_subsampled['accn'] in selected_accn_ls) + # Keep refseq sequence: all accn of refseq sequences contain char '_' + # viral_info_subsampled['accn'].contain('_') viral_info_subsampled['accn'].str.contains('ON8') + viral_info_subsampled = viral_info_subsampled.loc[(viral_info_subsampled['TaxId'] != taxid_to_subsample) | viral_info_subsampled['accn_version'].isin(selected_accn_ls) | viral_info_subsampled['accn_version'].str.contains('_')] + # create a text file of the non_selected ACC ID as extra information + removed_set = removed_set | (accn_set - set(selected_accn_ls)) + + viral_info_subsampled.drop_duplicates(subset=['accn_version'], inplace=True) + viral_info_subsampled.accn_version.to_csv('outfile.csv', sep='\n', index=False, header=False) + # extract the selected accession number from the fasta file using seqtk + subsample_fasta_command = 'seqtk subseq %s outfile.csv > viral_database_subsampled.fasta' %(viral_fasta_file) + run_child(subsample_fasta_command) + os.rename('viral_database.fasta', 'viral_database_original_rmdup.fasta') + os.rename('viral_database_subsampled.fasta', 'viral_database.fasta') + TaxId_to_counter_filterred_df.to_csv('filtered_taxids.csv', sep=',', index=False) -def viral_query(viral_db): +def viral_query(viral_db, update_min_date=None): # Viruses, Taxonomy ID: 10239 # Human adenovirus A, Taxonomy ID: 129875 (only for testing, 7 hits) # Mastadenovirus, Taxonomy ID: 10509 (only for testing, 440 hits) # Alphatorquevirus Taxonomy ID: 687331 # Cellular organisms, Taxonomy ID: 131567 (to avoid chimeras) txid = '10239' # change here for viruses or smaller taxa - query_text = '-query \"txid%s [orgn] AND (\\"complete genome\\" [Title] OR srcdb_refseq[prop])' % txid - query_text += ' NOT wgs[PROP] NOT \\"cellular organisms\\"[Organism] NOT AC_000001[PACC] : AC_999999[PACC]\"' + query_text = '-query \"txid%s [orgn] AND (\\"complete genome\\" [Title] OR \\"complete segment\\" [Title] OR srcdb_refseq[prop])' % txid + query_text += ' NOT \\"cellular organisms\\"[Organism] NOT AC_000001[PACC] : AC_999999[PACC]\"' + + if update_min_date: + logging.info('Viral Database Update is performed with sequences added to NCBI after %s .\n' %update_min_date) + query_text += '-datetype PDAT -mindate %s' %str(update_min_date) #-datetype MDAT -mindate %s' + query_text += ' > ncbi_search' + if viral_db == 'n': - target_dir = os.path.join(DB_DIR, 'viral_nuccore') + target_dir = os.path.join(DB_DIR_UPDATE, 'viral_nuccore') search_text = '-db nuccore ' + query_text elif viral_db == 'p': - target_dir = os.path.join(DB_DIR, 'viral_protein') + target_dir = os.path.join(DB_DIR_UPDATE, 'viral_protein') search_text = '-db protein ' + query_text try: os.mkdir(target_dir) except FileExistsError: pass os.chdir(target_dir) + logging.info('Database real path: %s' %os.path.realpath(target_dir)) return 'esearch ' + search_text @@ -158,7 +231,7 @@ def bact_fung_query(query_type=None, download=True, info_file=None): with urllib.request.urlopen(url) as f: print(f.read().decode('utf-8'), file=bh) bh.close() - querinfo = pd.read_csv(info_file, sep='\t', header=0, skiprows=1) + querinfo = pd.read_csv(info_file, sep='\t', header=0, skiprows=1, dtype={'excluded_from_refseq': str}) querinfo.rename(columns={'# assembly_accession': 'assembly_accession'}, inplace=True) if query_type == 'bacteria': gb = querinfo[(querinfo.assembly_level == 'Complete Genome') & @@ -169,7 +242,8 @@ def bact_fung_query(query_type=None, download=True, info_file=None): (querinfo.version_status == 'latest') & (querinfo.genome_rep == 'Full') & (querinfo.release_type == 'Major')] - gb.set_index('assembly_accession') + gb.set_index('assembly_accession', inplace=True) + gb = gb[gb['ftp_path']!='na'] x = gb['ftp_path'].apply(lambda col: col + '/' + col.split('/')[-1] + '_genomic.fna.gz') gb = gb.assign(ftp_genome_path=x) all_urls = list(gb['ftp_genome_path']) @@ -196,8 +270,8 @@ def download_genomes(all_urls, prefix, n_files=1): dl_pairs = [] for i, seqs in enumerate(seqs_urls): fasta_out = 'fasta/%s%d.fasta' % (prefix, i + 1) - if os.path.exists(fasta_out): - os.remove(fasta_out) + #if os.path.exists(fasta_out): + # os.remove(fasta_out) dl_pairs.append((fasta_out, seqs)) # run download in parallel diff --git a/src/virmet/fetch.py b/src/virmet/fetch.py index ccc1d2f..2d76bfc 100644 --- a/src/virmet/fetch.py +++ b/src/virmet/fetch.py @@ -4,12 +4,13 @@ import logging from virmet.common import viral_query, bact_fung_query, ftp_down, run_child, \ -download_genomes, get_accs, DB_DIR, N_FILES_BACT +download_genomes, get_accs, DB_DIR_UPDATE, N_FILES_BACT, random_reduction - -def fetch_viral(viral_mode): +DB_DIR = DB_DIR_UPDATE +def fetch_viral(viral_mode, no_compression=False): """Download nucleotide or protein database.""" # define the search nuccore/protein + if viral_mode == 'n': logging.info('downloading viral nuccore sequences') target_dir = os.path.join(DB_DIR, 'viral_nuccore') @@ -19,12 +20,13 @@ def fetch_viral(viral_mode): target_dir = os.path.join(DB_DIR, 'viral_protein') cml_search = viral_query('p') # run the search and download + logging.info('Database real path: %s' %os.path.realpath(target_dir)) os.chdir(target_dir) run_child(cml_search) cml_fetch_fasta = 'efetch -format fasta < ncbi_search > viral_database.fasta' run_child(cml_fetch_fasta) cml_efetch_xtract = 'efetch -format docsum < ncbi_search | xtract' - cml_efetch_xtract += ' -pattern DocumentSummary -element Caption TaxId Slen Organism Title > viral_seqs_info.tsv' + cml_efetch_xtract += ' -pattern DocumentSummary -element Caption TaxId Slen Organism Title AccessionVersion > viral_seqs_info.tsv' run_child(cml_efetch_xtract) logging.info('downloaded viral seqs info in %s', target_dir) logging.info('saving viral taxonomy') @@ -35,6 +37,17 @@ def fetch_viral(viral_mode): accs_2 = set([l.split()[0] for l in open('viral_accn_taxid.dmp')]) assert accs_1 == accs_2, accs_1 ^ accs_2 logging.info('taxonomy and fasta sequences match') + + rmdup_cmd = 'cat viral_database.fasta | seqkit rmdup -i -o viral_database_rmdup.fasta -D duplicated_names.txt' + run_child(rmdup_cmd) + os.rename('viral_database.fasta', 'viral_database_original.fasta') + os.rename('viral_database_rmdup.fasta', 'viral_database.fasta') + + #do_compression = True # change this to a parameter + #print('no_compression is %s' %no_compression) + if no_compression == False: + logging.info('Compress the database\n') + random_reduction(viral_mode) os.chdir(DB_DIR) logging.info('downloading taxonomy databases') @@ -50,22 +63,31 @@ def fetch_viral(viral_mode): os.remove(ftd) except OSError: logging.warning('Could not find file %s', ftd) - - + try: + run_child('bgzip names.dmp') + run_child('bgzip nodes.dmp') + except: + logging.debug('Could not find files names.dmp, nodes.dmp.') + def fetch_bacterial(): """Download the three bacterial sequence databases.""" target_dir = os.path.join(DB_DIR, 'bacteria') + logging.info('Database real path: %s' %os.path.realpath(target_dir)) try: os.mkdir(target_dir) except FileExistsError: pass os.chdir(target_dir) - + # first download summary file with all ftp paths and return urls all_urls = bact_fung_query(query_type='bacteria') + #print(all_urls) logging.info('%d bacterial genomes were found', len(all_urls)) # then download genomic_fna.gz files - download_genomes(all_urls, prefix='bact', n_files=N_FILES_BACT) + mid = len(all_urls)//2 + download_genomes(all_urls[:mid], prefix='bact', n_files=N_FILES_BACT) + print('second half starts') + download_genomes(all_urls[mid:], prefix='bact', n_files=N_FILES_BACT) for j in range(1, N_FILES_BACT+1): run_child('bgzip fasta/bact%d.fasta' % j) @@ -73,6 +95,7 @@ def fetch_bacterial(): def fetch_human(): """Download human genome and annotations.""" target_dir = os.path.join(DB_DIR, 'human') + logging.info('Database real path: %s' %os.path.realpath(target_dir)) try: os.mkdir(target_dir) except FileExistsError: @@ -99,6 +122,7 @@ def fetch_human(): def fetch_fungal(): """Download fungal sequences.""" target_dir = os.path.join(DB_DIR, 'fungi') + logging.info('Database real path: %s' %os.path.realpath(target_dir)) try: os.mkdir(target_dir) except FileExistsError: @@ -115,6 +139,7 @@ def fetch_fungal(): def fetch_bovine(): """Download cow genome and annotations.""" target_dir = os.path.join(DB_DIR, 'bovine') + logging.info('Database real path: %s' %os.path.realpath(target_dir)) try: os.mkdir(target_dir) except FileExistsError: @@ -128,27 +153,27 @@ def fetch_bovine(): chromosomes = ['chr%d' % chrom for chrom in range(1, 30)] chromosomes.extend(['chrX']) # chrY is missing logging.info('Downloading bovine genome') - local_file_name = os.path.join(target_dir, 'fasta', 'ref_Bos_taurus_GCF_002263795.1_ARS-UCD1.2.fasta') + local_file_name = os.path.join(target_dir, 'fasta', 'ref_Bos_taurus_GCF_002263795.2_ARS-UCD1.3.fasta') if os.path.exists(local_file_name): os.remove(local_file_name) for chrom in chromosomes: logging.debug('Downloading bovine chromosome %s', chrom) - fasta_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/%s.fna.gz' % chrom + fasta_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.2_ARS-UCD1.3/GCF_002263795.2_ARS-UCD1.3_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/%s.fna.gz' % chrom download_handle = ftp_down(fasta_url, local_file_name) download_handle.close() logging.debug('Downloaded bovine chromosome %s', chrom) - fasta_url ='ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_assembly_structure/non-nuclear/assembled_chromosomes/FASTA/chrMT.fna.gz' + fasta_url ='ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.2_ARS-UCD1.3/GCF_002263795.2_ARS-UCD1.3_assembly_structure/non-nuclear/assembled_chromosomes/FASTA/chrMT.fna.gz' download_handle = ftp_down(fasta_url, local_file_name) download_handle.close() logging.debug('Downloaded bovine chromosome MT') - fasta_url ='ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_assembly_structure/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fna.gz' + fasta_url ='ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.2_ARS-UCD1.3/GCF_002263795.2_ARS-UCD1.3_assembly_structure/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fna.gz' download_handle = ftp_down(fasta_url, local_file_name) download_handle.close() logging.debug('Downloaded bovine chromosome unplaced') run_child('bgzip %s' % local_file_name) logging.info('Downloading gff annotation file') - gff_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_genomic.gff.gz' + gff_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.2_ARS-UCD1.3/GCF_002263795.2_ARS-UCD1.3_genomic.gff.gz' download_handle = ftp_down(gff_url) download_handle.close() @@ -157,7 +182,8 @@ def main(args): """What the main does.""" logging.info('now in fetch_data') if args.viral: - fetch_viral(args.viral) + #print(args.no_db_compression) + fetch_viral(args.viral, args.no_db_compression) if args.bact: fetch_bacterial() elif args.human: diff --git a/src/virmet/index.py b/src/virmet/index.py index 27000ca..2c03671 100644 --- a/src/virmet/index.py +++ b/src/virmet/index.py @@ -9,9 +9,9 @@ import datetime from shutil import rmtree import multiprocessing as mp -from virmet.common import run_child, DB_DIR, N_FILES_BACT - +from virmet.common import run_child, DB_DIR_UPDATE, N_FILES_BACT +DB_DIR = DB_DIR_UPDATE def single_bwa_index(index_params): '''run a single bwa indexing job''' in_fasta, index_prefix = index_params @@ -23,7 +23,7 @@ def single_bwa_index(index_params): def main(args): '''only function doing all the indexing''' logging.info('now in index') - + logging.info('Database real path: %s' %os.path.realpath(DB_DIR)) if args.viral == 'n': target_dir = os.path.join(DB_DIR, 'viral_nuccore') os.chdir(target_dir) @@ -82,7 +82,7 @@ def main(args): os.mkdir(bwa_dir) except FileExistsError as err: logging.warning('FileExistsError: %s' % err) - fasta_file = os.path.join(DB_DIR, 'bovine', 'fasta', 'ref_Bos_taurus_GCF_002263795.1_ARS-UCD1.2.fasta.gz') + fasta_file = os.path.join(DB_DIR, 'bovine', 'fasta', 'ref_Bos_taurus_GCF_002263795.2_ARS-UCD1.3.fasta.gz') index_prefix = os.path.join(bwa_dir, 'bt_ref') index_pairs.append((fasta_file, index_prefix)) diff --git a/src/virmet/update.py b/src/virmet/update.py index ee6d27c..719c56d 100644 --- a/src/virmet/update.py +++ b/src/virmet/update.py @@ -8,8 +8,9 @@ import logging from collections import Counter import pandas as pd -from virmet.common import run_child, viral_query, bact_fung_query, get_accs, download_genomes, DB_DIR, N_FILES_BACT +from virmet.common import run_child, viral_query, bact_fung_query, get_accs, download_genomes, DB_DIR_UPDATE, N_FILES_BACT +DB_DIR = DB_DIR_UPDATE def bact_fung_update(query_type=None, picked=None): """ @@ -73,7 +74,7 @@ def bact_fung_update(query_type=None, picked=None): run_child('bgzip -r fasta/fungi1.fasta.gz') -def virupdate(viral_type, picked=None): +def virupdate(viral_type, picked=None, update_min_date=None): if viral_type == 'n': db_type = 'nuccore' elif viral_type == 'p': @@ -83,13 +84,13 @@ def virupdate(viral_type, picked=None): # this query downloads a new viral_seqs_info.tsv and parses the GI logging.info('interrogating NCBI again') os.chdir(viral_dir) - cml_search = viral_query(viral_type) + cml_search = viral_query(viral_type, update_min_date) run_child(cml_search) efetch_xtract = 'efetch -format docsum < ncbi_search | xtract' - efetch_xtract += ' -pattern DocumentSummary -element Caption TaxId Slen Organism Title > viral_seqs_info.tsv' + efetch_xtract += ' -pattern DocumentSummary -element Caption TaxId Slen Organism Title AccessionVersion > viral_seqs_info.tsv' run_child(efetch_xtract) info_file = os.path.join(viral_dir, 'viral_seqs_info.tsv') - info_seqs = pd.read_csv(info_file, sep='\t', names=['Caption', 'TaxId', 'Slen', 'Organism', 'Title']) + info_seqs = pd.read_csv(info_file, sep='\t', names=['Caption', 'TaxId', 'Slen', 'Organism', 'Title', 'AccessionVersion']) new_ids = [str(acc) for acc in info_seqs['Caption'].tolist()] logging.info('NCBI reports %d sequences', len(new_ids)) @@ -154,11 +155,12 @@ def virupdate(viral_type, picked=None): def main(args): logging.info('now in update_db') + logging.info('Database real path: %s' %os.path.realpath(DB_DIR)) if bool(args.viral) + args.bact + args.fungal > 1: logging.error('update either viral or bacterial or fungal in a single call') sys.exit('update either viral or bacterial or fungal in a single call') if args.viral: - virupdate(args.viral, args.picked) + virupdate(args.viral, args.picked, args.update_min_date) elif args.bact: bact_fung_update(query_type='bacteria', picked=args.picked) elif args.fungal: diff --git a/src/virmet/wolfpack.py b/src/virmet/wolfpack.py index fb52b4e..31ed54d 100755 --- a/src/virmet/wolfpack.py +++ b/src/virmet/wolfpack.py @@ -25,6 +25,17 @@ '/data/virmet_databases/bacteria/bwa/bact3', '/data/virmet_databases/bacteria/bwa/bact4', '/data/virmet_databases/bacteria/bwa/bact5', + '/data/virmet_databases/bacteria/bwa/bact6', + '/data/virmet_databases/bacteria/bwa/bact7', + '/data/virmet_databases/bacteria/bwa/bact8', + '/data/virmet_databases/bacteria/bwa/bact9', + '/data/virmet_databases/bacteria/bwa/bact10', + '/data/virmet_databases/bacteria/bwa/bact11', + '/data/virmet_databases/bacteria/bwa/bact12', + '/data/virmet_databases/bacteria/bwa/bact13', + '/data/virmet_databases/bacteria/bwa/bact14', + '/data/virmet_databases/bacteria/bwa/bact15', + '/data/virmet_databases/bacteria/bwa/bact16', '/data/virmet_databases/fungi/bwa/fungi1', '/data/virmet_databases/bovine/bwa/bt_ref'] ref_map = { @@ -34,6 +45,17 @@ 'bact3': '/data/virmet_databases/bacteria/fasta/bact3.fasta.gz', 'bact4': '/data/virmet_databases/bacteria/fasta/bact4.fasta.gz', 'bact5': '/data/virmet_databases/bacteria/fasta/bact5.fasta.gz', + 'bact6': '/data/virmet_databases/bacteria/fasta/bact6.fasta.gz', + 'bact7': '/data/virmet_databases/bacteria/fasta/bact7.fasta.gz', + 'bact8': '/data/virmet_databases/bacteria/fasta/bact8.fasta.gz', + 'bact9': '/data/virmet_databases/bacteria/fasta/bact9.fasta.gz', + 'bact10': '/data/virmet_databases/bacteria/fasta/bact10.fasta.gz', + 'bact11': '/data/virmet_databases/bacteria/fasta/bact11.fasta.gz', + 'bact12': '/data/virmet_databases/bacteria/fasta/bact12.fasta.gz', + 'bact13': '/data/virmet_databases/bacteria/fasta/bact13.fasta.gz', + 'bact14': '/data/virmet_databases/bacteria/fasta/bact14.fasta.gz', + 'bact15': '/data/virmet_databases/bacteria/fasta/bact15.fasta.gz', + 'bact16': '/data/virmet_databases/bacteria/fasta/bact16.fasta.gz', 'fungi1': '/data/virmet_databases/fungi/fasta/fungi1.fasta.gz', 'bt_ref': '/data/virmet_databases/bovine/fasta/bt_ref_Bos_taurus_UMD_3.1.1.fasta.gz' } @@ -265,6 +287,8 @@ def victor(input_reads, contaminant): return clean_name # alignment with bwa + cont_real_link = os.path.realpath(contaminant) + logging.info('Database real path: %s' %cont_real_link) cml = 'bwa mem -t %d -R \'@RG\\tID:foo\\tSM:bar\\tLB:library1\' -T 75 -M %s %s 2> \ %s | samtools view -h -F 4 - > %s' % (n_proc, contaminant, input_reads, err_name, sam_name) logging.debug('running bwa %s %s on %d cores', cont_name, rf_head, n_proc) @@ -361,6 +385,8 @@ def viral_blast(file_in, n_proc, nodes, names): logging.info('could not detect system platform: runnning on %d cores', n_proc) xargs_thread = n_proc # if Darwin then xargs_thread must be n_proc + DB_real_path = os.path.realpath(os.path.join(DB_DIR, 'viral_nuccore/viral_db')) + logging.info('Database real path: %s' %DB_real_path) cml = 'seq 0 %s | xargs -P %d -I {} blastn -task megablast \ -query splitted_clean_{}.fasta -db %s \ -out tmp_{}.tsv \ @@ -409,7 +435,8 @@ def viral_blast(file_in, n_proc, nodes, names): good_hits = good_hits.rename(columns={'staxid': 'tax_id'}) viral_info_file = os.path.join(DB_DIR, 'viral_nuccore/viral_seqs_info.tsv') - viral_info = pd.read_table(viral_info_file, names=['accn', 'TaxId', 'seq_len', 'Organism', 'Title']) + viral_info = pd.read_table(viral_info_file, names=['accn', 'TaxId', 'seq_len', 'Organism', 'Title', 'accn_version']) + viral_info.drop(columns=['accn_version']) good_hits = pd.merge(good_hits, viral_info, on='accn') # if blastn gives no taxid and scientific name, fill these col from viral_seqs_info.tsv file good_hits.loc[:, 'ssciname'] = good_hits.loc[:, 'ssciname'].fillna(good_hits['Organism']).astype(str)