diff --git a/README.md b/README.md index 5f429c1..8ad9958 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ AMBER is an evaluation package for the comparative assessment of genome reconstr ## Requirements -AMBER 2.0.4 has been tested with Python 3.11. +AMBER 2.0.7 has been tested with Python 3.11. See [requirements.txt](requirements.txt) for all dependencies. @@ -145,7 +145,7 @@ Binnings of datasets with multiple samples are supported by AMBER. For each binn ## Running _amber.py_ ~~~BASH -usage: AMBER [-h] -g GOLD_STANDARD_FILE [-l LABELS] [-p FILTER] [-n MIN_LENGTH] -o OUTPUT_DIR [--stdout] [-d DESC] [--colors COLORS] [--silent] [-v] [-x MIN_COMPLETENESS] +usage: AMBER [-h] -g GOLD_STANDARD_FILE [-l LABELS] [-p FILTER] [-n MIN_LENGTH] -o OUTPUT_DIR [--stdout] [-d DESC] [--colors COLORS] [--silent] [--skip_gs] [-v] [-x MIN_COMPLETENESS] [-y MAX_CONTAMINATION] [-r REMOVE_GENOMES] [-k KEYWORD] [--genome_coverage GENOME_COVERAGE] [--ncbi_dir NCBI_DIR] bin_files [bin_files ...] @@ -169,6 +169,7 @@ options: --stdout Print summary to stdout -d DESC, --desc DESC Description for HTML page --silent Silent mode + --skip_gs Skip gold standard evaluation vs itself -v, --version show program's version number and exit genome binning-specific arguments: diff --git a/amber.py b/amber.py index 7b247fa..39a6491 100755 --- a/amber.py +++ b/amber.py @@ -23,7 +23,6 @@ from cami_amber.utils import argparse_parents from cami_amber.utils import labels as utils_labels from version import __version__ -from collections import defaultdict import argparse import errno import logging @@ -56,10 +55,13 @@ def make_sure_path_exists(path): raise -def create_output_directories(output_dir, sample_id_to_queries_list): +def create_output_directories(output_dir, sample_id_to_g_queries_list, sample_id_to_t_queries_list): logging.getLogger('amber').info('Creating output directories') - for sample_id in sample_id_to_queries_list: - for query in sample_id_to_queries_list[sample_id]: + for sample_id in sample_id_to_g_queries_list: + for query in sample_id_to_g_queries_list[sample_id]: + make_sure_path_exists(os.path.join(output_dir, query.binning_type, query.label)) + for sample_id in sample_id_to_t_queries_list: + for query in sample_id_to_t_queries_list[sample_id]: make_sure_path_exists(os.path.join(output_dir, query.binning_type, query.label)) @@ -77,7 +79,7 @@ def get_labels(labels, bin_files): return tool_id -def save_metrics(sample_id_to_queries_list, df_summary, pd_bins, output_dir, stdout): +def save_metrics(sample_id_to_g_queries_list, df_summary, pd_bins, output_dir, stdout): logging.getLogger('amber').info('Saving computed metrics') df_summary.to_csv(os.path.join(output_dir, 'results.tsv'), sep='\t', index=False) pd_bins.to_csv(os.path.join(output_dir, 'bin_metrics.tsv'), index=False, sep='\t') @@ -95,12 +97,11 @@ def save_metrics(sample_id_to_queries_list, df_summary, pd_bins, output_dir, std table.to_csv(os.path.join(output_dir, 'taxonomic', tool, 'metrics_per_bin.tsv'), sep='\t', index=False) pd_genomes_all = pd.DataFrame() - for sample_id in sample_id_to_queries_list: + for sample_id in sample_id_to_g_queries_list: pd_genomes_sample = pd.DataFrame() - for query in sample_id_to_queries_list[sample_id]: - if isinstance(query, binning_classes.GenomeQuery): - query.recall_df_cami1[utils_labels.TOOL] = query.label - pd_genomes_sample = pd.concat([pd_genomes_sample, query.recall_df_cami1], ignore_index=True, sort=False) + for query in sample_id_to_g_queries_list[sample_id]: + query.recall_df_cami1[utils_labels.TOOL] = query.label + pd_genomes_sample = pd.concat([pd_genomes_sample, query.recall_df_cami1], ignore_index=True, sort=False) pd_genomes_sample['sample_id'] = sample_id pd_genomes_all = pd.concat([pd_genomes_all, pd_genomes_sample], ignore_index=True, sort=False) if not pd_genomes_all.empty: @@ -166,17 +167,11 @@ def main(args=None): coverages_pd = load_data.open_coverages(args.genome_coverage) - sample_id_to_queries_list = defaultdict(list) - for sample_id in sample_id_to_g_queries_list: - sample_id_to_queries_list[sample_id] += sample_id_to_g_queries_list[sample_id] - for sample_id in sample_id_to_t_queries_list: - sample_id_to_queries_list[sample_id] += sample_id_to_t_queries_list[sample_id] - - create_output_directories(output_dir, sample_id_to_queries_list) + create_output_directories(output_dir, sample_id_to_g_queries_list, sample_id_to_t_queries_list) df_summary, pd_bins = evaluate.evaluate_samples_queries(sample_id_to_g_queries_list, sample_id_to_t_queries_list) - save_metrics(sample_id_to_queries_list, df_summary, pd_bins, output_dir, args.stdout) + save_metrics(sample_id_to_g_queries_list, df_summary, pd_bins, output_dir, args.stdout) plots.plot_genome_binning(args.colors, sample_id_to_g_queries_list, diff --git a/cami_amber/binning_classes.py b/cami_amber/binning_classes.py index f157de4..6af6502 100755 --- a/cami_amber/binning_classes.py +++ b/cami_amber/binning_classes.py @@ -637,7 +637,8 @@ def safe_divide(x, y): self.precision_df['sample_id'] = self.sample_id self.recall_df = recall_df - self.heatmap_sdf = precision_recall_per_bin.transform_confusion_matrix2(query_w_length, confusion_df, precision_df, gs_df, log_scale=True) + if not self.options.skip_heatmap: + self.heatmap_sdf = precision_recall_per_bin.transform_confusion_matrix2(query_w_length, confusion_df, precision_df, gs_df, log_scale=True) self.eval_success = True @@ -690,7 +691,7 @@ def plot_recall_vs_genome_size(self): plt.close(fig) def plot_heat_maps(self): - if self.label == utils_labels.GS: + if self.label == utils_labels.GS or self.options.skip_heatmap: return plots.plot_heatmap(self.heatmap_sdf, self.sample_id, self.options.output_dir, self.label, log_scale=True) @@ -947,7 +948,7 @@ def compute_metrics(self, gs_rank_to_df): class Options: def __init__(self, filter_tail_percentage=0, genome_to_unique_common=None, filter_keyword=None, min_length=0, rank_as_genome_binning=None, output_dir=None, min_completeness=None, max_contamination=None, - ncbi_dir=None, skip_gs=False): + ncbi_dir=None, skip_gs=False, skip_heatmap=False): self.__filter_tail_percentage = float(filter_tail_percentage) if filter_tail_percentage else .0 self.__genome_to_unique_common = genome_to_unique_common self.__filter_keyword = filter_keyword @@ -967,6 +968,7 @@ def __init__(self, filter_tail_percentage=0, genome_to_unique_common=None, filte else: self.__max_contamination = [.1, .05] self.__skip_gs = skip_gs + self.__skip_heatmap = skip_heatmap self.__ncbi_dir = ncbi_dir @property @@ -1017,6 +1019,10 @@ def ncbi_dir(self): def skip_gs(self): return self.__skip_gs + @property + def skip_heatmap(self): + return self.__skip_heatmap + @filter_tail_percentage.setter def filter_tail_percentage(self, filter_tail_percentage): self.__filter_tail_percentage = filter_tail_percentage @@ -1064,3 +1070,7 @@ def ncbi_dir(self, ncbi_dir): @skip_gs.setter def skip_gs(self, skip_gs): self.__skip_gs = skip_gs + + @skip_heatmap.setter + def skip_heatmap(self, skip_heatmap): + self.__skip_heatmap = skip_heatmap diff --git a/cami_amber/evaluate.py b/cami_amber/evaluate.py index a66bced..c6bf95d 100644 --- a/cami_amber/evaluate.py +++ b/cami_amber/evaluate.py @@ -22,7 +22,6 @@ def evaluate_sample(queries_list): gs_data = query1.gold_standard_data for query in queries_list: query.compute_metrics(gs_data) - del gs_data def evaluate_samples_queries(sample_id_to_g_queries_list, sample_id_to_t_queries_list): diff --git a/cami_amber/utils/load_data.py b/cami_amber/utils/load_data.py index fd65f47..1d56c28 100755 --- a/cami_amber/utils/load_data.py +++ b/cami_amber/utils/load_data.py @@ -22,6 +22,7 @@ import io import tarfile import zipfile +import itertools from multiprocessing.pool import ThreadPool from collections import defaultdict from collections import OrderedDict @@ -167,13 +168,27 @@ def read_metadata(path_label_tuple): def load_sample(metadata): columns = ['SEQUENCEID', 'BINID', 'TAXID', 'LENGTH', '_LENGTH'] logging.getLogger('amber').info('Loading %s of %s' % (metadata[2]['SAMPLEID'], metadata[5])) - nrows = metadata[1] - metadata[0] + 1 usecols = [v for v in metadata[3] if v in columns] - df = pd.read_csv(metadata[4], sep='\t', comment='#', skiprows=metadata[0], nrows=nrows, header=None, - names=metadata[3], - usecols=usecols, - dtype={'SEQUENCEID': pd.StringDtype(), 'BINID': pd.StringDtype(), 'TAXID': pd.UInt32Dtype(), - 'LENGTH': pd.UInt32Dtype(), '_LENGTH': pd.UInt32Dtype()}) + + if metadata[0] < 1000: + nrows = metadata[1] - metadata[0] + 1 + df = pd.read_csv(metadata[4], sep='\t', comment='#', skiprows=metadata[0], nrows=nrows, header=None, + names=metadata[3], + usecols=usecols, + dtype={'SEQUENCEID': pd.StringDtype(), 'BINID': pd.StringDtype(), 'TAXID': pd.UInt32Dtype(), + 'LENGTH': pd.UInt32Dtype(), '_LENGTH': pd.UInt32Dtype()}) + else: + # Avoid high memory peak by using StringIO due to possible pandas bug + text = io.StringIO() + with open_generic(metadata[4]) as f: + for line in itertools.islice(f, metadata[0], metadata[1] + 1): + text.write(line) + text.seek(0) + df = pd.read_csv(text, sep='\t', comment='#', header=None, + names=metadata[3], + usecols=usecols, + dtype={'SEQUENCEID': pd.StringDtype(), 'BINID': pd.StringDtype(), 'TAXID': pd.UInt32Dtype(), + 'LENGTH': pd.UInt32Dtype(), '_LENGTH': pd.UInt32Dtype()}) df.rename(columns={'_LENGTH': 'LENGTH'}, inplace=True) return df diff --git a/setup.py b/setup.py index 0ce187f..dc25452 100644 --- a/setup.py +++ b/setup.py @@ -12,6 +12,8 @@ def dependencies(): name = 'cami-amber', version = __version__, description = 'AMBER: Assessment of Metagenome BinnERs', + long_description = open('README.md').read(), + long_description_content_type="text/markdown", author = 'CAMI', author_email = 'support@cami-challenge.org', url = 'http://cami-challenge.org', diff --git a/version.py b/version.py index 13ce17d..4b259db 100644 --- a/version.py +++ b/version.py @@ -1 +1 @@ -__version__ = '2.0.6' +__version__ = '2.0.7'