From 9033d6dcf60e8ceb638d7fcea5fc892c45b569bb Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 30 Jan 2024 09:35:46 -0800 Subject: [PATCH] MRG: Fix `tax metagenome` to work on gather output created with `--estimate-ani-ci` (#2952) The `tax metagenome` code errors out with a confusing error message when gather results containing confidence intervals for the ANI calculations are output. This PR adds a test and fixes the problem. ## Details The error message is: ``` ERROR: '/var/folders/6s/_f373w1d6hdfjc2kjstq97s80000gp/T/sourmashtest_rs5l3b23/gather.csv' is missing columns needed for taxonomic summarization. Please run gather with sourmash >= 4.4. ``` and it is caused by `GatherRow` running across the various extra columns added by `--estimate-ani-ci`, such as `query_containment_ani_low`. The fix is to add these columns in as optional/unused columns in the `GatherRow` dataclass. --- src/sourmash/tax/tax_utils.py | 4 ++++ tests/test_tax.py | 43 +++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 4bd7ddd8d9..df69f0ee6a 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -1540,6 +1540,10 @@ class GatherRow: n_unique_weighted_found: int = None sum_weighted_found: int = None total_weighted_hashes: int = None + query_containment_ani_low: float = None + query_containment_ani_high: float = None + match_containment_ani_low: float = None + match_containment_ani_high: float = None @dataclass diff --git a/tests/test_tax.py b/tests/test_tax.py index 1e82aebf85..b37e8eaf6f 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -2201,6 +2201,49 @@ def test_genome_ani_lemonade_classify(runtmp): assert 'MAG3_1,d__Bacteria,p__Bacteroidota,c__Chlorobia,o__Chlorobiales,f__Chlorobiaceae,g__Prosthecochloris,s__Prosthecochloris vibrioformis' in output +def test_genome_ani_lemonade_classify_estimate_ani_ci(runtmp): + # test a complete MAG classification with lemonade MAG from STAMPS 2022 + # (real data!) + c = runtmp + + ## first run gather + genome = utils.get_test_data('tax/lemonade-MAG3.sig.gz') + matches = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.matches.zip') + + c.run_sourmash('gather', genome, matches, + '--threshold-bp=5000', '-o', 'gather.csv', '--estimate-ani') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + + this_gather_file = c.output('gather.csv') + this_gather = Path(this_gather_file).read_text().splitlines() + + assert len(this_gather) == 4 + + ## now run 'tax genome' with human output + taxonomy_file = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.matches.tax.csv') + c.run_sourmash('tax', 'genome', '-g', this_gather_file, '-t', taxonomy_file, + '--ani', '0.8', '-F', 'human') + + output = c.last_result.out + assert 'MAG3_1 match 5.3% 91.0% d__Bacteria;p__Bacteroidota;c__Chlorobia;o__Chlorobiales;f__Chlorobiaceae;g__Prosthecochloris;s__Prosthecochloris vibrioformis' in output + + # aaand classify to lineage_csv + c.run_sourmash('tax', 'genome', '-g', this_gather_file, '-t', taxonomy_file, + '--ani', '0.8', '-F', 'lineage_csv') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + output = c.last_result.out + assert 'ident,superkingdom,phylum,class,order,family,genus,species' in output + assert 'MAG3_1,d__Bacteria,p__Bacteroidota,c__Chlorobia,o__Chlorobiales,f__Chlorobiaceae,g__Prosthecochloris,s__Prosthecochloris vibrioformis' in output + + def test_metagenome_no_gather_csv(runtmp): # test tax metagenome with no -g taxonomy_file = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.matches.tax.csv')