From 6ae9cd32d9ac2841c98415bc1c043597536a5470 Mon Sep 17 00:00:00 2001 From: Tessa Pierce Ward Date: Thu, 24 Oct 2024 13:09:07 -0700 Subject: [PATCH] MRG: when lingroups are provided, use them for `csv_summary` (#3311) Currently, when we generate a `csv_summary` with LINs, we get a summary at every single LIN rank, which is a lot of results and not very helpful. LINgroups are our way of linking the LINs (e.g. `14;1;0;0;0;0;0;0;0;0`) to a known name/taxonomic group (e.g. "Phylotype I"). This PR changes the behavior of `csv_summary` when a `lingroup` file is provided, limiting summarized reporting to just the named lingroups. While the output is very similar to the `lingroup` output we already have, the most important difference is that the sample name is included in the output, meaning that we get intelligible results when running `tax metagenome` on more than one sample. Prior `tax metagenome` behavior was to always generate a `lingroup` output file when a `lingroups` file is provided. Here, I disable that for multiple queries, since the results wouldn't make sense. I do not replace it with another default, but I did add a recommendation to the help + doc. In the future, we could consider changing the default `lingroup` output to `csv_summary`, since it's actually useful for multiple files. Or, we could modify the `lingroup` output to include query information. - Also fixes #3315 --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- doc/command-line.md | 29 +++- src/sourmash/cli/tax/metagenome.py | 4 +- src/sourmash/tax/__main__.py | 24 ++- src/sourmash/tax/tax_utils.py | 37 ++++- tests/test_tax.py | 256 ++++++++++++++++++++++++++++- 5 files changed, 321 insertions(+), 29 deletions(-) diff --git a/doc/command-line.md b/doc/command-line.md index 05f7414753..d6370a307c 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -637,13 +637,16 @@ sourmash tax metagenome --taxonomy gtdb-rs202.taxonomy.v2.csv ``` -The possible output formats are: -- `human` -- `csv_summary` -- `lineage_summary` -- `krona` -- `kreport` -- `lingroup_report` +The possible output formats are listed below, followed by the file extension used when writing to a file rather than stdout. When using more than one output format, you must provide an output basename (`--output-base`) that will be used to name the output files. If an `--output-dir` is provided, files will output to that directory. + +- `human`: ".human.txt", +- `csv_summary`: ".summarized.csv", +- `lineage_summary`: ".lineage_summary.tsv", +- `krona`: ".krona.tsv", +- `kreport`: ".kreport.txt", +- `lingroup`: ".lingroup.tsv", +- `bioboxes`: ".bioboxes.profile", + #### `csv_summary` output format @@ -672,6 +675,9 @@ o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus ``` The `query_md5` and `query_filename` columns are omitted here for brevity. +Note: When using `--lins` with a `--lingroup` file, the `csv_summary` file will report +summarization for each specified `lingroup`, rather than all possible `lin` ranks (v4.8.12+). + #### `krona` output format `krona` format is a tab-separated list of these results at a specific rank. @@ -842,6 +848,8 @@ lg4 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 0.65 80000 Related lingroup subpaths will be grouped in output, but exact ordering may change between runs. +Note: this output format requires a single sample only. For a similar output with multiple query samples, provide the `lingroup` file and use the 'csv_summary' output format. + #### `bioboxes` output format When using standard taxonomic ranks (not lins), you can choose to output a 'bioboxes' profile, `{base}.bioboxes.profile`, where `{base}` is the name provided via the `-o/--output-base` option. This output is organized according to the [bioboxes profile specifications](https://github.com/bioboxes/rfc/tree/master/data-format) so that this file can be used for CAMI challenges. @@ -971,7 +979,12 @@ sourmash tax genome > This command uses the default classification strategy, which uses a containment threshold of 0.1 (10%). -There are two possible output formats, `csv_summary` and `krona`. +`sourmash tax genome` can produce the following output formats: + +- `human`: ".human.txt", +- `csv_summary`: ".classifications.csv", +- `krona`: ".krona.tsv", +- `lineage_summary`: ".lineage_summary.tsv", #### `csv_summary` output format diff --git a/src/sourmash/cli/tax/metagenome.py b/src/sourmash/cli/tax/metagenome.py index f8e31902e8..2f04665dc8 100644 --- a/src/sourmash/cli/tax/metagenome.py +++ b/src/sourmash/cli/tax/metagenome.py @@ -114,14 +114,14 @@ def subparser(subparsers): "--lingroups", metavar="FILE", default=None, - help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. Will produce a 'lingroup' report containing taxonomic summarization for each group.", + help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. For 'tax metagenome' runs with a single 'gather' file (single query), providing this file will allow us to output a 'lingroup' report containing taxonomic summarization for each group. For multiple queries, we recommend the 'csv_summary' output format.", ) subparser.add_argument( "--ictv", "--ictv-taxonomy", action="store_true", default=False, - help="use ICTV taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain ICTV ranks.", + help="use ICTV taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain ICTV ranks.", ) add_rank_arg(subparser) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index f0c8b4f2df..c4e22d07fa 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -126,7 +126,7 @@ def metagenome(args): notify("No gather results loaded. Exiting.") sys.exit(-1) - single_query_output_formats = ["csv_summary", "kreport"] + single_query_output_formats = ["kreport", "lingroup", "bioboxes"] desired_single_outputs = [] if len(query_gather_results) > 1: # working with multiple queries desired_single_outputs = [ @@ -154,6 +154,15 @@ def metagenome(args): error(f"ERROR: {str(exc)}") sys.exit(-1) + # if lingroup file is passed in, read it + lingroups = None + if args.lingroup is not None: + try: + lingroups = tax_utils.read_lingroups(args.lingroup) + except ValueError as exc: + error(f"ERROR: {str(exc)}") + sys.exit(-1) + # write summarized output in human-readable format if "lineage_summary" in args.output_format: lineage_outfile, limit_float = make_outfile( @@ -202,7 +211,10 @@ def metagenome(args): ) with FileOutputCSV(summary_outfile) as out_fp: tax_utils.write_summary( - query_gather_results, out_fp, limit_float_decimals=limit_float + query_gather_results, + out_fp, + limit_float_decimals=limit_float, + lingroups=lingroups, ) # write summarized --> kreport output tsv @@ -218,13 +230,7 @@ def metagenome(args): ) # write summarized --> LINgroup output tsv - if "lingroup" in args.output_format: - try: - lingroups = tax_utils.read_lingroups(args.lingroup) - except ValueError as exc: - error(f"ERROR: {str(exc)}") - sys.exit(-1) - + if "lingroup" in args.output_format and lingroups is not None: lingroupfile, limit_float = make_outfile( args.output_base, "lingroup", output_dir=args.output_dir ) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 5af08f91fc..60a967c6b4 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -1116,6 +1116,7 @@ def write_summary( sep=",", limit_float_decimals=False, classification=False, + lingroups=None, ): """ Write taxonomy-summarized gather results for each rank. @@ -1123,7 +1124,9 @@ def write_summary( w = None for q_res in query_gather_results: header, summary = q_res.make_full_summary( - limit_float=limit_float_decimals, classification=classification + limit_float=limit_float_decimals, + classification=classification, + lingroups=lingroups, ) if w is None: w = csv.DictWriter(csv_fp, header, delimiter=sep) @@ -2073,9 +2076,18 @@ def as_lineage_dict(self, query_info, ranks): lD[rank] = lin_name return lD - def as_summary_dict(self, query_info, limit_float=False): + def as_summary_dict(self, query_info, limit_float=False, lingroups=None): sD = asdict(self) sD["lineage"] = self.lineage.display_lineage(null_as_unclassified=True) + # if lingroups, convert lingroup number to lingroup name + if lingroups is not None and sD["lineage"] in lingroups.keys(): + sD["lineage"] = lingroups[sD["lineage"]] + elif ( + lingroups + and sD["lineage"] != "unclassified" + and sD["lineage"] not in lingroups.keys() + ): + return None sD["query_name"] = query_info.query_name sD["query_md5"] = query_info.query_md5 sD["query_filename"] = query_info.query_filename @@ -2553,7 +2565,9 @@ def make_human_summary(self, display_rank, classification=False): results.append(res.as_human_friendly_dict(query_info=self.query_info)) return results - def make_full_summary(self, classification=False, limit_float=False): + def make_full_summary( + self, classification=False, limit_float=False, lingroups=None + ): results = [] rD = {} if classification: @@ -2590,7 +2604,18 @@ def make_full_summary(self, classification=False, limit_float=False): "total_weighted_hashes", ] + lingroup_ranks = set() + if lingroups is not None: + for lin in lingroups.keys(): + # e.g. "14;1;0;0;0;0;0;0;0;0" => 9 + lin_rank = len(lin.split(";")) - 1 + lingroup_ranks.add(lin_rank) + for rank in self.summarized_ranks[::-1]: # descending + # if lingroups are provided, only report summary for specified lingroups + if lingroup_ranks: + if int(rank) not in lingroup_ranks: + continue unclassified = [] rank_results = self.summarized_lineage_results[rank] rank_results.sort( @@ -2598,8 +2623,12 @@ def make_full_summary(self, classification=False, limit_float=False): ) # v5?: f_weighted_at_rank) for res in rank_results: rD = res.as_summary_dict( - query_info=self.query_info, limit_float=limit_float + query_info=self.query_info, + limit_float=limit_float, + lingroups=lingroups, ) + if rD is None: + continue # save unclassified for the end if rD["lineage"] == "unclassified": unclassified.append(rD) diff --git a/tests/test_tax.py b/tests/test_tax.py index 389a536619..72889aaede 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -1966,6 +1966,139 @@ def test_metagenome_two_queries_human_output(runtmp): assert "test2 1.6% 89.1% d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" +def test_metagenome_two_queries_csv_summary_output(runtmp): + # remove single-query outputs when working with multiple queries + c = runtmp + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") + + # make a second query with same output + g_res2 = runtmp.output("test2.gather.csv") + with open(g_res2, "w") as fp: + for line in Path(g_res).read_text().splitlines(): + line = line.replace("test1", "test2") + "\n" + fp.write(line) + + csv_summary_out = runtmp.output("tst.summarized.csv") + + c.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_res, + g_res2, + "--taxonomy-csv", + taxonomy_csv, + "-F", + "csv_summary", + "--rank", + "phylum", + "-o", + "tst", + ) + + assert os.path.exists(csv_summary_out) + + assert c.last_result.status == 0 + assert "loaded results for 2 queries from 2 gather CSVs" in c.last_result.err + assert ( + f"saving 'csv_summary' output to '{os.path.basename(csv_summary_out)}'" + in runtmp.last_result.err + ) + sum_gather_results = [ + x.rstrip() for x in Path(csv_summary_out).read_text().splitlines() + ] + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in sum_gather_results[0] + ) + # check both queries exist in csv_summary results; check several + assert ( + "test1,superkingdom,0.2042281611487834,d__Bacteria,md5,test1.sig,0.13080306238801107,1024000,0.9500482567175479,0" + in sum_gather_results[1] + ) + assert ( + "test2,superkingdom,0.2042281611487834,d__Bacteria,md5,test2.sig,0.13080306238801107,1024000,0.9500482567175479,0" + in sum_gather_results[23] + ) + assert ( + "test2,phylum,0.11607499002792182,d__Bacteria;p__Bacteroidota,md5,test2.sig,0.07265026877341586,582000" + in sum_gather_results[25] + ) + assert ( + "test2,phylum,0.08815317112086159,d__Bacteria;p__Proteobacteria,md5,test2.sig,0.05815279361459521,442000" + in sum_gather_results[26] + ) + assert ( + "test2,phylum,0.7957718388512166,unclassified,md5,test2.sig,0.8691969376119889,3990000" + in sum_gather_results[27] + ) + assert ( + "test2,class,0.11607499002792182,d__Bacteria;p__Bacteroidota;c__Bacteroidia,md5,test2.sig,0.07265026877341586,582000" + in sum_gather_results[28] + ) + assert ( + "test2,class,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria,md5,test2.sig,0.05815279361459521,442000" + in sum_gather_results[29] + ) + assert ( + "test2,class,0.7957718388512166,unclassified,md5,test2.sig,0.8691969376119889,3990000" + in sum_gather_results[30] + ) + assert ( + "test2,order,0.11607499002792182,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales,md5,test2.sig,0.07265026877341586,582000" + in sum_gather_results[31] + ) + assert ( + "test2,order,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales,md5,test2.sig,0.05815279361459521,442000" + in sum_gather_results[32] + ) + assert ( + "test2,order,0.7957718388512166,unclassified,md5,test2.sig,0.8691969376119889,3990000" + in sum_gather_results[33] + ) + assert ( + "test2,family,0.11607499002792182,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test2.sig,0.07265026877341586,582000" + in sum_gather_results[34] + ) + assert ( + "test2,family,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae,md5,test2.sig,0.05815279361459521,442000" + in sum_gather_results[35] + ) + assert ( + "test2,family,0.7957718388512166,unclassified,md5,test2.sig,0.8691969376119889,3990000" + in sum_gather_results[36] + ) + assert ( + "test2,genus,0.0885520542481053,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,md5,test2.sig,0.05701254275940707,444000" + in sum_gather_results[37] + ) + assert ( + "test2,genus,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia,md5,test2.sig,0.05815279361459521,442000" + in sum_gather_results[38] + ) + assert ( + "test2,genus,0.027522935779816515,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola,md5,test2.sig,0.015637726014008795,138000" + in sum_gather_results[39] + ) + assert ( + "test2,genus,0.7957718388512166,unclassified,md5,test2.sig,0.8691969376119889,3990000" + in sum_gather_results[40] + ) + assert ( + "test2,species,0.0885520542481053,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test2.sig,0.05701254275940707,444000" + in sum_gather_results[41] + ) + assert ( + "test2,species,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli,md5,test2.sig,0.05815279361459521,442000" + in sum_gather_results[42] + ) + assert ( + "test2,species,0.027522935779816515,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus,md5,test2.sig,0.015637726014008795,138000" + in sum_gather_results[43] + ) + + def test_metagenome_two_queries_with_single_query_output_formats_fail(runtmp): # fail on multiple queries with single query output formats c = runtmp @@ -1979,7 +2112,8 @@ def test_metagenome_two_queries_with_single_query_output_formats_fail(runtmp): line = line.replace("test1", "test2") + "\n" fp.write(line) - csv_summary_out = runtmp.output("tst.summarized.csv") + runtmp.output("tst.summarized.csv") + bioboxes_out = runtmp.output("tst.bioboxes.out") kreport_out = runtmp.output("tst.kreport.txt") with pytest.raises(SourmashCommandFailed) as exc: @@ -1992,7 +2126,7 @@ def test_metagenome_two_queries_with_single_query_output_formats_fail(runtmp): "--taxonomy-csv", taxonomy_csv, "-F", - "csv_summary", + "bioboxes", "kreport", "--rank", "phylum", @@ -2001,13 +2135,13 @@ def test_metagenome_two_queries_with_single_query_output_formats_fail(runtmp): ) print(str(exc.value)) - assert not os.path.exists(csv_summary_out) + assert not os.path.exists(bioboxes_out) assert not os.path.exists(kreport_out) assert c.last_result.status == -1 assert "loaded results for 2 queries from 2 gather CSVs" in c.last_result.err assert ( - "WARNING: found results for multiple gather queries. Can only output multi-query result formats: skipping csv_summary, kreport" + "WARNING: found results for multiple gather queries. Can only output multi-query result formats: skipping bioboxes, kreport" in c.last_result.err ) assert "ERROR: No output formats remaining." in c.last_result.err @@ -2028,6 +2162,7 @@ def test_metagenome_two_queries_skip_single_query_output_formats(runtmp): csv_summary_out = runtmp.output("tst.summarized.csv") kreport_out = runtmp.output("tst.kreport.txt") + bioboxes_out = runtmp.output("tst.bioboxes.txt") lineage_summary_out = runtmp.output("tst.lineage_summary.tsv") c.run_sourmash( @@ -2040,6 +2175,7 @@ def test_metagenome_two_queries_skip_single_query_output_formats(runtmp): taxonomy_csv, "-F", "csv_summary", + "bioboxes", "kreport", "lineage_summary", "--rank", @@ -2048,17 +2184,39 @@ def test_metagenome_two_queries_skip_single_query_output_formats(runtmp): "tst", ) - assert not os.path.exists(csv_summary_out) assert not os.path.exists(kreport_out) + assert not os.path.exists(bioboxes_out) + assert os.path.exists(csv_summary_out) assert os.path.exists(lineage_summary_out) assert c.last_result.status == 0 assert "loaded results for 2 queries from 2 gather CSVs" in c.last_result.err assert ( - "WARNING: found results for multiple gather queries. Can only output multi-query result formats: skipping csv_summary, kreport" + "WARNING: found results for multiple gather queries. Can only output multi-query result formats: skipping bioboxes, kreport" in c.last_result.err ) + assert ( + f"saving 'csv_summary' output to '{os.path.basename(csv_summary_out)}'" + in runtmp.last_result.err + ) + sum_gather_results = [ + x.rstrip() for x in Path(csv_summary_out).read_text().splitlines() + ] + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in sum_gather_results[0] + ) + # check both queries exist in csv_summary results + assert ( + "test1,superkingdom,0.2042281611487834,d__Bacteria,md5,test1.sig,0.13080306238801107,1024000,0.9500482567175479,0" + in sum_gather_results[1] + ) + assert ( + "test2,superkingdom,0.2042281611487834,d__Bacteria,md5,test2.sig,0.13080306238801107,1024000,0.9500482567175479,0" + in sum_gather_results[23] + ) + def test_metagenome_two_queries_krona(runtmp): # for now, we enable multi-query krona. Is this desired? @@ -6061,6 +6219,92 @@ def test_metagenome_LIN_lingroups(runtmp): ) +def test_metagenome_LIN_lingroups_summary(runtmp): + # test lingroups summary file. Can no longer output stdout, b/c will produce 2 files + c = runtmp + csv_base = "out" + sum_csv = csv_base + ".summarized.csv" + csvout = runtmp.output(sum_csv) + outdir = os.path.dirname(csvout) + + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") + + lg_file = runtmp.output("test.lg.csv") + with open(lg_file, "w") as out: + out.write("lin,name\n") + out.write("0;0;0,lg1\n") + out.write("1;0;0,lg2\n") + out.write("2;0;0,lg3\n") + out.write("1;0;1,lg3\n") + # write a 19 so we can check the end + out.write("1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,lg4\n") + + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lins", + "--lingroup", + lg_file, + "-o", + csv_base, + "--output-dir", + outdir, + "-F", + "csv_summary", + ) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert ( + "Read 5 lingroup rows and found 5 distinct lingroup prefixes." + in c.last_result.err + ) + assert os.path.exists(csvout) + sum_gather_results = [x.rstrip() for x in Path(csvout).read_text().splitlines()] + print(sum_gather_results) + assert f"saving 'csv_summary' output to '{csvout}'" in runtmp.last_result.err + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in sum_gather_results[0] + ) + assert ( + "test1,2,0.08815317112086159,lg1,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.05815279361459521,442000,0.9246458342627294,6139" + in sum_gather_results[1] + ) + assert ( + "test1,2,0.07778220981252493,lg2,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.050496823586903404,390000,0.920920083987624,6139" + in sum_gather_results[2] + ) + assert ( + "test1,2,0.027522935779816515,lg3,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.015637726014008795,138000,0.8905689983332759,6139" + in sum_gather_results[3] + ) + assert ( + "test1,2,0.010769844435580374,lg3,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.006515719172503665,54000,0.8640181883213995,6139" + in sum_gather_results[4] + ) + assert ( + "test1,2,0.7957718388512166,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.8691969376119889,3990000,,6139" + in sum_gather_results[5] + ) + assert ( + "test1,19,0.010769844435580374,lg4,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.006515719172503665,54000,0.8640181883213995,6139" + in sum_gather_results[6] + ) + assert ( + "test1,19,0.7957718388512166,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.8691969376119889,3990000,,6139" + in sum_gather_results[7] + ) + + def test_metagenome_LIN_human_summary_no_lin_position(runtmp): c = runtmp