MRG: when lingroups are provided, use them for csv_summary (#3311)

Currently, when we generate a `csv_summary` with LINs, we get a summary at every single LIN rank, which is a lot of results and not very helpful. LINgroups are our way of linking the LINs (e.g. `14;1;0;0;0;0;0;0;0;0`) to a known name/taxonomic group (e.g. "Phylotype I"). This PR changes the behavior of `csv_summary` when a `lingroup` file is provided, limiting summarized reporting to just the named lingroups. While the output is very similar to the `lingroup` output we already have, the most important difference is that the sample name is included in the output, meaning that we get intelligible results when running `tax metagenome` on more than one sample. Prior `tax metagenome` behavior was to always generate a `lingroup` output file when a `lingroups` file is provided. Here, I disable that for multiple queries, since the results wouldn't make sense. I do not replace it with another default, but I did add a recommendation to the help + doc. In the future, we could consider changing the default `lingroup` output to `csv_summary`, since it's actually useful for multiple files. Or, we could modify the `lingroup` output to include query information. - Also fixes #3315 --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
sourmash-bio · Oct 24, 2024 · 6ae9cd3 · 6ae9cd3
1 parent 5acf698
commit 6ae9cd3
Show file tree

Hide file tree

Showing 5 changed files with 321 additions and 29 deletions.
diff --git a/doc/command-line.md b/doc/command-line.md
@@ -637,13 +637,16 @@ sourmash tax metagenome
     --taxonomy gtdb-rs202.taxonomy.v2.csv
 ```
 
-The possible output formats are:
-- `human`
-- `csv_summary`
-- `lineage_summary`
-- `krona`
-- `kreport`
-- `lingroup_report`
+The possible output formats are listed below, followed by the file extension used when writing to a file rather than stdout. When using more than one output format, you must provide an output basename (`--output-base`) that will be used to name the output files. If an `--output-dir` is provided, files will output to that directory.
+
+- `human`: ".human.txt",
+- `csv_summary`: ".summarized.csv",
+- `lineage_summary`: ".lineage_summary.tsv",
+- `krona`: ".krona.tsv",
+- `kreport`: ".kreport.txt",
+- `lingroup`: ".lingroup.tsv",
+- `bioboxes`: ".bioboxes.profile",
+
 
 #### `csv_summary` output format
 
@@ -672,6 +675,9 @@ o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus
 ```
 The `query_md5` and `query_filename` columns are omitted here for brevity.
 
+Note: When using `--lins` with a `--lingroup` file, the `csv_summary` file will report
+summarization for each specified `lingroup`, rather than all possible `lin` ranks (v4.8.12+).
+
 #### `krona` output format
 
 `krona` format is a tab-separated list of these results at a specific rank.
@@ -842,6 +848,8 @@ lg4	1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0	0.65	80000
 
 Related lingroup subpaths will be grouped in output, but exact ordering may change between runs.
 
+Note: this output format requires a single sample only. For a similar output with multiple query samples, provide the `lingroup` file and use the 'csv_summary' output format.
+
 #### `bioboxes` output format
 
 When using standard taxonomic ranks (not lins), you can choose to output a 'bioboxes' profile, `{base}.bioboxes.profile`, where `{base}` is the name provided via the `-o/--output-base` option. This output is organized according to the [bioboxes profile specifications](https://github.com/bioboxes/rfc/tree/master/data-format) so that this file can be used for CAMI challenges.
@@ -971,7 +979,12 @@ sourmash tax genome
 > This command uses the default classification strategy, which uses a
 containment threshold of 0.1 (10%).
 
-There are two possible output formats, `csv_summary` and `krona`.
+`sourmash tax genome` can produce the following output formats:
+
+- `human`: ".human.txt",
+- `csv_summary`: ".classifications.csv",
+- `krona`: ".krona.tsv",
+- `lineage_summary`: ".lineage_summary.tsv",
 
 #### `csv_summary` output format
 

diff --git a/src/sourmash/cli/tax/metagenome.py b/src/sourmash/cli/tax/metagenome.py
@@ -114,14 +114,14 @@ def subparser(subparsers):
         "--lingroups",
         metavar="FILE",
         default=None,
-        help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. Will produce a 'lingroup' report containing taxonomic summarization for each group.",
+        help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. For 'tax metagenome' runs with a single 'gather' file (single query), providing this file will allow us to output a 'lingroup' report containing taxonomic summarization for each group. For multiple queries, we recommend the 'csv_summary' output format.",
     )
     subparser.add_argument(
         "--ictv",
         "--ictv-taxonomy",
         action="store_true",
         default=False,
-        help="use ICTV taxonomy in place of standard taxonomic ranks.  Note that the taxonomy CSV must contain ICTV ranks.",
+        help="use ICTV taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain ICTV ranks.",
     )
     add_rank_arg(subparser)
 

diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
@@ -126,7 +126,7 @@ def metagenome(args):
         notify("No gather results loaded. Exiting.")
         sys.exit(-1)
 
-    single_query_output_formats = ["csv_summary", "kreport"]
+    single_query_output_formats = ["kreport", "lingroup", "bioboxes"]
     desired_single_outputs = []
     if len(query_gather_results) > 1:  # working with multiple queries
         desired_single_outputs = [
@@ -154,6 +154,15 @@ def metagenome(args):
             error(f"ERROR: {str(exc)}")
             sys.exit(-1)
 
+    # if lingroup file is passed in, read it
+    lingroups = None
+    if args.lingroup is not None:
+        try:
+            lingroups = tax_utils.read_lingroups(args.lingroup)
+        except ValueError as exc:
+            error(f"ERROR: {str(exc)}")
+            sys.exit(-1)
+
     # write summarized output in human-readable format
     if "lineage_summary" in args.output_format:
         lineage_outfile, limit_float = make_outfile(
@@ -202,7 +211,10 @@ def metagenome(args):
         )
         with FileOutputCSV(summary_outfile) as out_fp:
             tax_utils.write_summary(
-                query_gather_results, out_fp, limit_float_decimals=limit_float
+                query_gather_results,
+                out_fp,
+                limit_float_decimals=limit_float,
+                lingroups=lingroups,
             )
 
     # write summarized --> kreport output tsv
@@ -218,13 +230,7 @@ def metagenome(args):
             )
 
     # write summarized --> LINgroup output tsv
-    if "lingroup" in args.output_format:
-        try:
-            lingroups = tax_utils.read_lingroups(args.lingroup)
-        except ValueError as exc:
-            error(f"ERROR: {str(exc)}")
-            sys.exit(-1)
-
+    if "lingroup" in args.output_format and lingroups is not None:
         lingroupfile, limit_float = make_outfile(
             args.output_base, "lingroup", output_dir=args.output_dir
         )

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
@@ -1116,14 +1116,17 @@ def write_summary(
     sep=",",
     limit_float_decimals=False,
     classification=False,
+    lingroups=None,
 ):
     """
     Write taxonomy-summarized gather results for each rank.
     """
     w = None
     for q_res in query_gather_results:
         header, summary = q_res.make_full_summary(
-            limit_float=limit_float_decimals, classification=classification
+            limit_float=limit_float_decimals,
+            classification=classification,
+            lingroups=lingroups,
         )
         if w is None:
             w = csv.DictWriter(csv_fp, header, delimiter=sep)
@@ -2073,9 +2076,18 @@ def as_lineage_dict(self, query_info, ranks):
             lD[rank] = lin_name
         return lD
 
-    def as_summary_dict(self, query_info, limit_float=False):
+    def as_summary_dict(self, query_info, limit_float=False, lingroups=None):
         sD = asdict(self)
         sD["lineage"] = self.lineage.display_lineage(null_as_unclassified=True)
+        # if lingroups, convert lingroup number to lingroup name
+        if lingroups is not None and sD["lineage"] in lingroups.keys():
+            sD["lineage"] = lingroups[sD["lineage"]]
+        elif (
+            lingroups
+            and sD["lineage"] != "unclassified"
+            and sD["lineage"] not in lingroups.keys()
+        ):
+            return None
         sD["query_name"] = query_info.query_name
         sD["query_md5"] = query_info.query_md5
         sD["query_filename"] = query_info.query_filename
@@ -2553,7 +2565,9 @@ def make_human_summary(self, display_rank, classification=False):
             results.append(res.as_human_friendly_dict(query_info=self.query_info))
         return results
 
-    def make_full_summary(self, classification=False, limit_float=False):
+    def make_full_summary(
+        self, classification=False, limit_float=False, lingroups=None
+    ):
         results = []
         rD = {}
         if classification:
@@ -2590,16 +2604,31 @@ def make_full_summary(self, classification=False, limit_float=False):
                 "total_weighted_hashes",
             ]
 
+            lingroup_ranks = set()
+            if lingroups is not None:
+                for lin in lingroups.keys():
+                    # e.g. "14;1;0;0;0;0;0;0;0;0" => 9
+                    lin_rank = len(lin.split(";")) - 1
+                    lingroup_ranks.add(lin_rank)
+
             for rank in self.summarized_ranks[::-1]:  # descending
+                # if lingroups are provided, only report summary for specified lingroups
+                if lingroup_ranks:
+                    if int(rank) not in lingroup_ranks:
+                        continue
                 unclassified = []
                 rank_results = self.summarized_lineage_results[rank]
                 rank_results.sort(
                     key=lambda res: -res.fraction
                 )  # v5?: f_weighted_at_rank)
                 for res in rank_results:
                     rD = res.as_summary_dict(
-                        query_info=self.query_info, limit_float=limit_float
+                        query_info=self.query_info,
+                        limit_float=limit_float,
+                        lingroups=lingroups,
                     )
+                    if rD is None:
+                        continue
                     # save unclassified for the end
                     if rD["lineage"] == "unclassified":
                         unclassified.append(rD)