Feature: subset of coocurrating mutations

- Fix variants attribution when cooccurrence are subset/superset of other variants: if var 1 has mutations A, B and C and var 2 has muations A and B amplicon with mutations A and B is flagged to both var 1 and 2
cbg-ethz · Oct 9, 2024 · 3cf5acd · 3cf5acd
1 parent 2287d89
commit 3cf5acd
Show file tree

Hide file tree

Showing 2 changed files with 85 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -57,9 +57,12 @@ Options:
                                   revert category)
   -b, --bedfile BED               bedfile defining the amplicons, with format:
                                   ref\tstart\tstop\tamp_num\tpool\tstrand
-  --sort / --no-sort             sort the bedfile by 'reference name' and
+  --sort / --no-sort              sort the bedfile by 'reference name' and
                                   'start position' (default: sorted)
   -#, --cooc COOC                 minimum number of cooccurrences to search for
+  --fix-subset, --fs / --no-fix-subset, --no-fs
+                                  Fix variants attribution when cooccurrence
+                                  are subset/superset of other variants
   -Q, --amplicons, --in-amp, --in-amplicons YAML
                                   use the supplied YAML file to query
                                   amplicons instead of building it from BED +
@@ -73,7 +76,7 @@ Options:
   -y, --yaml YAML                 output results to a yaml file
   -t, --tsv TSV                   output results to a (raw) tsv file
   -d, --dump                      dump the python object to the terminal
-  --help                          Show this message and exit.
+  -h, --help                      Show this message and exit.
 
   @listfile can be used to pass a long list of parameters (e.g.: a large
   number of BAMs) in a file instead of command line

diff --git a/cojac/cooc_mutbamscan.py b/cojac/cooc_mutbamscan.py
@@ -269,7 +269,7 @@ def bed_load(bedfile, sort=True):
     return amp_bed
 
 
-def make_all_amplicons(amp_bed, vocs, revert=False, n_cooc=2):
+def make_all_amplicons(amp_bed, vocs, revert=False, n_cooc=2, subset_fix=True):
     """given a .BED file and a directory with YAMLs decribing VOCs,
     generates all amplicons to search that have at least n_cooc mutations
     """
@@ -307,22 +307,86 @@ def make_all_amplicons(amp_bed, vocs, revert=False, n_cooc=2):
                 # search same amplicon number
                 if npart[0] != ampname:
                     continue
+
                 # compare amplicon definitions
                 if d1 != d2:
+                    # if we're only interested in matches
                     continue
-                print(f"{k1} is identical to {k2}")
+
+                print(
+                    f"{k1} is identical to {k2}: {  ','.join([ f'{p}{b}' for p, b in d1[4].items() ])}"
+                )
                 # sort the variant names part of the list, to be consistent between calls, no matter the filesystem's on-disk order
                 # (i.e.: no 76_AY42_IN2 vs 76_IN2_AY42)
-                amplicons["_".join([npart[0]] + sorted(npart[1:] + [yam["name"]]))] = (
-                    amplicons.pop(k2)
-                )
+                newk2 = "_".join([npart[0]] + sorted(npart[1:] + [yam["name"]]))
+                print(f"\t{k2} => {newk2}")
+                amplicons[newk2] = amplicons.pop(k2)
                 del amp_dict[k1]
                 break
 
         # merge dicts
         if len(amp_dict):
             amplicons.update(amp_dict)
 
+    # now check for subsets situations
+    if subset_fix:
+        # build look-up map
+        # mutations sets back to amplicons
+        rlum = {}
+        for k, d in amplicons.items():
+            # nothing can be a subset of a set of one
+            if len(d[4]) == 1:
+                continue
+
+            mutset = {f"{p}{b}" for p, b in d[4].items()}
+            varset = set(k.split("_")[1:])
+            ampname = k.split("_")[0]
+
+            if ampname not in rlum:
+                rlum[ampname] = []
+
+            rlum[ampname] += [(k, mutset, varset)]
+
+        #  now examine each amplicon in succession in search of subsets
+        for k1, d in list(amplicons.items()):
+            origk1 = k1
+            ampmutset = {f"{p}{b}" for p, b in d[4].items()}
+            ampname = k1.split("_")[0]
+
+            # are there *any* longer than 1 set of which we could be subsets?
+            if ampname not in rlum:
+                continue
+
+            ampmutnum = len(ampmutset)
+            # compare against the whole set (inefficient, but well)
+            for k2, mutset, varset in rlum[ampname]:
+                # skip self
+                if k2 == origk1:
+                    # sanity check
+                    assert (
+                        mutset == ampmutset
+                    ), f"something went wrong while building index: wrong self '{mutset}' vs '{ampmutset}'"
+                    continue
+
+                # it must be a larger set if we have any chance of being a *sub*set
+                if len(mutset) <= ampmutnum:
+                    continue
+
+                if 0 == len(ampmutset - mutset):
+                    print(f"{origk1}'s '{ampmutset}' is a subset of {k2}'s '{mutset}'")
+
+                    ampvset = set(k1.split("_")[1:])
+
+                    newk1 = "_".join([ampname] + sorted(list(ampvset.union(varset))))
+                    # only update the label
+                    k1 = newk1
+                    continue
+
+            # rename the entry using the final label
+            if k1 != origk1:
+                print(f"\t{origk1} => {k1}")
+                amplicons[k1] = amplicons.pop(origk1)
+
     return amplicons
 
 
@@ -498,6 +562,13 @@ def flowseq_rep(dumper, data):
     type=int,
     help="minimum number of cooccurrences to search for",
 )
+@click.option(
+    "--fix-subset/--no-fix-subset",
+    "--fs/--no-fs",
+    "subset_fix",
+    default=False,
+    help="Fix variants attribution when cooccurrence are subset/superset of other variants",
+)
 # TODO: use mutually exclusive groups
 @click.option(
     "-Q",
@@ -576,6 +647,7 @@ def cooc_mutbamscan(
     bedfile,
     sort,
     cooc,
+    subset_fix,
     inamp,
     outamp,
     comment,
@@ -602,7 +674,9 @@ def cooc_mutbamscan(
             len(voc) > 0
         ), "Neither --voc nor --vocdir provided. Please provide variants of concern definition yamls."
         amp_bed = bed_load(bedfile, sort)
-        amplicons = make_all_amplicons(amp_bed, voc, revert=revert, n_cooc=cooc)
+        amplicons = make_all_amplicons(
+            amp_bed, voc, revert=revert, n_cooc=cooc, subset_fix=subset_fix
+        )
         # and save them for future reference
         if outamp:
             write_all_amplicons(amplicons, outamp, amp_bed if comment else None)