MontgomeryLab · AlexTate · May 29, 2024 · May 29, 2024 · May 29, 2024 · May 29, 2024
diff --git a/START_HERE/run_config.yml b/START_HERE/run_config.yml
@@ -226,6 +226,9 @@ counter_normalize_by_feature_hits: True
 ##-- If True: a decollapsed copy of each SAM file will be produced (useful for IGV) --##
 counter_decollapse: False
 
+##-- Require a specific edit pattern for mismatches. Choices: ADAR, TUT. Default: None --##
+counter_mismatch_pattern:
+
 ##-- Select the StepVector implementation that is used. Options: HTSeq or Cython --##
 counter_stepvector: 'Cython'
 

diff --git a/doc/Configuration.md b/doc/Configuration.md
@@ -170,7 +170,7 @@ Examples:
 - **Mixed** <sup>§</sup>: `19, 21-23, 25-30` 
 
 <sup>†</sup> the `Strand` selector also supports `both`<br>
-<sup>§</sup> only supported by the `Length` selector
+<sup>§</sup> only supported by the `Mismatches` and `Length` selectors
 
 ### Case Sensitivity
 All selectors are case-insensitive.

diff --git a/doc/Parameters.md b/doc/Parameters.md
@@ -84,6 +84,17 @@ By default, tiny-count will increment feature counts by a normalized amount to a
 
 By default, tiny-count will increment feature counts by a normalized amount to avoid overcounting. Each sequence alignment locus is allocated a portion of the sequence's original read count (depending on `counter_normalize_by_genomic_hits`), and once selection is complete the allocated count is divided by the number of selected features, or _feature hits_, at the alignment. The resulting value is added to the totals for each matching feature. By disabling this normalization step, each selected feature will receive the full amount allocated to the locus rather than the normalized portion.
 
+### Mismatch Pattern
+| Run Config Key            | Commandline Argument            |
+|---------------------------|---------------------------------|
+| counter_mismatch_pattern: | `--mismatch-pattern {ADAR,TUT}` |
+
+Rules that define a mismatch requirement can be extended to require a specific edit pattern. Two choices are available for this parameter:
+- **ADAR**: all mismatches must follow the A → I edit pattern which is characteristic of the double-stranded RNA-specific adenosine deaminase (ADAR) enzyme family. Inosene is recognized as guanosine by reverse transcriptase and therefore represented as G when sequenced, so this pattern is represented as A → G in sequencing data.
+- **TUT**: all mismatches must follow the N → U edit pattern at the 3' terminus which is characteristic of the Terminal Uridylyl Transferase (TUT) enzyme family. Mismatches must be consecutive from the 3' end. Reverse transcription prior to sequencing means this pattern is represented as N → T in sequencing data.
+
+This option applies globally to all rules except those that lack a mismatch requirement. Rules without the requirement will continue to allow any number of mismatches following any edit pattern.
+
 ### Decollapse
 | Run Config Key      | Commandline Argument   |
 |---------------------|------------------------|
@@ -98,7 +109,7 @@ The SAM files produced by the tinyRNA pipeline are collapsed by default; alignme
 
 A custom Cython implementation of HTSeq's StepVector is used for finding features that overlap each alignment interval. While the core C++ component of the StepVector is the same, we have found that our Cython implementation can result in runtimes up to 50% faster than HTSeq's implementation. This parameter allows you to use HTSeq's StepVector if you wish.
 
-### Is Pipeline
+### In Pipeline
 | Run Config Key | Commandline Argument |
 |----------------|----------------------|
 |                | `--in-pipeline`      |
@@ -115,7 +126,8 @@ Diagnostic information will include intermediate alignment files for each librar
 ### Full tiny-count Help String
 ```
 tiny-count (-pf FILE | --get-templates) [-o PREFIX] [-ng T/F] [-nf T/F]
-                  [-vs T/F] [-dc] [-sv {Cython,HTSeq}] [-p] [-d]
+                  [-vs T/F] [-mp {ADAR,TUT}] [-dc] [-sv {Cython,HTSeq}] [-p]
+                  [-d]
 
 tiny-count is a precision counting tool for hierarchical classification and
 quantification of small RNA-seq reads
@@ -135,15 +147,20 @@ Optional arguments:
   argument mentioned above.
 
   -o PREFIX, --out-prefix PREFIX
-                        The output prefix to use for file names. (default:
-                        None)
+                        The output prefix to use for file names. (default: )
   -ng T/F, --normalize-by-genomic-hits T/F
                         Normalize counts by genomic hits. (default: T)
   -nf T/F, --normalize-by-feature-hits T/F
                         Normalize counts by feature hits. (default: T)
   -vs T/F, --verify-stats T/F
                         Verify that all reported stats are internally
                         consistent. (default: T)
+  -mp {ADAR,TUT}, --mismatch-pattern {ADAR,TUT}
+                        Require a specific editing pattern for reads that
+                          contain mismatches.
+                        • ADAR: A-to-I editing pattern (A -> G)
+                        • TUT: 3' uridylation pattern (N -> U)
+                         (default: None)
   -dc, --decollapse     Create a decollapsed SAM copy of all files listed in
                         your Samples Sheet. This option is ignored for non-
                         collapsed inputs. (default: False)

diff --git a/doc/tiny-count.md b/doc/tiny-count.md
@@ -96,10 +96,10 @@ selector, M, N
  If these features match rules with `5' anchored` and `3' anchored` overlap selectors, they will be downgraded to `anchored` selectors. Alignments overlapping these features are evaluated for shared start and/or end coordinates, but 5' and 3' ends are not distinguished.
 
 ### Mismatches
-The Mismatches column allows you to place constraints the edit distance, or the number of mismatches and indels, from the alignment to the reference. The Mismatch definition is explicit, i.e., a value of 3 means exactly 3, not 3 or less. Definitions support ranges (e.g., 0-3), lists (e.g., 1, 3), wildcards, and single values.
+The Mismatches column allows you to place constraints the edit distance, or the number of mismatches and indels, from the alignment to the reference. The Mismatch definition is explicit, i.e., a value of 3 means exactly 3, not 3 or less. Definitions support ranges (e.g., 0-3), lists (e.g., 1, 3), wildcards, and single values. This functionality can be extended to require a [specific edit pattern](Parameters.md#mismatch-pattern).
 
 #### Edit Distance Determination
-An alignment's edit distance is determined from its NM tag. If the first alignment in a SAM file doesn't have an NM tag, then the edit distance is calculated from the CIGAR string for all subsequent alignments in the file. If the first alignment has an NM tag then any subsequent alignments missing the tag will have a default edit distance of 0.
+An alignment's edit distance is determined from its NM tag. Currently, alignments that lack an NM tag are treated as an error.
 
 ### Hierarchy
 Each rule must be assigned a hierarchy value. This value is used to sort Stage 2 matches so that matches with smaller hierarchy values take precedence in Stage 3.

diff --git a/tests/testdata/config_files/run_config_template.yml b/tests/testdata/config_files/run_config_template.yml
@@ -226,6 +226,9 @@ counter_normalize_by_feature_hits: True
 ##-- If True: a decollapsed copy of each SAM file will be produced (useful for IGV) --##
 counter_decollapse: False
 
+##-- Require a specific edit pattern for mismatches. Choices: ADAR, TUT. Default: None --##
+counter_mismatch_pattern:
+
 ##-- Select the StepVector implementation that is used. Options: HTSeq or Cython --##
 counter_stepvector: 'Cython'
 

diff --git a/tests/testdata/counter/sam/identity_choice_test.sam b/tests/testdata/counter/sam/identity_choice_test.sam
@@ -1,4 +1,6 @@
-@SQ	SN:I	LN:21
+@SQ	SN:I	LN:42
+@SQ	SN:V	LN:97
+@SQ	SN:MtDNA	LN:17
 1_count=1	16	I	1	255	21M	*	0	0	CAAGACAGAGCTTCACCGTTC	IIIIIIIIIIIIIIIIIIIII	XA:i:0	MD:Z:21	NM:i:0	XM:i:2
 2_count=22	0	V	50	255	22M	*	0	0	GAAGGTTCTAGAACAATCCAGA	IIIIIIIIIIIIIIIIIIIIII	XA:i:0	MD:Z:22	NM:i:0	XM:i:52
 2_count=22	0	V	70	255	22M	*	0	0	GAAGGTTCTAGAACAATCCAGA	IIIIIIIIIIIIIIIIIIIIII	XA:i:0	MD:Z:22	NM:i:0	XM:i:52

diff --git a/tests/unit_tests_hts_parsing.py b/tests/unit_tests_hts_parsing.py
@@ -11,6 +11,7 @@
 from tiny.rna.counter.features import FeatureSelector
 from tiny.rna.counter.matching import *
 from tiny.rna.counter.hts_parsing import *
+from tiny.rna.counter.parsing.alignments import _validate_alignment
 
 import unit_test_helpers as helpers
 
@@ -178,25 +179,27 @@ def test_AlignmentReader_gather_metadata(self):
         self.assertEqual(reader.collapser_type, 'tiny-collapse')
         self.assertDictEqual(reader._header_dict, expected_header_dict)
         self.assertEqual(reader.references, ('I',))
-        self.assertTrue(reader.has_nm)
+        self.assertIn("NM", reader.expected_tags)
 
     """Does AlignmentReader._write_decollapsed_sam() write the correct number of duplicates to the decollapsed file?"""
 
     def test_AlignmentReader_write_decollapsed_sam(self):
         header = pysam.AlignmentHeader()
-        alignment = pysam.AlignedSegment(header)
-        alignment.query_name = "0_count=5"
+        alignment_in = pysam.AlignedSegment(header)
+        alignment_in.query_name = "0_count=5"
+        alignment_out = pysam.AlignedSegment(header)
+        alignment_out.query_name = "0_count"
 
         reader = AlignmentReader(decollapse=True)
         reader.collapser_type = "tiny-collapse"
         reader._collapser_token = "="
-        reader._decollapsed_reads = [alignment]
+        reader._decollapsed_reads = [alignment_in]
         reader._decollapsed_filename = "mock_outfile_name.sam"
 
         expected_writelines = [
             call('mock_outfile_name.sam', 'a'),
             call().__enter__(),
-            call().writelines([alignment.to_string() + '\n'] * 5),
+            call().writelines([alignment_out.to_string() + '\n'] * 5),
             call().__exit__(None, None, None)
         ]
 
@@ -220,8 +223,8 @@ def test_AlignmentReader_parse_alignments_decollapse(self):
             sam_in = pysam.AlignmentFile(reader.file)
             callback = reader._write_decollapsed_sam
             first_aln_offset = sam_in.tell()
-            has_nm = True
-            aln_iter = AlignmentIter(sam_in, has_nm, callback, buffer)
+            expected_tags = ("NM",)
+            aln_iter = AlignmentIter(sam_in, expected_tags, callback, buffer)
 
             # Add 100,000th alignment to the buffer
             self.exhaust_iterator(aln_iter)
@@ -325,6 +328,24 @@ def test_AlignmentReader_incompatible_PG_header(self):
             with self.assertRaisesRegex(ValueError, expected_error):
                 reader._check_for_incompatible_order()
 
+    """Does AlignmentIter reject alignments that contain unsupported CIGAR operators?"""
+
+    def test_AlignmentIter_incompatible_cigar_ops(self):
+        good = ["1M", "1D", "1I", "1=", "1X"]
+        bad = ["1N", "1S", "1H", "1P"]
+
+        header = pysam.AlignmentHeader()
+        test_aln = pysam.AlignedSegment(header)
+
+        for cigar in good:
+            test_aln.cigarstring = cigar
+            _validate_alignment(test_aln)
+
+        for cigar in bad:
+            test_aln.cigarstring = cigar
+            with self.assertRaisesRegex(ValueError, "not supported at this time"):
+                _validate_alignment(test_aln)
+
 
 class ReferenceFeaturesTests(unittest.TestCase):
 

diff --git a/tests/unit_tests_matching.py b/tests/unit_tests_matching.py
@@ -103,8 +103,8 @@ def test_NtMatch_rule_validation(self):
     """Does NumericalMatch properly validate the rule definition?"""
 
     def test_NumericalMatch_rule_validation(self):
-        good_defs = ["10", "10,11", "10-12", "12-14,   16"]
-        bad_defs = [",5", "5 5", " "]
+        good_defs = ["10", "10,11", "1-1", "10-12", "1 -2 ", "12 -14,  16"]
+        bad_defs = [",5", "5 5", " ", "1-2-3"]
 
         for defn in good_defs:
             NumericalMatch(defn)
@@ -142,6 +142,103 @@ def test_IntervalSelector_illegal_shift(self):
         with self.assertRaisesRegex(IllegalShiftError, "negative start interval"):
             IntervalSelector.get_shifted_interval("-1, 0", iv)
 
+    """Does AdarEditMatch work as expected?"""
+
+    def test_AdarEditMatch(self):
+        good_alns = [                                                  # Reference Sequence:
+            {'Seq': "G",    'Strand': True,  'MD': "0A0",     'NM': 1},  # A (+)
+            {'Seq': "C",    'Strand': False, 'MD': "0T0",     'NM': 1},  # A (-)
+            {'Seq': "GGG",  'Strand': True,  'MD': "0A0A0A0", 'NM': 3},  # AAA (+)
+            {'Seq': "CCC",  'Strand': False, 'MD': "0T0T0T0", 'NM': 3},  # AAA (-)
+            {'Seq': "GTGC", 'Strand': True,  'MD': "0A3",     'NM': 1},  # ATGC (+)
+            {'Seq': "GCAC", 'Strand': False, 'MD': "3T0",     'NM': 1},  # ATGC (-)
+        ]
+
+        bad_alns = [                                                   # Problem:
+            {'Seq': "G",    'Strand': True,  'MD': "1",       'NM': 0},  # No mismatch (+)
+            {'Seq': "C",    'Strand': False, 'MD': "1",       'NM': 0},  # No mismatch (-)
+            {'Seq': "G",    'Strand': True,  'MD': "0T0",     'NM': 1},  # Mismatch, but not from A (+)
+            {'Seq': "C",    'Strand': False, 'MD': "0A0",     'NM': 1},  # Mismatch, but not from A (-)
+            {'Seq': "C",    'Strand': True,  'MD': "0T0",     'NM': 1},  # A -> G but wrong strand (+)
+            {'Seq': "G",    'Strand': False, 'MD': "0A0",     'NM': 1},  # A -> G but wrong strand (-)
+            {'Seq': "GTGC", 'Strand': True,  'MD': "4",       'NM': 0},  # No mismatch (+)
+            {'Seq': "GCAC", 'Strand': False, 'MD': "4",       'NM': 0},  # No mismatch (-)
+            {'Seq': "GTGC", 'Strand': True,  'MD': "0A2T0",   'NM': 2},  # Mismatch from A and other (+)
+            {'Seq': "GCAC", 'Strand': False, 'MD': "0A2T0",   'NM': 2},  # Mismatch from A and other (-)
+            {'Seq': "G",    'Strand': True,  'MD': "0A0^T0",  'NM': 2},  # Mismatch from A and deletion (+)
+            {'Seq': "C",    'Strand': False, 'MD': "0T0^C0",  'NM': 2},  # Mismatch from A and deletion (-)
+        ]
+
+        # Test range
+        try:
+            range_match = AdarEditMatch("1-3")
+            for aln in good_alns:
+                self.assertTrue(aln in range_match)
+            for aln in bad_alns:
+                self.assertFalse(aln in range_match)
+        except Exception as e:
+            print("Failing alignment: \n" + str(aln))
+            raise e
+
+        # Test single value
+        val_match_lo = AdarEditMatch("1")
+        val_match_hi = AdarEditMatch("3")
+        aln_lo, aln_hi = good_alns[0], good_alns[2]
+        self.assertTrue(aln_lo['NM'] == 1 and aln_hi['NM'] == 3)  # sanity check
+        self.assertTrue(aln_lo in val_match_lo and aln_lo not in val_match_hi)
+        self.assertTrue(aln_hi in val_match_hi and aln_hi not in val_match_lo)
+
+    """Does TutEditMatch work as expected?"""
+
+    def test_TutEditMatch(self):
+        good_alns = [                                                  # Reference Sequence:
+            {'Seq': "T",    'Strand': True,  'MD': "0G0",     'NM': 1},  # G (+)
+            {'Seq': "A",    'Strand': False, 'MD': "0C0",     'NM': 1},  # C (-)
+            {'Seq': "TTT",  'Strand': True,  'MD': "0C0C0A0", 'NM': 3},  # CCA (+)
+            {'Seq': "AAA",  'Strand': False, 'MD': "0G0C0T0", 'NM': 3},  # AGC (-)
+            {'Seq': "ATTT", 'Strand': True,  'MD': "2C0C0",   'NM': 2},  # ATCC (+)
+            {'Seq': "AAGC", 'Strand': False, 'MD': "0G3",     'NM': 1},  # GCTC (-)
+        ]
+
+        bad_alns = [                                                   # Problem:
+            {'Seq': "T",    'Strand': True,  'MD': "1",       'NM': 0},  # No mismatch (+)
+            {'Seq': "A",    'Strand': False, 'MD': "1",       'NM': 0},  # No mismatch (-)
+            {'Seq': "G",    'Strand': True,  'MD': "0T0",     'NM': 1},  # Mismatch, but not to U (+)
+            {'Seq': "G",    'Strand': False, 'MD': "0A0",     'NM': 1},  # Mismatch, but not to U (-)
+            {'Seq': "A",    'Strand': True,  'MD': "0G0",     'NM': 1},  # N -> U but wrong strand (+)
+            {'Seq': "T",    'Strand': False, 'MD': "0G0",     'NM': 1},  # N -> U but wrong strand (-)
+            {'Seq': "GTGC", 'Strand': True,  'MD': "4",       'NM': 0},  # No mismatch (+)
+            {'Seq': "GCAC", 'Strand': False, 'MD': "4",       'NM': 0},  # No mismatch (-)
+            {'Seq': "GTTT", 'Strand': True,  'MD': "0T1G0A0", 'NM': 3},  # Mismatch to U and other (+)
+            {'Seq': "AAAG", 'Strand': False, 'MD': "0C2T0",   'NM': 2},  # Mismatch to U and other (-)
+            {'Seq': "T",    'Strand': True,  'MD': "0C0^T0",  'NM': 2},  # Mismatch to U and deletion (+)
+            {'Seq': "A",    'Strand': False, 'MD': "0G0^A0",  'NM': 2},  # Mismatch to U and deletion (-)
+        ]
+
+        # Test range
+        try:
+            range_match = TutEditMatch("1-3")
+            for aln in good_alns:
+                self.assertTrue(aln in range_match)
+            for aln in bad_alns:
+                self.assertFalse(aln in range_match)
+        except Exception as e:
+            print("Failing alignment: \n" + str(aln))
+            raise e
+
+        # Test single value
+        val_match_lo = TutEditMatch("1")
+        val_match_hi = TutEditMatch("3")
+        aln_lo, aln_hi = good_alns[0], good_alns[2]
+        self.assertTrue(aln_lo['NM'] == 1 and aln_hi['NM'] == 3)
+        self.assertTrue(aln_lo in val_match_lo and aln_lo not in val_match_hi)
+        self.assertTrue(aln_hi in val_match_hi and aln_hi not in val_match_lo)
+
+        # Test terminal run of desired length, but below mismatch threshold
+        tricky = TutEditMatch("3")
+        self.assertTrue(good_alns[4]['Seq'] == "ATTT" and good_alns[4]['NM'] == 2)
+        self.assertFalse(good_alns[4] in tricky)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tiny/cwl/tools/tiny-count.cwl b/tiny/cwl/tools/tiny-count.cwl
@@ -28,7 +28,7 @@ inputs:
     inputBinding:
       prefix: --out-prefix
 
-  # Optional inputs
+  # === Optional inputs ===
 
   normalize_by_feature_hits:
     type: string?
@@ -40,6 +40,11 @@ inputs:
     inputBinding:
       prefix: --normalize-by-genomic-hits
 
+  mismatch_pattern:
+    type: string?
+    inputBinding:
+      prefix: --mismatch-pattern
+
   decollapse:
     type: boolean?
     inputBinding:
@@ -60,7 +65,7 @@ inputs:
     inputBinding:
       prefix: --report-diags
 
-  # The following optional inputs are for staging InitialWorkingDir files for pipeline execution
+  # === The following optional inputs are for staging InitialWorkingDir files for pipeline execution ===
 
   samples_csv:
     type: File

diff --git a/tiny/cwl/workflows/tinyrna_wf.cwl b/tiny/cwl/workflows/tinyrna_wf.cwl
@@ -87,6 +87,7 @@ inputs:
   counter_diags: boolean?
   counter_decollapse: boolean?
   counter_stepvector: string?
+  counter_mismatch_pattern: string?
   counter_normalize_by_feature_hits: boolean?
   counter_normalize_by_genomic_hits: boolean?
 
@@ -221,6 +222,7 @@ steps:
       normalize_by_genomic_hits:
         source: counter_normalize_by_genomic_hits
         valueFrom: $(String(self))  # convert boolean -> string
+      mismatch_pattern: counter_mismatch_pattern
       decollapse: counter_decollapse
       stepvector: counter_stepvector
       in_pipeline: {default: true}