From ebd5a8dd550efdcfa90af90e6a922a8b0420df8b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gordon=20J=2E=20K=C3=B6hn?= <gordon@koehn.net>
Date: Thu, 31 Oct 2024 10:39:38 +0000
Subject: [PATCH 01/17] add original scripts from David Gicev @davidgicev

---
 scripts/dgicev/README.md         |  38 +++++++
 scripts/dgicev/add_padding.py    |  26 +++++
 scripts/dgicev/read.py           | 174 +++++++++++++++++++++++++++++
 scripts/dgicev/readAndSort.py    | 180 ++++++++++++++++++++++++++++++
 scripts/dgicev/read_outputsam.py | 185 +++++++++++++++++++++++++++++++
 5 files changed, 603 insertions(+)
 create mode 100644 scripts/dgicev/README.md
 create mode 100644 scripts/dgicev/add_padding.py
 create mode 100644 scripts/dgicev/read.py
 create mode 100644 scripts/dgicev/readAndSort.py
 create mode 100644 scripts/dgicev/read_outputsam.py

diff --git a/scripts/dgicev/README.md b/scripts/dgicev/README.md
new file mode 100644
index 0000000..65187a6
--- /dev/null
+++ b/scripts/dgicev/README.md
@@ -0,0 +1,38 @@
+### add_padding.py
+
+Used for adding padding to an example fasta file, mainly used for testing main
+
+#### Usage:
+
+`cat input_file.fasta | python add_padding.py > output_file.fasta`
+
+`input_file`: fasta file with header containing offset | delimited
+
+`output_file`: normal fasta file, each sequence padded by the respective amount of N's to the left/right (position info in the header is not in the output)
+
+### read.py
+
+Used for merging pairs of reads
+
+#### Usage:
+`cat input_file.fasta | python read.py`
+`samtools view input_file.bam | python read.py`
+
+`input_file`: sam file contents, could also read from bam with samtools
+
+`output_file`: it makes two output files by itself, merged.fasta and nuc_insertions.txt, the former has the merged reads and it uses the fasta with | headers to describe the position/offset
+
+### read_outputsam.py
+
+Used mainly for testing with IGV
+
+same usage as read.py, except the output it makes is merged.sam, it doesn't store the insertions
+
+the sam entries it outputs are the actual sequences you would find in the merged.fasta when running read.py - the cigar is just M's
+
+### readAndSort.py
+for reordering, using hashing, not really efficient more of a proof of concept
+
+same usage as read.py
+
+Note: I don't think I use the reference genome here so feel free to take it out of the code
diff --git a/scripts/dgicev/add_padding.py b/scripts/dgicev/add_padding.py
new file mode 100644
index 0000000..05e8a9d
--- /dev/null
+++ b/scripts/dgicev/add_padding.py
@@ -0,0 +1,26 @@
+import sys
+
+
+def transform_fasta():
+    while True:
+        header = sys.stdin.readline().strip()
+        sequence = sys.stdin.readline().strip()
+
+        if not header or not sequence:
+            break  # End of file
+
+        # Parse the header and extract the position offset
+        header_parts = header.split("|")
+        position_offset = int(header_parts[1])  # Get the position offset
+
+        # Add Ns before and after the sequence
+        left_padding = "N" * position_offset
+        right_padding = "N" * (29904 - len(sequence) - position_offset)
+
+        # Write the transformed sequence to stdout
+        sys.stdout.write(f"{header_parts[0]}\n")
+        sys.stdout.write(f"{left_padding}{sequence}{right_padding}\n")
+
+
+# Execute the function
+transform_fasta()
diff --git a/scripts/dgicev/read.py b/scripts/dgicev/read.py
new file mode 100644
index 0000000..7918670
--- /dev/null
+++ b/scripts/dgicev/read.py
@@ -0,0 +1,174 @@
+import sys
+import re
+
+def parse_cigar(cigar):
+    pattern = re.compile(r'(\d+)([MIDNSHP=X])')
+    
+    parsed_cigar = pattern.findall(cigar)
+    
+    return [(op, int(length)) for length, op in parsed_cigar]
+
+
+unpaired = dict()
+
+with open('merged.fasta', 'w') as output_fasta, open('nuc_insertions.txt', 'w') as output_insertions:
+    for line in sys.stdin:
+        if line.startswith('@'):
+            continue
+        
+        fields = line.strip().split('\t')
+        
+        QNAME = fields[0]                   # Query template NAME
+        FLAG = int(fields[1])               # bitwise FLAG
+        RNAME = fields[2]                   # Reference sequence NAME
+        POS = int(fields[3])                # 1-based leftmost mapping POSition
+        MAPQ = int(fields[4])               # MAPping Quality
+        CIGAR = parse_cigar(fields[5])      # CIGAR string
+        RNEXT = fields[6]                   # Ref. name of the mate/next read
+        PNEXT = int(fields[7])              # Position of the mate/next read
+        TLEN = int(fields[8])               # observed Template LENgth
+        SEQ = fields[9]                     # segment SEQuence
+        QUAL = fields[10]                   # ASCII of Phred-scaled base QUALity + 33
+
+        result_sequence = ''
+        result_qual = ''
+        index = 0
+        inserts = []
+
+        for operation in CIGAR:
+            type, count = operation
+            if type == 'S':
+                index += count
+                continue
+            if type == 'M':
+                result_sequence += SEQ[index:index + count]
+                result_qual     += QUAL[index:index + count]
+                index += count
+                continue
+            if type == 'D':
+                result_sequence += '-' * count
+                result_qual     += '!' * count
+                continue
+            if type == 'I':
+                inserts.append((index + POS, SEQ[index:index + count]))
+                index += count
+                continue
+
+        read = {
+            # "QNAME": QNAME,
+            # "FLAG": FLAG,
+            # "RNAME": RNAME,
+            "POS": POS,
+            # "MAPQ": MAPQ,
+            "CIGAR": CIGAR,
+            # "RNEXT": RNEXT,
+            # "PNEXT": PNEXT,
+            # "TLEN": TLEN,
+            # "SEQ": SEQ,
+            # "QUAL": QUAL,
+            "RESULT_SEQUENCE": result_sequence,
+            "RESULT_QUAL": result_qual,
+            "insertions": inserts,
+        }
+
+        if QNAME in unpaired:
+            read1 = unpaired.pop(QNAME)
+            read2 = read
+
+            # print(read1)
+            # print(read2)
+            
+            if read1['POS'] > read2['POS']:
+                read1, read2 = read2, read1
+            
+            index = read1['POS']
+            read1len = len(read1['RESULT_SEQUENCE'])
+            merged = read1['RESULT_SEQUENCE'][:min(read1len, read2['POS'] - read1['POS'])]
+
+            # do deletions cause a problem here?
+            gaplen = read1['POS'] + read1len - read2['POS']
+            if gaplen < 0:
+                merged += 'N' * (-gaplen)
+                merged += read2['RESULT_SEQUENCE']
+            else:
+                # read1_insertions = [read for read in read1['insertions'] if read[0] >= read2['POS']]
+                # read2_insertions = [read for read in read2['insertions'] if read[0] < read1['POS'] + read1len]
+                # if str(read1_insertions) != str(read2_insertions):
+                #     print("\n\nInsertions don't match")
+                #     print(QNAME)
+                #     print("insertions1: ", read1_insertions)
+                #     print("insertions2: ", read2_insertions)
+                # if len(read1_insertions) != len(read2_insertions):
+                #     print("Number of insertions doesn't match")
+                # else:
+                #     for i in range(len(read1_insertions)):
+                #         if read1_insertions[i][0] != read2_insertions[i][0]:
+                #             print("Insertion index doesn't match")
+                #             print(read1_insertions[i][0], read2_insertions[i][0], " = ", read1_insertions[i][0] - read2_insertions[i][0])
+                #             print("pos2 - pos1", read2['POS'] - read1['POS'])
+                #             print("cigar1", read1['CIGAR'])
+                #             print("cigar2", read2['CIGAR'])len(overlap_result) 
+
+                #         if read1_insertions[i][1] != read2_insertions[i][1]:
+                #             print("Insertion sequence doesn't match")
+                #             print(read1_insertions[i][1], read2_insertions[i][1])
+                overlap_read1 = read1['RESULT_SEQUENCE'][read2['POS'] - read1['POS']:]
+                overlap_read2 = read2['RESULT_SEQUENCE'][0: max(0, gaplen)]
+
+                overlap_qual1 = read1['RESULT_QUAL'][read2['POS'] - read1['POS']:]
+                overlap_qual2 = read2['RESULT_QUAL'][0: max(0, gaplen)]
+
+                # let's set the read1's version by default
+                overlap_result = list(overlap_read1)
+
+                if len(overlap_result) and overlap_read1 != overlap_read2:
+                    # print("", QNAME)
+                    if len(overlap_read1) != len(overlap_read2):
+                        print("overlaps don't match in size")
+                    number_of_diffs = 0
+                    for i in range(len(overlap_read1)):
+                        if overlap_read1[i] != overlap_read2[i]:
+                            if overlap_qual1[i] == '-' and overlap_read2 != '-':
+                                overlap_result[i] = overlap_read2[i]
+                            if overlap_qual1[i] > overlap_qual2[i]:
+                                overlap_result[i] = overlap_read2[i]
+                            # print("diff in position ", i, ": ", overlap_read1[i], "/", overlap_read2[i])
+                            number_of_diffs += 1
+                            # print("corresponding qs ", i, ": ", overlap_qual1[i], "/", overlap_qual2[i])
+                    # print("read1pos", read1['POS'])
+                    # print("read2pos", read2['POS'])
+                    # print("diff", read2['POS'] - read1['POS'])
+                    # print("read1len", read1len)
+                    # print("gap", gaplen)
+                    # print("\nread1")
+                    # print(overlap_read1)
+                    # print(overlap_qual1)
+                    # print("\nread2")
+                    # print(overlap_read2)
+                    # print(overlap_qual2)
+
+                    # print("\nreconcilled")
+                    # print("".join(overlap_result))
+
+                merged += "".join(overlap_result) + read2['RESULT_SEQUENCE'][max(0, gaplen):]
+
+
+
+            if len(merged) != read2['POS'] + len(read2['RESULT_SEQUENCE']) - read1['POS']:
+                raise Exception("Length mismatch")
+            
+            output_fasta.write(f">{QNAME}|{read1['POS']}\n{merged}\n")
+
+            merged_insertions = read1['insertions'].copy()
+            insertion_index = read1['POS'] + read1len
+            merged_insertions += [insert for insert in read2['insertions'] if insert[0] > insertion_index]
+
+            output_insertions.write(f"{QNAME}\t{merged_insertions}\n")
+
+
+            
+        else:
+            unpaired[QNAME] = read
+    for id in unpaired:
+        output_fasta.write(f">{id}|{unpaired[id]['POS']}\n{unpaired[id]['RESULT_SEQUENCE']}\n")
+        output_insertions.write(f"{id}\t{unpaired[id]['insertions']}\n")
\ No newline at end of file
diff --git a/scripts/dgicev/readAndSort.py b/scripts/dgicev/readAndSort.py
new file mode 100644
index 0000000..efb7f9d
--- /dev/null
+++ b/scripts/dgicev/readAndSort.py
@@ -0,0 +1,180 @@
+import sys
+import re
+
+def parse_cigar(cigar):
+    pattern = re.compile(r'(\d+)([MIDNSHP=X])')
+    
+    parsed_cigar = pattern.findall(cigar)
+    
+    return [(op, int(length)) for length, op in parsed_cigar]
+
+
+unpaired = dict()
+
+sequence_graph = dict()
+
+# have sequence_graph contain nodes forming clusters
+# graph['79'] would be the starting node for all samples taken at position 79
+# from that point on treat it as a prefix tree
+# will have to compare to reference sequence to get diff array
+
+
+known_sequences = dict()
+
+with open('merged.fasta', 'w') as output_fasta, open('nuc_insertions.txt', 'w') as output_insertions, open('reference_genome.fasta', 'r') as reference_sequence_file:
+    reference_sequence = reference_sequence_file.readlines()[1]
+    # print("reference sequence: ", reference_sequence)
+    for line in sys.stdin:
+        if line.startswith('@'):
+            continue
+        
+        fields = line.strip().split('\t')
+        
+        QNAME = fields[0]                   # Query template NAME
+        FLAG = int(fields[1])               # bitwise FLAG
+        RNAME = fields[2]                   # Reference sequence NAME
+        POS = int(fields[3])                # 1-based leftmost mapping POSition
+        MAPQ = int(fields[4])               # MAPping Quality
+        CIGAR = parse_cigar(fields[5])      # CIGAR string
+        RNEXT = fields[6]                   # Ref. name of the mate/next read
+        PNEXT = int(fields[7])              # Position of the mate/next read
+        TLEN = int(fields[8])               # observed Template LENgth
+        SEQ = fields[9]                     # segment SEQuence
+        QUAL = fields[10]                   # ASCII of Phred-scaled base QUALity + 33
+
+        result_sequence = ''
+        result_qual = ''
+        index = 0
+        inserts = []
+
+        for operation in CIGAR:
+            type, count = operation
+            if type == 'S':
+                index += count
+                continue
+            if type == 'M':
+                result_sequence += SEQ[index:index + count]
+                result_qual     += QUAL[index:index + count]
+                index += count
+                continue
+            if type == 'D':
+                result_sequence += '-' * count
+                result_qual     += '!' * count
+                continue
+            if type == 'I':
+                inserts.append((index + POS, SEQ[index:index + count]))
+                index += count
+                continue
+
+        read = {
+            # "QNAME": QNAME,
+            # "FLAG": FLAG,
+            # "RNAME": RNAME,
+            "POS": POS,
+            # "MAPQ": MAPQ,
+            "CIGAR": CIGAR,
+            # "RNEXT": RNEXT,
+            # "PNEXT": PNEXT,
+            # "TLEN": TLEN,
+            # "SEQ": SEQ,
+            # "QUAL": QUAL,
+            "RESULT_SEQUENCE": result_sequence,
+            "RESULT_QUAL": result_qual,
+            "insertions": inserts,
+        }
+
+        if QNAME in unpaired:
+            read1 = unpaired.pop(QNAME)
+            read2 = read
+
+            # print(read1)
+            # print(read2)
+            
+            if read1['POS'] > read2['POS']:
+                read1, read2 = read2, read1
+            
+            index = read1['POS']
+            read1len = len(read1['RESULT_SEQUENCE'])
+            merged = read1['RESULT_SEQUENCE'][:min(read1len, read2['POS'] - read1['POS'])]
+
+            gaplen = read1['POS'] + read1len - read2['POS']
+            if gaplen < 0:
+                merged += 'N' * (-gaplen)
+                merged += read2['RESULT_SEQUENCE']
+            else:
+                overlap_read1 = read1['RESULT_SEQUENCE'][read2['POS'] - read1['POS']:]
+                overlap_read2 = read2['RESULT_SEQUENCE'][0: max(0, gaplen)]
+
+                overlap_qual1 = read1['RESULT_QUAL'][read2['POS'] - read1['POS']:]
+                overlap_qual2 = read2['RESULT_QUAL'][0: max(0, gaplen)]
+
+                overlap_result = list(overlap_read1)
+
+                if len(overlap_result) and overlap_read1 != overlap_read2:
+                    # print("", QNAME)
+                    if len(overlap_read1) != len(overlap_read2):
+                        print("overlaps don't match in size")
+                    number_of_diffs = 0
+                    for i in range(len(overlap_read1)):
+                        if overlap_read1[i] != overlap_read2[i]:
+                            if overlap_qual1[i] == '-' and overlap_read2 != '-':
+                                overlap_result[i] = overlap_read2[i]
+                            if overlap_qual1[i] > overlap_qual2[i]:
+                                overlap_result[i] = overlap_read2[i]
+                            # print("diff in position ", i, ": ", overlap_read1[i], "/", overlap_read2[i])
+                            number_of_diffs += 1
+                            # print("corresponding qs ", i, ": ", overlap_qual1[i], "/", overlap_qual2[i])
+
+                merged += "".join(overlap_result) + read2['RESULT_SEQUENCE'][max(0, gaplen):]
+
+
+
+            if len(merged) != read2['POS'] + len(read2['RESULT_SEQUENCE']) - read1['POS']:
+                raise Exception("Length mismatch")
+            
+            # output_fasta.write(f">{QNAME}|{read1['POS']}\n{merged}\n")
+
+            merged_insertions = read1['insertions'].copy()
+            insertion_index = read1['POS'] + read1len
+            merged_insertions += [insert for insert in read2['insertions'] if insert[0] > insertion_index]
+
+            output_insertions.write(f"{QNAME}\t{merged_insertions}\n")
+
+            # time to add it to the graph
+            reference_offset = read1['POS'] - 1
+
+            # for i in range(len(merged)):
+                # if merged[i] != reference_sequence[reference_offset + i]:
+                #     # print("id: ", QNAME, "mutation at ", i, " from ", reference_sequence[reference_offset + i], " to ", merged[i])
+            fingerprint = hash(merged)
+            if fingerprint in known_sequences:
+                groups = known_sequences[fingerprint]
+                found = False
+                for group in groups:
+                    if group[0] == merged:
+                        found = True
+                        group[2].append(QNAME)
+                if not found:
+                    known_sequences[fingerprint].append((merged, read1['POS'], [QNAME]))
+                
+            else:
+                known_sequences[fingerprint] = [(merged, read1['POS'], [QNAME])]
+
+        else:
+            unpaired[QNAME] = read
+
+    print("Number of unique sequences: ", sum([len(groups) for groups in known_sequences.values()]))
+
+    flattened = [group for groups in known_sequences.values() for group in groups]
+    flattened.sort(key=lambda x: x[1])
+
+    sort_number_prefix = 0
+
+    for group in flattened:
+        for id in group[2]:
+            output_fasta.write(f">{sort_number_prefix:010}.{id}|{group[1]}\n{group[0]}\n")
+            sort_number_prefix += 1
+
+    for id in unpaired:
+        output_fasta.write(f">{id}|{unpaired[id]['POS']}\n{unpaired[id]['RESULT_SEQUENCE']}\n")
+        output_insertions.write(f"{id}\t{unpaired[id]['insertions']}\n")
\ No newline at end of file
diff --git a/scripts/dgicev/read_outputsam.py b/scripts/dgicev/read_outputsam.py
new file mode 100644
index 0000000..c186c25
--- /dev/null
+++ b/scripts/dgicev/read_outputsam.py
@@ -0,0 +1,185 @@
+import sys
+import re
+
+def parse_cigar(cigar):
+    pattern = re.compile(r'(\d+)([MIDNSHP=X])')
+    
+    parsed_cigar = pattern.findall(cigar)
+    
+    return [(op, int(length)) for length, op in parsed_cigar]
+
+
+unpaired = dict()
+
+with open('merged.sam', 'w') as output_sam:
+    for line in sys.stdin:
+        if line.startswith('@'):
+            continue
+        
+        fields = line.strip().split('\t')
+        
+        QNAME = fields[0]                   # Query template NAME
+        FLAG = int(fields[1])               # bitwise FLAG
+        RNAME = fields[2]                   # Reference sequence NAME
+        POS = int(fields[3])                # 1-based leftmost mapping POSition
+        MAPQ = int(fields[4])               # MAPping Quality
+        CIGAR = parse_cigar(fields[5])      # CIGAR string
+        RNEXT = fields[6]                   # Ref. name of the mate/next read
+        PNEXT = int(fields[7])              # Position of the mate/next read
+        TLEN = int(fields[8])               # observed Template LENgth
+        SEQ = fields[9]                     # segment SEQuence
+        QUAL = fields[10]                   # ASCII of Phred-scaled base QUALity + 33
+
+        result_sequence = ''
+        result_qual = ''
+        index = 0
+        inserts = []
+
+        for operation in CIGAR:
+            type, count = operation
+            if type == 'S':
+                index += count
+                continue
+            if type == 'M':
+                result_sequence += SEQ[index:index + count]
+                result_qual     += QUAL[index:index + count]
+                index += count
+                continue
+            if type == 'D':
+                result_sequence += '-' * count
+                result_qual     += '!' * count
+                continue
+            if type == 'I':
+                inserts.append((index + POS, SEQ[index:index + count]))
+                index += count
+                continue
+
+        read = {
+            # "QNAME": QNAME,
+            # "FLAG": FLAG,
+            # "RNAME": RNAME,
+            "POS": POS,
+            # "MAPQ": MAPQ,
+            "CIGAR": CIGAR,
+            # "RNEXT": RNEXT,
+            # "PNEXT": PNEXT,
+            # "TLEN": TLEN,
+            # "SEQ": SEQ,
+            # "QUAL": QUAL,
+            "RESULT_SEQUENCE": result_sequence,
+            "RESULT_QUAL": result_qual,
+            "insertions": inserts,
+        }
+
+        if QNAME in unpaired:
+            read1 = unpaired.pop(QNAME)
+            read2 = read
+
+            # print(read1)
+            # print(read2)
+            
+            if read1['POS'] > read2['POS']:
+                read1, read2 = read2, read1
+            
+            index = read1['POS']
+            read1len = len(read1['RESULT_SEQUENCE'])
+            merged = read1['RESULT_SEQUENCE'][:min(read1len, read2['POS'] - read1['POS'])]
+            merged_qual = read1['RESULT_QUAL'][:min(read1len, read2['POS'] - read1['POS'])]
+
+            # do deletions cause a problem here?
+            gaplen = read1['POS'] + read1len - read2['POS']
+            if gaplen < 0:
+                merged += 'N' * (-gaplen)
+                merged += read2['RESULT_SEQUENCE']
+                merged_qual += 'C' * (-gaplen)
+                merged_qual += read2['RESULT_SEQUENCE']
+            else:
+                # read1_insertions = [read for read in read1['insertions'] if read[0] >= read2['POS']]
+                # read2_insertions = [read for read in read2['insertions'] if read[0] < read1['POS'] + read1len]
+                # if str(read1_insertions) != str(read2_insertions):
+                #     print("\n\nInsertions don't match")
+                #     print(QNAME)
+                #     print("insertions1: ", read1_insertions)
+                #     print("insertions2: ", read2_insertions)
+                # if len(read1_insertions) != len(read2_insertions):
+                #     print("Number of insertions doesn't match")
+                # else:
+                #     for i in range(len(read1_insertions)):
+                #         if read1_insertions[i][0] != read2_insertions[i][0]:
+                #             print("Insertion index doesn't match")
+                #             print(read1_insertions[i][0], read2_insertions[i][0], " = ", read1_insertions[i][0] - read2_insertions[i][0])
+                #             print("pos2 - pos1", read2['POS'] - read1['POS'])
+                #             print("cigar1", read1['CIGAR'])
+                #             print("cigar2", read2['CIGAR'])len(overlap_result) 
+
+                #         if read1_insertions[i][1] != read2_insertions[i][1]:
+                #             print("Insertion sequence doesn't match")
+                #             print(read1_insertions[i][1], read2_insertions[i][1])
+                overlap_read1 = read1['RESULT_SEQUENCE'][read2['POS'] - read1['POS']:]
+                overlap_read2 = read2['RESULT_SEQUENCE'][0: max(0, gaplen)]
+
+                overlap_qual1 = read1['RESULT_QUAL'][read2['POS'] - read1['POS']:]
+                overlap_qual2 = read2['RESULT_QUAL'][0: max(0, gaplen)]
+
+                # let's set the read1's version by default
+                overlap_result = list(overlap_read1)
+                overlap_qual = list(overlap_qual1)
+
+                if len(overlap_result) and overlap_read1 != overlap_read2:
+                    # print("", QNAME)
+                    if len(overlap_read1) != len(overlap_read2):
+                        print("overlaps don't match in size")
+                    number_of_diffs = 0
+                    for i in range(len(overlap_read1)):
+                        if overlap_read1[i] != overlap_read2[i]:
+                            if overlap_qual1[i] == '-' and overlap_read2 != '-':
+                                overlap_result[i] = overlap_read2[i]
+                                overlap_qual[i] = overlap_qual[i]
+
+                            if overlap_qual1[i] > overlap_qual2[i]:
+                                overlap_result[i] = overlap_read2[i]
+                                overlap_qual[i] = overlap_qual[i]
+                            # print("diff in position ", i, ": ", overlap_read1[i], "/", overlap_read2[i])
+                            number_of_diffs += 1
+                            # print("corresponding qs ", i, ": ", overlap_qual1[i], "/", overlap_qual2[i])
+                    # print("read1pos", read1['POS'])
+                    # print("read2pos", read2['POS'])
+                    # print("diff", read2['POS'] - read1['POS'])
+                    # print("read1len", read1len)
+                    # print("gap", gaplen)
+                    # print("\nread1")
+                    # print(overlap_read1)
+                    # print(overlap_qual1)
+                    # print("\nread2")
+                    # print(overlap_read2)
+                    # print(overlap_qual2)
+
+                    # print("\nreconcilled")
+                    # print("".join(overlap_result))
+
+                merged += "".join(overlap_result) + read2['RESULT_SEQUENCE'][max(0, gaplen):]
+                merged_qual += "".join(overlap_qual) + read2['RESULT_QUAL'][max(0, gaplen):]
+
+
+
+            if len(merged) != read2['POS'] + len(read2['RESULT_SEQUENCE']) - read1['POS']:
+                raise Exception("Length mismatch")
+            
+            flag = 0
+            output_cigar = str(len(merged)) + "M"
+
+            output_sam.write(f"{QNAME}\t{flag}\t{RNAME}\t{read1['POS']}\t{MAPQ}\t{output_cigar}\t*\t0\t{abs(TLEN)}\t{merged}\t{merged_qual}\n")
+
+            merged_insertions = read1['insertions'].copy()
+            insertion_index = read1['POS'] + read1len
+            merged_insertions += [insert for insert in read2['insertions'] if insert[0] > insertion_index]
+
+            # output_insertions.write(f"{QNAME}\t{merged_insertions}\n")
+
+
+            
+        else:
+            unpaired[QNAME] = read
+    # for id in unpaired:
+    #     output_fasta.write(f">{id}|{unpaired[id]['POS']}\n{unpaired[id]['RESULT_SEQUENCE']}\n")
+    #     output_insertions.write(f"{id}\t{unpaired[id]['insertions']}\n")
\ No newline at end of file

From 95c6e696ae72b4e7d8d58c30b220899973aa4a30 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gordon=20J=2E=20K=C3=B6hn?= <gordon@koehn.net>
Date: Thu, 31 Oct 2024 10:58:17 +0000
Subject: [PATCH 02/17] add credits

---
 scripts/dgicev/add_padding.py    |   6 ++
 scripts/dgicev/read.py           | 117 ++++++++++++++--------------
 scripts/dgicev/readAndSort.py    | 127 ++++++++++++++++--------------
 scripts/dgicev/read_outputsam.py | 129 ++++++++++++++++---------------
 4 files changed, 200 insertions(+), 179 deletions(-)

diff --git a/scripts/dgicev/add_padding.py b/scripts/dgicev/add_padding.py
index 05e8a9d..a42b289 100644
--- a/scripts/dgicev/add_padding.py
+++ b/scripts/dgicev/add_padding.py
@@ -1,3 +1,9 @@
+"""
+Author: David Gicev  (@davidgicev / david.gicev@gmail.com)
+Supervisor:Alexander Taepper (@Taepper / alexander.taepper@bsse.ethz.ch)
+Date: 2024-10-30
+"""
+
 import sys
 
 
diff --git a/scripts/dgicev/read.py b/scripts/dgicev/read.py
index 7918670..1968c04 100644
--- a/scripts/dgicev/read.py
+++ b/scripts/dgicev/read.py
@@ -1,56 +1,63 @@
+"""
+Author: David Gicev  (@davidgicev / david.gicev@gmail.com)
+Supervisor:Alexander Taepper (@Taepper / alexander.taepper@bsse.ethz.ch)
+Date: 2024-10-30
+"""
+
 import sys
 import re
 
+
 def parse_cigar(cigar):
-    pattern = re.compile(r'(\d+)([MIDNSHP=X])')
-    
+    pattern = re.compile(r"(\d+)([MIDNSHP=X])")
+
     parsed_cigar = pattern.findall(cigar)
-    
+
     return [(op, int(length)) for length, op in parsed_cigar]
 
 
 unpaired = dict()
 
-with open('merged.fasta', 'w') as output_fasta, open('nuc_insertions.txt', 'w') as output_insertions:
+with open("merged.fasta", "w") as output_fasta, open("nuc_insertions.txt", "w") as output_insertions:
     for line in sys.stdin:
-        if line.startswith('@'):
+        if line.startswith("@"):
             continue
-        
-        fields = line.strip().split('\t')
-        
-        QNAME = fields[0]                   # Query template NAME
-        FLAG = int(fields[1])               # bitwise FLAG
-        RNAME = fields[2]                   # Reference sequence NAME
-        POS = int(fields[3])                # 1-based leftmost mapping POSition
-        MAPQ = int(fields[4])               # MAPping Quality
-        CIGAR = parse_cigar(fields[5])      # CIGAR string
-        RNEXT = fields[6]                   # Ref. name of the mate/next read
-        PNEXT = int(fields[7])              # Position of the mate/next read
-        TLEN = int(fields[8])               # observed Template LENgth
-        SEQ = fields[9]                     # segment SEQuence
-        QUAL = fields[10]                   # ASCII of Phred-scaled base QUALity + 33
-
-        result_sequence = ''
-        result_qual = ''
+
+        fields = line.strip().split("\t")
+
+        QNAME = fields[0]  # Query template NAME
+        FLAG = int(fields[1])  # bitwise FLAG
+        RNAME = fields[2]  # Reference sequence NAME
+        POS = int(fields[3])  # 1-based leftmost mapping POSition
+        MAPQ = int(fields[4])  # MAPping Quality
+        CIGAR = parse_cigar(fields[5])  # CIGAR string
+        RNEXT = fields[6]  # Ref. name of the mate/next read
+        PNEXT = int(fields[7])  # Position of the mate/next read
+        TLEN = int(fields[8])  # observed Template LENgth
+        SEQ = fields[9]  # segment SEQuence
+        QUAL = fields[10]  # ASCII of Phred-scaled base QUALity + 33
+
+        result_sequence = ""
+        result_qual = ""
         index = 0
         inserts = []
 
         for operation in CIGAR:
             type, count = operation
-            if type == 'S':
+            if type == "S":
                 index += count
                 continue
-            if type == 'M':
-                result_sequence += SEQ[index:index + count]
-                result_qual     += QUAL[index:index + count]
+            if type == "M":
+                result_sequence += SEQ[index : index + count]
+                result_qual += QUAL[index : index + count]
                 index += count
                 continue
-            if type == 'D':
-                result_sequence += '-' * count
-                result_qual     += '!' * count
+            if type == "D":
+                result_sequence += "-" * count
+                result_qual += "!" * count
                 continue
-            if type == 'I':
-                inserts.append((index + POS, SEQ[index:index + count]))
+            if type == "I":
+                inserts.append((index + POS, SEQ[index : index + count]))
                 index += count
                 continue
 
@@ -77,19 +84,19 @@ def parse_cigar(cigar):
 
             # print(read1)
             # print(read2)
-            
-            if read1['POS'] > read2['POS']:
+
+            if read1["POS"] > read2["POS"]:
                 read1, read2 = read2, read1
-            
-            index = read1['POS']
-            read1len = len(read1['RESULT_SEQUENCE'])
-            merged = read1['RESULT_SEQUENCE'][:min(read1len, read2['POS'] - read1['POS'])]
+
+            index = read1["POS"]
+            read1len = len(read1["RESULT_SEQUENCE"])
+            merged = read1["RESULT_SEQUENCE"][: min(read1len, read2["POS"] - read1["POS"])]
 
             # do deletions cause a problem here?
-            gaplen = read1['POS'] + read1len - read2['POS']
+            gaplen = read1["POS"] + read1len - read2["POS"]
             if gaplen < 0:
-                merged += 'N' * (-gaplen)
-                merged += read2['RESULT_SEQUENCE']
+                merged += "N" * (-gaplen)
+                merged += read2["RESULT_SEQUENCE"]
             else:
                 # read1_insertions = [read for read in read1['insertions'] if read[0] >= read2['POS']]
                 # read2_insertions = [read for read in read2['insertions'] if read[0] < read1['POS'] + read1len]
@@ -107,16 +114,16 @@ def parse_cigar(cigar):
                 #             print(read1_insertions[i][0], read2_insertions[i][0], " = ", read1_insertions[i][0] - read2_insertions[i][0])
                 #             print("pos2 - pos1", read2['POS'] - read1['POS'])
                 #             print("cigar1", read1['CIGAR'])
-                #             print("cigar2", read2['CIGAR'])len(overlap_result) 
+                #             print("cigar2", read2['CIGAR'])len(overlap_result)
 
                 #         if read1_insertions[i][1] != read2_insertions[i][1]:
                 #             print("Insertion sequence doesn't match")
                 #             print(read1_insertions[i][1], read2_insertions[i][1])
-                overlap_read1 = read1['RESULT_SEQUENCE'][read2['POS'] - read1['POS']:]
-                overlap_read2 = read2['RESULT_SEQUENCE'][0: max(0, gaplen)]
+                overlap_read1 = read1["RESULT_SEQUENCE"][read2["POS"] - read1["POS"] :]
+                overlap_read2 = read2["RESULT_SEQUENCE"][0 : max(0, gaplen)]
 
-                overlap_qual1 = read1['RESULT_QUAL'][read2['POS'] - read1['POS']:]
-                overlap_qual2 = read2['RESULT_QUAL'][0: max(0, gaplen)]
+                overlap_qual1 = read1["RESULT_QUAL"][read2["POS"] - read1["POS"] :]
+                overlap_qual2 = read2["RESULT_QUAL"][0 : max(0, gaplen)]
 
                 # let's set the read1's version by default
                 overlap_result = list(overlap_read1)
@@ -128,7 +135,7 @@ def parse_cigar(cigar):
                     number_of_diffs = 0
                     for i in range(len(overlap_read1)):
                         if overlap_read1[i] != overlap_read2[i]:
-                            if overlap_qual1[i] == '-' and overlap_read2 != '-':
+                            if overlap_qual1[i] == "-" and overlap_read2 != "-":
                                 overlap_result[i] = overlap_read2[i]
                             if overlap_qual1[i] > overlap_qual2[i]:
                                 overlap_result[i] = overlap_read2[i]
@@ -150,25 +157,21 @@ def parse_cigar(cigar):
                     # print("\nreconcilled")
                     # print("".join(overlap_result))
 
-                merged += "".join(overlap_result) + read2['RESULT_SEQUENCE'][max(0, gaplen):]
-
-
+                merged += "".join(overlap_result) + read2["RESULT_SEQUENCE"][max(0, gaplen) :]
 
-            if len(merged) != read2['POS'] + len(read2['RESULT_SEQUENCE']) - read1['POS']:
+            if len(merged) != read2["POS"] + len(read2["RESULT_SEQUENCE"]) - read1["POS"]:
                 raise Exception("Length mismatch")
-            
+
             output_fasta.write(f">{QNAME}|{read1['POS']}\n{merged}\n")
 
-            merged_insertions = read1['insertions'].copy()
-            insertion_index = read1['POS'] + read1len
-            merged_insertions += [insert for insert in read2['insertions'] if insert[0] > insertion_index]
+            merged_insertions = read1["insertions"].copy()
+            insertion_index = read1["POS"] + read1len
+            merged_insertions += [insert for insert in read2["insertions"] if insert[0] > insertion_index]
 
             output_insertions.write(f"{QNAME}\t{merged_insertions}\n")
 
-
-            
         else:
             unpaired[QNAME] = read
     for id in unpaired:
         output_fasta.write(f">{id}|{unpaired[id]['POS']}\n{unpaired[id]['RESULT_SEQUENCE']}\n")
-        output_insertions.write(f"{id}\t{unpaired[id]['insertions']}\n")
\ No newline at end of file
+        output_insertions.write(f"{id}\t{unpaired[id]['insertions']}\n")
diff --git a/scripts/dgicev/readAndSort.py b/scripts/dgicev/readAndSort.py
index efb7f9d..4754c57 100644
--- a/scripts/dgicev/readAndSort.py
+++ b/scripts/dgicev/readAndSort.py
@@ -1,11 +1,18 @@
+"""
+Author: David Gicev  (@davidgicev / david.gicev@gmail.com)
+Supervisor:Alexander Taepper (@Taepper / alexander.taepper@bsse.ethz.ch)
+Date: 2024-10-30
+"""
+
 import sys
 import re
 
+
 def parse_cigar(cigar):
-    pattern = re.compile(r'(\d+)([MIDNSHP=X])')
-    
+    pattern = re.compile(r"(\d+)([MIDNSHP=X])")
+
     parsed_cigar = pattern.findall(cigar)
-    
+
     return [(op, int(length)) for length, op in parsed_cigar]
 
 
@@ -21,48 +28,50 @@ def parse_cigar(cigar):
 
 known_sequences = dict()
 
-with open('merged.fasta', 'w') as output_fasta, open('nuc_insertions.txt', 'w') as output_insertions, open('reference_genome.fasta', 'r') as reference_sequence_file:
+with open("merged.fasta", "w") as output_fasta, open("nuc_insertions.txt", "w") as output_insertions, open(
+    "reference_genome.fasta", "r"
+) as reference_sequence_file:
     reference_sequence = reference_sequence_file.readlines()[1]
     # print("reference sequence: ", reference_sequence)
     for line in sys.stdin:
-        if line.startswith('@'):
+        if line.startswith("@"):
             continue
-        
-        fields = line.strip().split('\t')
-        
-        QNAME = fields[0]                   # Query template NAME
-        FLAG = int(fields[1])               # bitwise FLAG
-        RNAME = fields[2]                   # Reference sequence NAME
-        POS = int(fields[3])                # 1-based leftmost mapping POSition
-        MAPQ = int(fields[4])               # MAPping Quality
-        CIGAR = parse_cigar(fields[5])      # CIGAR string
-        RNEXT = fields[6]                   # Ref. name of the mate/next read
-        PNEXT = int(fields[7])              # Position of the mate/next read
-        TLEN = int(fields[8])               # observed Template LENgth
-        SEQ = fields[9]                     # segment SEQuence
-        QUAL = fields[10]                   # ASCII of Phred-scaled base QUALity + 33
-
-        result_sequence = ''
-        result_qual = ''
+
+        fields = line.strip().split("\t")
+
+        QNAME = fields[0]  # Query template NAME
+        FLAG = int(fields[1])  # bitwise FLAG
+        RNAME = fields[2]  # Reference sequence NAME
+        POS = int(fields[3])  # 1-based leftmost mapping POSition
+        MAPQ = int(fields[4])  # MAPping Quality
+        CIGAR = parse_cigar(fields[5])  # CIGAR string
+        RNEXT = fields[6]  # Ref. name of the mate/next read
+        PNEXT = int(fields[7])  # Position of the mate/next read
+        TLEN = int(fields[8])  # observed Template LENgth
+        SEQ = fields[9]  # segment SEQuence
+        QUAL = fields[10]  # ASCII of Phred-scaled base QUALity + 33
+
+        result_sequence = ""
+        result_qual = ""
         index = 0
         inserts = []
 
         for operation in CIGAR:
             type, count = operation
-            if type == 'S':
+            if type == "S":
                 index += count
                 continue
-            if type == 'M':
-                result_sequence += SEQ[index:index + count]
-                result_qual     += QUAL[index:index + count]
+            if type == "M":
+                result_sequence += SEQ[index : index + count]
+                result_qual += QUAL[index : index + count]
                 index += count
                 continue
-            if type == 'D':
-                result_sequence += '-' * count
-                result_qual     += '!' * count
+            if type == "D":
+                result_sequence += "-" * count
+                result_qual += "!" * count
                 continue
-            if type == 'I':
-                inserts.append((index + POS, SEQ[index:index + count]))
+            if type == "I":
+                inserts.append((index + POS, SEQ[index : index + count]))
                 index += count
                 continue
 
@@ -89,24 +98,24 @@ def parse_cigar(cigar):
 
             # print(read1)
             # print(read2)
-            
-            if read1['POS'] > read2['POS']:
+
+            if read1["POS"] > read2["POS"]:
                 read1, read2 = read2, read1
-            
-            index = read1['POS']
-            read1len = len(read1['RESULT_SEQUENCE'])
-            merged = read1['RESULT_SEQUENCE'][:min(read1len, read2['POS'] - read1['POS'])]
 
-            gaplen = read1['POS'] + read1len - read2['POS']
+            index = read1["POS"]
+            read1len = len(read1["RESULT_SEQUENCE"])
+            merged = read1["RESULT_SEQUENCE"][: min(read1len, read2["POS"] - read1["POS"])]
+
+            gaplen = read1["POS"] + read1len - read2["POS"]
             if gaplen < 0:
-                merged += 'N' * (-gaplen)
-                merged += read2['RESULT_SEQUENCE']
+                merged += "N" * (-gaplen)
+                merged += read2["RESULT_SEQUENCE"]
             else:
-                overlap_read1 = read1['RESULT_SEQUENCE'][read2['POS'] - read1['POS']:]
-                overlap_read2 = read2['RESULT_SEQUENCE'][0: max(0, gaplen)]
+                overlap_read1 = read1["RESULT_SEQUENCE"][read2["POS"] - read1["POS"] :]
+                overlap_read2 = read2["RESULT_SEQUENCE"][0 : max(0, gaplen)]
 
-                overlap_qual1 = read1['RESULT_QUAL'][read2['POS'] - read1['POS']:]
-                overlap_qual2 = read2['RESULT_QUAL'][0: max(0, gaplen)]
+                overlap_qual1 = read1["RESULT_QUAL"][read2["POS"] - read1["POS"] :]
+                overlap_qual2 = read2["RESULT_QUAL"][0 : max(0, gaplen)]
 
                 overlap_result = list(overlap_read1)
 
@@ -117,7 +126,7 @@ def parse_cigar(cigar):
                     number_of_diffs = 0
                     for i in range(len(overlap_read1)):
                         if overlap_read1[i] != overlap_read2[i]:
-                            if overlap_qual1[i] == '-' and overlap_read2 != '-':
+                            if overlap_qual1[i] == "-" and overlap_read2 != "-":
                                 overlap_result[i] = overlap_read2[i]
                             if overlap_qual1[i] > overlap_qual2[i]:
                                 overlap_result[i] = overlap_read2[i]
@@ -125,27 +134,25 @@ def parse_cigar(cigar):
                             number_of_diffs += 1
                             # print("corresponding qs ", i, ": ", overlap_qual1[i], "/", overlap_qual2[i])
 
-                merged += "".join(overlap_result) + read2['RESULT_SEQUENCE'][max(0, gaplen):]
-
-
+                merged += "".join(overlap_result) + read2["RESULT_SEQUENCE"][max(0, gaplen) :]
 
-            if len(merged) != read2['POS'] + len(read2['RESULT_SEQUENCE']) - read1['POS']:
+            if len(merged) != read2["POS"] + len(read2["RESULT_SEQUENCE"]) - read1["POS"]:
                 raise Exception("Length mismatch")
-            
+
             # output_fasta.write(f">{QNAME}|{read1['POS']}\n{merged}\n")
 
-            merged_insertions = read1['insertions'].copy()
-            insertion_index = read1['POS'] + read1len
-            merged_insertions += [insert for insert in read2['insertions'] if insert[0] > insertion_index]
+            merged_insertions = read1["insertions"].copy()
+            insertion_index = read1["POS"] + read1len
+            merged_insertions += [insert for insert in read2["insertions"] if insert[0] > insertion_index]
 
             output_insertions.write(f"{QNAME}\t{merged_insertions}\n")
 
             # time to add it to the graph
-            reference_offset = read1['POS'] - 1
+            reference_offset = read1["POS"] - 1
 
             # for i in range(len(merged)):
-                # if merged[i] != reference_sequence[reference_offset + i]:
-                #     # print("id: ", QNAME, "mutation at ", i, " from ", reference_sequence[reference_offset + i], " to ", merged[i])
+            # if merged[i] != reference_sequence[reference_offset + i]:
+            #     # print("id: ", QNAME, "mutation at ", i, " from ", reference_sequence[reference_offset + i], " to ", merged[i])
             fingerprint = hash(merged)
             if fingerprint in known_sequences:
                 groups = known_sequences[fingerprint]
@@ -155,10 +162,10 @@ def parse_cigar(cigar):
                         found = True
                         group[2].append(QNAME)
                 if not found:
-                    known_sequences[fingerprint].append((merged, read1['POS'], [QNAME]))
-                
+                    known_sequences[fingerprint].append((merged, read1["POS"], [QNAME]))
+
             else:
-                known_sequences[fingerprint] = [(merged, read1['POS'], [QNAME])]
+                known_sequences[fingerprint] = [(merged, read1["POS"], [QNAME])]
 
         else:
             unpaired[QNAME] = read
@@ -177,4 +184,4 @@ def parse_cigar(cigar):
 
     for id in unpaired:
         output_fasta.write(f">{id}|{unpaired[id]['POS']}\n{unpaired[id]['RESULT_SEQUENCE']}\n")
-        output_insertions.write(f"{id}\t{unpaired[id]['insertions']}\n")
\ No newline at end of file
+        output_insertions.write(f"{id}\t{unpaired[id]['insertions']}\n")
diff --git a/scripts/dgicev/read_outputsam.py b/scripts/dgicev/read_outputsam.py
index c186c25..f8ac9de 100644
--- a/scripts/dgicev/read_outputsam.py
+++ b/scripts/dgicev/read_outputsam.py
@@ -1,56 +1,63 @@
+"""
+Author: David Gicev  (@davidgicev / david.gicev@gmail.com)
+Supervisor:Alexander Taepper (@Taepper / alexander.taepper@bsse.ethz.ch)
+Date: 2024-10-30
+"""
+
 import sys
 import re
 
+
 def parse_cigar(cigar):
-    pattern = re.compile(r'(\d+)([MIDNSHP=X])')
-    
+    pattern = re.compile(r"(\d+)([MIDNSHP=X])")
+
     parsed_cigar = pattern.findall(cigar)
-    
+
     return [(op, int(length)) for length, op in parsed_cigar]
 
 
 unpaired = dict()
 
-with open('merged.sam', 'w') as output_sam:
+with open("merged.sam", "w") as output_sam:
     for line in sys.stdin:
-        if line.startswith('@'):
+        if line.startswith("@"):
             continue
-        
-        fields = line.strip().split('\t')
-        
-        QNAME = fields[0]                   # Query template NAME
-        FLAG = int(fields[1])               # bitwise FLAG
-        RNAME = fields[2]                   # Reference sequence NAME
-        POS = int(fields[3])                # 1-based leftmost mapping POSition
-        MAPQ = int(fields[4])               # MAPping Quality
-        CIGAR = parse_cigar(fields[5])      # CIGAR string
-        RNEXT = fields[6]                   # Ref. name of the mate/next read
-        PNEXT = int(fields[7])              # Position of the mate/next read
-        TLEN = int(fields[8])               # observed Template LENgth
-        SEQ = fields[9]                     # segment SEQuence
-        QUAL = fields[10]                   # ASCII of Phred-scaled base QUALity + 33
-
-        result_sequence = ''
-        result_qual = ''
+
+        fields = line.strip().split("\t")
+
+        QNAME = fields[0]  # Query template NAME
+        FLAG = int(fields[1])  # bitwise FLAG
+        RNAME = fields[2]  # Reference sequence NAME
+        POS = int(fields[3])  # 1-based leftmost mapping POSition
+        MAPQ = int(fields[4])  # MAPping Quality
+        CIGAR = parse_cigar(fields[5])  # CIGAR string
+        RNEXT = fields[6]  # Ref. name of the mate/next read
+        PNEXT = int(fields[7])  # Position of the mate/next read
+        TLEN = int(fields[8])  # observed Template LENgth
+        SEQ = fields[9]  # segment SEQuence
+        QUAL = fields[10]  # ASCII of Phred-scaled base QUALity + 33
+
+        result_sequence = ""
+        result_qual = ""
         index = 0
         inserts = []
 
         for operation in CIGAR:
             type, count = operation
-            if type == 'S':
+            if type == "S":
                 index += count
                 continue
-            if type == 'M':
-                result_sequence += SEQ[index:index + count]
-                result_qual     += QUAL[index:index + count]
+            if type == "M":
+                result_sequence += SEQ[index : index + count]
+                result_qual += QUAL[index : index + count]
                 index += count
                 continue
-            if type == 'D':
-                result_sequence += '-' * count
-                result_qual     += '!' * count
+            if type == "D":
+                result_sequence += "-" * count
+                result_qual += "!" * count
                 continue
-            if type == 'I':
-                inserts.append((index + POS, SEQ[index:index + count]))
+            if type == "I":
+                inserts.append((index + POS, SEQ[index : index + count]))
                 index += count
                 continue
 
@@ -77,22 +84,22 @@ def parse_cigar(cigar):
 
             # print(read1)
             # print(read2)
-            
-            if read1['POS'] > read2['POS']:
+
+            if read1["POS"] > read2["POS"]:
                 read1, read2 = read2, read1
-            
-            index = read1['POS']
-            read1len = len(read1['RESULT_SEQUENCE'])
-            merged = read1['RESULT_SEQUENCE'][:min(read1len, read2['POS'] - read1['POS'])]
-            merged_qual = read1['RESULT_QUAL'][:min(read1len, read2['POS'] - read1['POS'])]
+
+            index = read1["POS"]
+            read1len = len(read1["RESULT_SEQUENCE"])
+            merged = read1["RESULT_SEQUENCE"][: min(read1len, read2["POS"] - read1["POS"])]
+            merged_qual = read1["RESULT_QUAL"][: min(read1len, read2["POS"] - read1["POS"])]
 
             # do deletions cause a problem here?
-            gaplen = read1['POS'] + read1len - read2['POS']
+            gaplen = read1["POS"] + read1len - read2["POS"]
             if gaplen < 0:
-                merged += 'N' * (-gaplen)
-                merged += read2['RESULT_SEQUENCE']
-                merged_qual += 'C' * (-gaplen)
-                merged_qual += read2['RESULT_SEQUENCE']
+                merged += "N" * (-gaplen)
+                merged += read2["RESULT_SEQUENCE"]
+                merged_qual += "C" * (-gaplen)
+                merged_qual += read2["RESULT_SEQUENCE"]
             else:
                 # read1_insertions = [read for read in read1['insertions'] if read[0] >= read2['POS']]
                 # read2_insertions = [read for read in read2['insertions'] if read[0] < read1['POS'] + read1len]
@@ -110,16 +117,16 @@ def parse_cigar(cigar):
                 #             print(read1_insertions[i][0], read2_insertions[i][0], " = ", read1_insertions[i][0] - read2_insertions[i][0])
                 #             print("pos2 - pos1", read2['POS'] - read1['POS'])
                 #             print("cigar1", read1['CIGAR'])
-                #             print("cigar2", read2['CIGAR'])len(overlap_result) 
+                #             print("cigar2", read2['CIGAR'])len(overlap_result)
 
                 #         if read1_insertions[i][1] != read2_insertions[i][1]:
                 #             print("Insertion sequence doesn't match")
                 #             print(read1_insertions[i][1], read2_insertions[i][1])
-                overlap_read1 = read1['RESULT_SEQUENCE'][read2['POS'] - read1['POS']:]
-                overlap_read2 = read2['RESULT_SEQUENCE'][0: max(0, gaplen)]
+                overlap_read1 = read1["RESULT_SEQUENCE"][read2["POS"] - read1["POS"] :]
+                overlap_read2 = read2["RESULT_SEQUENCE"][0 : max(0, gaplen)]
 
-                overlap_qual1 = read1['RESULT_QUAL'][read2['POS'] - read1['POS']:]
-                overlap_qual2 = read2['RESULT_QUAL'][0: max(0, gaplen)]
+                overlap_qual1 = read1["RESULT_QUAL"][read2["POS"] - read1["POS"] :]
+                overlap_qual2 = read2["RESULT_QUAL"][0 : max(0, gaplen)]
 
                 # let's set the read1's version by default
                 overlap_result = list(overlap_read1)
@@ -132,7 +139,7 @@ def parse_cigar(cigar):
                     number_of_diffs = 0
                     for i in range(len(overlap_read1)):
                         if overlap_read1[i] != overlap_read2[i]:
-                            if overlap_qual1[i] == '-' and overlap_read2 != '-':
+                            if overlap_qual1[i] == "-" and overlap_read2 != "-":
                                 overlap_result[i] = overlap_read2[i]
                                 overlap_qual[i] = overlap_qual[i]
 
@@ -157,29 +164,27 @@ def parse_cigar(cigar):
                     # print("\nreconcilled")
                     # print("".join(overlap_result))
 
-                merged += "".join(overlap_result) + read2['RESULT_SEQUENCE'][max(0, gaplen):]
-                merged_qual += "".join(overlap_qual) + read2['RESULT_QUAL'][max(0, gaplen):]
-
-
+                merged += "".join(overlap_result) + read2["RESULT_SEQUENCE"][max(0, gaplen) :]
+                merged_qual += "".join(overlap_qual) + read2["RESULT_QUAL"][max(0, gaplen) :]
 
-            if len(merged) != read2['POS'] + len(read2['RESULT_SEQUENCE']) - read1['POS']:
+            if len(merged) != read2["POS"] + len(read2["RESULT_SEQUENCE"]) - read1["POS"]:
                 raise Exception("Length mismatch")
-            
+
             flag = 0
             output_cigar = str(len(merged)) + "M"
 
-            output_sam.write(f"{QNAME}\t{flag}\t{RNAME}\t{read1['POS']}\t{MAPQ}\t{output_cigar}\t*\t0\t{abs(TLEN)}\t{merged}\t{merged_qual}\n")
+            output_sam.write(
+                f"{QNAME}\t{flag}\t{RNAME}\t{read1['POS']}\t{MAPQ}\t{output_cigar}\t*\t0\t{abs(TLEN)}\t{merged}\t{merged_qual}\n"
+            )
 
-            merged_insertions = read1['insertions'].copy()
-            insertion_index = read1['POS'] + read1len
-            merged_insertions += [insert for insert in read2['insertions'] if insert[0] > insertion_index]
+            merged_insertions = read1["insertions"].copy()
+            insertion_index = read1["POS"] + read1len
+            merged_insertions += [insert for insert in read2["insertions"] if insert[0] > insertion_index]
 
             # output_insertions.write(f"{QNAME}\t{merged_insertions}\n")
 
-
-            
         else:
             unpaired[QNAME] = read
     # for id in unpaired:
     #     output_fasta.write(f">{id}|{unpaired[id]['POS']}\n{unpaired[id]['RESULT_SEQUENCE']}\n")
-    #     output_insertions.write(f"{id}\t{unpaired[id]['insertions']}\n")
\ No newline at end of file
+    #     output_insertions.write(f"{id}\t{unpaired[id]['insertions']}\n")

From d114254b021644343376b9708c126bd73db02774 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 31 Oct 2024 09:01:33 +0000
Subject: [PATCH 03/17] fix: bump saadmk11/github-actions-version-updater from
 0.7.4 to 0.8.1

Bumps [saadmk11/github-actions-version-updater](https://github.com/saadmk11/github-actions-version-updater) from 0.7.4 to 0.8.1.
- [Release notes](https://github.com/saadmk11/github-actions-version-updater/releases)
- [Changelog](https://github.com/saadmk11/github-actions-version-updater/blob/main/CHANGELOG.md)
- [Commits](https://github.com/saadmk11/github-actions-version-updater/compare/v0.7.4...v0.8.1)

---
updated-dependencies:
- dependency-name: saadmk11/github-actions-version-updater
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/schedule-update-actions.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/schedule-update-actions.yml b/.github/workflows/schedule-update-actions.yml
index f4c30b6..4773301 100644
--- a/.github/workflows/schedule-update-actions.yml
+++ b/.github/workflows/schedule-update-actions.yml
@@ -18,7 +18,7 @@ jobs:
           token: ${{ secrets.PAT }}
 
       - name: Run GitHub Actions Version Updater
-        uses: saadmk11/github-actions-version-updater@v0.7.4
+        uses: saadmk11/github-actions-version-updater@v0.8.1
         with:
           # [Required] Access token with `workflow` scope.
           token: ${{ secrets.PAT }}

From 92301a667af929aeee576abfa8e66459f89fbc5d Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 31 Oct 2024 09:01:35 +0000
Subject: [PATCH 04/17] fix: bump amannn/action-semantic-pull-request from
 5.2.0 to 5.5.3

Bumps [amannn/action-semantic-pull-request](https://github.com/amannn/action-semantic-pull-request) from 5.2.0 to 5.5.3.
- [Release notes](https://github.com/amannn/action-semantic-pull-request/releases)
- [Changelog](https://github.com/amannn/action-semantic-pull-request/blob/main/CHANGELOG.md)
- [Commits](https://github.com/amannn/action-semantic-pull-request/compare/v5.2.0...v5.5.3)

---
updated-dependencies:
- dependency-name: amannn/action-semantic-pull-request
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/semantic-pr-check.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/semantic-pr-check.yml b/.github/workflows/semantic-pr-check.yml
index 3a1158d..7fdd599 100644
--- a/.github/workflows/semantic-pr-check.yml
+++ b/.github/workflows/semantic-pr-check.yml
@@ -12,6 +12,6 @@ jobs:
     name: Validate PR title
     runs-on: ubuntu-latest
     steps:
-      - uses: amannn/action-semantic-pull-request@v5.2.0
+      - uses: amannn/action-semantic-pull-request@v5.5.3
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

From 240eaf908f4861e078134b88d9436b79dcbe4f80 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 31 Oct 2024 09:01:38 +0000
Subject: [PATCH 05/17] fix: bump microsoft/action-python from 0.6.4 to 0.7.3

Bumps [microsoft/action-python](https://github.com/microsoft/action-python) from 0.6.4 to 0.7.3.
- [Release notes](https://github.com/microsoft/action-python/releases)
- [Commits](https://github.com/microsoft/action-python/compare/0.6.4...0.7.3)

---
updated-dependencies:
- dependency-name: microsoft/action-python
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/CI.yml      | 4 ++--
 .github/workflows/publish.yml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index eb8ae3e..c94f448 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -10,12 +10,12 @@ on:
 
 jobs:
   validation:
-    uses: microsoft/action-python/.github/workflows/validation.yml@0.6.4
+    uses: microsoft/action-python/.github/workflows/validation.yml@0.7.3
     with:
       workdir: '.'
 
   publish:
-    uses: microsoft/action-python/.github/workflows/publish.yml@0.6.4
+    uses: microsoft/action-python/.github/workflows/publish.yml@0.7.3
     secrets:
       PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
       TEST_PYPI_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD  }}
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 9d71560..0264afd 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -4,7 +4,7 @@ on:
 
 jobs:
   publish:
-    uses: microsoft/action-python/.github/workflows/publish.yml@0.6.4
+    uses: microsoft/action-python/.github/workflows/publish.yml@0.7.3
     secrets:
       PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
       TEST_PYPI_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD  }}

From 73078c27ae2a21e147c426da02527374bf3b7e99 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 31 Oct 2024 09:01:40 +0000
Subject: [PATCH 06/17] fix: bump actions/checkout from 3.5.2 to 4.2.2

Bumps [actions/checkout](https://github.com/actions/checkout) from 3.5.2 to 4.2.2.
- [Release notes](https://github.com/actions/checkout/releases)
- [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md)
- [Commits](https://github.com/actions/checkout/compare/v3.5.2...v4.2.2)

---
updated-dependencies:
- dependency-name: actions/checkout
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/schedule-update-actions.yml | 2 +-
 .github/workflows/template-sync.yml           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/schedule-update-actions.yml b/.github/workflows/schedule-update-actions.yml
index 4773301..fb6c720 100644
--- a/.github/workflows/schedule-update-actions.yml
+++ b/.github/workflows/schedule-update-actions.yml
@@ -12,7 +12,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@v3.5.2
+      - uses: actions/checkout@v4.2.2
         with:
           # [Required] Access token with `workflow` scope.
           token: ${{ secrets.PAT }}
diff --git a/.github/workflows/template-sync.yml b/.github/workflows/template-sync.yml
index 49666bc..d46d002 100644
--- a/.github/workflows/template-sync.yml
+++ b/.github/workflows/template-sync.yml
@@ -5,7 +5,7 @@ jobs:
   sync:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3.5.2 # important!
+      - uses: actions/checkout@v4.2.2 # important!
       - uses: euphoricsystems/action-sync-template-repository@v2.5.1
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}

From ef6927a73aeaef72ee7081775843388e23e256ee Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 31 Oct 2024 09:01:45 +0000
Subject: [PATCH 07/17] fix: bump pytest-cov from 4.0.0 to 6.0.0

Bumps [pytest-cov](https://github.com/pytest-dev/pytest-cov) from 4.0.0 to 6.0.0.
- [Changelog](https://github.com/pytest-dev/pytest-cov/blob/master/CHANGELOG.rst)
- [Commits](https://github.com/pytest-dev/pytest-cov/compare/v4.0.0...v6.0.0)

---
updated-dependencies:
- dependency-name: pytest-cov
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9e77fc7..98110f3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,7 +38,7 @@ test = [
     "pre-commit==3.3.1",
     "pylint==2.17.4",
     "pylint_junit",
-    "pytest-cov==4.0.0",
+    "pytest-cov==6.0.0",
     "pytest-mock<3.10.1",
     "pytest-runner",
     "pytest==7.3.1",

From 01c9817bb48b9cda9c45803601eaaf32fba20d43 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 31 Oct 2024 09:01:50 +0000
Subject: [PATCH 08/17] fix: bump black from 23.3.0 to 24.10.0

Bumps [black](https://github.com/psf/black) from 23.3.0 to 24.10.0.
- [Release notes](https://github.com/psf/black/releases)
- [Changelog](https://github.com/psf/black/blob/main/CHANGES.md)
- [Commits](https://github.com/psf/black/compare/23.3.0...24.10.0)

---
updated-dependencies:
- dependency-name: black
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 98110f3..8bda9e3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,7 @@ spark = [
 ]
 test = [
     "bandit[toml]==1.7.5",
-    "black==23.3.0",
+    "black==24.10.0",
     "check-manifest==0.49",
     "flake8-bugbear==23.5.9",
     "flake8-docstrings",

From 91b4dc2969aa0e041da89727b26f4178633a70fe Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 31 Oct 2024 11:07:28 +0000
Subject: [PATCH 09/17] fix: bump check-manifest from 0.49 to 0.50

Bumps [check-manifest](https://github.com/mgedmin/check-manifest) from 0.49 to 0.50.
- [Changelog](https://github.com/mgedmin/check-manifest/blob/master/CHANGES.rst)
- [Commits](https://github.com/mgedmin/check-manifest/compare/0.49...0.50)

---
updated-dependencies:
- dependency-name: check-manifest
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8bda9e3..4d864f7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,7 +29,7 @@ spark = [
 test = [
     "bandit[toml]==1.7.5",
     "black==24.10.0",
-    "check-manifest==0.49",
+    "check-manifest==0.50",
     "flake8-bugbear==23.5.9",
     "flake8-docstrings",
     "flake8-formatter_junit_xml",

From 085f288308107263149f804b0000af8a4e5eb28a Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 31 Oct 2024 11:07:32 +0000
Subject: [PATCH 10/17] fix: bump bandit[toml] from 1.7.5 to 1.7.10

Bumps [bandit[toml]](https://github.com/PyCQA/bandit) from 1.7.5 to 1.7.10.
- [Release notes](https://github.com/PyCQA/bandit/releases)
- [Commits](https://github.com/PyCQA/bandit/compare/1.7.5...1.7.10)

---
updated-dependencies:
- dependency-name: bandit[toml]
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4d864f7..4bf6cb1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,7 +27,7 @@ spark = [
     "pyspark>=3.0.0"
 ]
 test = [
-    "bandit[toml]==1.7.5",
+    "bandit[toml]==1.7.10",
     "black==24.10.0",
     "check-manifest==0.50",
     "flake8-bugbear==23.5.9",

From 4d77c12a390593acfc00623a33d6fb5454349868 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 31 Oct 2024 11:07:00 +0000
Subject: [PATCH 11/17] fix: bump pre-commit from 3.3.1 to 4.0.1

Bumps [pre-commit](https://github.com/pre-commit/pre-commit) from 3.3.1 to 4.0.1.
- [Release notes](https://github.com/pre-commit/pre-commit/releases)
- [Changelog](https://github.com/pre-commit/pre-commit/blob/main/CHANGELOG.md)
- [Commits](https://github.com/pre-commit/pre-commit/compare/v3.3.1...v4.0.1)

---
updated-dependencies:
- dependency-name: pre-commit
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4bf6cb1..6f6172a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,7 +35,7 @@ test = [
     "flake8-formatter_junit_xml",
     "flake8",
     "flake8-pyproject",
-    "pre-commit==3.3.1",
+    "pre-commit==4.0.1",
     "pylint==2.17.4",
     "pylint_junit",
     "pytest-cov==6.0.0",

From 906fd47b74244048d9ff28e6422f0ac887d93da5 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 31 Oct 2024 11:07:26 +0000
Subject: [PATCH 12/17] fix: bump pytest from 7.3.1 to 8.3.3

Bumps [pytest](https://github.com/pytest-dev/pytest) from 7.3.1 to 8.3.3.
- [Release notes](https://github.com/pytest-dev/pytest/releases)
- [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst)
- [Commits](https://github.com/pytest-dev/pytest/compare/7.3.1...8.3.3)

---
updated-dependencies:
- dependency-name: pytest
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 6f6172a..d828afc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,7 +41,7 @@ test = [
     "pytest-cov==6.0.0",
     "pytest-mock<3.10.1",
     "pytest-runner",
-    "pytest==7.3.1",
+    "pytest==8.3.3",
     "pytest-github-actions-annotate-failures",
     "shellcheck-py==0.9.0.2"
 ]

From 99d6efad1a3a4f966115b23dde4fb7ab0f831509 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 31 Oct 2024 11:07:19 +0000
Subject: [PATCH 13/17] fix: update pytest-mock requirement from <3.10.1 to
 <3.14.1

Updates the requirements on [pytest-mock](https://github.com/pytest-dev/pytest-mock) to permit the latest version.
- [Release notes](https://github.com/pytest-dev/pytest-mock/releases)
- [Changelog](https://github.com/pytest-dev/pytest-mock/blob/main/CHANGELOG.rst)
- [Commits](https://github.com/pytest-dev/pytest-mock/compare/v0.1.0...v3.14.0)

---
updated-dependencies:
- dependency-name: pytest-mock
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index d828afc..466feb7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,7 +39,7 @@ test = [
     "pylint==2.17.4",
     "pylint_junit",
     "pytest-cov==6.0.0",
-    "pytest-mock<3.10.1",
+    "pytest-mock<3.14.1",
     "pytest-runner",
     "pytest==8.3.3",
     "pytest-github-actions-annotate-failures",

From 7906bdead132df4de9403c2c62d8c1ca7c52c0ab Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 31 Oct 2024 12:09:23 +0000
Subject: [PATCH 14/17] fix: bump shellcheck-py from 0.9.0.2 to 0.10.0.1

Bumps [shellcheck-py](https://github.com/ryanrhee/shellcheck-py) from 0.9.0.2 to 0.10.0.1.
- [Commits](https://github.com/ryanrhee/shellcheck-py/compare/v0.9.0.2...v0.10.0.1)

---
updated-dependencies:
- dependency-name: shellcheck-py
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 466feb7..f0924aa 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,7 +43,7 @@ test = [
     "pytest-runner",
     "pytest==8.3.3",
     "pytest-github-actions-annotate-failures",
-    "shellcheck-py==0.9.0.2"
+    "shellcheck-py==0.10.0.1"
 ]
 
 [project.urls]

From 35ef6c7c113c042cb7d4412d567bfb17ae3c5ce5 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 31 Oct 2024 12:08:59 +0000
Subject: [PATCH 15/17] fix: bump pylint from 2.17.4 to 3.3.1

Bumps [pylint](https://github.com/pylint-dev/pylint) from 2.17.4 to 3.3.1.
- [Release notes](https://github.com/pylint-dev/pylint/releases)
- [Commits](https://github.com/pylint-dev/pylint/compare/v2.17.4...v3.3.1)

---
updated-dependencies:
- dependency-name: pylint
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index f0924aa..b53e28b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,7 +36,7 @@ test = [
     "flake8",
     "flake8-pyproject",
     "pre-commit==4.0.1",
-    "pylint==2.17.4",
+    "pylint==3.3.1",
     "pylint_junit",
     "pytest-cov==6.0.0",
     "pytest-mock<3.14.1",

From 8dc8cc943771e1fe748c7f690d21a9cc1c509013 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 31 Oct 2024 12:08:34 +0000
Subject: [PATCH 16/17] fix: bump flake8-bugbear from 23.5.9 to 24.8.19

Bumps [flake8-bugbear](https://github.com/PyCQA/flake8-bugbear) from 23.5.9 to 24.8.19.
- [Release notes](https://github.com/PyCQA/flake8-bugbear/releases)
- [Commits](https://github.com/PyCQA/flake8-bugbear/compare/23.5.9...24.8.19)

---
updated-dependencies:
- dependency-name: flake8-bugbear
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index b53e28b..c113062 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,7 +30,7 @@ test = [
     "bandit[toml]==1.7.10",
     "black==24.10.0",
     "check-manifest==0.50",
-    "flake8-bugbear==23.5.9",
+    "flake8-bugbear==24.8.19",
     "flake8-docstrings",
     "flake8-formatter_junit_xml",
     "flake8",

From bb32a8445e7730da77b86a67cca45a8d98fb5103 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gordon=20J=2E=20K=C3=B6hn?= <gordon@koehn.net>
Date: Thu, 31 Oct 2024 12:45:10 +0000
Subject: [PATCH 17/17] skip QA

---
 scripts/dgicev/add_padding.py    | 4 ++++
 scripts/dgicev/read.py           | 4 ++++
 scripts/dgicev/readAndSort.py    | 4 ++++
 scripts/dgicev/read_outputsam.py | 4 ++++
 4 files changed, 16 insertions(+)

diff --git a/scripts/dgicev/add_padding.py b/scripts/dgicev/add_padding.py
index a42b289..c1f4c1c 100644
--- a/scripts/dgicev/add_padding.py
+++ b/scripts/dgicev/add_padding.py
@@ -4,6 +4,10 @@
 Date: 2024-10-30
 """
 
+# TODO:  integrate into package, with QA and testing
+# pylint: skip-file
+# flake8: noqa
+
 import sys
 
 
diff --git a/scripts/dgicev/read.py b/scripts/dgicev/read.py
index 1968c04..a5016b1 100644
--- a/scripts/dgicev/read.py
+++ b/scripts/dgicev/read.py
@@ -4,6 +4,10 @@
 Date: 2024-10-30
 """
 
+# TODO:  integrate into package, with QA and testing
+# pylint: skip-file
+# flake8: noqa
+
 import sys
 import re
 
diff --git a/scripts/dgicev/readAndSort.py b/scripts/dgicev/readAndSort.py
index 4754c57..674f557 100644
--- a/scripts/dgicev/readAndSort.py
+++ b/scripts/dgicev/readAndSort.py
@@ -4,6 +4,10 @@
 Date: 2024-10-30
 """
 
+# TODO:  integrate into package, with QA and testing
+# pylint: skip-file
+# flake8: noqa
+
 import sys
 import re
 
diff --git a/scripts/dgicev/read_outputsam.py b/scripts/dgicev/read_outputsam.py
index f8ac9de..738a896 100644
--- a/scripts/dgicev/read_outputsam.py
+++ b/scripts/dgicev/read_outputsam.py
@@ -4,6 +4,10 @@
 Date: 2024-10-30
 """
 
+# TODO:  integrate into package, with QA and testing
+# pylint: skip-file
+# flake8: noqa
+
 import sys
 import re