From ebd5a8dd550efdcfa90af90e6a922a8b0420df8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gordon=20J=2E=20K=C3=B6hn?= Date: Thu, 31 Oct 2024 10:39:38 +0000 Subject: [PATCH 01/17] add original scripts from David Gicev @davidgicev --- scripts/dgicev/README.md | 38 +++++++ scripts/dgicev/add_padding.py | 26 +++++ scripts/dgicev/read.py | 174 +++++++++++++++++++++++++++++ scripts/dgicev/readAndSort.py | 180 ++++++++++++++++++++++++++++++ scripts/dgicev/read_outputsam.py | 185 +++++++++++++++++++++++++++++++ 5 files changed, 603 insertions(+) create mode 100644 scripts/dgicev/README.md create mode 100644 scripts/dgicev/add_padding.py create mode 100644 scripts/dgicev/read.py create mode 100644 scripts/dgicev/readAndSort.py create mode 100644 scripts/dgicev/read_outputsam.py diff --git a/scripts/dgicev/README.md b/scripts/dgicev/README.md new file mode 100644 index 0000000..65187a6 --- /dev/null +++ b/scripts/dgicev/README.md @@ -0,0 +1,38 @@ +### add_padding.py + +Used for adding padding to an example fasta file, mainly used for testing main + +#### Usage: + +`cat input_file.fasta | python add_padding.py > output_file.fasta` + +`input_file`: fasta file with header containing offset | delimited + +`output_file`: normal fasta file, each sequence padded by the respective amount of N's to the left/right (position info in the header is not in the output) + +### read.py + +Used for merging pairs of reads + +#### Usage: +`cat input_file.fasta | python read.py` +`samtools view input_file.bam | python read.py` + +`input_file`: sam file contents, could also read from bam with samtools + +`output_file`: it makes two output files by itself, merged.fasta and nuc_insertions.txt, the former has the merged reads and it uses the fasta with | headers to describe the position/offset + +### read_outputsam.py + +Used mainly for testing with IGV + +same usage as read.py, except the output it makes is merged.sam, it doesn't store the insertions + +the sam entries it outputs are the actual sequences you would find in the merged.fasta when running read.py - the cigar is just M's + +### readAndSort.py +for reordering, using hashing, not really efficient more of a proof of concept + +same usage as read.py + +Note: I don't think I use the reference genome here so feel free to take it out of the code diff --git a/scripts/dgicev/add_padding.py b/scripts/dgicev/add_padding.py new file mode 100644 index 0000000..05e8a9d --- /dev/null +++ b/scripts/dgicev/add_padding.py @@ -0,0 +1,26 @@ +import sys + + +def transform_fasta(): + while True: + header = sys.stdin.readline().strip() + sequence = sys.stdin.readline().strip() + + if not header or not sequence: + break # End of file + + # Parse the header and extract the position offset + header_parts = header.split("|") + position_offset = int(header_parts[1]) # Get the position offset + + # Add Ns before and after the sequence + left_padding = "N" * position_offset + right_padding = "N" * (29904 - len(sequence) - position_offset) + + # Write the transformed sequence to stdout + sys.stdout.write(f"{header_parts[0]}\n") + sys.stdout.write(f"{left_padding}{sequence}{right_padding}\n") + + +# Execute the function +transform_fasta() diff --git a/scripts/dgicev/read.py b/scripts/dgicev/read.py new file mode 100644 index 0000000..7918670 --- /dev/null +++ b/scripts/dgicev/read.py @@ -0,0 +1,174 @@ +import sys +import re + +def parse_cigar(cigar): + pattern = re.compile(r'(\d+)([MIDNSHP=X])') + + parsed_cigar = pattern.findall(cigar) + + return [(op, int(length)) for length, op in parsed_cigar] + + +unpaired = dict() + +with open('merged.fasta', 'w') as output_fasta, open('nuc_insertions.txt', 'w') as output_insertions: + for line in sys.stdin: + if line.startswith('@'): + continue + + fields = line.strip().split('\t') + + QNAME = fields[0] # Query template NAME + FLAG = int(fields[1]) # bitwise FLAG + RNAME = fields[2] # Reference sequence NAME + POS = int(fields[3]) # 1-based leftmost mapping POSition + MAPQ = int(fields[4]) # MAPping Quality + CIGAR = parse_cigar(fields[5]) # CIGAR string + RNEXT = fields[6] # Ref. name of the mate/next read + PNEXT = int(fields[7]) # Position of the mate/next read + TLEN = int(fields[8]) # observed Template LENgth + SEQ = fields[9] # segment SEQuence + QUAL = fields[10] # ASCII of Phred-scaled base QUALity + 33 + + result_sequence = '' + result_qual = '' + index = 0 + inserts = [] + + for operation in CIGAR: + type, count = operation + if type == 'S': + index += count + continue + if type == 'M': + result_sequence += SEQ[index:index + count] + result_qual += QUAL[index:index + count] + index += count + continue + if type == 'D': + result_sequence += '-' * count + result_qual += '!' * count + continue + if type == 'I': + inserts.append((index + POS, SEQ[index:index + count])) + index += count + continue + + read = { + # "QNAME": QNAME, + # "FLAG": FLAG, + # "RNAME": RNAME, + "POS": POS, + # "MAPQ": MAPQ, + "CIGAR": CIGAR, + # "RNEXT": RNEXT, + # "PNEXT": PNEXT, + # "TLEN": TLEN, + # "SEQ": SEQ, + # "QUAL": QUAL, + "RESULT_SEQUENCE": result_sequence, + "RESULT_QUAL": result_qual, + "insertions": inserts, + } + + if QNAME in unpaired: + read1 = unpaired.pop(QNAME) + read2 = read + + # print(read1) + # print(read2) + + if read1['POS'] > read2['POS']: + read1, read2 = read2, read1 + + index = read1['POS'] + read1len = len(read1['RESULT_SEQUENCE']) + merged = read1['RESULT_SEQUENCE'][:min(read1len, read2['POS'] - read1['POS'])] + + # do deletions cause a problem here? + gaplen = read1['POS'] + read1len - read2['POS'] + if gaplen < 0: + merged += 'N' * (-gaplen) + merged += read2['RESULT_SEQUENCE'] + else: + # read1_insertions = [read for read in read1['insertions'] if read[0] >= read2['POS']] + # read2_insertions = [read for read in read2['insertions'] if read[0] < read1['POS'] + read1len] + # if str(read1_insertions) != str(read2_insertions): + # print("\n\nInsertions don't match") + # print(QNAME) + # print("insertions1: ", read1_insertions) + # print("insertions2: ", read2_insertions) + # if len(read1_insertions) != len(read2_insertions): + # print("Number of insertions doesn't match") + # else: + # for i in range(len(read1_insertions)): + # if read1_insertions[i][0] != read2_insertions[i][0]: + # print("Insertion index doesn't match") + # print(read1_insertions[i][0], read2_insertions[i][0], " = ", read1_insertions[i][0] - read2_insertions[i][0]) + # print("pos2 - pos1", read2['POS'] - read1['POS']) + # print("cigar1", read1['CIGAR']) + # print("cigar2", read2['CIGAR'])len(overlap_result) + + # if read1_insertions[i][1] != read2_insertions[i][1]: + # print("Insertion sequence doesn't match") + # print(read1_insertions[i][1], read2_insertions[i][1]) + overlap_read1 = read1['RESULT_SEQUENCE'][read2['POS'] - read1['POS']:] + overlap_read2 = read2['RESULT_SEQUENCE'][0: max(0, gaplen)] + + overlap_qual1 = read1['RESULT_QUAL'][read2['POS'] - read1['POS']:] + overlap_qual2 = read2['RESULT_QUAL'][0: max(0, gaplen)] + + # let's set the read1's version by default + overlap_result = list(overlap_read1) + + if len(overlap_result) and overlap_read1 != overlap_read2: + # print("", QNAME) + if len(overlap_read1) != len(overlap_read2): + print("overlaps don't match in size") + number_of_diffs = 0 + for i in range(len(overlap_read1)): + if overlap_read1[i] != overlap_read2[i]: + if overlap_qual1[i] == '-' and overlap_read2 != '-': + overlap_result[i] = overlap_read2[i] + if overlap_qual1[i] > overlap_qual2[i]: + overlap_result[i] = overlap_read2[i] + # print("diff in position ", i, ": ", overlap_read1[i], "/", overlap_read2[i]) + number_of_diffs += 1 + # print("corresponding qs ", i, ": ", overlap_qual1[i], "/", overlap_qual2[i]) + # print("read1pos", read1['POS']) + # print("read2pos", read2['POS']) + # print("diff", read2['POS'] - read1['POS']) + # print("read1len", read1len) + # print("gap", gaplen) + # print("\nread1") + # print(overlap_read1) + # print(overlap_qual1) + # print("\nread2") + # print(overlap_read2) + # print(overlap_qual2) + + # print("\nreconcilled") + # print("".join(overlap_result)) + + merged += "".join(overlap_result) + read2['RESULT_SEQUENCE'][max(0, gaplen):] + + + + if len(merged) != read2['POS'] + len(read2['RESULT_SEQUENCE']) - read1['POS']: + raise Exception("Length mismatch") + + output_fasta.write(f">{QNAME}|{read1['POS']}\n{merged}\n") + + merged_insertions = read1['insertions'].copy() + insertion_index = read1['POS'] + read1len + merged_insertions += [insert for insert in read2['insertions'] if insert[0] > insertion_index] + + output_insertions.write(f"{QNAME}\t{merged_insertions}\n") + + + + else: + unpaired[QNAME] = read + for id in unpaired: + output_fasta.write(f">{id}|{unpaired[id]['POS']}\n{unpaired[id]['RESULT_SEQUENCE']}\n") + output_insertions.write(f"{id}\t{unpaired[id]['insertions']}\n") \ No newline at end of file diff --git a/scripts/dgicev/readAndSort.py b/scripts/dgicev/readAndSort.py new file mode 100644 index 0000000..efb7f9d --- /dev/null +++ b/scripts/dgicev/readAndSort.py @@ -0,0 +1,180 @@ +import sys +import re + +def parse_cigar(cigar): + pattern = re.compile(r'(\d+)([MIDNSHP=X])') + + parsed_cigar = pattern.findall(cigar) + + return [(op, int(length)) for length, op in parsed_cigar] + + +unpaired = dict() + +sequence_graph = dict() + +# have sequence_graph contain nodes forming clusters +# graph['79'] would be the starting node for all samples taken at position 79 +# from that point on treat it as a prefix tree +# will have to compare to reference sequence to get diff array + + +known_sequences = dict() + +with open('merged.fasta', 'w') as output_fasta, open('nuc_insertions.txt', 'w') as output_insertions, open('reference_genome.fasta', 'r') as reference_sequence_file: + reference_sequence = reference_sequence_file.readlines()[1] + # print("reference sequence: ", reference_sequence) + for line in sys.stdin: + if line.startswith('@'): + continue + + fields = line.strip().split('\t') + + QNAME = fields[0] # Query template NAME + FLAG = int(fields[1]) # bitwise FLAG + RNAME = fields[2] # Reference sequence NAME + POS = int(fields[3]) # 1-based leftmost mapping POSition + MAPQ = int(fields[4]) # MAPping Quality + CIGAR = parse_cigar(fields[5]) # CIGAR string + RNEXT = fields[6] # Ref. name of the mate/next read + PNEXT = int(fields[7]) # Position of the mate/next read + TLEN = int(fields[8]) # observed Template LENgth + SEQ = fields[9] # segment SEQuence + QUAL = fields[10] # ASCII of Phred-scaled base QUALity + 33 + + result_sequence = '' + result_qual = '' + index = 0 + inserts = [] + + for operation in CIGAR: + type, count = operation + if type == 'S': + index += count + continue + if type == 'M': + result_sequence += SEQ[index:index + count] + result_qual += QUAL[index:index + count] + index += count + continue + if type == 'D': + result_sequence += '-' * count + result_qual += '!' * count + continue + if type == 'I': + inserts.append((index + POS, SEQ[index:index + count])) + index += count + continue + + read = { + # "QNAME": QNAME, + # "FLAG": FLAG, + # "RNAME": RNAME, + "POS": POS, + # "MAPQ": MAPQ, + "CIGAR": CIGAR, + # "RNEXT": RNEXT, + # "PNEXT": PNEXT, + # "TLEN": TLEN, + # "SEQ": SEQ, + # "QUAL": QUAL, + "RESULT_SEQUENCE": result_sequence, + "RESULT_QUAL": result_qual, + "insertions": inserts, + } + + if QNAME in unpaired: + read1 = unpaired.pop(QNAME) + read2 = read + + # print(read1) + # print(read2) + + if read1['POS'] > read2['POS']: + read1, read2 = read2, read1 + + index = read1['POS'] + read1len = len(read1['RESULT_SEQUENCE']) + merged = read1['RESULT_SEQUENCE'][:min(read1len, read2['POS'] - read1['POS'])] + + gaplen = read1['POS'] + read1len - read2['POS'] + if gaplen < 0: + merged += 'N' * (-gaplen) + merged += read2['RESULT_SEQUENCE'] + else: + overlap_read1 = read1['RESULT_SEQUENCE'][read2['POS'] - read1['POS']:] + overlap_read2 = read2['RESULT_SEQUENCE'][0: max(0, gaplen)] + + overlap_qual1 = read1['RESULT_QUAL'][read2['POS'] - read1['POS']:] + overlap_qual2 = read2['RESULT_QUAL'][0: max(0, gaplen)] + + overlap_result = list(overlap_read1) + + if len(overlap_result) and overlap_read1 != overlap_read2: + # print("", QNAME) + if len(overlap_read1) != len(overlap_read2): + print("overlaps don't match in size") + number_of_diffs = 0 + for i in range(len(overlap_read1)): + if overlap_read1[i] != overlap_read2[i]: + if overlap_qual1[i] == '-' and overlap_read2 != '-': + overlap_result[i] = overlap_read2[i] + if overlap_qual1[i] > overlap_qual2[i]: + overlap_result[i] = overlap_read2[i] + # print("diff in position ", i, ": ", overlap_read1[i], "/", overlap_read2[i]) + number_of_diffs += 1 + # print("corresponding qs ", i, ": ", overlap_qual1[i], "/", overlap_qual2[i]) + + merged += "".join(overlap_result) + read2['RESULT_SEQUENCE'][max(0, gaplen):] + + + + if len(merged) != read2['POS'] + len(read2['RESULT_SEQUENCE']) - read1['POS']: + raise Exception("Length mismatch") + + # output_fasta.write(f">{QNAME}|{read1['POS']}\n{merged}\n") + + merged_insertions = read1['insertions'].copy() + insertion_index = read1['POS'] + read1len + merged_insertions += [insert for insert in read2['insertions'] if insert[0] > insertion_index] + + output_insertions.write(f"{QNAME}\t{merged_insertions}\n") + + # time to add it to the graph + reference_offset = read1['POS'] - 1 + + # for i in range(len(merged)): + # if merged[i] != reference_sequence[reference_offset + i]: + # # print("id: ", QNAME, "mutation at ", i, " from ", reference_sequence[reference_offset + i], " to ", merged[i]) + fingerprint = hash(merged) + if fingerprint in known_sequences: + groups = known_sequences[fingerprint] + found = False + for group in groups: + if group[0] == merged: + found = True + group[2].append(QNAME) + if not found: + known_sequences[fingerprint].append((merged, read1['POS'], [QNAME])) + + else: + known_sequences[fingerprint] = [(merged, read1['POS'], [QNAME])] + + else: + unpaired[QNAME] = read + + print("Number of unique sequences: ", sum([len(groups) for groups in known_sequences.values()])) + + flattened = [group for groups in known_sequences.values() for group in groups] + flattened.sort(key=lambda x: x[1]) + + sort_number_prefix = 0 + + for group in flattened: + for id in group[2]: + output_fasta.write(f">{sort_number_prefix:010}.{id}|{group[1]}\n{group[0]}\n") + sort_number_prefix += 1 + + for id in unpaired: + output_fasta.write(f">{id}|{unpaired[id]['POS']}\n{unpaired[id]['RESULT_SEQUENCE']}\n") + output_insertions.write(f"{id}\t{unpaired[id]['insertions']}\n") \ No newline at end of file diff --git a/scripts/dgicev/read_outputsam.py b/scripts/dgicev/read_outputsam.py new file mode 100644 index 0000000..c186c25 --- /dev/null +++ b/scripts/dgicev/read_outputsam.py @@ -0,0 +1,185 @@ +import sys +import re + +def parse_cigar(cigar): + pattern = re.compile(r'(\d+)([MIDNSHP=X])') + + parsed_cigar = pattern.findall(cigar) + + return [(op, int(length)) for length, op in parsed_cigar] + + +unpaired = dict() + +with open('merged.sam', 'w') as output_sam: + for line in sys.stdin: + if line.startswith('@'): + continue + + fields = line.strip().split('\t') + + QNAME = fields[0] # Query template NAME + FLAG = int(fields[1]) # bitwise FLAG + RNAME = fields[2] # Reference sequence NAME + POS = int(fields[3]) # 1-based leftmost mapping POSition + MAPQ = int(fields[4]) # MAPping Quality + CIGAR = parse_cigar(fields[5]) # CIGAR string + RNEXT = fields[6] # Ref. name of the mate/next read + PNEXT = int(fields[7]) # Position of the mate/next read + TLEN = int(fields[8]) # observed Template LENgth + SEQ = fields[9] # segment SEQuence + QUAL = fields[10] # ASCII of Phred-scaled base QUALity + 33 + + result_sequence = '' + result_qual = '' + index = 0 + inserts = [] + + for operation in CIGAR: + type, count = operation + if type == 'S': + index += count + continue + if type == 'M': + result_sequence += SEQ[index:index + count] + result_qual += QUAL[index:index + count] + index += count + continue + if type == 'D': + result_sequence += '-' * count + result_qual += '!' * count + continue + if type == 'I': + inserts.append((index + POS, SEQ[index:index + count])) + index += count + continue + + read = { + # "QNAME": QNAME, + # "FLAG": FLAG, + # "RNAME": RNAME, + "POS": POS, + # "MAPQ": MAPQ, + "CIGAR": CIGAR, + # "RNEXT": RNEXT, + # "PNEXT": PNEXT, + # "TLEN": TLEN, + # "SEQ": SEQ, + # "QUAL": QUAL, + "RESULT_SEQUENCE": result_sequence, + "RESULT_QUAL": result_qual, + "insertions": inserts, + } + + if QNAME in unpaired: + read1 = unpaired.pop(QNAME) + read2 = read + + # print(read1) + # print(read2) + + if read1['POS'] > read2['POS']: + read1, read2 = read2, read1 + + index = read1['POS'] + read1len = len(read1['RESULT_SEQUENCE']) + merged = read1['RESULT_SEQUENCE'][:min(read1len, read2['POS'] - read1['POS'])] + merged_qual = read1['RESULT_QUAL'][:min(read1len, read2['POS'] - read1['POS'])] + + # do deletions cause a problem here? + gaplen = read1['POS'] + read1len - read2['POS'] + if gaplen < 0: + merged += 'N' * (-gaplen) + merged += read2['RESULT_SEQUENCE'] + merged_qual += 'C' * (-gaplen) + merged_qual += read2['RESULT_SEQUENCE'] + else: + # read1_insertions = [read for read in read1['insertions'] if read[0] >= read2['POS']] + # read2_insertions = [read for read in read2['insertions'] if read[0] < read1['POS'] + read1len] + # if str(read1_insertions) != str(read2_insertions): + # print("\n\nInsertions don't match") + # print(QNAME) + # print("insertions1: ", read1_insertions) + # print("insertions2: ", read2_insertions) + # if len(read1_insertions) != len(read2_insertions): + # print("Number of insertions doesn't match") + # else: + # for i in range(len(read1_insertions)): + # if read1_insertions[i][0] != read2_insertions[i][0]: + # print("Insertion index doesn't match") + # print(read1_insertions[i][0], read2_insertions[i][0], " = ", read1_insertions[i][0] - read2_insertions[i][0]) + # print("pos2 - pos1", read2['POS'] - read1['POS']) + # print("cigar1", read1['CIGAR']) + # print("cigar2", read2['CIGAR'])len(overlap_result) + + # if read1_insertions[i][1] != read2_insertions[i][1]: + # print("Insertion sequence doesn't match") + # print(read1_insertions[i][1], read2_insertions[i][1]) + overlap_read1 = read1['RESULT_SEQUENCE'][read2['POS'] - read1['POS']:] + overlap_read2 = read2['RESULT_SEQUENCE'][0: max(0, gaplen)] + + overlap_qual1 = read1['RESULT_QUAL'][read2['POS'] - read1['POS']:] + overlap_qual2 = read2['RESULT_QUAL'][0: max(0, gaplen)] + + # let's set the read1's version by default + overlap_result = list(overlap_read1) + overlap_qual = list(overlap_qual1) + + if len(overlap_result) and overlap_read1 != overlap_read2: + # print("", QNAME) + if len(overlap_read1) != len(overlap_read2): + print("overlaps don't match in size") + number_of_diffs = 0 + for i in range(len(overlap_read1)): + if overlap_read1[i] != overlap_read2[i]: + if overlap_qual1[i] == '-' and overlap_read2 != '-': + overlap_result[i] = overlap_read2[i] + overlap_qual[i] = overlap_qual[i] + + if overlap_qual1[i] > overlap_qual2[i]: + overlap_result[i] = overlap_read2[i] + overlap_qual[i] = overlap_qual[i] + # print("diff in position ", i, ": ", overlap_read1[i], "/", overlap_read2[i]) + number_of_diffs += 1 + # print("corresponding qs ", i, ": ", overlap_qual1[i], "/", overlap_qual2[i]) + # print("read1pos", read1['POS']) + # print("read2pos", read2['POS']) + # print("diff", read2['POS'] - read1['POS']) + # print("read1len", read1len) + # print("gap", gaplen) + # print("\nread1") + # print(overlap_read1) + # print(overlap_qual1) + # print("\nread2") + # print(overlap_read2) + # print(overlap_qual2) + + # print("\nreconcilled") + # print("".join(overlap_result)) + + merged += "".join(overlap_result) + read2['RESULT_SEQUENCE'][max(0, gaplen):] + merged_qual += "".join(overlap_qual) + read2['RESULT_QUAL'][max(0, gaplen):] + + + + if len(merged) != read2['POS'] + len(read2['RESULT_SEQUENCE']) - read1['POS']: + raise Exception("Length mismatch") + + flag = 0 + output_cigar = str(len(merged)) + "M" + + output_sam.write(f"{QNAME}\t{flag}\t{RNAME}\t{read1['POS']}\t{MAPQ}\t{output_cigar}\t*\t0\t{abs(TLEN)}\t{merged}\t{merged_qual}\n") + + merged_insertions = read1['insertions'].copy() + insertion_index = read1['POS'] + read1len + merged_insertions += [insert for insert in read2['insertions'] if insert[0] > insertion_index] + + # output_insertions.write(f"{QNAME}\t{merged_insertions}\n") + + + + else: + unpaired[QNAME] = read + # for id in unpaired: + # output_fasta.write(f">{id}|{unpaired[id]['POS']}\n{unpaired[id]['RESULT_SEQUENCE']}\n") + # output_insertions.write(f"{id}\t{unpaired[id]['insertions']}\n") \ No newline at end of file From 95c6e696ae72b4e7d8d58c30b220899973aa4a30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gordon=20J=2E=20K=C3=B6hn?= Date: Thu, 31 Oct 2024 10:58:17 +0000 Subject: [PATCH 02/17] add credits --- scripts/dgicev/add_padding.py | 6 ++ scripts/dgicev/read.py | 117 ++++++++++++++-------------- scripts/dgicev/readAndSort.py | 127 ++++++++++++++++-------------- scripts/dgicev/read_outputsam.py | 129 ++++++++++++++++--------------- 4 files changed, 200 insertions(+), 179 deletions(-) diff --git a/scripts/dgicev/add_padding.py b/scripts/dgicev/add_padding.py index 05e8a9d..a42b289 100644 --- a/scripts/dgicev/add_padding.py +++ b/scripts/dgicev/add_padding.py @@ -1,3 +1,9 @@ +""" +Author: David Gicev (@davidgicev / david.gicev@gmail.com) +Supervisor:Alexander Taepper (@Taepper / alexander.taepper@bsse.ethz.ch) +Date: 2024-10-30 +""" + import sys diff --git a/scripts/dgicev/read.py b/scripts/dgicev/read.py index 7918670..1968c04 100644 --- a/scripts/dgicev/read.py +++ b/scripts/dgicev/read.py @@ -1,56 +1,63 @@ +""" +Author: David Gicev (@davidgicev / david.gicev@gmail.com) +Supervisor:Alexander Taepper (@Taepper / alexander.taepper@bsse.ethz.ch) +Date: 2024-10-30 +""" + import sys import re + def parse_cigar(cigar): - pattern = re.compile(r'(\d+)([MIDNSHP=X])') - + pattern = re.compile(r"(\d+)([MIDNSHP=X])") + parsed_cigar = pattern.findall(cigar) - + return [(op, int(length)) for length, op in parsed_cigar] unpaired = dict() -with open('merged.fasta', 'w') as output_fasta, open('nuc_insertions.txt', 'w') as output_insertions: +with open("merged.fasta", "w") as output_fasta, open("nuc_insertions.txt", "w") as output_insertions: for line in sys.stdin: - if line.startswith('@'): + if line.startswith("@"): continue - - fields = line.strip().split('\t') - - QNAME = fields[0] # Query template NAME - FLAG = int(fields[1]) # bitwise FLAG - RNAME = fields[2] # Reference sequence NAME - POS = int(fields[3]) # 1-based leftmost mapping POSition - MAPQ = int(fields[4]) # MAPping Quality - CIGAR = parse_cigar(fields[5]) # CIGAR string - RNEXT = fields[6] # Ref. name of the mate/next read - PNEXT = int(fields[7]) # Position of the mate/next read - TLEN = int(fields[8]) # observed Template LENgth - SEQ = fields[9] # segment SEQuence - QUAL = fields[10] # ASCII of Phred-scaled base QUALity + 33 - - result_sequence = '' - result_qual = '' + + fields = line.strip().split("\t") + + QNAME = fields[0] # Query template NAME + FLAG = int(fields[1]) # bitwise FLAG + RNAME = fields[2] # Reference sequence NAME + POS = int(fields[3]) # 1-based leftmost mapping POSition + MAPQ = int(fields[4]) # MAPping Quality + CIGAR = parse_cigar(fields[5]) # CIGAR string + RNEXT = fields[6] # Ref. name of the mate/next read + PNEXT = int(fields[7]) # Position of the mate/next read + TLEN = int(fields[8]) # observed Template LENgth + SEQ = fields[9] # segment SEQuence + QUAL = fields[10] # ASCII of Phred-scaled base QUALity + 33 + + result_sequence = "" + result_qual = "" index = 0 inserts = [] for operation in CIGAR: type, count = operation - if type == 'S': + if type == "S": index += count continue - if type == 'M': - result_sequence += SEQ[index:index + count] - result_qual += QUAL[index:index + count] + if type == "M": + result_sequence += SEQ[index : index + count] + result_qual += QUAL[index : index + count] index += count continue - if type == 'D': - result_sequence += '-' * count - result_qual += '!' * count + if type == "D": + result_sequence += "-" * count + result_qual += "!" * count continue - if type == 'I': - inserts.append((index + POS, SEQ[index:index + count])) + if type == "I": + inserts.append((index + POS, SEQ[index : index + count])) index += count continue @@ -77,19 +84,19 @@ def parse_cigar(cigar): # print(read1) # print(read2) - - if read1['POS'] > read2['POS']: + + if read1["POS"] > read2["POS"]: read1, read2 = read2, read1 - - index = read1['POS'] - read1len = len(read1['RESULT_SEQUENCE']) - merged = read1['RESULT_SEQUENCE'][:min(read1len, read2['POS'] - read1['POS'])] + + index = read1["POS"] + read1len = len(read1["RESULT_SEQUENCE"]) + merged = read1["RESULT_SEQUENCE"][: min(read1len, read2["POS"] - read1["POS"])] # do deletions cause a problem here? - gaplen = read1['POS'] + read1len - read2['POS'] + gaplen = read1["POS"] + read1len - read2["POS"] if gaplen < 0: - merged += 'N' * (-gaplen) - merged += read2['RESULT_SEQUENCE'] + merged += "N" * (-gaplen) + merged += read2["RESULT_SEQUENCE"] else: # read1_insertions = [read for read in read1['insertions'] if read[0] >= read2['POS']] # read2_insertions = [read for read in read2['insertions'] if read[0] < read1['POS'] + read1len] @@ -107,16 +114,16 @@ def parse_cigar(cigar): # print(read1_insertions[i][0], read2_insertions[i][0], " = ", read1_insertions[i][0] - read2_insertions[i][0]) # print("pos2 - pos1", read2['POS'] - read1['POS']) # print("cigar1", read1['CIGAR']) - # print("cigar2", read2['CIGAR'])len(overlap_result) + # print("cigar2", read2['CIGAR'])len(overlap_result) # if read1_insertions[i][1] != read2_insertions[i][1]: # print("Insertion sequence doesn't match") # print(read1_insertions[i][1], read2_insertions[i][1]) - overlap_read1 = read1['RESULT_SEQUENCE'][read2['POS'] - read1['POS']:] - overlap_read2 = read2['RESULT_SEQUENCE'][0: max(0, gaplen)] + overlap_read1 = read1["RESULT_SEQUENCE"][read2["POS"] - read1["POS"] :] + overlap_read2 = read2["RESULT_SEQUENCE"][0 : max(0, gaplen)] - overlap_qual1 = read1['RESULT_QUAL'][read2['POS'] - read1['POS']:] - overlap_qual2 = read2['RESULT_QUAL'][0: max(0, gaplen)] + overlap_qual1 = read1["RESULT_QUAL"][read2["POS"] - read1["POS"] :] + overlap_qual2 = read2["RESULT_QUAL"][0 : max(0, gaplen)] # let's set the read1's version by default overlap_result = list(overlap_read1) @@ -128,7 +135,7 @@ def parse_cigar(cigar): number_of_diffs = 0 for i in range(len(overlap_read1)): if overlap_read1[i] != overlap_read2[i]: - if overlap_qual1[i] == '-' and overlap_read2 != '-': + if overlap_qual1[i] == "-" and overlap_read2 != "-": overlap_result[i] = overlap_read2[i] if overlap_qual1[i] > overlap_qual2[i]: overlap_result[i] = overlap_read2[i] @@ -150,25 +157,21 @@ def parse_cigar(cigar): # print("\nreconcilled") # print("".join(overlap_result)) - merged += "".join(overlap_result) + read2['RESULT_SEQUENCE'][max(0, gaplen):] - - + merged += "".join(overlap_result) + read2["RESULT_SEQUENCE"][max(0, gaplen) :] - if len(merged) != read2['POS'] + len(read2['RESULT_SEQUENCE']) - read1['POS']: + if len(merged) != read2["POS"] + len(read2["RESULT_SEQUENCE"]) - read1["POS"]: raise Exception("Length mismatch") - + output_fasta.write(f">{QNAME}|{read1['POS']}\n{merged}\n") - merged_insertions = read1['insertions'].copy() - insertion_index = read1['POS'] + read1len - merged_insertions += [insert for insert in read2['insertions'] if insert[0] > insertion_index] + merged_insertions = read1["insertions"].copy() + insertion_index = read1["POS"] + read1len + merged_insertions += [insert for insert in read2["insertions"] if insert[0] > insertion_index] output_insertions.write(f"{QNAME}\t{merged_insertions}\n") - - else: unpaired[QNAME] = read for id in unpaired: output_fasta.write(f">{id}|{unpaired[id]['POS']}\n{unpaired[id]['RESULT_SEQUENCE']}\n") - output_insertions.write(f"{id}\t{unpaired[id]['insertions']}\n") \ No newline at end of file + output_insertions.write(f"{id}\t{unpaired[id]['insertions']}\n") diff --git a/scripts/dgicev/readAndSort.py b/scripts/dgicev/readAndSort.py index efb7f9d..4754c57 100644 --- a/scripts/dgicev/readAndSort.py +++ b/scripts/dgicev/readAndSort.py @@ -1,11 +1,18 @@ +""" +Author: David Gicev (@davidgicev / david.gicev@gmail.com) +Supervisor:Alexander Taepper (@Taepper / alexander.taepper@bsse.ethz.ch) +Date: 2024-10-30 +""" + import sys import re + def parse_cigar(cigar): - pattern = re.compile(r'(\d+)([MIDNSHP=X])') - + pattern = re.compile(r"(\d+)([MIDNSHP=X])") + parsed_cigar = pattern.findall(cigar) - + return [(op, int(length)) for length, op in parsed_cigar] @@ -21,48 +28,50 @@ def parse_cigar(cigar): known_sequences = dict() -with open('merged.fasta', 'w') as output_fasta, open('nuc_insertions.txt', 'w') as output_insertions, open('reference_genome.fasta', 'r') as reference_sequence_file: +with open("merged.fasta", "w") as output_fasta, open("nuc_insertions.txt", "w") as output_insertions, open( + "reference_genome.fasta", "r" +) as reference_sequence_file: reference_sequence = reference_sequence_file.readlines()[1] # print("reference sequence: ", reference_sequence) for line in sys.stdin: - if line.startswith('@'): + if line.startswith("@"): continue - - fields = line.strip().split('\t') - - QNAME = fields[0] # Query template NAME - FLAG = int(fields[1]) # bitwise FLAG - RNAME = fields[2] # Reference sequence NAME - POS = int(fields[3]) # 1-based leftmost mapping POSition - MAPQ = int(fields[4]) # MAPping Quality - CIGAR = parse_cigar(fields[5]) # CIGAR string - RNEXT = fields[6] # Ref. name of the mate/next read - PNEXT = int(fields[7]) # Position of the mate/next read - TLEN = int(fields[8]) # observed Template LENgth - SEQ = fields[9] # segment SEQuence - QUAL = fields[10] # ASCII of Phred-scaled base QUALity + 33 - - result_sequence = '' - result_qual = '' + + fields = line.strip().split("\t") + + QNAME = fields[0] # Query template NAME + FLAG = int(fields[1]) # bitwise FLAG + RNAME = fields[2] # Reference sequence NAME + POS = int(fields[3]) # 1-based leftmost mapping POSition + MAPQ = int(fields[4]) # MAPping Quality + CIGAR = parse_cigar(fields[5]) # CIGAR string + RNEXT = fields[6] # Ref. name of the mate/next read + PNEXT = int(fields[7]) # Position of the mate/next read + TLEN = int(fields[8]) # observed Template LENgth + SEQ = fields[9] # segment SEQuence + QUAL = fields[10] # ASCII of Phred-scaled base QUALity + 33 + + result_sequence = "" + result_qual = "" index = 0 inserts = [] for operation in CIGAR: type, count = operation - if type == 'S': + if type == "S": index += count continue - if type == 'M': - result_sequence += SEQ[index:index + count] - result_qual += QUAL[index:index + count] + if type == "M": + result_sequence += SEQ[index : index + count] + result_qual += QUAL[index : index + count] index += count continue - if type == 'D': - result_sequence += '-' * count - result_qual += '!' * count + if type == "D": + result_sequence += "-" * count + result_qual += "!" * count continue - if type == 'I': - inserts.append((index + POS, SEQ[index:index + count])) + if type == "I": + inserts.append((index + POS, SEQ[index : index + count])) index += count continue @@ -89,24 +98,24 @@ def parse_cigar(cigar): # print(read1) # print(read2) - - if read1['POS'] > read2['POS']: + + if read1["POS"] > read2["POS"]: read1, read2 = read2, read1 - - index = read1['POS'] - read1len = len(read1['RESULT_SEQUENCE']) - merged = read1['RESULT_SEQUENCE'][:min(read1len, read2['POS'] - read1['POS'])] - gaplen = read1['POS'] + read1len - read2['POS'] + index = read1["POS"] + read1len = len(read1["RESULT_SEQUENCE"]) + merged = read1["RESULT_SEQUENCE"][: min(read1len, read2["POS"] - read1["POS"])] + + gaplen = read1["POS"] + read1len - read2["POS"] if gaplen < 0: - merged += 'N' * (-gaplen) - merged += read2['RESULT_SEQUENCE'] + merged += "N" * (-gaplen) + merged += read2["RESULT_SEQUENCE"] else: - overlap_read1 = read1['RESULT_SEQUENCE'][read2['POS'] - read1['POS']:] - overlap_read2 = read2['RESULT_SEQUENCE'][0: max(0, gaplen)] + overlap_read1 = read1["RESULT_SEQUENCE"][read2["POS"] - read1["POS"] :] + overlap_read2 = read2["RESULT_SEQUENCE"][0 : max(0, gaplen)] - overlap_qual1 = read1['RESULT_QUAL'][read2['POS'] - read1['POS']:] - overlap_qual2 = read2['RESULT_QUAL'][0: max(0, gaplen)] + overlap_qual1 = read1["RESULT_QUAL"][read2["POS"] - read1["POS"] :] + overlap_qual2 = read2["RESULT_QUAL"][0 : max(0, gaplen)] overlap_result = list(overlap_read1) @@ -117,7 +126,7 @@ def parse_cigar(cigar): number_of_diffs = 0 for i in range(len(overlap_read1)): if overlap_read1[i] != overlap_read2[i]: - if overlap_qual1[i] == '-' and overlap_read2 != '-': + if overlap_qual1[i] == "-" and overlap_read2 != "-": overlap_result[i] = overlap_read2[i] if overlap_qual1[i] > overlap_qual2[i]: overlap_result[i] = overlap_read2[i] @@ -125,27 +134,25 @@ def parse_cigar(cigar): number_of_diffs += 1 # print("corresponding qs ", i, ": ", overlap_qual1[i], "/", overlap_qual2[i]) - merged += "".join(overlap_result) + read2['RESULT_SEQUENCE'][max(0, gaplen):] - - + merged += "".join(overlap_result) + read2["RESULT_SEQUENCE"][max(0, gaplen) :] - if len(merged) != read2['POS'] + len(read2['RESULT_SEQUENCE']) - read1['POS']: + if len(merged) != read2["POS"] + len(read2["RESULT_SEQUENCE"]) - read1["POS"]: raise Exception("Length mismatch") - + # output_fasta.write(f">{QNAME}|{read1['POS']}\n{merged}\n") - merged_insertions = read1['insertions'].copy() - insertion_index = read1['POS'] + read1len - merged_insertions += [insert for insert in read2['insertions'] if insert[0] > insertion_index] + merged_insertions = read1["insertions"].copy() + insertion_index = read1["POS"] + read1len + merged_insertions += [insert for insert in read2["insertions"] if insert[0] > insertion_index] output_insertions.write(f"{QNAME}\t{merged_insertions}\n") # time to add it to the graph - reference_offset = read1['POS'] - 1 + reference_offset = read1["POS"] - 1 # for i in range(len(merged)): - # if merged[i] != reference_sequence[reference_offset + i]: - # # print("id: ", QNAME, "mutation at ", i, " from ", reference_sequence[reference_offset + i], " to ", merged[i]) + # if merged[i] != reference_sequence[reference_offset + i]: + # # print("id: ", QNAME, "mutation at ", i, " from ", reference_sequence[reference_offset + i], " to ", merged[i]) fingerprint = hash(merged) if fingerprint in known_sequences: groups = known_sequences[fingerprint] @@ -155,10 +162,10 @@ def parse_cigar(cigar): found = True group[2].append(QNAME) if not found: - known_sequences[fingerprint].append((merged, read1['POS'], [QNAME])) - + known_sequences[fingerprint].append((merged, read1["POS"], [QNAME])) + else: - known_sequences[fingerprint] = [(merged, read1['POS'], [QNAME])] + known_sequences[fingerprint] = [(merged, read1["POS"], [QNAME])] else: unpaired[QNAME] = read @@ -177,4 +184,4 @@ def parse_cigar(cigar): for id in unpaired: output_fasta.write(f">{id}|{unpaired[id]['POS']}\n{unpaired[id]['RESULT_SEQUENCE']}\n") - output_insertions.write(f"{id}\t{unpaired[id]['insertions']}\n") \ No newline at end of file + output_insertions.write(f"{id}\t{unpaired[id]['insertions']}\n") diff --git a/scripts/dgicev/read_outputsam.py b/scripts/dgicev/read_outputsam.py index c186c25..f8ac9de 100644 --- a/scripts/dgicev/read_outputsam.py +++ b/scripts/dgicev/read_outputsam.py @@ -1,56 +1,63 @@ +""" +Author: David Gicev (@davidgicev / david.gicev@gmail.com) +Supervisor:Alexander Taepper (@Taepper / alexander.taepper@bsse.ethz.ch) +Date: 2024-10-30 +""" + import sys import re + def parse_cigar(cigar): - pattern = re.compile(r'(\d+)([MIDNSHP=X])') - + pattern = re.compile(r"(\d+)([MIDNSHP=X])") + parsed_cigar = pattern.findall(cigar) - + return [(op, int(length)) for length, op in parsed_cigar] unpaired = dict() -with open('merged.sam', 'w') as output_sam: +with open("merged.sam", "w") as output_sam: for line in sys.stdin: - if line.startswith('@'): + if line.startswith("@"): continue - - fields = line.strip().split('\t') - - QNAME = fields[0] # Query template NAME - FLAG = int(fields[1]) # bitwise FLAG - RNAME = fields[2] # Reference sequence NAME - POS = int(fields[3]) # 1-based leftmost mapping POSition - MAPQ = int(fields[4]) # MAPping Quality - CIGAR = parse_cigar(fields[5]) # CIGAR string - RNEXT = fields[6] # Ref. name of the mate/next read - PNEXT = int(fields[7]) # Position of the mate/next read - TLEN = int(fields[8]) # observed Template LENgth - SEQ = fields[9] # segment SEQuence - QUAL = fields[10] # ASCII of Phred-scaled base QUALity + 33 - - result_sequence = '' - result_qual = '' + + fields = line.strip().split("\t") + + QNAME = fields[0] # Query template NAME + FLAG = int(fields[1]) # bitwise FLAG + RNAME = fields[2] # Reference sequence NAME + POS = int(fields[3]) # 1-based leftmost mapping POSition + MAPQ = int(fields[4]) # MAPping Quality + CIGAR = parse_cigar(fields[5]) # CIGAR string + RNEXT = fields[6] # Ref. name of the mate/next read + PNEXT = int(fields[7]) # Position of the mate/next read + TLEN = int(fields[8]) # observed Template LENgth + SEQ = fields[9] # segment SEQuence + QUAL = fields[10] # ASCII of Phred-scaled base QUALity + 33 + + result_sequence = "" + result_qual = "" index = 0 inserts = [] for operation in CIGAR: type, count = operation - if type == 'S': + if type == "S": index += count continue - if type == 'M': - result_sequence += SEQ[index:index + count] - result_qual += QUAL[index:index + count] + if type == "M": + result_sequence += SEQ[index : index + count] + result_qual += QUAL[index : index + count] index += count continue - if type == 'D': - result_sequence += '-' * count - result_qual += '!' * count + if type == "D": + result_sequence += "-" * count + result_qual += "!" * count continue - if type == 'I': - inserts.append((index + POS, SEQ[index:index + count])) + if type == "I": + inserts.append((index + POS, SEQ[index : index + count])) index += count continue @@ -77,22 +84,22 @@ def parse_cigar(cigar): # print(read1) # print(read2) - - if read1['POS'] > read2['POS']: + + if read1["POS"] > read2["POS"]: read1, read2 = read2, read1 - - index = read1['POS'] - read1len = len(read1['RESULT_SEQUENCE']) - merged = read1['RESULT_SEQUENCE'][:min(read1len, read2['POS'] - read1['POS'])] - merged_qual = read1['RESULT_QUAL'][:min(read1len, read2['POS'] - read1['POS'])] + + index = read1["POS"] + read1len = len(read1["RESULT_SEQUENCE"]) + merged = read1["RESULT_SEQUENCE"][: min(read1len, read2["POS"] - read1["POS"])] + merged_qual = read1["RESULT_QUAL"][: min(read1len, read2["POS"] - read1["POS"])] # do deletions cause a problem here? - gaplen = read1['POS'] + read1len - read2['POS'] + gaplen = read1["POS"] + read1len - read2["POS"] if gaplen < 0: - merged += 'N' * (-gaplen) - merged += read2['RESULT_SEQUENCE'] - merged_qual += 'C' * (-gaplen) - merged_qual += read2['RESULT_SEQUENCE'] + merged += "N" * (-gaplen) + merged += read2["RESULT_SEQUENCE"] + merged_qual += "C" * (-gaplen) + merged_qual += read2["RESULT_SEQUENCE"] else: # read1_insertions = [read for read in read1['insertions'] if read[0] >= read2['POS']] # read2_insertions = [read for read in read2['insertions'] if read[0] < read1['POS'] + read1len] @@ -110,16 +117,16 @@ def parse_cigar(cigar): # print(read1_insertions[i][0], read2_insertions[i][0], " = ", read1_insertions[i][0] - read2_insertions[i][0]) # print("pos2 - pos1", read2['POS'] - read1['POS']) # print("cigar1", read1['CIGAR']) - # print("cigar2", read2['CIGAR'])len(overlap_result) + # print("cigar2", read2['CIGAR'])len(overlap_result) # if read1_insertions[i][1] != read2_insertions[i][1]: # print("Insertion sequence doesn't match") # print(read1_insertions[i][1], read2_insertions[i][1]) - overlap_read1 = read1['RESULT_SEQUENCE'][read2['POS'] - read1['POS']:] - overlap_read2 = read2['RESULT_SEQUENCE'][0: max(0, gaplen)] + overlap_read1 = read1["RESULT_SEQUENCE"][read2["POS"] - read1["POS"] :] + overlap_read2 = read2["RESULT_SEQUENCE"][0 : max(0, gaplen)] - overlap_qual1 = read1['RESULT_QUAL'][read2['POS'] - read1['POS']:] - overlap_qual2 = read2['RESULT_QUAL'][0: max(0, gaplen)] + overlap_qual1 = read1["RESULT_QUAL"][read2["POS"] - read1["POS"] :] + overlap_qual2 = read2["RESULT_QUAL"][0 : max(0, gaplen)] # let's set the read1's version by default overlap_result = list(overlap_read1) @@ -132,7 +139,7 @@ def parse_cigar(cigar): number_of_diffs = 0 for i in range(len(overlap_read1)): if overlap_read1[i] != overlap_read2[i]: - if overlap_qual1[i] == '-' and overlap_read2 != '-': + if overlap_qual1[i] == "-" and overlap_read2 != "-": overlap_result[i] = overlap_read2[i] overlap_qual[i] = overlap_qual[i] @@ -157,29 +164,27 @@ def parse_cigar(cigar): # print("\nreconcilled") # print("".join(overlap_result)) - merged += "".join(overlap_result) + read2['RESULT_SEQUENCE'][max(0, gaplen):] - merged_qual += "".join(overlap_qual) + read2['RESULT_QUAL'][max(0, gaplen):] - - + merged += "".join(overlap_result) + read2["RESULT_SEQUENCE"][max(0, gaplen) :] + merged_qual += "".join(overlap_qual) + read2["RESULT_QUAL"][max(0, gaplen) :] - if len(merged) != read2['POS'] + len(read2['RESULT_SEQUENCE']) - read1['POS']: + if len(merged) != read2["POS"] + len(read2["RESULT_SEQUENCE"]) - read1["POS"]: raise Exception("Length mismatch") - + flag = 0 output_cigar = str(len(merged)) + "M" - output_sam.write(f"{QNAME}\t{flag}\t{RNAME}\t{read1['POS']}\t{MAPQ}\t{output_cigar}\t*\t0\t{abs(TLEN)}\t{merged}\t{merged_qual}\n") + output_sam.write( + f"{QNAME}\t{flag}\t{RNAME}\t{read1['POS']}\t{MAPQ}\t{output_cigar}\t*\t0\t{abs(TLEN)}\t{merged}\t{merged_qual}\n" + ) - merged_insertions = read1['insertions'].copy() - insertion_index = read1['POS'] + read1len - merged_insertions += [insert for insert in read2['insertions'] if insert[0] > insertion_index] + merged_insertions = read1["insertions"].copy() + insertion_index = read1["POS"] + read1len + merged_insertions += [insert for insert in read2["insertions"] if insert[0] > insertion_index] # output_insertions.write(f"{QNAME}\t{merged_insertions}\n") - - else: unpaired[QNAME] = read # for id in unpaired: # output_fasta.write(f">{id}|{unpaired[id]['POS']}\n{unpaired[id]['RESULT_SEQUENCE']}\n") - # output_insertions.write(f"{id}\t{unpaired[id]['insertions']}\n") \ No newline at end of file + # output_insertions.write(f"{id}\t{unpaired[id]['insertions']}\n") From d114254b021644343376b9708c126bd73db02774 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 31 Oct 2024 09:01:33 +0000 Subject: [PATCH 03/17] fix: bump saadmk11/github-actions-version-updater from 0.7.4 to 0.8.1 Bumps [saadmk11/github-actions-version-updater](https://github.com/saadmk11/github-actions-version-updater) from 0.7.4 to 0.8.1. - [Release notes](https://github.com/saadmk11/github-actions-version-updater/releases) - [Changelog](https://github.com/saadmk11/github-actions-version-updater/blob/main/CHANGELOG.md) - [Commits](https://github.com/saadmk11/github-actions-version-updater/compare/v0.7.4...v0.8.1) --- updated-dependencies: - dependency-name: saadmk11/github-actions-version-updater dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/schedule-update-actions.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/schedule-update-actions.yml b/.github/workflows/schedule-update-actions.yml index f4c30b6..4773301 100644 --- a/.github/workflows/schedule-update-actions.yml +++ b/.github/workflows/schedule-update-actions.yml @@ -18,7 +18,7 @@ jobs: token: ${{ secrets.PAT }} - name: Run GitHub Actions Version Updater - uses: saadmk11/github-actions-version-updater@v0.7.4 + uses: saadmk11/github-actions-version-updater@v0.8.1 with: # [Required] Access token with `workflow` scope. token: ${{ secrets.PAT }} From 92301a667af929aeee576abfa8e66459f89fbc5d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 31 Oct 2024 09:01:35 +0000 Subject: [PATCH 04/17] fix: bump amannn/action-semantic-pull-request from 5.2.0 to 5.5.3 Bumps [amannn/action-semantic-pull-request](https://github.com/amannn/action-semantic-pull-request) from 5.2.0 to 5.5.3. - [Release notes](https://github.com/amannn/action-semantic-pull-request/releases) - [Changelog](https://github.com/amannn/action-semantic-pull-request/blob/main/CHANGELOG.md) - [Commits](https://github.com/amannn/action-semantic-pull-request/compare/v5.2.0...v5.5.3) --- updated-dependencies: - dependency-name: amannn/action-semantic-pull-request dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/semantic-pr-check.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/semantic-pr-check.yml b/.github/workflows/semantic-pr-check.yml index 3a1158d..7fdd599 100644 --- a/.github/workflows/semantic-pr-check.yml +++ b/.github/workflows/semantic-pr-check.yml @@ -12,6 +12,6 @@ jobs: name: Validate PR title runs-on: ubuntu-latest steps: - - uses: amannn/action-semantic-pull-request@v5.2.0 + - uses: amannn/action-semantic-pull-request@v5.5.3 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 240eaf908f4861e078134b88d9436b79dcbe4f80 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 31 Oct 2024 09:01:38 +0000 Subject: [PATCH 05/17] fix: bump microsoft/action-python from 0.6.4 to 0.7.3 Bumps [microsoft/action-python](https://github.com/microsoft/action-python) from 0.6.4 to 0.7.3. - [Release notes](https://github.com/microsoft/action-python/releases) - [Commits](https://github.com/microsoft/action-python/compare/0.6.4...0.7.3) --- updated-dependencies: - dependency-name: microsoft/action-python dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/CI.yml | 4 ++-- .github/workflows/publish.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index eb8ae3e..c94f448 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -10,12 +10,12 @@ on: jobs: validation: - uses: microsoft/action-python/.github/workflows/validation.yml@0.6.4 + uses: microsoft/action-python/.github/workflows/validation.yml@0.7.3 with: workdir: '.' publish: - uses: microsoft/action-python/.github/workflows/publish.yml@0.6.4 + uses: microsoft/action-python/.github/workflows/publish.yml@0.7.3 secrets: PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} TEST_PYPI_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }} diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 9d71560..0264afd 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -4,7 +4,7 @@ on: jobs: publish: - uses: microsoft/action-python/.github/workflows/publish.yml@0.6.4 + uses: microsoft/action-python/.github/workflows/publish.yml@0.7.3 secrets: PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} TEST_PYPI_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }} From 73078c27ae2a21e147c426da02527374bf3b7e99 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 31 Oct 2024 09:01:40 +0000 Subject: [PATCH 06/17] fix: bump actions/checkout from 3.5.2 to 4.2.2 Bumps [actions/checkout](https://github.com/actions/checkout) from 3.5.2 to 4.2.2. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v3.5.2...v4.2.2) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/schedule-update-actions.yml | 2 +- .github/workflows/template-sync.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/schedule-update-actions.yml b/.github/workflows/schedule-update-actions.yml index 4773301..fb6c720 100644 --- a/.github/workflows/schedule-update-actions.yml +++ b/.github/workflows/schedule-update-actions.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3.5.2 + - uses: actions/checkout@v4.2.2 with: # [Required] Access token with `workflow` scope. token: ${{ secrets.PAT }} diff --git a/.github/workflows/template-sync.yml b/.github/workflows/template-sync.yml index 49666bc..d46d002 100644 --- a/.github/workflows/template-sync.yml +++ b/.github/workflows/template-sync.yml @@ -5,7 +5,7 @@ jobs: sync: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3.5.2 # important! + - uses: actions/checkout@v4.2.2 # important! - uses: euphoricsystems/action-sync-template-repository@v2.5.1 with: github-token: ${{ secrets.GITHUB_TOKEN }} From ef6927a73aeaef72ee7081775843388e23e256ee Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 31 Oct 2024 09:01:45 +0000 Subject: [PATCH 07/17] fix: bump pytest-cov from 4.0.0 to 6.0.0 Bumps [pytest-cov](https://github.com/pytest-dev/pytest-cov) from 4.0.0 to 6.0.0. - [Changelog](https://github.com/pytest-dev/pytest-cov/blob/master/CHANGELOG.rst) - [Commits](https://github.com/pytest-dev/pytest-cov/compare/v4.0.0...v6.0.0) --- updated-dependencies: - dependency-name: pytest-cov dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9e77fc7..98110f3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ test = [ "pre-commit==3.3.1", "pylint==2.17.4", "pylint_junit", - "pytest-cov==4.0.0", + "pytest-cov==6.0.0", "pytest-mock<3.10.1", "pytest-runner", "pytest==7.3.1", From 01c9817bb48b9cda9c45803601eaaf32fba20d43 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 31 Oct 2024 09:01:50 +0000 Subject: [PATCH 08/17] fix: bump black from 23.3.0 to 24.10.0 Bumps [black](https://github.com/psf/black) from 23.3.0 to 24.10.0. - [Release notes](https://github.com/psf/black/releases) - [Changelog](https://github.com/psf/black/blob/main/CHANGES.md) - [Commits](https://github.com/psf/black/compare/23.3.0...24.10.0) --- updated-dependencies: - dependency-name: black dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 98110f3..8bda9e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ spark = [ ] test = [ "bandit[toml]==1.7.5", - "black==23.3.0", + "black==24.10.0", "check-manifest==0.49", "flake8-bugbear==23.5.9", "flake8-docstrings", From 91b4dc2969aa0e041da89727b26f4178633a70fe Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 31 Oct 2024 11:07:28 +0000 Subject: [PATCH 09/17] fix: bump check-manifest from 0.49 to 0.50 Bumps [check-manifest](https://github.com/mgedmin/check-manifest) from 0.49 to 0.50. - [Changelog](https://github.com/mgedmin/check-manifest/blob/master/CHANGES.rst) - [Commits](https://github.com/mgedmin/check-manifest/compare/0.49...0.50) --- updated-dependencies: - dependency-name: check-manifest dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8bda9e3..4d864f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ spark = [ test = [ "bandit[toml]==1.7.5", "black==24.10.0", - "check-manifest==0.49", + "check-manifest==0.50", "flake8-bugbear==23.5.9", "flake8-docstrings", "flake8-formatter_junit_xml", From 085f288308107263149f804b0000af8a4e5eb28a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 31 Oct 2024 11:07:32 +0000 Subject: [PATCH 10/17] fix: bump bandit[toml] from 1.7.5 to 1.7.10 Bumps [bandit[toml]](https://github.com/PyCQA/bandit) from 1.7.5 to 1.7.10. - [Release notes](https://github.com/PyCQA/bandit/releases) - [Commits](https://github.com/PyCQA/bandit/compare/1.7.5...1.7.10) --- updated-dependencies: - dependency-name: bandit[toml] dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4d864f7..4bf6cb1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ spark = [ "pyspark>=3.0.0" ] test = [ - "bandit[toml]==1.7.5", + "bandit[toml]==1.7.10", "black==24.10.0", "check-manifest==0.50", "flake8-bugbear==23.5.9", From 4d77c12a390593acfc00623a33d6fb5454349868 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 31 Oct 2024 11:07:00 +0000 Subject: [PATCH 11/17] fix: bump pre-commit from 3.3.1 to 4.0.1 Bumps [pre-commit](https://github.com/pre-commit/pre-commit) from 3.3.1 to 4.0.1. - [Release notes](https://github.com/pre-commit/pre-commit/releases) - [Changelog](https://github.com/pre-commit/pre-commit/blob/main/CHANGELOG.md) - [Commits](https://github.com/pre-commit/pre-commit/compare/v3.3.1...v4.0.1) --- updated-dependencies: - dependency-name: pre-commit dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4bf6cb1..6f6172a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,7 @@ test = [ "flake8-formatter_junit_xml", "flake8", "flake8-pyproject", - "pre-commit==3.3.1", + "pre-commit==4.0.1", "pylint==2.17.4", "pylint_junit", "pytest-cov==6.0.0", From 906fd47b74244048d9ff28e6422f0ac887d93da5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 31 Oct 2024 11:07:26 +0000 Subject: [PATCH 12/17] fix: bump pytest from 7.3.1 to 8.3.3 Bumps [pytest](https://github.com/pytest-dev/pytest) from 7.3.1 to 8.3.3. - [Release notes](https://github.com/pytest-dev/pytest/releases) - [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pytest-dev/pytest/compare/7.3.1...8.3.3) --- updated-dependencies: - dependency-name: pytest dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6f6172a..d828afc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,7 @@ test = [ "pytest-cov==6.0.0", "pytest-mock<3.10.1", "pytest-runner", - "pytest==7.3.1", + "pytest==8.3.3", "pytest-github-actions-annotate-failures", "shellcheck-py==0.9.0.2" ] From 99d6efad1a3a4f966115b23dde4fb7ab0f831509 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 31 Oct 2024 11:07:19 +0000 Subject: [PATCH 13/17] fix: update pytest-mock requirement from <3.10.1 to <3.14.1 Updates the requirements on [pytest-mock](https://github.com/pytest-dev/pytest-mock) to permit the latest version. - [Release notes](https://github.com/pytest-dev/pytest-mock/releases) - [Changelog](https://github.com/pytest-dev/pytest-mock/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pytest-dev/pytest-mock/compare/v0.1.0...v3.14.0) --- updated-dependencies: - dependency-name: pytest-mock dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d828afc..466feb7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ test = [ "pylint==2.17.4", "pylint_junit", "pytest-cov==6.0.0", - "pytest-mock<3.10.1", + "pytest-mock<3.14.1", "pytest-runner", "pytest==8.3.3", "pytest-github-actions-annotate-failures", From 7906bdead132df4de9403c2c62d8c1ca7c52c0ab Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 31 Oct 2024 12:09:23 +0000 Subject: [PATCH 14/17] fix: bump shellcheck-py from 0.9.0.2 to 0.10.0.1 Bumps [shellcheck-py](https://github.com/ryanrhee/shellcheck-py) from 0.9.0.2 to 0.10.0.1. - [Commits](https://github.com/ryanrhee/shellcheck-py/compare/v0.9.0.2...v0.10.0.1) --- updated-dependencies: - dependency-name: shellcheck-py dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 466feb7..f0924aa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,7 @@ test = [ "pytest-runner", "pytest==8.3.3", "pytest-github-actions-annotate-failures", - "shellcheck-py==0.9.0.2" + "shellcheck-py==0.10.0.1" ] [project.urls] From 35ef6c7c113c042cb7d4412d567bfb17ae3c5ce5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 31 Oct 2024 12:08:59 +0000 Subject: [PATCH 15/17] fix: bump pylint from 2.17.4 to 3.3.1 Bumps [pylint](https://github.com/pylint-dev/pylint) from 2.17.4 to 3.3.1. - [Release notes](https://github.com/pylint-dev/pylint/releases) - [Commits](https://github.com/pylint-dev/pylint/compare/v2.17.4...v3.3.1) --- updated-dependencies: - dependency-name: pylint dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f0924aa..b53e28b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ test = [ "flake8", "flake8-pyproject", "pre-commit==4.0.1", - "pylint==2.17.4", + "pylint==3.3.1", "pylint_junit", "pytest-cov==6.0.0", "pytest-mock<3.14.1", From 8dc8cc943771e1fe748c7f690d21a9cc1c509013 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 31 Oct 2024 12:08:34 +0000 Subject: [PATCH 16/17] fix: bump flake8-bugbear from 23.5.9 to 24.8.19 Bumps [flake8-bugbear](https://github.com/PyCQA/flake8-bugbear) from 23.5.9 to 24.8.19. - [Release notes](https://github.com/PyCQA/flake8-bugbear/releases) - [Commits](https://github.com/PyCQA/flake8-bugbear/compare/23.5.9...24.8.19) --- updated-dependencies: - dependency-name: flake8-bugbear dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b53e28b..c113062 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ test = [ "bandit[toml]==1.7.10", "black==24.10.0", "check-manifest==0.50", - "flake8-bugbear==23.5.9", + "flake8-bugbear==24.8.19", "flake8-docstrings", "flake8-formatter_junit_xml", "flake8", From bb32a8445e7730da77b86a67cca45a8d98fb5103 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gordon=20J=2E=20K=C3=B6hn?= Date: Thu, 31 Oct 2024 12:45:10 +0000 Subject: [PATCH 17/17] skip QA --- scripts/dgicev/add_padding.py | 4 ++++ scripts/dgicev/read.py | 4 ++++ scripts/dgicev/readAndSort.py | 4 ++++ scripts/dgicev/read_outputsam.py | 4 ++++ 4 files changed, 16 insertions(+) diff --git a/scripts/dgicev/add_padding.py b/scripts/dgicev/add_padding.py index a42b289..c1f4c1c 100644 --- a/scripts/dgicev/add_padding.py +++ b/scripts/dgicev/add_padding.py @@ -4,6 +4,10 @@ Date: 2024-10-30 """ +# TODO: integrate into package, with QA and testing +# pylint: skip-file +# flake8: noqa + import sys diff --git a/scripts/dgicev/read.py b/scripts/dgicev/read.py index 1968c04..a5016b1 100644 --- a/scripts/dgicev/read.py +++ b/scripts/dgicev/read.py @@ -4,6 +4,10 @@ Date: 2024-10-30 """ +# TODO: integrate into package, with QA and testing +# pylint: skip-file +# flake8: noqa + import sys import re diff --git a/scripts/dgicev/readAndSort.py b/scripts/dgicev/readAndSort.py index 4754c57..674f557 100644 --- a/scripts/dgicev/readAndSort.py +++ b/scripts/dgicev/readAndSort.py @@ -4,6 +4,10 @@ Date: 2024-10-30 """ +# TODO: integrate into package, with QA and testing +# pylint: skip-file +# flake8: noqa + import sys import re diff --git a/scripts/dgicev/read_outputsam.py b/scripts/dgicev/read_outputsam.py index f8ac9de..738a896 100644 --- a/scripts/dgicev/read_outputsam.py +++ b/scripts/dgicev/read_outputsam.py @@ -4,6 +4,10 @@ Date: 2024-10-30 """ +# TODO: integrate into package, with QA and testing +# pylint: skip-file +# flake8: noqa + import sys import re