diff --git a/scripts/dgicev/README.md b/scripts/dgicev/README.md new file mode 100644 index 0000000..65187a6 --- /dev/null +++ b/scripts/dgicev/README.md @@ -0,0 +1,38 @@ +### add_padding.py + +Used for adding padding to an example fasta file, mainly used for testing main + +#### Usage: + +`cat input_file.fasta | python add_padding.py > output_file.fasta` + +`input_file`: fasta file with header containing offset | delimited + +`output_file`: normal fasta file, each sequence padded by the respective amount of N's to the left/right (position info in the header is not in the output) + +### read.py + +Used for merging pairs of reads + +#### Usage: +`cat input_file.fasta | python read.py` +`samtools view input_file.bam | python read.py` + +`input_file`: sam file contents, could also read from bam with samtools + +`output_file`: it makes two output files by itself, merged.fasta and nuc_insertions.txt, the former has the merged reads and it uses the fasta with | headers to describe the position/offset + +### read_outputsam.py + +Used mainly for testing with IGV + +same usage as read.py, except the output it makes is merged.sam, it doesn't store the insertions + +the sam entries it outputs are the actual sequences you would find in the merged.fasta when running read.py - the cigar is just M's + +### readAndSort.py +for reordering, using hashing, not really efficient more of a proof of concept + +same usage as read.py + +Note: I don't think I use the reference genome here so feel free to take it out of the code diff --git a/scripts/dgicev/add_padding.py b/scripts/dgicev/add_padding.py new file mode 100644 index 0000000..c1f4c1c --- /dev/null +++ b/scripts/dgicev/add_padding.py @@ -0,0 +1,36 @@ +""" +Author: David Gicev (@davidgicev / david.gicev@gmail.com) +Supervisor:Alexander Taepper (@Taepper / alexander.taepper@bsse.ethz.ch) +Date: 2024-10-30 +""" + +# TODO: integrate into package, with QA and testing +# pylint: skip-file +# flake8: noqa + +import sys + + +def transform_fasta(): + while True: + header = sys.stdin.readline().strip() + sequence = sys.stdin.readline().strip() + + if not header or not sequence: + break # End of file + + # Parse the header and extract the position offset + header_parts = header.split("|") + position_offset = int(header_parts[1]) # Get the position offset + + # Add Ns before and after the sequence + left_padding = "N" * position_offset + right_padding = "N" * (29904 - len(sequence) - position_offset) + + # Write the transformed sequence to stdout + sys.stdout.write(f"{header_parts[0]}\n") + sys.stdout.write(f"{left_padding}{sequence}{right_padding}\n") + + +# Execute the function +transform_fasta() diff --git a/scripts/dgicev/read.py b/scripts/dgicev/read.py new file mode 100644 index 0000000..a5016b1 --- /dev/null +++ b/scripts/dgicev/read.py @@ -0,0 +1,181 @@ +""" +Author: David Gicev (@davidgicev / david.gicev@gmail.com) +Supervisor:Alexander Taepper (@Taepper / alexander.taepper@bsse.ethz.ch) +Date: 2024-10-30 +""" + +# TODO: integrate into package, with QA and testing +# pylint: skip-file +# flake8: noqa + +import sys +import re + + +def parse_cigar(cigar): + pattern = re.compile(r"(\d+)([MIDNSHP=X])") + + parsed_cigar = pattern.findall(cigar) + + return [(op, int(length)) for length, op in parsed_cigar] + + +unpaired = dict() + +with open("merged.fasta", "w") as output_fasta, open("nuc_insertions.txt", "w") as output_insertions: + for line in sys.stdin: + if line.startswith("@"): + continue + + fields = line.strip().split("\t") + + QNAME = fields[0] # Query template NAME + FLAG = int(fields[1]) # bitwise FLAG + RNAME = fields[2] # Reference sequence NAME + POS = int(fields[3]) # 1-based leftmost mapping POSition + MAPQ = int(fields[4]) # MAPping Quality + CIGAR = parse_cigar(fields[5]) # CIGAR string + RNEXT = fields[6] # Ref. name of the mate/next read + PNEXT = int(fields[7]) # Position of the mate/next read + TLEN = int(fields[8]) # observed Template LENgth + SEQ = fields[9] # segment SEQuence + QUAL = fields[10] # ASCII of Phred-scaled base QUALity + 33 + + result_sequence = "" + result_qual = "" + index = 0 + inserts = [] + + for operation in CIGAR: + type, count = operation + if type == "S": + index += count + continue + if type == "M": + result_sequence += SEQ[index : index + count] + result_qual += QUAL[index : index + count] + index += count + continue + if type == "D": + result_sequence += "-" * count + result_qual += "!" * count + continue + if type == "I": + inserts.append((index + POS, SEQ[index : index + count])) + index += count + continue + + read = { + # "QNAME": QNAME, + # "FLAG": FLAG, + # "RNAME": RNAME, + "POS": POS, + # "MAPQ": MAPQ, + "CIGAR": CIGAR, + # "RNEXT": RNEXT, + # "PNEXT": PNEXT, + # "TLEN": TLEN, + # "SEQ": SEQ, + # "QUAL": QUAL, + "RESULT_SEQUENCE": result_sequence, + "RESULT_QUAL": result_qual, + "insertions": inserts, + } + + if QNAME in unpaired: + read1 = unpaired.pop(QNAME) + read2 = read + + # print(read1) + # print(read2) + + if read1["POS"] > read2["POS"]: + read1, read2 = read2, read1 + + index = read1["POS"] + read1len = len(read1["RESULT_SEQUENCE"]) + merged = read1["RESULT_SEQUENCE"][: min(read1len, read2["POS"] - read1["POS"])] + + # do deletions cause a problem here? + gaplen = read1["POS"] + read1len - read2["POS"] + if gaplen < 0: + merged += "N" * (-gaplen) + merged += read2["RESULT_SEQUENCE"] + else: + # read1_insertions = [read for read in read1['insertions'] if read[0] >= read2['POS']] + # read2_insertions = [read for read in read2['insertions'] if read[0] < read1['POS'] + read1len] + # if str(read1_insertions) != str(read2_insertions): + # print("\n\nInsertions don't match") + # print(QNAME) + # print("insertions1: ", read1_insertions) + # print("insertions2: ", read2_insertions) + # if len(read1_insertions) != len(read2_insertions): + # print("Number of insertions doesn't match") + # else: + # for i in range(len(read1_insertions)): + # if read1_insertions[i][0] != read2_insertions[i][0]: + # print("Insertion index doesn't match") + # print(read1_insertions[i][0], read2_insertions[i][0], " = ", read1_insertions[i][0] - read2_insertions[i][0]) + # print("pos2 - pos1", read2['POS'] - read1['POS']) + # print("cigar1", read1['CIGAR']) + # print("cigar2", read2['CIGAR'])len(overlap_result) + + # if read1_insertions[i][1] != read2_insertions[i][1]: + # print("Insertion sequence doesn't match") + # print(read1_insertions[i][1], read2_insertions[i][1]) + overlap_read1 = read1["RESULT_SEQUENCE"][read2["POS"] - read1["POS"] :] + overlap_read2 = read2["RESULT_SEQUENCE"][0 : max(0, gaplen)] + + overlap_qual1 = read1["RESULT_QUAL"][read2["POS"] - read1["POS"] :] + overlap_qual2 = read2["RESULT_QUAL"][0 : max(0, gaplen)] + + # let's set the read1's version by default + overlap_result = list(overlap_read1) + + if len(overlap_result) and overlap_read1 != overlap_read2: + # print("", QNAME) + if len(overlap_read1) != len(overlap_read2): + print("overlaps don't match in size") + number_of_diffs = 0 + for i in range(len(overlap_read1)): + if overlap_read1[i] != overlap_read2[i]: + if overlap_qual1[i] == "-" and overlap_read2 != "-": + overlap_result[i] = overlap_read2[i] + if overlap_qual1[i] > overlap_qual2[i]: + overlap_result[i] = overlap_read2[i] + # print("diff in position ", i, ": ", overlap_read1[i], "/", overlap_read2[i]) + number_of_diffs += 1 + # print("corresponding qs ", i, ": ", overlap_qual1[i], "/", overlap_qual2[i]) + # print("read1pos", read1['POS']) + # print("read2pos", read2['POS']) + # print("diff", read2['POS'] - read1['POS']) + # print("read1len", read1len) + # print("gap", gaplen) + # print("\nread1") + # print(overlap_read1) + # print(overlap_qual1) + # print("\nread2") + # print(overlap_read2) + # print(overlap_qual2) + + # print("\nreconcilled") + # print("".join(overlap_result)) + + merged += "".join(overlap_result) + read2["RESULT_SEQUENCE"][max(0, gaplen) :] + + if len(merged) != read2["POS"] + len(read2["RESULT_SEQUENCE"]) - read1["POS"]: + raise Exception("Length mismatch") + + output_fasta.write(f">{QNAME}|{read1['POS']}\n{merged}\n") + + merged_insertions = read1["insertions"].copy() + insertion_index = read1["POS"] + read1len + merged_insertions += [insert for insert in read2["insertions"] if insert[0] > insertion_index] + + output_insertions.write(f"{QNAME}\t{merged_insertions}\n") + + else: + unpaired[QNAME] = read + for id in unpaired: + output_fasta.write(f">{id}|{unpaired[id]['POS']}\n{unpaired[id]['RESULT_SEQUENCE']}\n") + output_insertions.write(f"{id}\t{unpaired[id]['insertions']}\n") diff --git a/scripts/dgicev/readAndSort.py b/scripts/dgicev/readAndSort.py new file mode 100644 index 0000000..674f557 --- /dev/null +++ b/scripts/dgicev/readAndSort.py @@ -0,0 +1,191 @@ +""" +Author: David Gicev (@davidgicev / david.gicev@gmail.com) +Supervisor:Alexander Taepper (@Taepper / alexander.taepper@bsse.ethz.ch) +Date: 2024-10-30 +""" + +# TODO: integrate into package, with QA and testing +# pylint: skip-file +# flake8: noqa + +import sys +import re + + +def parse_cigar(cigar): + pattern = re.compile(r"(\d+)([MIDNSHP=X])") + + parsed_cigar = pattern.findall(cigar) + + return [(op, int(length)) for length, op in parsed_cigar] + + +unpaired = dict() + +sequence_graph = dict() + +# have sequence_graph contain nodes forming clusters +# graph['79'] would be the starting node for all samples taken at position 79 +# from that point on treat it as a prefix tree +# will have to compare to reference sequence to get diff array + + +known_sequences = dict() + +with open("merged.fasta", "w") as output_fasta, open("nuc_insertions.txt", "w") as output_insertions, open( + "reference_genome.fasta", "r" +) as reference_sequence_file: + reference_sequence = reference_sequence_file.readlines()[1] + # print("reference sequence: ", reference_sequence) + for line in sys.stdin: + if line.startswith("@"): + continue + + fields = line.strip().split("\t") + + QNAME = fields[0] # Query template NAME + FLAG = int(fields[1]) # bitwise FLAG + RNAME = fields[2] # Reference sequence NAME + POS = int(fields[3]) # 1-based leftmost mapping POSition + MAPQ = int(fields[4]) # MAPping Quality + CIGAR = parse_cigar(fields[5]) # CIGAR string + RNEXT = fields[6] # Ref. name of the mate/next read + PNEXT = int(fields[7]) # Position of the mate/next read + TLEN = int(fields[8]) # observed Template LENgth + SEQ = fields[9] # segment SEQuence + QUAL = fields[10] # ASCII of Phred-scaled base QUALity + 33 + + result_sequence = "" + result_qual = "" + index = 0 + inserts = [] + + for operation in CIGAR: + type, count = operation + if type == "S": + index += count + continue + if type == "M": + result_sequence += SEQ[index : index + count] + result_qual += QUAL[index : index + count] + index += count + continue + if type == "D": + result_sequence += "-" * count + result_qual += "!" * count + continue + if type == "I": + inserts.append((index + POS, SEQ[index : index + count])) + index += count + continue + + read = { + # "QNAME": QNAME, + # "FLAG": FLAG, + # "RNAME": RNAME, + "POS": POS, + # "MAPQ": MAPQ, + "CIGAR": CIGAR, + # "RNEXT": RNEXT, + # "PNEXT": PNEXT, + # "TLEN": TLEN, + # "SEQ": SEQ, + # "QUAL": QUAL, + "RESULT_SEQUENCE": result_sequence, + "RESULT_QUAL": result_qual, + "insertions": inserts, + } + + if QNAME in unpaired: + read1 = unpaired.pop(QNAME) + read2 = read + + # print(read1) + # print(read2) + + if read1["POS"] > read2["POS"]: + read1, read2 = read2, read1 + + index = read1["POS"] + read1len = len(read1["RESULT_SEQUENCE"]) + merged = read1["RESULT_SEQUENCE"][: min(read1len, read2["POS"] - read1["POS"])] + + gaplen = read1["POS"] + read1len - read2["POS"] + if gaplen < 0: + merged += "N" * (-gaplen) + merged += read2["RESULT_SEQUENCE"] + else: + overlap_read1 = read1["RESULT_SEQUENCE"][read2["POS"] - read1["POS"] :] + overlap_read2 = read2["RESULT_SEQUENCE"][0 : max(0, gaplen)] + + overlap_qual1 = read1["RESULT_QUAL"][read2["POS"] - read1["POS"] :] + overlap_qual2 = read2["RESULT_QUAL"][0 : max(0, gaplen)] + + overlap_result = list(overlap_read1) + + if len(overlap_result) and overlap_read1 != overlap_read2: + # print("", QNAME) + if len(overlap_read1) != len(overlap_read2): + print("overlaps don't match in size") + number_of_diffs = 0 + for i in range(len(overlap_read1)): + if overlap_read1[i] != overlap_read2[i]: + if overlap_qual1[i] == "-" and overlap_read2 != "-": + overlap_result[i] = overlap_read2[i] + if overlap_qual1[i] > overlap_qual2[i]: + overlap_result[i] = overlap_read2[i] + # print("diff in position ", i, ": ", overlap_read1[i], "/", overlap_read2[i]) + number_of_diffs += 1 + # print("corresponding qs ", i, ": ", overlap_qual1[i], "/", overlap_qual2[i]) + + merged += "".join(overlap_result) + read2["RESULT_SEQUENCE"][max(0, gaplen) :] + + if len(merged) != read2["POS"] + len(read2["RESULT_SEQUENCE"]) - read1["POS"]: + raise Exception("Length mismatch") + + # output_fasta.write(f">{QNAME}|{read1['POS']}\n{merged}\n") + + merged_insertions = read1["insertions"].copy() + insertion_index = read1["POS"] + read1len + merged_insertions += [insert for insert in read2["insertions"] if insert[0] > insertion_index] + + output_insertions.write(f"{QNAME}\t{merged_insertions}\n") + + # time to add it to the graph + reference_offset = read1["POS"] - 1 + + # for i in range(len(merged)): + # if merged[i] != reference_sequence[reference_offset + i]: + # # print("id: ", QNAME, "mutation at ", i, " from ", reference_sequence[reference_offset + i], " to ", merged[i]) + fingerprint = hash(merged) + if fingerprint in known_sequences: + groups = known_sequences[fingerprint] + found = False + for group in groups: + if group[0] == merged: + found = True + group[2].append(QNAME) + if not found: + known_sequences[fingerprint].append((merged, read1["POS"], [QNAME])) + + else: + known_sequences[fingerprint] = [(merged, read1["POS"], [QNAME])] + + else: + unpaired[QNAME] = read + + print("Number of unique sequences: ", sum([len(groups) for groups in known_sequences.values()])) + + flattened = [group for groups in known_sequences.values() for group in groups] + flattened.sort(key=lambda x: x[1]) + + sort_number_prefix = 0 + + for group in flattened: + for id in group[2]: + output_fasta.write(f">{sort_number_prefix:010}.{id}|{group[1]}\n{group[0]}\n") + sort_number_prefix += 1 + + for id in unpaired: + output_fasta.write(f">{id}|{unpaired[id]['POS']}\n{unpaired[id]['RESULT_SEQUENCE']}\n") + output_insertions.write(f"{id}\t{unpaired[id]['insertions']}\n") diff --git a/scripts/dgicev/read_outputsam.py b/scripts/dgicev/read_outputsam.py new file mode 100644 index 0000000..738a896 --- /dev/null +++ b/scripts/dgicev/read_outputsam.py @@ -0,0 +1,194 @@ +""" +Author: David Gicev (@davidgicev / david.gicev@gmail.com) +Supervisor:Alexander Taepper (@Taepper / alexander.taepper@bsse.ethz.ch) +Date: 2024-10-30 +""" + +# TODO: integrate into package, with QA and testing +# pylint: skip-file +# flake8: noqa + +import sys +import re + + +def parse_cigar(cigar): + pattern = re.compile(r"(\d+)([MIDNSHP=X])") + + parsed_cigar = pattern.findall(cigar) + + return [(op, int(length)) for length, op in parsed_cigar] + + +unpaired = dict() + +with open("merged.sam", "w") as output_sam: + for line in sys.stdin: + if line.startswith("@"): + continue + + fields = line.strip().split("\t") + + QNAME = fields[0] # Query template NAME + FLAG = int(fields[1]) # bitwise FLAG + RNAME = fields[2] # Reference sequence NAME + POS = int(fields[3]) # 1-based leftmost mapping POSition + MAPQ = int(fields[4]) # MAPping Quality + CIGAR = parse_cigar(fields[5]) # CIGAR string + RNEXT = fields[6] # Ref. name of the mate/next read + PNEXT = int(fields[7]) # Position of the mate/next read + TLEN = int(fields[8]) # observed Template LENgth + SEQ = fields[9] # segment SEQuence + QUAL = fields[10] # ASCII of Phred-scaled base QUALity + 33 + + result_sequence = "" + result_qual = "" + index = 0 + inserts = [] + + for operation in CIGAR: + type, count = operation + if type == "S": + index += count + continue + if type == "M": + result_sequence += SEQ[index : index + count] + result_qual += QUAL[index : index + count] + index += count + continue + if type == "D": + result_sequence += "-" * count + result_qual += "!" * count + continue + if type == "I": + inserts.append((index + POS, SEQ[index : index + count])) + index += count + continue + + read = { + # "QNAME": QNAME, + # "FLAG": FLAG, + # "RNAME": RNAME, + "POS": POS, + # "MAPQ": MAPQ, + "CIGAR": CIGAR, + # "RNEXT": RNEXT, + # "PNEXT": PNEXT, + # "TLEN": TLEN, + # "SEQ": SEQ, + # "QUAL": QUAL, + "RESULT_SEQUENCE": result_sequence, + "RESULT_QUAL": result_qual, + "insertions": inserts, + } + + if QNAME in unpaired: + read1 = unpaired.pop(QNAME) + read2 = read + + # print(read1) + # print(read2) + + if read1["POS"] > read2["POS"]: + read1, read2 = read2, read1 + + index = read1["POS"] + read1len = len(read1["RESULT_SEQUENCE"]) + merged = read1["RESULT_SEQUENCE"][: min(read1len, read2["POS"] - read1["POS"])] + merged_qual = read1["RESULT_QUAL"][: min(read1len, read2["POS"] - read1["POS"])] + + # do deletions cause a problem here? + gaplen = read1["POS"] + read1len - read2["POS"] + if gaplen < 0: + merged += "N" * (-gaplen) + merged += read2["RESULT_SEQUENCE"] + merged_qual += "C" * (-gaplen) + merged_qual += read2["RESULT_SEQUENCE"] + else: + # read1_insertions = [read for read in read1['insertions'] if read[0] >= read2['POS']] + # read2_insertions = [read for read in read2['insertions'] if read[0] < read1['POS'] + read1len] + # if str(read1_insertions) != str(read2_insertions): + # print("\n\nInsertions don't match") + # print(QNAME) + # print("insertions1: ", read1_insertions) + # print("insertions2: ", read2_insertions) + # if len(read1_insertions) != len(read2_insertions): + # print("Number of insertions doesn't match") + # else: + # for i in range(len(read1_insertions)): + # if read1_insertions[i][0] != read2_insertions[i][0]: + # print("Insertion index doesn't match") + # print(read1_insertions[i][0], read2_insertions[i][0], " = ", read1_insertions[i][0] - read2_insertions[i][0]) + # print("pos2 - pos1", read2['POS'] - read1['POS']) + # print("cigar1", read1['CIGAR']) + # print("cigar2", read2['CIGAR'])len(overlap_result) + + # if read1_insertions[i][1] != read2_insertions[i][1]: + # print("Insertion sequence doesn't match") + # print(read1_insertions[i][1], read2_insertions[i][1]) + overlap_read1 = read1["RESULT_SEQUENCE"][read2["POS"] - read1["POS"] :] + overlap_read2 = read2["RESULT_SEQUENCE"][0 : max(0, gaplen)] + + overlap_qual1 = read1["RESULT_QUAL"][read2["POS"] - read1["POS"] :] + overlap_qual2 = read2["RESULT_QUAL"][0 : max(0, gaplen)] + + # let's set the read1's version by default + overlap_result = list(overlap_read1) + overlap_qual = list(overlap_qual1) + + if len(overlap_result) and overlap_read1 != overlap_read2: + # print("", QNAME) + if len(overlap_read1) != len(overlap_read2): + print("overlaps don't match in size") + number_of_diffs = 0 + for i in range(len(overlap_read1)): + if overlap_read1[i] != overlap_read2[i]: + if overlap_qual1[i] == "-" and overlap_read2 != "-": + overlap_result[i] = overlap_read2[i] + overlap_qual[i] = overlap_qual[i] + + if overlap_qual1[i] > overlap_qual2[i]: + overlap_result[i] = overlap_read2[i] + overlap_qual[i] = overlap_qual[i] + # print("diff in position ", i, ": ", overlap_read1[i], "/", overlap_read2[i]) + number_of_diffs += 1 + # print("corresponding qs ", i, ": ", overlap_qual1[i], "/", overlap_qual2[i]) + # print("read1pos", read1['POS']) + # print("read2pos", read2['POS']) + # print("diff", read2['POS'] - read1['POS']) + # print("read1len", read1len) + # print("gap", gaplen) + # print("\nread1") + # print(overlap_read1) + # print(overlap_qual1) + # print("\nread2") + # print(overlap_read2) + # print(overlap_qual2) + + # print("\nreconcilled") + # print("".join(overlap_result)) + + merged += "".join(overlap_result) + read2["RESULT_SEQUENCE"][max(0, gaplen) :] + merged_qual += "".join(overlap_qual) + read2["RESULT_QUAL"][max(0, gaplen) :] + + if len(merged) != read2["POS"] + len(read2["RESULT_SEQUENCE"]) - read1["POS"]: + raise Exception("Length mismatch") + + flag = 0 + output_cigar = str(len(merged)) + "M" + + output_sam.write( + f"{QNAME}\t{flag}\t{RNAME}\t{read1['POS']}\t{MAPQ}\t{output_cigar}\t*\t0\t{abs(TLEN)}\t{merged}\t{merged_qual}\n" + ) + + merged_insertions = read1["insertions"].copy() + insertion_index = read1["POS"] + read1len + merged_insertions += [insert for insert in read2["insertions"] if insert[0] > insertion_index] + + # output_insertions.write(f"{QNAME}\t{merged_insertions}\n") + + else: + unpaired[QNAME] = read + # for id in unpaired: + # output_fasta.write(f">{id}|{unpaired[id]['POS']}\n{unpaired[id]['RESULT_SEQUENCE']}\n") + # output_insertions.write(f"{id}\t{unpaired[id]['insertions']}\n")