Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: adding proof-of-concept works #15

Merged
merged 18 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions scripts/dgicev/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
### add_padding.py

Used for adding padding to an example fasta file, mainly used for testing main

#### Usage:

`cat input_file.fasta | python add_padding.py > output_file.fasta`

`input_file`: fasta file with header containing offset | delimited

`output_file`: normal fasta file, each sequence padded by the respective amount of N's to the left/right (position info in the header is not in the output)

### read.py

Used for merging pairs of reads

#### Usage:
`cat input_file.fasta | python read.py`
`samtools view input_file.bam | python read.py`

`input_file`: sam file contents, could also read from bam with samtools

`output_file`: it makes two output files by itself, merged.fasta and nuc_insertions.txt, the former has the merged reads and it uses the fasta with | headers to describe the position/offset

### read_outputsam.py

Used mainly for testing with IGV

same usage as read.py, except the output it makes is merged.sam, it doesn't store the insertions

the sam entries it outputs are the actual sequences you would find in the merged.fasta when running read.py - the cigar is just M's

### readAndSort.py
for reordering, using hashing, not really efficient more of a proof of concept

same usage as read.py

Note: I don't think I use the reference genome here so feel free to take it out of the code
36 changes: 36 additions & 0 deletions scripts/dgicev/add_padding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""
Author: David Gicev (@davidgicev / [email protected])
Supervisor:Alexander Taepper (@Taepper / [email protected])
Date: 2024-10-30
"""

# TODO: integrate into package, with QA and testing
# pylint: skip-file
# flake8: noqa

import sys


def transform_fasta():
while True:
header = sys.stdin.readline().strip()
sequence = sys.stdin.readline().strip()

if not header or not sequence:
break # End of file

# Parse the header and extract the position offset
header_parts = header.split("|")
position_offset = int(header_parts[1]) # Get the position offset

# Add Ns before and after the sequence
left_padding = "N" * position_offset
right_padding = "N" * (29904 - len(sequence) - position_offset)

# Write the transformed sequence to stdout
sys.stdout.write(f"{header_parts[0]}\n")
sys.stdout.write(f"{left_padding}{sequence}{right_padding}\n")


# Execute the function
transform_fasta()
181 changes: 181 additions & 0 deletions scripts/dgicev/read.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
"""
Author: David Gicev (@davidgicev / [email protected])
Supervisor:Alexander Taepper (@Taepper / [email protected])
Date: 2024-10-30
"""

# TODO: integrate into package, with QA and testing
# pylint: skip-file
# flake8: noqa

import sys
import re


def parse_cigar(cigar):
pattern = re.compile(r"(\d+)([MIDNSHP=X])")

parsed_cigar = pattern.findall(cigar)

return [(op, int(length)) for length, op in parsed_cigar]


unpaired = dict()

with open("merged.fasta", "w") as output_fasta, open("nuc_insertions.txt", "w") as output_insertions:
for line in sys.stdin:
if line.startswith("@"):
continue

fields = line.strip().split("\t")

QNAME = fields[0] # Query template NAME
FLAG = int(fields[1]) # bitwise FLAG
RNAME = fields[2] # Reference sequence NAME
POS = int(fields[3]) # 1-based leftmost mapping POSition
MAPQ = int(fields[4]) # MAPping Quality
CIGAR = parse_cigar(fields[5]) # CIGAR string
RNEXT = fields[6] # Ref. name of the mate/next read
PNEXT = int(fields[7]) # Position of the mate/next read
TLEN = int(fields[8]) # observed Template LENgth
SEQ = fields[9] # segment SEQuence
QUAL = fields[10] # ASCII of Phred-scaled base QUALity + 33

result_sequence = ""
result_qual = ""
index = 0
inserts = []

for operation in CIGAR:
type, count = operation
if type == "S":
index += count
continue
if type == "M":
result_sequence += SEQ[index : index + count]
result_qual += QUAL[index : index + count]
index += count
continue
if type == "D":
result_sequence += "-" * count
result_qual += "!" * count
continue
if type == "I":
inserts.append((index + POS, SEQ[index : index + count]))
index += count
continue

read = {
# "QNAME": QNAME,
# "FLAG": FLAG,
# "RNAME": RNAME,
"POS": POS,
# "MAPQ": MAPQ,
"CIGAR": CIGAR,
# "RNEXT": RNEXT,
# "PNEXT": PNEXT,
# "TLEN": TLEN,
# "SEQ": SEQ,
# "QUAL": QUAL,
"RESULT_SEQUENCE": result_sequence,
"RESULT_QUAL": result_qual,
"insertions": inserts,
}

if QNAME in unpaired:
read1 = unpaired.pop(QNAME)
read2 = read

# print(read1)
# print(read2)

if read1["POS"] > read2["POS"]:
read1, read2 = read2, read1

index = read1["POS"]
read1len = len(read1["RESULT_SEQUENCE"])
merged = read1["RESULT_SEQUENCE"][: min(read1len, read2["POS"] - read1["POS"])]

# do deletions cause a problem here?
gaplen = read1["POS"] + read1len - read2["POS"]
if gaplen < 0:
merged += "N" * (-gaplen)
merged += read2["RESULT_SEQUENCE"]
else:
# read1_insertions = [read for read in read1['insertions'] if read[0] >= read2['POS']]
# read2_insertions = [read for read in read2['insertions'] if read[0] < read1['POS'] + read1len]
# if str(read1_insertions) != str(read2_insertions):
# print("\n\nInsertions don't match")
# print(QNAME)
# print("insertions1: ", read1_insertions)
# print("insertions2: ", read2_insertions)
# if len(read1_insertions) != len(read2_insertions):
# print("Number of insertions doesn't match")
# else:
# for i in range(len(read1_insertions)):
# if read1_insertions[i][0] != read2_insertions[i][0]:
# print("Insertion index doesn't match")
# print(read1_insertions[i][0], read2_insertions[i][0], " = ", read1_insertions[i][0] - read2_insertions[i][0])
# print("pos2 - pos1", read2['POS'] - read1['POS'])
# print("cigar1", read1['CIGAR'])
# print("cigar2", read2['CIGAR'])len(overlap_result)

# if read1_insertions[i][1] != read2_insertions[i][1]:
# print("Insertion sequence doesn't match")
# print(read1_insertions[i][1], read2_insertions[i][1])
overlap_read1 = read1["RESULT_SEQUENCE"][read2["POS"] - read1["POS"] :]
overlap_read2 = read2["RESULT_SEQUENCE"][0 : max(0, gaplen)]

overlap_qual1 = read1["RESULT_QUAL"][read2["POS"] - read1["POS"] :]
overlap_qual2 = read2["RESULT_QUAL"][0 : max(0, gaplen)]

# let's set the read1's version by default
overlap_result = list(overlap_read1)

if len(overlap_result) and overlap_read1 != overlap_read2:
# print("", QNAME)
if len(overlap_read1) != len(overlap_read2):
print("overlaps don't match in size")
number_of_diffs = 0
for i in range(len(overlap_read1)):
if overlap_read1[i] != overlap_read2[i]:
if overlap_qual1[i] == "-" and overlap_read2 != "-":
overlap_result[i] = overlap_read2[i]
if overlap_qual1[i] > overlap_qual2[i]:
overlap_result[i] = overlap_read2[i]
# print("diff in position ", i, ": ", overlap_read1[i], "/", overlap_read2[i])
number_of_diffs += 1
# print("corresponding qs ", i, ": ", overlap_qual1[i], "/", overlap_qual2[i])
# print("read1pos", read1['POS'])
# print("read2pos", read2['POS'])
# print("diff", read2['POS'] - read1['POS'])
# print("read1len", read1len)
# print("gap", gaplen)
# print("\nread1")
# print(overlap_read1)
# print(overlap_qual1)
# print("\nread2")
# print(overlap_read2)
# print(overlap_qual2)

# print("\nreconcilled")
# print("".join(overlap_result))

merged += "".join(overlap_result) + read2["RESULT_SEQUENCE"][max(0, gaplen) :]

if len(merged) != read2["POS"] + len(read2["RESULT_SEQUENCE"]) - read1["POS"]:
raise Exception("Length mismatch")

output_fasta.write(f">{QNAME}|{read1['POS']}\n{merged}\n")

merged_insertions = read1["insertions"].copy()
insertion_index = read1["POS"] + read1len
merged_insertions += [insert for insert in read2["insertions"] if insert[0] > insertion_index]

output_insertions.write(f"{QNAME}\t{merged_insertions}\n")

else:
unpaired[QNAME] = read
for id in unpaired:
output_fasta.write(f">{id}|{unpaired[id]['POS']}\n{unpaired[id]['RESULT_SEQUENCE']}\n")
output_insertions.write(f"{id}\t{unpaired[id]['insertions']}\n")
Loading
Loading