Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: infer library type relationship #157

Merged
merged 15 commits into from
Jan 21, 2024
Merged
142 changes: 94 additions & 48 deletions htsinfer/get_library_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def _evaluate_mate_relationship(
ids_2: As `ids_1` but for the putative second mate file.
"""
self.results.relationship = StatesTypeRelationship.not_mates
if ids_1 == ids_2:
if ids_1 and ids_2 and ids_1 == ids_2:
if (
self.results.file_1 == StatesType.first_mate and
self.results.file_2 == StatesType.second_mate
Expand All @@ -127,13 +127,23 @@ def _evaluate_mate_relationship(
self.mapping.library_type.relationship = (
StatesTypeRelationship.split_mates
)
else:
elif (
self.library_source.file_1.short_name is not None or
self.library_source.file_2.short_name is not None
):
LOGGER.debug("Determining mate relationship by alignment...")
self.mapping.library_type.relationship \
= StatesTypeRelationship.not_available
self.mapping.library_source = self.library_source
self.mapping.paths = self.path_1, self.path_2
self.mapping.evaluate()
self._align_mates()
else:
self.results.relationship = StatesTypeRelationship.not_available
LOGGER.debug(
"Sequence IDs and library source are not determined, "
"mate relationship cannot be inferred."
)

def _align_mates(self):
"""Decide mate relationship by alignment."""
Expand All @@ -144,19 +154,24 @@ def _align_mates(self):
samfile1 = pysam.AlignmentFile(str(alignment_1), 'r')
samfile2 = pysam.AlignmentFile(str(alignment_2), 'r')

seq_id1 = None
seq_id2 = None
previous_seq_id1 = None
previous_seq_id2 = None

reads1 = [] # List to store alignments for one read from file1
mate1 = [] # List to store alignments for each read
mate1 = [] # List to store alignments for each read from file1
reads2 = [] # List to store alignments for one read from file2
mate2 = [] # List to store alignments for each read from file2

concordant = 0

for read1 in samfile1:
seq_id1 = read1.query_name
if seq_id1 != previous_seq_id1 \
and previous_seq_id1 is not None:
if (
seq_id1 != previous_seq_id1 and
previous_seq_id1 is not None
):
mate1.append(reads1.copy())
balajtimate marked this conversation as resolved.
Show resolved Hide resolved
reads1.clear()
if read1.reference_end:
Expand All @@ -167,35 +182,63 @@ def _align_mates(self):
read_counter = 0
for read2 in samfile2:
seq_id2 = read2.query_name
if seq_id2 != previous_seq_id2 \
and previous_seq_id2 is not None:
if self._compare_alignments(mate1[read_counter], reads2):
if (
seq_id2 != previous_seq_id2 and
previous_seq_id2 is not None
):
mate2.append(reads2.copy())
if self._compare_alignments(
mate1[read_counter], reads2
):
concordant += 1
reads2.clear()
read_counter += 1
if read2.reference_end:
reads2.append(read2)
previous_seq_id2 = seq_id2

if self._compare_alignments(mate1[read_counter], reads2):
mate2.append(reads2.copy())
if self._compare_alignments(
mate1[read_counter], reads2
):
concordant += 1

if (concordant / read_counter) >= self.cutoff:
self.results.relationship = (
StatesTypeRelationship.split_mates
)
self.mapping.library_type.relationship \
= StatesTypeRelationship.split_mates
self.mapping.mapped = False
self.mapping.star_dirs = []
else:
self.results.relationship = (
StatesTypeRelationship.not_mates
)
aligned_mate1 = len(list(filter(None, mate1)))
aligned_mate2 = len(list(filter(None, mate2)))

LOGGER.debug(f"Number of aligned reads file 1: {aligned_mate1}")
LOGGER.debug(f"Number of aligned reads file 2: {aligned_mate2}")
LOGGER.debug(f"Number of concordant reads: {concordant}")

self._update_relationship(
concordant, min(aligned_mate1, aligned_mate2)
)

samfile1.close()
samfile2.close()

def _update_relationship(self, concordant, aligned_reads):
"""Helper function to update relationship based on alignment."""
try:
ratio = concordant / aligned_reads
except ZeroDivisionError:
self.results.relationship = (
StatesTypeRelationship.not_available
)
else:
if ratio >= self.cutoff:
self.results.relationship = (
StatesTypeRelationship.split_mates
)
self.mapping.library_type.relationship = (
StatesTypeRelationship.split_mates
)
self.mapping.mapped = False
self.mapping.star_dirs = []
else:
self.results.relationship = (
StatesTypeRelationship.not_mates
)

class AlignedSegment:
"""Placeholder class for mypy "Missing attribute"
error in _compare_alignments(), the actual object used
Expand Down Expand Up @@ -302,44 +345,47 @@ def evaluate(self) -> None:
self.result = StatesType.not_available
raise FileProblem(f"File is empty: {self.path}") from exc

if self.seq_id_format is None:
if self.seq_id_format is not None:
LOGGER.debug(
"Sequence identifier format: "
f"{self.seq_id_format.name}"
)
else:
self.result = StatesType.not_available
raise MetadataWarning(
LOGGER.debug(
"Could not determine sequence identifier format."
)
LOGGER.debug(
f"Sequence identifier format: {self.seq_id_format.name}"
)

# Ensure that remaining records are compatible with sequence
# identifier format and library type determined from first
# record
LOGGER.debug(
"Checking consistency of remaining reads with initially "
"determined identifier format and library type..."
)
for record in seq_iter:
records += 1
try:
self._get_read_type(
seq_id=record[0],
regex=self.seq_id_format.value,
)
except (
InconsistentFastqIdentifiers,
UnknownFastqIdentifier,
) as exc:
self.result = StatesType.not_available
raise MetadataWarning(
f"{type(exc).__name__}: {str(exc)}"
) from exc
if self.seq_id_format is not None:
LOGGER.debug(
"Checking consistency of remaining reads with "
"initially determined identifier format "
"and library type..."
)
for record in seq_iter:
records += 1
try:
self._get_read_type(
seq_id=record[0],
regex=self.seq_id_format.value,
)
except (
InconsistentFastqIdentifiers,
UnknownFastqIdentifier,
) as exc:
self.result = StatesType.not_available
raise MetadataWarning(
f"{type(exc).__name__}: {str(exc)}"
) from exc
LOGGER.debug(f"Total records processed: {records}")

except (OSError, ValueError) as exc:
self.result = StatesType.not_available
raise FileProblem(f"{type(exc).__name__}: {str(exc)}") from exc

LOGGER.debug(f"Total records processed: {records}")

def _get_read_type(
self,
seq_id: str,
Expand Down
6 changes: 5 additions & 1 deletion htsinfer/get_read_orientation.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,11 @@ def evaluate(self) -> ResultsOrientation:
self.mapping.transcripts_file = self.transcripts_file
self.mapping.tmp_dir = self.tmp_dir

if not self.mapping.mapped:
if not self.mapping.mapped and (
self.library_source.file_1.short_name is not None or
self.library_source.file_2.short_name is not None
):
LOGGER.debug("Determining read relationship by alignment...")
self.mapping.evaluate()

return self.process_alignments(star_dirs=self.mapping.star_dirs)
Expand Down
11 changes: 6 additions & 5 deletions htsinfer/mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,7 @@ def __init__(
self.star_dirs: List[Path] = []

def evaluate(self):
"""Infer read orientation.

Returns:
Orientation results object.
"""
"""Align FASTQ files to reference transcripts with STAR."""

# get transcripts for current organims
transcripts = self.subset_transcripts_by_organism()
Expand Down Expand Up @@ -270,6 +266,10 @@ def prepare_star_alignment_commands(
) -> Dict[Path, List[str]]:
"""Prepare STAR alignment commands.

Input FASTQ files are assumed to be sorted according to reference names
or coordinates, the order of input reads is kept with the option
"PairedKeepInputOrder", no additional sorting of aligned reads is done.

Args:
index_dir: Path to directory containing STAR index.

Expand Down Expand Up @@ -299,6 +299,7 @@ def build_star_command(
"--runThreadN", f"{str(self.threads_star)}",
"--genomeDir", f"{str(index_dir)}",
"--outFilterMultimapNmax", "50",
"--outSAMorder", "PairedKeepInputOrder",
balajtimate marked this conversation as resolved.
Show resolved Hide resolved
"--outSAMunmapped", "Within", "KeepPairs",
]
cmd: List[str] = cmd_base[:]
Expand Down
Loading
Loading