From a081c35a1fe9f439e60261c8f43e70399b800bd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Balajti?= <51365402+balajtimate@users.noreply.github.com> Date: Wed, 9 Oct 2024 08:59:08 +0200 Subject: [PATCH] feat: output read orientation fractions to json (#169) * feat: add read orient single json output * feat: add read orient single json output * update single orientation json * feat: add read orient paired json output * refactor: simplify read orientation functions * add pylint exception * minor refactor get_read_orientation --- htsinfer/get_read_orientation.py | 122 ++++++++++++++++++++++++++++--- 1 file changed, 112 insertions(+), 10 deletions(-) diff --git a/htsinfer/get_read_orientation.py b/htsinfer/get_read_orientation.py index 4b36e8b..d5bac0f 100644 --- a/htsinfer/get_read_orientation.py +++ b/htsinfer/get_read_orientation.py @@ -6,6 +6,7 @@ from typing import (Any, DefaultDict, Dict, List) import pysam # type: ignore +import pandas as pd # type: ignore from htsinfer.exceptions import ( FileProblem, @@ -58,6 +59,7 @@ def __init__( self.library_source = config.results.library_source self.transcripts_file = config.args.t_file_processed self.tmp_dir = config.args.tmp_dir + self.out_dir = config.args.out_dir self.min_mapped_reads = config.args.read_orientation_min_mapped_reads self.min_fraction = config.args.read_orientation_min_fraction self.mapping = mapping @@ -176,13 +178,20 @@ def process_single( else: orientation = StatesOrientation.unstranded - # write log messages and return result + orient_df = self.create_orient_df( + reads, fractions_all_states, orientation, paired=False + ) + LOGGER.debug( - f"Required number of mapped reads pairs: {self.min_mapped_reads}" + f"Required number of mapped reads: {self.min_mapped_reads}" ) - LOGGER.debug(f"Number of reads mapped: {reads}") - LOGGER.debug(f"Fraction of states: {fractions_all_states}") - LOGGER.debug(f"Orientation: {orientation}") + LOGGER.debug(f"Number of mapped reads: {orient_df.iloc[0, 0]}") + LOGGER.debug(f"Fraction of SF: {orient_df.iloc[0, 1]}") + LOGGER.debug(f"Fraction of SR: {orient_df.iloc[0, 2]}") + LOGGER.debug(f"Orientation: {orient_df.iloc[0, 3]}") + + self.write_orientation_to_json(orient_df, self.paths[0].name) + return orientation def process_paired( # pylint: disable=R0912,R0915 @@ -293,13 +302,32 @@ def process_paired( # pylint: disable=R0912,R0915 orientation.file_1 = StatesOrientation.unstranded orientation.file_2 = StatesOrientation.unstranded - # write log messages and return result + orient_df_1 = self.create_orient_df( + reads, fractions_all_states, orientation, paired=True, file_index=1 + ) + orient_df_2 = self.create_orient_df( + reads, fractions_all_states, orientation, paired=True, file_index=2 + ) + LOGGER.debug( - f"Required number of mapped read pairs: {self.min_mapped_reads}" + f"Required number of mapped reads: {self.min_mapped_reads}" ) - LOGGER.debug(f"Number of reads mapped: {reads}") - LOGGER.debug(f"Fraction of states: {fractions_all_states}") - LOGGER.debug(f"Orientation: {orientation}") + LOGGER.debug(f"Number of mapped reads: {orient_df_1.iloc[0, 0]}") + LOGGER.debug(f"Fraction of ISF: {orient_df_1.iloc[0, 1]}") + LOGGER.debug(f"Fraction of ISR: {orient_df_1.iloc[0, 2]}") + LOGGER.debug(f"Orientation file 1: {orient_df_1.iloc[0, 3]}") + LOGGER.debug(f"Orientation file 2: {orient_df_2.iloc[0, 3]}") + LOGGER.debug( + f"Orientation relationship: {orient_df_1.iloc[0, 4]}" + ) + + self.write_orientation_to_json( + orient_df_1, getattr(self.paths[0], 'name') + ) + self.write_orientation_to_json( + orient_df_2, getattr(self.paths[1], 'name') + ) + return orientation @staticmethod @@ -338,3 +366,77 @@ def sum_dicts(*dicts: Dict[Any, float]) -> Dict[Any, float]: for key, num in dct.items(): result[key] += num return dict(result) + + @staticmethod + def create_orient_df( + reads, + fractions_all_states, + orientation, + paired: bool, + file_index=None + ): + """Prepare DataFrame for orientation details. + + Constructs a DataFrame with information about read orientation for + single or paired-end sequencing data. + + Args: + reads: Number of mapped reads. + fractions_all_states: Dictionary containing the fraction + of each orientation state. + orientation: Orientation states. + paired: Indicates if the sequencing data is paired-end. + file_index: Specifies the index of the file for paired-end data + (1 or 2). Ignored for single-end data. + + Returns: + pd.DataFrame: A DataFrame containing orientation details. + """ + if paired: + data = { + 'Number of mapped reads': reads, + 'Fraction ISF': fractions_all_states.get( + StatesOrientationRelationship.inward_stranded_forward + ), + 'Fraction ISR': fractions_all_states.get( + StatesOrientationRelationship.inward_stranded_reverse + ), + 'Orientation': getattr( + orientation.file_1 + if file_index == 1 else orientation.file_2, + 'value', + None + ), + 'Relationship': getattr( + orientation.relationship, 'value', None + ) + } + else: + data = { + 'Number of mapped reads': reads, + 'Fraction SF': fractions_all_states.get( + StatesOrientation.stranded_forward + ), + 'Fraction SR': fractions_all_states.get( + StatesOrientation.stranded_reverse + ), + 'Orientation': orientation.value + } + return pd.DataFrame([data]) + + def write_orientation_to_json(self, orient_df, filename): + """Write orientation dataframe to a JSON file. + + Serializes the provided orientation dataframe to a JSON file + with indentation. + + Args: + orient_df: The dataframe containing orientation details. + filename: Name of the file to save the JSON data. + + Returns: + None + """ + file_path = Path(self.out_dir) / f"read_orientation_{filename}.json" + LOGGER.debug(f"Writing results to file: {file_path}") + orient_df.to_json(file_path, orient='split', index=False, indent=True)