zavolanlab · balajtimate · Nov 15, 2023 · Sep 8, 2023 · Sep 14, 2023 · Oct 31, 2023
diff --git a/.gitignore b/.gitignore
@@ -117,3 +117,4 @@ tests/.DS_Store
 results_htsinfer
 .snakemake
 tests/cluster_tests/results_sra_downloads
+*.out
diff --git a/README.md b/README.md
@@ -109,6 +109,7 @@ htsinfer [--output-directory PATH]
          [--library-type-mates-cutoff FLOAT]
          [--read-orientation-min-mapped-reads INT]
          [--read-orientation-min-fraction FLOAT]
+         [--org-id INT]
          [--verbosity {DEBUG,INFO,WARN,ERROR,CRITICAL}]
          [-h] [--version]
          PATH [PATH]

diff --git a/htsinfer/cli.py b/htsinfer/cli.py
@@ -262,6 +262,17 @@ def __call__(
             "be reported. Must be above 0.5"
         )
     )
+    parser.add_argument(
+        '--org-id',
+        dest="org_id",
+        metavar="INT",
+        type=int,
+        default=None,
+        help=(
+            "source organism of the sequencing library; if provided, will not "
+            "not be inferred by the application"
+        )
+    )
     parser.add_argument(
         "--verbosity",
         choices=[e.name for e in LogLevels],

diff --git a/htsinfer/get_library_source.py b/htsinfer/get_library_source.py
@@ -5,6 +5,8 @@
 import subprocess as sp
 import tempfile
 
+from typing import Optional
+from Bio import SeqIO  # type: ignore
 import pandas as pd  # type: ignore
 from pandas import DataFrame  # type: ignore
 
@@ -50,6 +52,7 @@ class GetLibSource:
         min_freq_ratio: Minimum frequency ratio between the first and second
             most frequent source in order for the former to be considered the
             library's source.
+        org_id: Taxonomy ID of the organism.
     """
     def __init__(  # pylint: disable=E1101
         self,
@@ -63,6 +66,7 @@ def __init__(  # pylint: disable=E1101
         self.tmp_dir = config.args.tmp_dir
         self.min_match_pct = config.args.lib_source_min_match_pct
         self.min_freq_ratio = config.args.lib_source_min_freq_ratio
+        self.org_id = config.args.org_id
 
     def evaluate(self) -> ResultsSource:
         """Infer read source.
@@ -71,16 +75,59 @@ def evaluate(self) -> ResultsSource:
             Source results object.
         """
         source = ResultsSource()
-        index = self.create_kallisto_index()
-        source.file_1 = self.get_source(
-            fastq=self.paths[0],
-            index=index,
-        )
-        if self.paths[1] is not None:
-            source.file_2 = self.get_source(
-                fastq=self.paths[1],
+        # Check if library_source is provided, otherwise infer it
+        if self.org_id is not None:
+            source.file_1.taxon_id = self.org_id
+            org_name = self.get_organism_name(
+                self.org_id,
+                self.transcripts_file
+            )
+
+            if org_name is not None:
+                source.file_1.short_name = org_name
+
+                if self.paths[1] is not None:
+                    source.file_2.taxon_id = self.org_id
+                    source.file_2.short_name = source.file_1.short_name
+
+            else:
+                LOGGER.warning(
+                    f"Taxon ID '{self.org_id}' not found in "
+                    "organism dictionary, inferring source organism..."
+                )
+                index = self.create_kallisto_index()
+                library_source = self.get_source(
+                    fastq=self.paths[0],
+                    index=index,
+                )
+                source.file_1.short_name = library_source.short_name
+                source.file_1.taxon_id = library_source.taxon_id
+
+                if self.paths[1] is not None:
+                    library_source = self.get_source(
+                        fastq=self.paths[1],
+                        index=index,
+                    )
+                    source.file_2.short_name = library_source.short_name
+                    source.file_2.taxon_id = library_source.taxon_id
+
+        else:
+            index = self.create_kallisto_index()
+            library_source = self.get_source(
+                fastq=self.paths[0],
                 index=index,
             )
+            source.file_1.short_name = library_source.short_name
+            source.file_1.taxon_id = library_source.taxon_id
+
+            if self.paths[1] is not None:
+                library_source = self.get_source(
+                    fastq=self.paths[1],
+                    index=index,
+                )
+                source.file_2.short_name = library_source.short_name
+                source.file_2.taxon_id = library_source.taxon_id
+
         return source
 
     def create_kallisto_index(self) -> Path:
@@ -281,3 +328,45 @@ def get_source_expression(
 
         # return as dictionary
         return dat_agg.sort_values(["tpm"], ascending=False)
+
+    @staticmethod
+    def get_organism_name(
+        taxon_id: int,
+        transcripts_file: Path,
+    ) -> Optional[str]:
+        """Return name of the organism, based on tax ID.
+
+        Args:
+            taxon_id: Taxonomy ID of a given organism (int).
+            transcripts_file: Path to fasta file containing transcripts.
+
+        Returns:
+            Short name of the organism belonging to the given tax ID.
+
+        Raises:
+            Could not process input FASTA file.
+        """
+        org_dict = {}
+        # Construct dictionary of organism ID's and names
+        try:
+            for record in SeqIO.parse(
+                    handle=transcripts_file,
+                    format='fasta',
+            ):
+                org_id = int(record.description.split("|")[4])
+                org_name = record.description.split("|")[3]
+
+                if org_id not in org_dict:
+                    org_dict[org_id] = org_name
+                else:
+                    org_dict[org_id] = org_name
+
+        except OSError as exc:
+            raise FileProblem(
+                f"Could not process file '{transcripts_file}'"
+            ) from exc
+
+        if taxon_id in org_dict:
+            return org_dict[taxon_id]
+
+        return None
diff --git a/htsinfer/get_read_layout.py b/htsinfer/get_read_layout.py
@@ -221,7 +221,7 @@ def evaluate(self) -> None:
         try:
             with open(self.path, encoding="utf-8") as _f:  # type: ignore
 
-                LOGGER.debug("Procecssing Reads")
+                LOGGER.debug("Processing Reads")
                 try:
                     for record in FastqGeneralIterator(source=_f):
                         read = record[1]

diff --git a/htsinfer/htsinfer.py b/htsinfer/htsinfer.py
@@ -85,15 +85,15 @@ def evaluate(self):
                 self.get_library_stats()
                 LOGGER.info(
                     "Library stats determined: "
-                    f"{self.config.results.library_stats.json()}"
+                    f"{self.config.results.library_stats.model_dump_json()}"
                 )
 
                 # determine library source
                 LOGGER.info("Determining library source...")
                 self.config.results.library_source = self.get_library_source()
                 LOGGER.info(
                     "Library source determined: "
-                    f"{self.config.results.library_source.json()}"
+                    f"{self.config.results.library_source.model_dump_json()}"
                 )
 
                 # determine library type
@@ -106,7 +106,7 @@ def evaluate(self):
                     LOGGER.warning(f"{type(exc).__name__}: {str(exc)}")
                 LOGGER.info(
                     "Library type determined: "
-                    f"{self.config.results.library_type.json()}"
+                    f"{self.config.results.library_type.model_dump_json()}"
                 )
 
                 # determine read orientation
@@ -119,7 +119,7 @@ def evaluate(self):
                     LOGGER.warning(f"{type(exc).__name__}: {str(exc)}")
                 LOGGER.info(
                     "Read orientation determined: "
-                    f"{self.config.results.read_orientation.json()}"
+                    f"{self.config.results.read_orientation.model_dump_json()}"
                 )
 
                 # determine read layout
@@ -132,7 +132,7 @@ def evaluate(self):
                     LOGGER.warning(f"{type(exc).__name__}: {str(exc)}")
                 LOGGER.info(
                     "Read layout determined: "
-                    f"{self.config.results.read_layout.json()}"
+                    f"{self.config.results.read_layout.model_dump_json()}"
                 )
 
             except FileProblem as exc:
@@ -148,7 +148,7 @@ def evaluate(self):
             LOGGER.error(f"{type(exc).__name__}: {str(exc)}")
 
         # log results
-        LOGGER.info(f"Results: {self.config.results.json()}")
+        LOGGER.info(f"Results: {self.config.results.model_dump_json()}")
 
     def prepare_env(self):
         """Set up work environment."""

diff --git a/htsinfer/models.py b/htsinfer/models.py
@@ -356,6 +356,7 @@ class Args(BaseModel):
         records: Number of input file records to process; set to `0` to
             process all records.
         threads: Number of threads to run STAR with.
+        org_id: Organism ID.
         transcripts_file: File path to transcripts FASTA file.
         read_layout_adapter_file: Path to text file containing 3' adapter
             sequences to scan for (one sequence per line).
@@ -429,6 +430,7 @@ class Args(BaseModel):
         CleanupRegimes.DEFAULT
     records: int = 0
     threads: int = 1
+    org_id: Optional[int] = None
     transcripts_file: Path = Path()
     read_layout_adapter_file: Path = Path()
     read_layout_min_match_pct: float = 0.1

diff --git a/pylint.cfg b/pylint.cfg
@@ -1,4 +1,4 @@
 [MESSAGES CONTROL]
-disable=C0330,I1101,R0801,R0902,R0903,R0913,R0914,W1202,W1203,W1510
-extension-pkg-white-list=pysam,ahocorasick
+disable=I1101,R0801,R0902,R0903,R0913,R0914,W1202,W1203,W1510
+extension-pkg-whitelist=pysam,ahocorasick
 ignored-classes=pysam