make fasta attribute into FastaFile instead of simply the path and ad…

…d genome name
aertslab · Dec 2, 2024 · edab6dd · edab6dd
1 parent 30fe270
commit edab6dd
Show file tree

Hide file tree

Showing 6 changed files with 57 additions and 24 deletions.
diff --git a/src/crested/_conf.py b/src/crested/_conf.py
@@ -1,5 +1,3 @@
 """Persistent configuration for the crested package."""
 
-from crested._genome import Genome
-
-genome: Genome = None
+genome = None  # persistent genome object
diff --git a/src/crested/_genome.py b/src/crested/_genome.py
@@ -6,7 +6,10 @@
 import os
 from pathlib import Path
 
-from crested import _conf as conf
+from loguru import logger
+from pysam import FastaFile
+
+import crested._conf as conf
 
 
 class Genome:
@@ -24,21 +27,29 @@ class Genome:
         If not provided, the chromosome sizes will be inferred from the FASTA file.
     annotation
         The path to the annotation file.
+    name
+        Optional name of the genome.
 
     Examples
     --------
     >>> genome = Genome(
     ...     fasta="tests/data/test.fa",
     ...     chrom_sizes="tests/data/test.chrom.sizes",
     ... )
+    >>> print(genome.fasta)
+    <pysam.libcfaidx.FastaFile at 0x7f4d8b4a8f40>
+    >>> print(genome.chrom_sizes)
+    {'chr1': 1000, 'chr2': 2000}
+    >>> print(genome.name)
+    test
     """
 
     def __init__(
         self,
-        *,
         fasta: Path,
         chrom_sizes: dict[str, int] | Path | None = None,
         annotation: Path | None = None,
+        name: str | None = None,
     ):
         """Initialize the Genome object."""
         if isinstance(fasta, (Path, str)):
@@ -71,22 +82,26 @@ def __init__(
         else:
             self._annotation = None
 
+        self._name = name
+
     @property
-    def fasta(self) -> Path:
+    def fasta(self) -> FastaFile:
         """
-        The Path to the FASTA file.
+        The pysam FastaFile object for the FASTA file.
 
         Returns
         -------
-        The path to the FASTA file.
+        The pysam FastaFile object.
         """
-        return self._fasta
+        return FastaFile(self._fasta)
 
     @property
     def annotation(self) -> Path | None:
         """
         The Path to the annotation file.
 
+        Currently not used in the package.
+
         Returns
         -------
         The path to the annotation file.
@@ -103,10 +118,7 @@ def chrom_sizes(self) -> dict[str, int]:
         A dictionary of chromosome sizes.
         """
         if self._chrom_sizes is None:
-            from pysam import FastaFile
-
-            fasta = FastaFile(self.fasta)
-            self._chrom_sizes = dict(zip(fasta.references, fasta.lengths))
+            self._chrom_sizes = dict(zip(self.fasta.references, self.fasta.lengths))
         elif isinstance(self._chrom_sizes, Path):
             from crested._io import _read_chromsizes
 
@@ -115,6 +127,25 @@ def chrom_sizes(self) -> dict[str, int]:
             raise ValueError("chrom_sizes must be a dictionary or a Path.")
         return self._chrom_sizes
 
+    @property
+    def name(self) -> str:
+        """
+        The name of the genome.
+
+        Returns
+        -------
+        The name of the genome.
+        """
+        if self._name is None:
+            filename = self.fasta.filename.decode("utf-8")
+            basename = os.path.basename(filename)
+
+            if basename.endswith(".fa") or basename.endswith(".fasta"):
+                return os.path.splitext(basename)[0]  # Remove the extension and return
+            else:
+                return basename
+        return self._name
+
 
 def register_genome(genome: Genome):
     """
@@ -130,19 +161,21 @@ def register_genome(genome: Genome):
     Examples
     --------
     >>> genome = Genome(
-    ...     fasta="tests/data/test.fasta",
+    ...     fasta="tests/data/hg38.fa",
     ...     chrom_sizes="tests/data/test.chrom.sizes",
     ... )
     >>> register_genome(genome)
+    INFO Genome hg38 registered.
     """
     if not isinstance(genome, Genome):
         raise TypeError("genome must be an instance of crested.Genome")
     conf.genome = genome
+    logger.info(f"Genome {genome.name} registered.")
 
 
 def _resolve_genome(
     genome: os.PathLike | Genome | None,
-    chromsizes_file: os.PathLike | None,
+    chromsizes_file: os.PathLike | None = None,
     annotation: os.PathLike | None = None,
 ) -> Genome:
     """Resolve the input to a Genome object. Required to keep backwards compatibility with fasta and chromsizes paths as inputs."""

diff --git a/src/crested/tl/_crested.py b/src/crested/tl/_crested.py
@@ -798,7 +798,7 @@ def score_gene_locus(
         idx = all_class_names.index(class_name)
 
         if genome is None:
-            genome = FastaFile(self.anndatamodule.genome.fasta)
+            genome = self.anndatamodule.genome.fasta
 
         # Generate all windows and one-hot encode the sequences in parallel
         all_sequences = []

diff --git a/src/crested/tl/data/_anndatamodule.py b/src/crested/tl/data/_anndatamodule.py
@@ -39,7 +39,7 @@ class AnnDataModule:
         Instance of Genome or Path to the fasta file.
         If None, will look for a registered genome object.
     chromsizes_file
-        Path to the chromsizes file.
+        Path to the chromsizes file. Not required if genome is a Genome object.
         If genome is a path and chromsizes is not provided, will deduce the chromsizes from the fasta file.
     in_memory
         If True, the train and val sequences will be loaded into memory. Default is True.

diff --git a/src/crested/tl/data/_dataset.py b/src/crested/tl/data/_dataset.py
@@ -8,7 +8,6 @@
 import numpy as np
 from anndata import AnnData
 from loguru import logger
-from pysam import FastaFile
 from scipy.sparse import spmatrix
 from tqdm import tqdm
 
@@ -84,7 +83,7 @@ def __init__(
         regions: list[str] | None = None,
     ):
         """Initialize the SequenceLoader with the provided genome file and options."""
-        self.genome = FastaFile(genome.fasta)
+        self.genome = genome.fasta
         self.chromsizes = genome.chrom_sizes
         self.in_memory = in_memory
         self.always_reverse_complement = always_reverse_complement

diff --git a/src/crested/utils/_utils.py b/src/crested/utils/_utils.py
@@ -5,9 +5,9 @@
 
 import numpy as np
 import pandas as pd
-import pysam
 from loguru import logger
 
+from crested._genome import Genome, _resolve_genome
 from crested._io import _extract_tracks_from_bigwig
 
 
@@ -317,7 +317,9 @@ def extract_bigwig_values_per_bp(
 
 
 def fetch_sequences(
-    regions: str | list[str], genome: os.PathLike, uppercase: bool = True
+    regions: str | list[str],
+    genome: os.PathLike | Genome | None = None,
+    uppercase: bool = True,
 ) -> list[str]:
     """
     Fetch sequences from a genome file for a list of regions using pysam.
@@ -329,7 +331,8 @@ def fetch_sequences(
     regions
         List of regions to fetch sequences for.
     genome
-        Path to the genome file.
+        Path to the genome fasta or Genome instance or None.
+        If None, will look for a registered genome object.
     uppercase
         If True, return sequences in uppercase.
 
@@ -344,12 +347,12 @@ def fetch_sequences(
     """
     if isinstance(regions, str):
         regions = [regions]
-    fasta = pysam.FastaFile(genome)
+    genome = _resolve_genome(genome)
     seqs = []
     for region in regions:
         chrom, start_end = region.split(":")
         start, end = start_end.split("-")
-        seq = fasta.fetch(chrom, int(start), int(end))
+        seq = genome.fasta.fetch(chrom, int(start), int(end))
         if uppercase:
             seq = seq.upper()
         seqs.append(seq)