Skip to content

Commit

Permalink
make fasta attribute into FastaFile instead of simply the path and ad…
Browse files Browse the repository at this point in the history
…d genome name
  • Loading branch information
LukasMahieu committed Dec 2, 2024
1 parent 30fe270 commit edab6dd
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 24 deletions.
4 changes: 1 addition & 3 deletions src/crested/_conf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
"""Persistent configuration for the crested package."""

from crested._genome import Genome

genome: Genome = None
genome = None # persistent genome object
57 changes: 45 additions & 12 deletions src/crested/_genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@
import os
from pathlib import Path

from crested import _conf as conf
from loguru import logger
from pysam import FastaFile

import crested._conf as conf


class Genome:
Expand All @@ -24,21 +27,29 @@ class Genome:
If not provided, the chromosome sizes will be inferred from the FASTA file.
annotation
The path to the annotation file.
name
Optional name of the genome.
Examples
--------
>>> genome = Genome(
... fasta="tests/data/test.fa",
... chrom_sizes="tests/data/test.chrom.sizes",
... )
>>> print(genome.fasta)
<pysam.libcfaidx.FastaFile at 0x7f4d8b4a8f40>
>>> print(genome.chrom_sizes)
{'chr1': 1000, 'chr2': 2000}
>>> print(genome.name)
test
"""

def __init__(
self,
*,
fasta: Path,
chrom_sizes: dict[str, int] | Path | None = None,
annotation: Path | None = None,
name: str | None = None,
):
"""Initialize the Genome object."""
if isinstance(fasta, (Path, str)):
Expand Down Expand Up @@ -71,22 +82,26 @@ def __init__(
else:
self._annotation = None

self._name = name

@property
def fasta(self) -> Path:
def fasta(self) -> FastaFile:
"""
The Path to the FASTA file.
The pysam FastaFile object for the FASTA file.
Returns
-------
The path to the FASTA file.
The pysam FastaFile object.
"""
return self._fasta
return FastaFile(self._fasta)

@property
def annotation(self) -> Path | None:
"""
The Path to the annotation file.
Currently not used in the package.
Returns
-------
The path to the annotation file.
Expand All @@ -103,10 +118,7 @@ def chrom_sizes(self) -> dict[str, int]:
A dictionary of chromosome sizes.
"""
if self._chrom_sizes is None:
from pysam import FastaFile

fasta = FastaFile(self.fasta)
self._chrom_sizes = dict(zip(fasta.references, fasta.lengths))
self._chrom_sizes = dict(zip(self.fasta.references, self.fasta.lengths))
elif isinstance(self._chrom_sizes, Path):
from crested._io import _read_chromsizes

Expand All @@ -115,6 +127,25 @@ def chrom_sizes(self) -> dict[str, int]:
raise ValueError("chrom_sizes must be a dictionary or a Path.")
return self._chrom_sizes

@property
def name(self) -> str:
"""
The name of the genome.
Returns
-------
The name of the genome.
"""
if self._name is None:
filename = self.fasta.filename.decode("utf-8")
basename = os.path.basename(filename)

if basename.endswith(".fa") or basename.endswith(".fasta"):
return os.path.splitext(basename)[0] # Remove the extension and return
else:
return basename
return self._name


def register_genome(genome: Genome):
"""
Expand All @@ -130,19 +161,21 @@ def register_genome(genome: Genome):
Examples
--------
>>> genome = Genome(
... fasta="tests/data/test.fasta",
... fasta="tests/data/hg38.fa",
... chrom_sizes="tests/data/test.chrom.sizes",
... )
>>> register_genome(genome)
INFO Genome hg38 registered.
"""
if not isinstance(genome, Genome):
raise TypeError("genome must be an instance of crested.Genome")
conf.genome = genome
logger.info(f"Genome {genome.name} registered.")


def _resolve_genome(
genome: os.PathLike | Genome | None,
chromsizes_file: os.PathLike | None,
chromsizes_file: os.PathLike | None = None,
annotation: os.PathLike | None = None,
) -> Genome:
"""Resolve the input to a Genome object. Required to keep backwards compatibility with fasta and chromsizes paths as inputs."""
Expand Down
2 changes: 1 addition & 1 deletion src/crested/tl/_crested.py
Original file line number Diff line number Diff line change
Expand Up @@ -798,7 +798,7 @@ def score_gene_locus(
idx = all_class_names.index(class_name)

if genome is None:
genome = FastaFile(self.anndatamodule.genome.fasta)
genome = self.anndatamodule.genome.fasta

# Generate all windows and one-hot encode the sequences in parallel
all_sequences = []
Expand Down
2 changes: 1 addition & 1 deletion src/crested/tl/data/_anndatamodule.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class AnnDataModule:
Instance of Genome or Path to the fasta file.
If None, will look for a registered genome object.
chromsizes_file
Path to the chromsizes file.
Path to the chromsizes file. Not required if genome is a Genome object.
If genome is a path and chromsizes is not provided, will deduce the chromsizes from the fasta file.
in_memory
If True, the train and val sequences will be loaded into memory. Default is True.
Expand Down
3 changes: 1 addition & 2 deletions src/crested/tl/data/_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import numpy as np
from anndata import AnnData
from loguru import logger
from pysam import FastaFile
from scipy.sparse import spmatrix
from tqdm import tqdm

Expand Down Expand Up @@ -84,7 +83,7 @@ def __init__(
regions: list[str] | None = None,
):
"""Initialize the SequenceLoader with the provided genome file and options."""
self.genome = FastaFile(genome.fasta)
self.genome = genome.fasta
self.chromsizes = genome.chrom_sizes
self.in_memory = in_memory
self.always_reverse_complement = always_reverse_complement
Expand Down
13 changes: 8 additions & 5 deletions src/crested/utils/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@

import numpy as np
import pandas as pd
import pysam
from loguru import logger

from crested._genome import Genome, _resolve_genome
from crested._io import _extract_tracks_from_bigwig


Expand Down Expand Up @@ -317,7 +317,9 @@ def extract_bigwig_values_per_bp(


def fetch_sequences(
regions: str | list[str], genome: os.PathLike, uppercase: bool = True
regions: str | list[str],
genome: os.PathLike | Genome | None = None,
uppercase: bool = True,
) -> list[str]:
"""
Fetch sequences from a genome file for a list of regions using pysam.
Expand All @@ -329,7 +331,8 @@ def fetch_sequences(
regions
List of regions to fetch sequences for.
genome
Path to the genome file.
Path to the genome fasta or Genome instance or None.
If None, will look for a registered genome object.
uppercase
If True, return sequences in uppercase.
Expand All @@ -344,12 +347,12 @@ def fetch_sequences(
"""
if isinstance(regions, str):
regions = [regions]
fasta = pysam.FastaFile(genome)
genome = _resolve_genome(genome)
seqs = []
for region in regions:
chrom, start_end = region.split(":")
start, end = start_end.split("-")
seq = fasta.fetch(chrom, int(start), int(end))
seq = genome.fasta.fetch(chrom, int(start), int(end))
if uppercase:
seq = seq.upper()
seqs.append(seq)
Expand Down

0 comments on commit edab6dd

Please sign in to comment.