Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add tax_id parameter #147

Merged
merged 20 commits into from
Nov 15, 2023
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -117,3 +117,4 @@ tests/.DS_Store
results_htsinfer
.snakemake
tests/cluster_tests/results_sra_downloads
*.out
uniqueg marked this conversation as resolved.
Show resolved Hide resolved
1 change: 1 addition & 0 deletions README.md
balajtimate marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ htsinfer [--output-directory PATH]
[--library-type-mates-cutoff FLOAT]
[--read-orientation-min-mapped-reads INT]
[--read-orientation-min-fraction FLOAT]
[--org-id INT]
balajtimate marked this conversation as resolved.
Show resolved Hide resolved
[--verbosity {DEBUG,INFO,WARN,ERROR,CRITICAL}]
[-h] [--version]
PATH [PATH]
Expand Down
11 changes: 11 additions & 0 deletions htsinfer/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,17 @@ def __call__(
"be reported. Must be above 0.5"
)
)
parser.add_argument(
'--org-id',
dest="org_id",
metavar="INT",
type=int,
default=None,
help=(
"source organism of the sequencing library; if provided, will not "
balajtimate marked this conversation as resolved.
Show resolved Hide resolved
"not be inferred by the application"
)
)
parser.add_argument(
"--verbosity",
choices=[e.name for e in LogLevels],
Expand Down
105 changes: 97 additions & 8 deletions htsinfer/get_library_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import subprocess as sp
import tempfile

from typing import Optional
balajtimate marked this conversation as resolved.
Show resolved Hide resolved
from Bio import SeqIO # type: ignore
import pandas as pd # type: ignore
from pandas import DataFrame # type: ignore

Expand Down Expand Up @@ -50,6 +52,7 @@ class GetLibSource:
min_freq_ratio: Minimum frequency ratio between the first and second
most frequent source in order for the former to be considered the
library's source.
org_id: Taxonomy ID of the organism.
"""
def __init__( # pylint: disable=E1101
self,
Expand All @@ -63,6 +66,7 @@ def __init__( # pylint: disable=E1101
self.tmp_dir = config.args.tmp_dir
self.min_match_pct = config.args.lib_source_min_match_pct
self.min_freq_ratio = config.args.lib_source_min_freq_ratio
self.org_id = config.args.org_id

def evaluate(self) -> ResultsSource:
"""Infer read source.
Expand All @@ -71,16 +75,59 @@ def evaluate(self) -> ResultsSource:
Source results object.
"""
source = ResultsSource()
index = self.create_kallisto_index()
source.file_1 = self.get_source(
fastq=self.paths[0],
index=index,
)
if self.paths[1] is not None:
source.file_2 = self.get_source(
fastq=self.paths[1],
# Check if library_source is provided, otherwise infer it
balajtimate marked this conversation as resolved.
Show resolved Hide resolved
if self.org_id is not None:
source.file_1.taxon_id = self.org_id
org_name = self.get_organism_name(
self.org_id,
self.transcripts_file
)

if org_name is not None:
source.file_1.short_name = org_name

if self.paths[1] is not None:
source.file_2.taxon_id = self.org_id
source.file_2.short_name = source.file_1.short_name

else:
balajtimate marked this conversation as resolved.
Show resolved Hide resolved
LOGGER.warning(
f"Taxon ID '{self.org_id}' not found in "
"organism dictionary, inferring source organism..."
)
index = self.create_kallisto_index()
library_source = self.get_source(
fastq=self.paths[0],
index=index,
)
source.file_1.short_name = library_source.short_name
source.file_1.taxon_id = library_source.taxon_id

if self.paths[1] is not None:
library_source = self.get_source(
fastq=self.paths[1],
index=index,
)
source.file_2.short_name = library_source.short_name
source.file_2.taxon_id = library_source.taxon_id

else:
index = self.create_kallisto_index()
library_source = self.get_source(
fastq=self.paths[0],
index=index,
)
source.file_1.short_name = library_source.short_name
source.file_1.taxon_id = library_source.taxon_id

if self.paths[1] is not None:
library_source = self.get_source(
fastq=self.paths[1],
index=index,
)
source.file_2.short_name = library_source.short_name
source.file_2.taxon_id = library_source.taxon_id

return source

def create_kallisto_index(self) -> Path:
Expand Down Expand Up @@ -281,3 +328,45 @@ def get_source_expression(

# return as dictionary
return dat_agg.sort_values(["tpm"], ascending=False)

@staticmethod
def get_organism_name(
balajtimate marked this conversation as resolved.
Show resolved Hide resolved
taxon_id: int,
transcripts_file: Path,
) -> Optional[str]:
balajtimate marked this conversation as resolved.
Show resolved Hide resolved
"""Return name of the organism, based on tax ID.

Args:
taxon_id: Taxonomy ID of a given organism (int).
balajtimate marked this conversation as resolved.
Show resolved Hide resolved
transcripts_file: Path to fasta file containing transcripts.
balajtimate marked this conversation as resolved.
Show resolved Hide resolved

Returns:
Short name of the organism belonging to the given tax ID.

Raises:
Could not process input FASTA file.
balajtimate marked this conversation as resolved.
Show resolved Hide resolved
"""
org_dict = {}
# Construct dictionary of organism ID's and names
try:
for record in SeqIO.parse(
handle=transcripts_file,
format='fasta',
):
org_id = int(record.description.split("|")[4])
org_name = record.description.split("|")[3]

if org_id not in org_dict:
org_dict[org_id] = org_name
else:
org_dict[org_id] = org_name
balajtimate marked this conversation as resolved.
Show resolved Hide resolved

except OSError as exc:
raise FileProblem(
f"Could not process file '{transcripts_file}'"
) from exc

if taxon_id in org_dict:
balajtimate marked this conversation as resolved.
Show resolved Hide resolved
return org_dict[taxon_id]

return None
2 changes: 1 addition & 1 deletion htsinfer/get_read_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ def evaluate(self) -> None:
try:
with open(self.path, encoding="utf-8") as _f: # type: ignore

LOGGER.debug("Procecssing Reads")
LOGGER.debug("Processing Reads")
try:
for record in FastqGeneralIterator(source=_f):
read = record[1]
Expand Down
12 changes: 6 additions & 6 deletions htsinfer/htsinfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,15 +85,15 @@ def evaluate(self):
self.get_library_stats()
LOGGER.info(
"Library stats determined: "
f"{self.config.results.library_stats.json()}"
f"{self.config.results.library_stats.model_dump_json()}"
balajtimate marked this conversation as resolved.
Show resolved Hide resolved
)

# determine library source
LOGGER.info("Determining library source...")
self.config.results.library_source = self.get_library_source()
LOGGER.info(
"Library source determined: "
f"{self.config.results.library_source.json()}"
f"{self.config.results.library_source.model_dump_json()}"
)

# determine library type
Expand All @@ -106,7 +106,7 @@ def evaluate(self):
LOGGER.warning(f"{type(exc).__name__}: {str(exc)}")
LOGGER.info(
"Library type determined: "
f"{self.config.results.library_type.json()}"
f"{self.config.results.library_type.model_dump_json()}"
)

# determine read orientation
Expand All @@ -119,7 +119,7 @@ def evaluate(self):
LOGGER.warning(f"{type(exc).__name__}: {str(exc)}")
LOGGER.info(
"Read orientation determined: "
f"{self.config.results.read_orientation.json()}"
f"{self.config.results.read_orientation.model_dump_json()}"
)

# determine read layout
Expand All @@ -132,7 +132,7 @@ def evaluate(self):
LOGGER.warning(f"{type(exc).__name__}: {str(exc)}")
LOGGER.info(
"Read layout determined: "
f"{self.config.results.read_layout.json()}"
f"{self.config.results.read_layout.model_dump_json()}"
)

except FileProblem as exc:
Expand All @@ -148,7 +148,7 @@ def evaluate(self):
LOGGER.error(f"{type(exc).__name__}: {str(exc)}")

# log results
LOGGER.info(f"Results: {self.config.results.json()}")
LOGGER.info(f"Results: {self.config.results.model_dump_json()}")

def prepare_env(self):
"""Set up work environment."""
Expand Down
2 changes: 2 additions & 0 deletions htsinfer/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,7 @@ class Args(BaseModel):
records: Number of input file records to process; set to `0` to
process all records.
threads: Number of threads to run STAR with.
org_id: Organism ID.
transcripts_file: File path to transcripts FASTA file.
read_layout_adapter_file: Path to text file containing 3' adapter
sequences to scan for (one sequence per line).
Expand Down Expand Up @@ -429,6 +430,7 @@ class Args(BaseModel):
CleanupRegimes.DEFAULT
records: int = 0
threads: int = 1
org_id: Optional[int] = None
transcripts_file: Path = Path()
read_layout_adapter_file: Path = Path()
read_layout_min_match_pct: float = 0.1
Expand Down
4 changes: 2 additions & 2 deletions pylint.cfg
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[MESSAGES CONTROL]
disable=C0330,I1101,R0801,R0902,R0903,R0913,R0914,W1202,W1203,W1510
extension-pkg-white-list=pysam,ahocorasick
disable=I1101,R0801,R0902,R0903,R0913,R0914,W1202,W1203,W1510
extension-pkg-whitelist=pysam,ahocorasick
ignored-classes=pysam
Loading
Loading