Skip to content

Commit

Permalink
Merge branch 'main' into fix/docker_mount
Browse files Browse the repository at this point in the history
  • Loading branch information
gordonkoehn authored Nov 22, 2024
2 parents de9d9da + 962de78 commit b40aade
Show file tree
Hide file tree
Showing 8 changed files with 93 additions and 36 deletions.
2 changes: 0 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@ repos:
hooks:
- id: poetry-check
- id: poetry-lock
- id: poetry-export
args: ["-f", "requirements.txt", "-o", "requirements.txt"]
- repo: https://github.com/charliermarsh/ruff-pre-commit
rev: 'v0.0.245'
hooks:
Expand Down
1 change: 1 addition & 0 deletions docker-compose.env
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@ SAMPLE_DIR=./tests/data/samples/A1_05_2024_10_08/20241024_2411515907/alignments
SAMPLE_ID=A1_05_2024_10_08
BATCH_ID=20241024_2411515907
TIMELINE_FILE=./tests/data/samples/timeline_A1_05_2024_10_08.tsv
PRIMER_FILE=./tests/data/samples/primers.yaml
NEXTCLADE_REFERENCE=sars-cov2
RESULTS_DIR=./results
2 changes: 2 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ services:
volumes:
- ${SAMPLE_DIR}:/app/sample
- ${TIMELINE_FILE}:/app/timeline.tsv
- ${PRIMER_FILE}:/app/primers.yaml
- ${RESULTS_DIR}:/app/results
- ./scripts/vp_config.json:/app/scripts/vp_config.json
environment:
Expand All @@ -12,6 +13,7 @@ services:
- SAMPLE_ID=${SAMPLE_ID}
- BATCH_ID=${BATCH_ID}
- TIMELINE_FILE=${TIMELINE_FILE}
- PRIMER_FILE=${PRIMER_FILE}
- RESULTS_DIR=${RESULTS_DIR}

volumes:
Expand Down
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@ packages = [{ include = "sr2silo", from = "src" }]
[tool.poetry.dependencies]
python = "^3.10"
pysam = "^0.22.1"
click = "^8.0.0"
schedule = "^1.2.2"
pyyaml = "^6.0.2"

[tool.poetry.group.dev.dependencies]
pytest = "^7.2.1"
Expand Down
28 changes: 0 additions & 28 deletions requirements.txt

This file was deleted.

46 changes: 42 additions & 4 deletions scripts/vp_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from pathlib import Path

import click
import yaml

from sr2silo.convert import bam_to_sam
from sr2silo.process import pair_normalize_reads
Expand Down Expand Up @@ -90,7 +91,7 @@ def convert_to_iso_date(date: str) -> str:
return date_obj.date().isoformat()


def get_metadata(sample_id: str, batch_id: str, timeline: Path) -> dict:
def get_metadata(sample_id: str, batch_id: str, timeline: Path, primers: Path) -> dict:
"""
Get metadata for a given sample and batch directory.
Cross-references the directory with the timeline file to get the metadata.
Expand All @@ -99,6 +100,7 @@ def get_metadata(sample_id: str, batch_id: str, timeline: Path) -> dict:
sample_id (str): The sample ID to use for metadata.
batch_id (str): The batch ID to use for metadata.
timeline (Path): The timeline file to cross-reference the metadata.
primers (Path): The primers file to cross-reference the metadata.
Returns:
dict: A dictionary containing the metadata.
Expand All @@ -122,7 +124,10 @@ def get_metadata(sample_id: str, batch_id: str, timeline: Path) -> dict:
# timline has headers:
# sample_id batch_id read_length primer_protocol location_code sampling_date location_name
# get read length, primer protocol, location name
# double checl if location code and location code are the same
# double check if location code and location code are the same
if not timeline.is_file():
logging.error(f"Timeline file not found or is not a file: {timeline}")
raise FileNotFoundError(f"Timeline file not found or is not a file: {timeline}")
with timeline.open() as f:
reader = csv.reader(f, delimiter="\t")
for row in reader:
Expand Down Expand Up @@ -162,6 +167,28 @@ def get_metadata(sample_id: str, batch_id: str, timeline: Path) -> dict:
raise ValueError(
f"No matching entry found in timeline for sample_id {metadata['sample_id']} and batch_id {metadata['batch_id']}"
)
# Read the primers yaml to get additional metadata
# find the key with matching primer_protocol and get the "name" value
# as the canonical name of the primer protocol
if not primers.is_file():
logging.error(f"Primers file not found or is not a file: {primers}")
raise FileNotFoundError(f"Primers file not found or is not a file: {primers}")
# Load YAML file
with open(primers, "r") as file:
primers = yaml.safe_load(file)
logging.debug(f"Primers: {primers}")
logging.debug(f" Type of primers: {type(primers)}")
for primer in primers.keys():
if primer == metadata["primer_protocol"]:
logging.info(
f"Enriching metadata with primer data e.g. primer_protocol_name"
)
metadata["primer_protocol_name"] = primers[primer]["name"]
break
else:
raise ValueError(
f"No matching entry found in primers for primer_protocol {metadata['primer_protocol']}"
)
return metadata


Expand All @@ -172,6 +199,7 @@ def process_directory(
result_dir: Path,
nextclade_reference: str,
timeline_file: Path,
primers_file: Path,
file_name: str = "REF_aln_trim.bam",
) -> None:
"""Process all files in a given directory.
Expand All @@ -182,6 +210,7 @@ def process_directory(
result_dir (Path): The directory to save the results.
nextclade_reference (str): The reference to use for nextclade.
timeline_file (Path): The timeline file to cross-reference the metadata.
primers_file (Path): The primers file to cross-reference the metadata.
file_name (str): The name of the file to process
Returns:
Expand All @@ -202,7 +231,7 @@ def process_directory(
raise FileNotFoundError(f"Input file not found: {sample_fp}")

# Get Sample and Batch metadata and write to a file
metadata = get_metadata(sample_id, batch_id, timeline_file)
metadata = get_metadata(sample_id, batch_id, timeline_file, primers_file)
# add nextclade reference to metadata
metadata["nextclade_reference"] = nextclade_reference
metadata_file = result_dir / "metadata.json"
Expand Down Expand Up @@ -239,19 +268,27 @@ def process_directory(
@click.option(
"--timeline_file", envvar="TIMELINE_FILE", help="Path to the timeline file."
)
@click.option("--primer_file", envvar="PRIMER_FILE", help="Path to the primers file.")
@click.option(
"--nextclade_reference",
envvar="NEXTCLADE_REFERENCE",
default="sars-cov-2",
help="Nextclade reference.",
)
def main(
sample_dir, sample_id, batch_id, result_dir, timeline_file, nextclade_reference
sample_dir,
sample_id,
batch_id,
result_dir,
timeline_file,
primer_file,
nextclade_reference,
):
"""Process a sample directory."""
logging.info(f"Processing sample directory: {sample_dir}")
logging.info(f"Saving results to: {result_dir}")
logging.info(f"Using timeline file: {timeline_file}")
logging.info(f"Using primers file: {primer_file}")
logging.info(f"Using Nextclade reference: {nextclade_reference}")
logging.info(f"Using sample_id: {sample_id}")
logging.info(f"Using batch_id: {batch_id}")
Expand All @@ -262,6 +299,7 @@ def main(
batch_id=batch_id,
result_dir=Path("results"),
timeline_file=Path("timeline.tsv"),
primers_file=Path("primers.yaml"),
nextclade_reference=nextclade_reference,
)

Expand Down
44 changes: 44 additions & 0 deletions tests/data/samples/primers.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
v532:
name: SARS-CoV-2 ARTIC V5.3.2
alias:
- SARS-CoV-2 ARTIC V5.3.2 NEB Ultra II
- SARS-CoV-2 ARTIC V5.3.2 NexteraXT
inserts_bedfile: references/primers/v532/SARS-CoV-2.insert.bed
primers_bedfile: references/primers/v532/SARS-CoV-2.primer.bed
primers_file: references/primers/v532/SARS-CoV-2.tsv
primers_fasta: references/primers/v532/SARS-CoV-2.primer.fasta
v41:
name: SARS-CoV-2 ARTIC V4.1
alias:
- SARS-CoV-2 ARTIC V4.1 NEB Ultra II
- SARS-CoV-2 ARTIC V4.1 NexteraXT
inserts_bedfile: references/primers/v41/SARS-CoV-2.insert.bed
primers_bedfile: references/primers/v41/SARS-CoV-2.primer.bed
primers_file: references/primers/v41/SARS-CoV-2.tsv
primers_fasta: references/primers/v41/SARS-CoV-2.primer.fasta
v4:
name: SARS-CoV-2 ARTIC V4
alias:
- SARS-CoV-2 ARTIC V4 NEB Ultra II
- SARS-CoV-2 ARTIC V4 NexteraXT
inserts_bedfile: references/primers/v4/SARS-CoV-2.insert.bed
primers_bedfile: references/primers/v4/SARS-CoV-2.primer.bed
primers_file: references/primers/v4/SARS-CoV-2.tsv
primers_fasta: references/primers/v4/ARTIC_v4.fasta
v3:
name: SARS-CoV-2 ARTIC V3
alias:
- ARTIC_NEB
- ARTICV3_NEB
- COVID_ARTIC_V3_NEB
# Not 100% exact, because Illumina introduces additional controls
- Illumina_COVIDSeq
- COVIDSeq_RUO_Custom
# TODO check with Viollier what custom is
# Nimagen: not 100% sure.
# TODO check with GFB and Christian
- EASYSEQ_SARS_COV2_WHOLE_GENOME_NGS
inserts_bedfile: references/primers/v3/nCoV-2019.insert.bed
primers_bedfile: references/primers/v3/nCoV-2019.primer.bed
primers_file: references/primers/v3/nCoV-2019.tsv
primers_fasta: references/primers/v3/ARTIC_v3.fasta
3 changes: 3 additions & 0 deletions tests/test_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def test_get_metadata():
sample_id="A1_05_2024_10_08",
batch_id="20241024_2411515907",
timeline=Path("tests/data/samples/timeline_A1_05_2024_10_08.tsv"),
primers=Path("tests/data/samples/primers.yaml"),
)

print(metadata)
Expand All @@ -35,6 +36,7 @@ def test_get_metadata():
"read_length": "250",
"primer_protocol": "v532",
"location_name": "Lugano (TI)",
"primer_protocol_name": "SARS-CoV-2 ARTIC V5.3.2",
}

assert metadata == expected_metadata
Expand All @@ -49,5 +51,6 @@ def test_process_directory():
Path("tests/output"),
"nextstrain/sars-cov-2/wuhan-hu-1/orfs",
Path("tests/data/samples/timeline_A1_05_2024_10_08.tsv"),
Path("tests/data/samples/primers.yaml"),
)
assert True

0 comments on commit b40aade

Please sign in to comment.