Skip to content

Commit

Permalink
teenager
Browse files Browse the repository at this point in the history
  • Loading branch information
gordonkoehn committed Nov 18, 2024
1 parent 1d2ba0d commit 0f59b15
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 16 deletions.
5 changes: 4 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,7 @@ channels:
dependencies:
- python=3.10
- nextclade
- poetry=1.8.3
- poetry
- pydantic
- click
- schedule
55 changes: 55 additions & 0 deletions scripts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# sr2silo Scripts

This directory contains scripts for processing and managing sample data. Below is an explanation of the two main scripts in this directory.

## vp_deamon.py

`vp_deamon.py` is a daemon script that processes new samples from the timeline file and stores the processed samples in the result directory. It performs the following tasks:

1. **Load Configuration**: Loads configuration settings from a JSON file using Pydantic for validation.
2. **Initialize Database**: Initializes a SQLite database to keep track of processed samples.
3. **Process New Samples**: Reads the timeline file to identify new samples and processes them using the `vp_transformer` module.
4. **Backup Database**: Backs up the database daily to a specified backup directory.
5. **Schedule Tasks**: Uses the `schedule` library to run the sample processing and database backup tasks at specified intervals.

### Usage

To run the daemon script, execute the following command:

```sh
python vp_deamon.py --config scripts/vp_config.json
```
Ensure that the configuration file vp_config.json is present in the scripts directory with the necessary settings.

## vp_transformer.py
`vp_transformer.py` is a script that contains the core processing logic for transforming sample data. This script is used by `vp_deamon.py` to process new samples.

## Usage
This script is typically not run directly. Instead, it is imported and used by `vp_deamon.py`.

## Legacy Notice
The core processing logic in these scripts is based on the dgivec scripts, which were the foundation of this package. These scripts are retained here for legacy reasons and to ensure compatibility with existing workflows.

## Configuration
The configuration file `vp_config.json` should have the following structure:

```json
{
"sample_dir": "The directory where the samples are stored.",
"timeline_file": "The path to the timeline file.",
"result_dir": "The directory where the results are stored.",
"nextclade_reference": "The reference to use for nextclade.",
"database_file": "The path to the database file.",
"backup_dir": "The directory where the backups are stored.",
"deamon_interval_m": "The interval in minutes to run the daemon."
}
```
- `sample_dir`: The directory where the samples are stored.
- `timeline_file`: The path to the timeline file.
- `result_dir`: The directory where the results are stored.
- `nextclade_reference`: The reference to use for nextclade.
- `database_file`: The path to the database file.
- `backup_dir`: The directory where the backups are stored.
- `deamon_interval_m`: The interval in minutes to run the daemon.

Ensure that all paths are correctly set in the configuration file before running the scripts.
3 changes: 2 additions & 1 deletion scripts/vp_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@
"timeline_file": "../../../data/sr2silo/deamon_test/timeline.tsv",
"nextclade_reference": "sars-cov-2",
"database_file": "processed_files.db",
"backup_dir": "backups/"
"backup_dir": "backups/",
"deamon_interval_m": 1
}
69 changes: 55 additions & 14 deletions scripts/vp_deamon.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import click
import schedule
from pydantic import BaseModel, Field, ValidationError

from vp_transformer import process_directory # noqa: F401 # isort:skip

Expand Down Expand Up @@ -89,15 +90,47 @@ def process_new_samples(
output_dir = result_dir / sample_id / batch_id
output_dir.mkdir(parents=True, exist_ok=True)
try:
process_directory(file_path, output_dir, nextclade_reference, timeline_file)
process_directory(
file_path, output_dir, nextclade_reference, timeline_file
)
mark_sample_as_processed(database_file, sample_id, batch_id)
except Exception as e:
logging.error(f"Error processing sample {sample_id}, batch {batch_id}: {e}")
logging.error(
f"Error processing sample {sample_id}, batch {batch_id}: {e}"
)


def load_config(config_file: Path) -> dict:
class Config(BaseModel):
"""Configuration for the sr2silo daemon.
Args:
sample_dir (str): The directory where the samples are stored.
timeline_file (str): The path to the timeline file.
result_dir (str): The directory where the results are stored.
nextclade_reference (str): The reference to use for nextclade.
database_file (str): The path to the database file.
backup_dir (str): The directory where the backups are stored.
deamon_interval_m (int): The interval in minutes to run the daemon.
"""

sample_dir: str
timeline_file: str
result_dir: str
nextclade_reference: str
database_file: str
backup_dir: str
deamon_interval_m: int


def load_config(config_file: Path) -> Config:
with config_file.open() as f:
return json.load(f)
config_data = json.load(f)
try:
return Config(**config_data)
except ValidationError as e:
logging.error(f"Configuration validation error: {e}")
raise


def backup_database(database_file: Path, backup_dir: Path):
Expand All @@ -108,23 +141,31 @@ def backup_database(database_file: Path, backup_dir: Path):
logging.info(f"Database backed up to: {backup_file}")


def main():
@click.command()
@click.option(
"--config",
type=click.Path(exists=True),
help="Path to the configuration file.",
default="scripts/vp_config.json",
)
def main(config):
# Load the configuration
logging.info("Loading configuration...")
config = load_config(Path("scripts/vp_config.json"))
config = load_config(Path(config))

sample_dir = Path(config["sample_dir"])
timeline_file = Path(config["timeline_file"])
result_dir = Path(config["result_dir"])
nextclade_reference = config["nextclade_reference"]
database_file = Path(config["database_file"])
backup_dir = Path(config["backup_dir"])
sample_dir = Path(config.sample_dir)
timeline_file = Path(config.timeline_file)
result_dir = Path(config.result_dir)
nextclade_reference = config.nextclade_reference
database_file = Path(config.database_file)
backup_dir = Path(config.backup_dir)
deamon_interval_m = config.deamon_interval_m

logging.info("Initializing database...")
initialize_database()

logging.info("Scheduling the sample processing job...")
schedule.every(1).minutes.do(
schedule.every(deamon_interval_m).minutes.do(
process_new_samples,
database_file,
sample_dir,
Expand All @@ -140,7 +181,7 @@ def main():
while True:
schedule.run_pending()
logging.debug("Waiting for the next scheduled task...")
time.sleep(10)
time.sleep(deamon_interval_m * 60)


if __name__ == "__main__":
Expand Down

0 comments on commit 0f59b15

Please sign in to comment.