teenager

cbg-ethz · Nov 18, 2024 · 0f59b15 · 0f59b15
1 parent 1d2ba0d
commit 0f59b15
Show file tree

Hide file tree

Showing 4 changed files with 116 additions and 16 deletions.
diff --git a/environment.yml b/environment.yml
@@ -6,4 +6,7 @@ channels:
 dependencies:
   - python=3.10
   - nextclade
-  - poetry=1.8.3
+  - poetry
+  - pydantic
+  - click
+  - schedule
diff --git a/scripts/README.md b/scripts/README.md
@@ -0,0 +1,55 @@
+# sr2silo Scripts
+
+This directory contains scripts for processing and managing sample data. Below is an explanation of the two main scripts in this directory.
+
+## vp_deamon.py
+
+`vp_deamon.py` is a daemon script that processes new samples from the timeline file and stores the processed samples in the result directory. It performs the following tasks:
+
+1. **Load Configuration**: Loads configuration settings from a JSON file using Pydantic for validation.
+2. **Initialize Database**: Initializes a SQLite database to keep track of processed samples.
+3. **Process New Samples**: Reads the timeline file to identify new samples and processes them using the `vp_transformer` module.
+4. **Backup Database**: Backs up the database daily to a specified backup directory.
+5. **Schedule Tasks**: Uses the `schedule` library to run the sample processing and database backup tasks at specified intervals.
+
+### Usage
+
+To run the daemon script, execute the following command:
+
+```sh
+python vp_deamon.py --config scripts/vp_config.json
+```
+Ensure that the configuration file vp_config.json is present in the scripts directory with the necessary settings.
+
+## vp_transformer.py
+`vp_transformer.py` is a script that contains the core processing logic for transforming sample data. This script is used by `vp_deamon.py` to process new samples.
+
+## Usage
+This script is typically not run directly. Instead, it is imported and used by `vp_deamon.py`.
+
+## Legacy Notice
+The core processing logic in these scripts is based on the dgivec scripts, which were the foundation of this package. These scripts are retained here for legacy reasons and to ensure compatibility with existing workflows.
+
+## Configuration
+The configuration file `vp_config.json` should have the following structure:
+
+```json
+{
+    "sample_dir": "The directory where the samples are stored.",
+    "timeline_file": "The path to the timeline file.",
+    "result_dir": "The directory where the results are stored.",
+    "nextclade_reference": "The reference to use for nextclade.",
+    "database_file": "The path to the database file.",
+    "backup_dir": "The directory where the backups are stored.",
+    "deamon_interval_m": "The interval in minutes to run the daemon."
+}
+```
+- `sample_dir`: The directory where the samples are stored.
+- `timeline_file`: The path to the timeline file.
+- `result_dir`: The directory where the results are stored.
+- `nextclade_reference`: The reference to use for nextclade.
+- `database_file`: The path to the database file.
+- `backup_dir`: The directory where the backups are stored.
+- `deamon_interval_m`: The interval in minutes to run the daemon.
+
+Ensure that all paths are correctly set in the configuration file before running the scripts.
diff --git a/scripts/vp_config.json b/scripts/vp_config.json
@@ -4,5 +4,6 @@
     "timeline_file": "../../../data/sr2silo/deamon_test/timeline.tsv",
     "nextclade_reference": "sars-cov-2",
     "database_file": "processed_files.db",
-    "backup_dir": "backups/"
+    "backup_dir": "backups/",
+    "deamon_interval_m": 1
 }
diff --git a/scripts/vp_deamon.py b/scripts/vp_deamon.py
@@ -15,6 +15,7 @@
 
 import click
 import schedule
+from pydantic import BaseModel, Field, ValidationError
 
 from vp_transformer import process_directory  # noqa: F401 # isort:skip
 
@@ -89,15 +90,47 @@ def process_new_samples(
             output_dir = result_dir / sample_id / batch_id
             output_dir.mkdir(parents=True, exist_ok=True)
             try:
-                process_directory(file_path, output_dir, nextclade_reference, timeline_file)
+                process_directory(
+                    file_path, output_dir, nextclade_reference, timeline_file
+                )
                 mark_sample_as_processed(database_file, sample_id, batch_id)
             except Exception as e:
-                logging.error(f"Error processing sample {sample_id}, batch {batch_id}: {e}")
+                logging.error(
+                    f"Error processing sample {sample_id}, batch {batch_id}: {e}"
+                )
 
 
-def load_config(config_file: Path) -> dict:
+class Config(BaseModel):
+    """Configuration for the sr2silo daemon.
+
+    Args:
+
+        sample_dir (str): The directory where the samples are stored.
+        timeline_file (str): The path to the timeline file.
+        result_dir (str): The directory where the results are stored.
+        nextclade_reference (str): The reference to use for nextclade.
+        database_file (str): The path to the database file.
+        backup_dir (str): The directory where the backups are stored.
+        deamon_interval_m (int): The interval in minutes to run the daemon.
+    """
+
+    sample_dir: str
+    timeline_file: str
+    result_dir: str
+    nextclade_reference: str
+    database_file: str
+    backup_dir: str
+    deamon_interval_m: int
+
+
+def load_config(config_file: Path) -> Config:
     with config_file.open() as f:
-        return json.load(f)
+        config_data = json.load(f)
+    try:
+        return Config(**config_data)
+    except ValidationError as e:
+        logging.error(f"Configuration validation error: {e}")
+        raise
 
 
 def backup_database(database_file: Path, backup_dir: Path):
@@ -108,23 +141,31 @@ def backup_database(database_file: Path, backup_dir: Path):
     logging.info(f"Database backed up to: {backup_file}")
 
 
-def main():
+@click.command()
+@click.option(
+    "--config",
+    type=click.Path(exists=True),
+    help="Path to the configuration file.",
+    default="scripts/vp_config.json",
+)
+def main(config):
     # Load the configuration
     logging.info("Loading configuration...")
-    config = load_config(Path("scripts/vp_config.json"))
+    config = load_config(Path(config))
 
-    sample_dir = Path(config["sample_dir"])
-    timeline_file = Path(config["timeline_file"])
-    result_dir = Path(config["result_dir"])
-    nextclade_reference = config["nextclade_reference"]
-    database_file = Path(config["database_file"])
-    backup_dir = Path(config["backup_dir"])
+    sample_dir = Path(config.sample_dir)
+    timeline_file = Path(config.timeline_file)
+    result_dir = Path(config.result_dir)
+    nextclade_reference = config.nextclade_reference
+    database_file = Path(config.database_file)
+    backup_dir = Path(config.backup_dir)
+    deamon_interval_m = config.deamon_interval_m
 
     logging.info("Initializing database...")
     initialize_database()
 
     logging.info("Scheduling the sample processing job...")
-    schedule.every(1).minutes.do(
+    schedule.every(deamon_interval_m).minutes.do(
         process_new_samples,
         database_file,
         sample_dir,
@@ -140,7 +181,7 @@ def main():
     while True:
         schedule.run_pending()
         logging.debug("Waiting for the next scheduled task...")
-        time.sleep(10)
+        time.sleep(deamon_interval_m * 60)
 
 
 if __name__ == "__main__":