Make via snakemake what all the other classes were doing

neuroinformatics-unit · Dec 10, 2024 · 93cfd20 · 93cfd20
1 parent 76ff8b7
commit 93cfd20
Show file tree

Hide file tree

Showing 7 changed files with 54 additions and 314 deletions.
diff --git a/README.md b/README.md
@@ -13,13 +13,8 @@ To extract dataset names
 snakemake --cores 1 setup_output.txt
 ```
 
-
-To run preprocessing with slurm, use the following command for one dataset:
-```bash
-snakemake --executor slurm --jobs 20 --latency-wait 10 preprocess_output_0.txt
-```
-For an array of datasets:
+Run all jobs in the pipeline:
 ```bash
-snakemake --executor slurm --jobs 20 --latency-wait 10 preprocess_output_{0..N}.txt
+snakemake --executor slurm --jobs 20 --latency-wait 10 all
 ```
-Replace N with the number of datasets you have in the `datasets.csv` file.
+Add `-np --printshellcmds` for a dry run with commands printed to the terminal.
diff --git a/_datasets.csv b/_datasets.csv
diff --git a/calcium_imaging_automation/core/reader.py b/calcium_imaging_automation/core/reader.py
diff --git a/calcium_imaging_automation/core/rules/preprocess.py b/calcium_imaging_automation/core/rules/preprocess.py
@@ -4,19 +4,17 @@
 from derotation.derotate_batch import derotate
 from snakemake.script import snakemake
 
-try:
-    # Input arguments
-    read_dataset_path = Path(snakemake.input[0])
-    write_dataset_path = Path(snakemake.input[1])
-    output = snakemake.output[0]
-
-    output_path_dataset = write_dataset_path / "ses-0/funcimg/"
+# Input arguments
+read_dataset_path = Path(snakemake.input[0])
+output_tif = Path(snakemake.output[0])
 
+output_path_dataset = output_tif.parent.parent
+try:
     data = derotate(read_dataset_path, output_path_dataset)
     metric_measured = stability_of_most_detected_blob(data)
-    with open(output, "w") as f:
+    with open(output_path_dataset / "metric.txt", "w") as f:
         f.write(f"dataset: {read_dataset_path.stem} metric: {metric_measured}")
 except Exception as e:
     print(e.args)
-    with open(output, "w") as f:
+    with open(output_path_dataset / "error.txt", "w") as f:
         f.write(str(e.args))
diff --git a/calcium_imaging_automation/core/rules/setup.py b/calcium_imaging_automation/core/rules/setup.py
diff --git a/calcium_imaging_automation/core/writer.py b/calcium_imaging_automation/core/writer.py
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -1,35 +1,47 @@
-rule setup:
-    input:
-        "/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/",
-        "/ceph/margrie/laura/cimaut/",
-    params:
-        folder_read_pattern="2*",
-        file_read_pattern=["rotation_00001.tif", "*.bin"],
-    output: "datasets.csv"
-    run:
-        "calcium_imaging_automation/core/rules/setup.py"
-
-# import pandas as pd
+# Base paths
+raw_data_base = "/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/"
+processed_data_base = "/ceph/margrie/laura/cimaut/derivatives"
+
+# Dynamically discover folders matching the "2*" pattern
+datasets = glob_wildcards(f"{raw_data_base}{{dataset}}").dataset
+datasets = [ds for ds in datasets if ds.startswith("2")]
+datasets = [ds.split("/")[0] for ds in datasets]
+datasets = list(set(datasets))
+datasets.sort()
 
-# paths = pd.read_csv("datasets.csv")
+#  for the output
+datasets_no_underscore = [ds.replace("_", "") for ds in datasets]
 
-# rule all:
-#     input:
-#         expand("preprocess_output_{index}.txt", index=paths["index"])
+#  Final state of the pipeline
+#  Are all the outputs files present?
+rule all:
+    input:
+        expand(
+            [
+                f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.tif",
+                f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.csv",
+            ],
+            zip,
+            index=range(len(datasets)),
+            datasets_no_underscore=datasets_no_underscore,
+        )
 
-# rule preprocess:
-#     input:
-#         lambda wildcards: paths.loc[int(wildcards.index), "read_dataset_path"],
-#         lambda wildcards: paths.loc[int(wildcards.index), "write_dataset_path"],
-#     output:
-#         "preprocess_output_{index}.txt"
-#     params:
-#         index=lambda wildcards: wildcards.index
-#     resources:
-#         partition="fast",
-#         mem_mb=16000,
-#         cpu_per_task=1,
-#         tasks=1,
-#         nodes=1,
-#     script:
-#         "calcium_imaging_automation/core/rules/preprocess.py"
+rule preprocess:
+    input:
+        raw=lambda wildcards: f"{raw_data_base}{datasets[int(wildcards.index)]}/",
+        # Dynamically match input files using patterns
+        # bin=lambda wildcards: f"{raw_data_base}{datasets[int(wildcards.index)]}/aux_stim/*rotation_*001.bin",
+        # tif=lambda wildcards: f"{raw_data_base}{datasets[int(wildcards.index)]}/imaging/rotation_*001.tif",
+    output:
+        tiff=f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.tif",
+        csv=f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.csv",
+    params:
+        index=lambda wildcards: wildcards.index
+    resources:
+        partition="fast",
+        mem_mb=16000,
+        cpu_per_task=1,
+        tasks=1,
+        nodes=1,
+    script:
+        "../calcium_imaging_automation/core/rules/preprocess.py"