From 93cfd206e4d681d90b0a2c65075e0fce12fcc4d3 Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Tue, 10 Dec 2024 15:50:27 +0000
Subject: [PATCH] Make via snakemake what all the other classes were doing

---
 README.md                                     |  11 +-
 _datasets.csv                                 |  21 ---
 calcium_imaging_automation/core/reader.py     | 130 ------------------
 .../core/rules/preprocess.py                  |  16 +--
 .../core/rules/setup.py                       |  56 --------
 calcium_imaging_automation/core/writer.py     |  58 --------
 workflow/Snakefile                            |  76 +++++-----
 7 files changed, 54 insertions(+), 314 deletions(-)
 delete mode 100644 _datasets.csv
 delete mode 100644 calcium_imaging_automation/core/reader.py
 delete mode 100644 calcium_imaging_automation/core/rules/setup.py
 delete mode 100644 calcium_imaging_automation/core/writer.py

diff --git a/README.md b/README.md
index 128e7cc..4c81b3d 100644
--- a/README.md
+++ b/README.md
@@ -13,13 +13,8 @@ To extract dataset names
 snakemake --cores 1 setup_output.txt
 ```
 
-
-To run preprocessing with slurm, use the following command for one dataset:
-```bash
-snakemake --executor slurm --jobs 20 --latency-wait 10 preprocess_output_0.txt
-```
-For an array of datasets:
+Run all jobs in the pipeline:
 ```bash
-snakemake --executor slurm --jobs 20 --latency-wait 10 preprocess_output_{0..N}.txt
+snakemake --executor slurm --jobs 20 --latency-wait 10 all
 ```
-Replace N with the number of datasets you have in the `datasets.csv` file.
+Add `-np --printshellcmds` for a dry run with commands printed to the terminal.
diff --git a/_datasets.csv b/_datasets.csv
deleted file mode 100644
index c9c0036..0000000
--- a/_datasets.csv
+++ /dev/null
@@ -1,21 +0,0 @@
-index,read_dataset_path,write_dataset_path
-0,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230804_CAA_1119917,/ceph/margrie/laura/cimaut/derivatives/sub-0_230804_CAA_1119917
-1,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230818_CAA_1120210,/ceph/margrie/laura/cimaut/derivatives/sub-1_230818_CAA_1120210
-2,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230803_CAA_1119915,/ceph/margrie/laura/cimaut/derivatives/sub-2_230803_CAA_1119915
-3,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230801_CAA_1120181,/ceph/margrie/laura/cimaut/derivatives/sub-3_230801_CAA_1120181
-4,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230802_CAA_1120182,/ceph/margrie/laura/cimaut/derivatives/sub-4_230802_CAA_1120182
-5,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230803_CAA_1120181,/ceph/margrie/laura/cimaut/derivatives/sub-5_230803_CAA_1120181
-6,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230822_CAA_1120509,/ceph/margrie/laura/cimaut/derivatives/sub-6_230822_CAA_1120509
-7,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230823_CAA_1120181,/ceph/margrie/laura/cimaut/derivatives/sub-7_230823_CAA_1120181
-8,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230824_CAA_1119915,/ceph/margrie/laura/cimaut/derivatives/sub-8_230824_CAA_1119915
-9,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230825_CAA_1120182,/ceph/margrie/laura/cimaut/derivatives/sub-9_230825_CAA_1120182
-10,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230905_CAA_1119917,/ceph/margrie/laura/cimaut/derivatives/sub-10_230905_CAA_1119917
-11,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230907_CAA_1120210,/ceph/margrie/laura/cimaut/derivatives/sub-11_230907_CAA_1120210
-12,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230907_CAA_1120509,/ceph/margrie/laura/cimaut/derivatives/sub-12_230907_CAA_1120509
-13,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230912_CAA_1119915,/ceph/margrie/laura/cimaut/derivatives/sub-13_230912_CAA_1119915
-14,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230912_CAA_1120051,/ceph/margrie/laura/cimaut/derivatives/sub-14_230912_CAA_1120051
-15,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230913_CAA_1120182,/ceph/margrie/laura/cimaut/derivatives/sub-15_230913_CAA_1120182
-16,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230913_CAA_1120395,/ceph/margrie/laura/cimaut/derivatives/sub-16_230913_CAA_1120395
-17,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230914_CAA_1120181,/ceph/margrie/laura/cimaut/derivatives/sub-17_230914_CAA_1120181
-18,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230914_CAA_1120210,/ceph/margrie/laura/cimaut/derivatives/sub-18_230914_CAA_1120210
-19,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230915_CAA_1120509,/ceph/margrie/laura/cimaut/derivatives/sub-19_230915_CAA_1120509
diff --git a/calcium_imaging_automation/core/reader.py b/calcium_imaging_automation/core/reader.py
deleted file mode 100644
index f0fcb3c..0000000
--- a/calcium_imaging_automation/core/reader.py
+++ /dev/null
@@ -1,130 +0,0 @@
-from pathlib import Path
-from typing import List
-
-
-class ReadAquiredData:
-    def __init__(
-        self,
-        raw_data_folder: Path,
-        folder_read_pattern: str,
-        file_read_pattern: List[str],
-    ):
-        """
-        Class to handle filepaths and dataset names in the raw data folder.
-        It can load folders and files based on the provided patterns, allowing
-        flexibility in the data structure of origin.
-        It also provides the maximum number of sessions for each dataset based
-        on the total number of files found in the dataset folders, by default
-        it searches for tif files.
-
-        Parameters
-        ----------
-        raw_data_folder : Path
-            The path to the raw data folder.
-        folder_read_pattern : str
-            The pattern to search for folders in the raw data folder. It
-            corresponds to the naming convention of the datasets.
-        file_read_pattern : List[str]
-            The patterns to search for files in the dataset folders. It
-            corresponds to the naming convention of the files in the dataset
-            folders.
-        """
-        self.folder_read_pattern = folder_read_pattern
-        self.file_read_pattern = file_read_pattern
-
-        self.datasets_paths = self.get_folders_first_layer(raw_data_folder)
-        self.dataset_names = [
-            dataset_path.name for dataset_path in self.datasets_paths
-        ]
-
-    def get_folders_first_layer(self, file_path: Path) -> List[Path]:
-        """
-        Get the first layer of folders in the raw data folder. The rest
-        of the class assumes that the first layer of folders corresponds
-        to the dataset folders.
-
-        Parameters
-        ----------
-        file_path : Path
-            The path to the raw data folder.
-
-        Returns
-        -------
-        List[Path]
-            The list of paths to the dataset folders.
-        """
-        return list(file_path.glob(self.folder_read_pattern))
-
-    def get_files_paths_by_format(
-        self, folder: Path, filetype="tif"
-    ) -> List[Path]:
-        """
-        Get the paths to the files in the dataset folders based on the
-        provided file type. By default, it searches for tif files.
-
-        Parameters
-        ----------
-        folder : Path
-            The path to the dataset folder.
-        filetype : str, optional
-            The file type to search for in the dataset folder, by default
-            "tif".
-
-        Returns
-        -------
-        List[Path]
-            The list of paths to the files in the dataset folder.
-        """
-        return list(folder.rglob(filetype))
-
-    def total_objects_by_extension(self, folder: Path) -> dict:
-        """
-        Get the total number of files in the dataset folder based on the
-        extensions included in the file_read_pattern.
-
-        Parameters
-        ----------
-        folder : Path
-            The path to the dataset folder.
-
-        Returns
-        -------
-        dict
-            The dictionary with the number of files for each extension in the
-            patterns found in file_read_pattern.
-        """
-
-        return {
-            filetype.split(".")[-1]: len(
-                self.get_files_paths_by_format(folder, filetype)
-            )
-            for filetype in self.file_read_pattern
-        }
-
-    def max_session_number(self, filetype="tif", max_allowed=1) -> int:
-        """
-        Get the maximum number of sessions for each dataset based on the total
-        number of files found in the dataset folders. By default, it searches
-        for tif files and allows a maximum of 5 sessions. It assumes that every
-        tif file corresponds to an experimental session.
-
-        Parameters
-        ----------
-        filetype : str, optional
-            The file type to search for in the dataset folder, by default
-            "tif".
-        max_allowed : int, optional
-            The maximum number of sessions allowed, by default 5.
-
-        Returns
-        -------
-        int
-            The maximum number of sessions for each dataset.
-        """
-
-        total_tif_number = [
-            self.total_objects_by_extension(dataset_path).get(filetype, 0)
-            for dataset_path in self.datasets_paths
-        ]
-
-        return min(max(total_tif_number), max_allowed)
diff --git a/calcium_imaging_automation/core/rules/preprocess.py b/calcium_imaging_automation/core/rules/preprocess.py
index e9dc3b1..66ddb5c 100644
--- a/calcium_imaging_automation/core/rules/preprocess.py
+++ b/calcium_imaging_automation/core/rules/preprocess.py
@@ -4,19 +4,17 @@
 from derotation.derotate_batch import derotate
 from snakemake.script import snakemake
 
-try:
-    # Input arguments
-    read_dataset_path = Path(snakemake.input[0])
-    write_dataset_path = Path(snakemake.input[1])
-    output = snakemake.output[0]
-
-    output_path_dataset = write_dataset_path / "ses-0/funcimg/"
+# Input arguments
+read_dataset_path = Path(snakemake.input[0])
+output_tif = Path(snakemake.output[0])
 
+output_path_dataset = output_tif.parent.parent
+try:
     data = derotate(read_dataset_path, output_path_dataset)
     metric_measured = stability_of_most_detected_blob(data)
-    with open(output, "w") as f:
+    with open(output_path_dataset / "metric.txt", "w") as f:
         f.write(f"dataset: {read_dataset_path.stem} metric: {metric_measured}")
 except Exception as e:
     print(e.args)
-    with open(output, "w") as f:
+    with open(output_path_dataset / "error.txt", "w") as f:
         f.write(str(e.args))
diff --git a/calcium_imaging_automation/core/rules/setup.py b/calcium_imaging_automation/core/rules/setup.py
deleted file mode 100644
index 56f1835..0000000
--- a/calcium_imaging_automation/core/rules/setup.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import argparse
-import shutil
-from pathlib import Path
-
-import pandas as pd
-
-from calcium_imaging_automation.core.reader import ReadAquiredData
-from calcium_imaging_automation.core.writer import DatashuttleWrapper
-from snakemake.script import snakemake
-
-
-try:
-    read_dataset_path = Path(snakemake.input[0])
-    write_dataset_path = Path(snakemake.input[1])
-    folder_read_pattern = snakemake.params.folder_read_pattern
-    file_read_pattern = snakemake.params.file_read_pattern
-    
-    output = snakemake.output[0]
-
-    try:
-        shutil.rmtree("/ceph/margrie/laura/cimaut/derivatives/")
-        shutil.rmtree("/ceph/margrie/laura/cimaut/submitit/")
-    except FileNotFoundError:
-        print("No derivatives folder found")
-
-    print(f"Reading data from {read_dataset_path}")
-
-    reader = ReadAquiredData(
-        read_dataset_path,
-        folder_read_pattern,
-        file_read_pattern,
-    )
-    print(f"Found {len(reader.datasets_paths)} datasets.")
-
-    number_of_tiffs = reader.max_session_number(filetype="tif")
-    print(f"Max of tiffs found: {number_of_tiffs}")
-
-    writer = DatashuttleWrapper(write_dataset_path)
-    writer.create_folders(reader.dataset_names, session_number=number_of_tiffs)
-    print("Folders created")
-
-    datasets = pd.DataFrame(
-        {
-            "read_dataset_path": reader.datasets_paths,
-            "write_dataset_path": [
-                writer.get_dataset_path(dt.stem)
-                for dt in reader.datasets_paths
-            ],
-        }
-    )
-    datasets.to_csv(output, index=True, index_label="index")
-   
-except Exception as e:
-    print(e.args)
-    with open(output, "w") as f:
-        f.write(str(e.args))
diff --git a/calcium_imaging_automation/core/writer.py b/calcium_imaging_automation/core/writer.py
deleted file mode 100644
index 6e713c4..0000000
--- a/calcium_imaging_automation/core/writer.py
+++ /dev/null
@@ -1,58 +0,0 @@
-from pathlib import Path
-from typing import Dict, List
-
-import numpy as np
-from datashuttle.configs.config_class import Configs
-from datashuttle.utils import folders
-from PIL import Image
-
-
-class DatashuttleWrapper:
-    def __init__(self, output_path: Path) -> None:
-        # This is supposed to run in the cluster and have direct access
-        # to the central storages
-        self.output_path = output_path
-        self.datashuttle_cfg = Configs(
-            project_name=output_path.name,
-            file_path=output_path,
-            input_dict={
-                "local_path": output_path,
-                "central_path": "",
-                "connection_method": "local_filesystem",
-            },
-        )
-
-    def create_folders(self, dataset_names: List[str], session_number) -> None:
-        # all_paths is a dictionary with keys: sub, ses
-        self.all_paths: Dict[str, List[Path]] = folders.create_folder_trees(
-            cfg=self.datashuttle_cfg,
-            top_level_folder="derivatives",
-            sub_names=[
-                f"sub-{i}_{dataset_name}"
-                for i, dataset_name in enumerate(dataset_names)
-            ],
-            ses_names=[f"ses-{i}" for i in range(session_number)],
-            datatype="funcimg",
-        )
-
-    def get_dataset_path(self, dataset_name: str) -> Path:
-        return next(
-            (self.output_path / "derivatives").glob(f"*{dataset_name}*")
-        )
-
-    def save_image(
-        self,
-        image: np.ndarray,
-        dataset_name: str,
-        session_number: int,
-        filename: str,
-    ) -> Path:
-        path = self.get_dataset_path(dataset_name)
-        image = Image.fromarray(image).convert("L")
-        image_path = path / f"ses-{session_number}" / "funcimg" / f"{filename}"
-        image.save(
-            image_path,
-            mode="PNG",
-        )
-
-        return image_path
diff --git a/workflow/Snakefile b/workflow/Snakefile
index e817e52..27bc8b3 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -1,35 +1,47 @@
-rule setup:
-    input:
-        "/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/",
-        "/ceph/margrie/laura/cimaut/",
-    params:
-        folder_read_pattern="2*",
-        file_read_pattern=["rotation_00001.tif", "*.bin"],
-    output: "datasets.csv"
-    run:
-        "calcium_imaging_automation/core/rules/setup.py"
-        
-# import pandas as pd
+# Base paths
+raw_data_base = "/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/"
+processed_data_base = "/ceph/margrie/laura/cimaut/derivatives"
+
+# Dynamically discover folders matching the "2*" pattern
+datasets = glob_wildcards(f"{raw_data_base}{{dataset}}").dataset
+datasets = [ds for ds in datasets if ds.startswith("2")]
+datasets = [ds.split("/")[0] for ds in datasets]
+datasets = list(set(datasets))
+datasets.sort()
 
-# paths = pd.read_csv("datasets.csv")
+#  for the output
+datasets_no_underscore = [ds.replace("_", "") for ds in datasets]
 
-# rule all:
-#     input:
-#         expand("preprocess_output_{index}.txt", index=paths["index"])
+#  Final state of the pipeline
+#  Are all the outputs files present?
+rule all:
+    input:
+        expand(
+            [
+                f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.tif",
+                f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.csv",
+            ],
+            zip,
+            index=range(len(datasets)),
+            datasets_no_underscore=datasets_no_underscore,
+        )
 
-# rule preprocess:
-#     input:
-#         lambda wildcards: paths.loc[int(wildcards.index), "read_dataset_path"],
-#         lambda wildcards: paths.loc[int(wildcards.index), "write_dataset_path"],
-#     output:
-#         "preprocess_output_{index}.txt"
-#     params:
-#         index=lambda wildcards: wildcards.index
-#     resources:
-#         partition="fast",
-#         mem_mb=16000,
-#         cpu_per_task=1,
-#         tasks=1,
-#         nodes=1,
-#     script:
-#         "calcium_imaging_automation/core/rules/preprocess.py"
+rule preprocess:
+    input:
+        raw=lambda wildcards: f"{raw_data_base}{datasets[int(wildcards.index)]}/",
+        # Dynamically match input files using patterns
+        # bin=lambda wildcards: f"{raw_data_base}{datasets[int(wildcards.index)]}/aux_stim/*rotation_*001.bin",
+        # tif=lambda wildcards: f"{raw_data_base}{datasets[int(wildcards.index)]}/imaging/rotation_*001.tif",
+    output:
+        tiff=f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.tif",
+        csv=f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.csv",
+    params:
+        index=lambda wildcards: wildcards.index
+    resources:
+        partition="fast",
+        mem_mb=16000,
+        cpu_per_task=1,
+        tasks=1,
+        nodes=1,
+    script:
+        "../calcium_imaging_automation/core/rules/preprocess.py"