From 93cfd206e4d681d90b0a2c65075e0fce12fcc4d3 Mon Sep 17 00:00:00 2001 From: lauraporta Date: Tue, 10 Dec 2024 15:50:27 +0000 Subject: [PATCH] Make via snakemake what all the other classes were doing --- README.md | 11 +- _datasets.csv | 21 --- calcium_imaging_automation/core/reader.py | 130 ------------------ .../core/rules/preprocess.py | 16 +-- .../core/rules/setup.py | 56 -------- calcium_imaging_automation/core/writer.py | 58 -------- workflow/Snakefile | 76 +++++----- 7 files changed, 54 insertions(+), 314 deletions(-) delete mode 100644 _datasets.csv delete mode 100644 calcium_imaging_automation/core/reader.py delete mode 100644 calcium_imaging_automation/core/rules/setup.py delete mode 100644 calcium_imaging_automation/core/writer.py diff --git a/README.md b/README.md index 128e7cc..4c81b3d 100644 --- a/README.md +++ b/README.md @@ -13,13 +13,8 @@ To extract dataset names snakemake --cores 1 setup_output.txt ``` - -To run preprocessing with slurm, use the following command for one dataset: -```bash -snakemake --executor slurm --jobs 20 --latency-wait 10 preprocess_output_0.txt -``` -For an array of datasets: +Run all jobs in the pipeline: ```bash -snakemake --executor slurm --jobs 20 --latency-wait 10 preprocess_output_{0..N}.txt +snakemake --executor slurm --jobs 20 --latency-wait 10 all ``` -Replace N with the number of datasets you have in the `datasets.csv` file. +Add `-np --printshellcmds` for a dry run with commands printed to the terminal. diff --git a/_datasets.csv b/_datasets.csv deleted file mode 100644 index c9c0036..0000000 --- a/_datasets.csv +++ /dev/null @@ -1,21 +0,0 @@ -index,read_dataset_path,write_dataset_path -0,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230804_CAA_1119917,/ceph/margrie/laura/cimaut/derivatives/sub-0_230804_CAA_1119917 -1,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230818_CAA_1120210,/ceph/margrie/laura/cimaut/derivatives/sub-1_230818_CAA_1120210 -2,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230803_CAA_1119915,/ceph/margrie/laura/cimaut/derivatives/sub-2_230803_CAA_1119915 -3,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230801_CAA_1120181,/ceph/margrie/laura/cimaut/derivatives/sub-3_230801_CAA_1120181 -4,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230802_CAA_1120182,/ceph/margrie/laura/cimaut/derivatives/sub-4_230802_CAA_1120182 -5,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230803_CAA_1120181,/ceph/margrie/laura/cimaut/derivatives/sub-5_230803_CAA_1120181 -6,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230822_CAA_1120509,/ceph/margrie/laura/cimaut/derivatives/sub-6_230822_CAA_1120509 -7,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230823_CAA_1120181,/ceph/margrie/laura/cimaut/derivatives/sub-7_230823_CAA_1120181 -8,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230824_CAA_1119915,/ceph/margrie/laura/cimaut/derivatives/sub-8_230824_CAA_1119915 -9,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230825_CAA_1120182,/ceph/margrie/laura/cimaut/derivatives/sub-9_230825_CAA_1120182 -10,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230905_CAA_1119917,/ceph/margrie/laura/cimaut/derivatives/sub-10_230905_CAA_1119917 -11,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230907_CAA_1120210,/ceph/margrie/laura/cimaut/derivatives/sub-11_230907_CAA_1120210 -12,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230907_CAA_1120509,/ceph/margrie/laura/cimaut/derivatives/sub-12_230907_CAA_1120509 -13,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230912_CAA_1119915,/ceph/margrie/laura/cimaut/derivatives/sub-13_230912_CAA_1119915 -14,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230912_CAA_1120051,/ceph/margrie/laura/cimaut/derivatives/sub-14_230912_CAA_1120051 -15,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230913_CAA_1120182,/ceph/margrie/laura/cimaut/derivatives/sub-15_230913_CAA_1120182 -16,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230913_CAA_1120395,/ceph/margrie/laura/cimaut/derivatives/sub-16_230913_CAA_1120395 -17,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230914_CAA_1120181,/ceph/margrie/laura/cimaut/derivatives/sub-17_230914_CAA_1120181 -18,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230914_CAA_1120210,/ceph/margrie/laura/cimaut/derivatives/sub-18_230914_CAA_1120210 -19,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230915_CAA_1120509,/ceph/margrie/laura/cimaut/derivatives/sub-19_230915_CAA_1120509 diff --git a/calcium_imaging_automation/core/reader.py b/calcium_imaging_automation/core/reader.py deleted file mode 100644 index f0fcb3c..0000000 --- a/calcium_imaging_automation/core/reader.py +++ /dev/null @@ -1,130 +0,0 @@ -from pathlib import Path -from typing import List - - -class ReadAquiredData: - def __init__( - self, - raw_data_folder: Path, - folder_read_pattern: str, - file_read_pattern: List[str], - ): - """ - Class to handle filepaths and dataset names in the raw data folder. - It can load folders and files based on the provided patterns, allowing - flexibility in the data structure of origin. - It also provides the maximum number of sessions for each dataset based - on the total number of files found in the dataset folders, by default - it searches for tif files. - - Parameters - ---------- - raw_data_folder : Path - The path to the raw data folder. - folder_read_pattern : str - The pattern to search for folders in the raw data folder. It - corresponds to the naming convention of the datasets. - file_read_pattern : List[str] - The patterns to search for files in the dataset folders. It - corresponds to the naming convention of the files in the dataset - folders. - """ - self.folder_read_pattern = folder_read_pattern - self.file_read_pattern = file_read_pattern - - self.datasets_paths = self.get_folders_first_layer(raw_data_folder) - self.dataset_names = [ - dataset_path.name for dataset_path in self.datasets_paths - ] - - def get_folders_first_layer(self, file_path: Path) -> List[Path]: - """ - Get the first layer of folders in the raw data folder. The rest - of the class assumes that the first layer of folders corresponds - to the dataset folders. - - Parameters - ---------- - file_path : Path - The path to the raw data folder. - - Returns - ------- - List[Path] - The list of paths to the dataset folders. - """ - return list(file_path.glob(self.folder_read_pattern)) - - def get_files_paths_by_format( - self, folder: Path, filetype="tif" - ) -> List[Path]: - """ - Get the paths to the files in the dataset folders based on the - provided file type. By default, it searches for tif files. - - Parameters - ---------- - folder : Path - The path to the dataset folder. - filetype : str, optional - The file type to search for in the dataset folder, by default - "tif". - - Returns - ------- - List[Path] - The list of paths to the files in the dataset folder. - """ - return list(folder.rglob(filetype)) - - def total_objects_by_extension(self, folder: Path) -> dict: - """ - Get the total number of files in the dataset folder based on the - extensions included in the file_read_pattern. - - Parameters - ---------- - folder : Path - The path to the dataset folder. - - Returns - ------- - dict - The dictionary with the number of files for each extension in the - patterns found in file_read_pattern. - """ - - return { - filetype.split(".")[-1]: len( - self.get_files_paths_by_format(folder, filetype) - ) - for filetype in self.file_read_pattern - } - - def max_session_number(self, filetype="tif", max_allowed=1) -> int: - """ - Get the maximum number of sessions for each dataset based on the total - number of files found in the dataset folders. By default, it searches - for tif files and allows a maximum of 5 sessions. It assumes that every - tif file corresponds to an experimental session. - - Parameters - ---------- - filetype : str, optional - The file type to search for in the dataset folder, by default - "tif". - max_allowed : int, optional - The maximum number of sessions allowed, by default 5. - - Returns - ------- - int - The maximum number of sessions for each dataset. - """ - - total_tif_number = [ - self.total_objects_by_extension(dataset_path).get(filetype, 0) - for dataset_path in self.datasets_paths - ] - - return min(max(total_tif_number), max_allowed) diff --git a/calcium_imaging_automation/core/rules/preprocess.py b/calcium_imaging_automation/core/rules/preprocess.py index e9dc3b1..66ddb5c 100644 --- a/calcium_imaging_automation/core/rules/preprocess.py +++ b/calcium_imaging_automation/core/rules/preprocess.py @@ -4,19 +4,17 @@ from derotation.derotate_batch import derotate from snakemake.script import snakemake -try: - # Input arguments - read_dataset_path = Path(snakemake.input[0]) - write_dataset_path = Path(snakemake.input[1]) - output = snakemake.output[0] - - output_path_dataset = write_dataset_path / "ses-0/funcimg/" +# Input arguments +read_dataset_path = Path(snakemake.input[0]) +output_tif = Path(snakemake.output[0]) +output_path_dataset = output_tif.parent.parent +try: data = derotate(read_dataset_path, output_path_dataset) metric_measured = stability_of_most_detected_blob(data) - with open(output, "w") as f: + with open(output_path_dataset / "metric.txt", "w") as f: f.write(f"dataset: {read_dataset_path.stem} metric: {metric_measured}") except Exception as e: print(e.args) - with open(output, "w") as f: + with open(output_path_dataset / "error.txt", "w") as f: f.write(str(e.args)) diff --git a/calcium_imaging_automation/core/rules/setup.py b/calcium_imaging_automation/core/rules/setup.py deleted file mode 100644 index 56f1835..0000000 --- a/calcium_imaging_automation/core/rules/setup.py +++ /dev/null @@ -1,56 +0,0 @@ -import argparse -import shutil -from pathlib import Path - -import pandas as pd - -from calcium_imaging_automation.core.reader import ReadAquiredData -from calcium_imaging_automation.core.writer import DatashuttleWrapper -from snakemake.script import snakemake - - -try: - read_dataset_path = Path(snakemake.input[0]) - write_dataset_path = Path(snakemake.input[1]) - folder_read_pattern = snakemake.params.folder_read_pattern - file_read_pattern = snakemake.params.file_read_pattern - - output = snakemake.output[0] - - try: - shutil.rmtree("/ceph/margrie/laura/cimaut/derivatives/") - shutil.rmtree("/ceph/margrie/laura/cimaut/submitit/") - except FileNotFoundError: - print("No derivatives folder found") - - print(f"Reading data from {read_dataset_path}") - - reader = ReadAquiredData( - read_dataset_path, - folder_read_pattern, - file_read_pattern, - ) - print(f"Found {len(reader.datasets_paths)} datasets.") - - number_of_tiffs = reader.max_session_number(filetype="tif") - print(f"Max of tiffs found: {number_of_tiffs}") - - writer = DatashuttleWrapper(write_dataset_path) - writer.create_folders(reader.dataset_names, session_number=number_of_tiffs) - print("Folders created") - - datasets = pd.DataFrame( - { - "read_dataset_path": reader.datasets_paths, - "write_dataset_path": [ - writer.get_dataset_path(dt.stem) - for dt in reader.datasets_paths - ], - } - ) - datasets.to_csv(output, index=True, index_label="index") - -except Exception as e: - print(e.args) - with open(output, "w") as f: - f.write(str(e.args)) diff --git a/calcium_imaging_automation/core/writer.py b/calcium_imaging_automation/core/writer.py deleted file mode 100644 index 6e713c4..0000000 --- a/calcium_imaging_automation/core/writer.py +++ /dev/null @@ -1,58 +0,0 @@ -from pathlib import Path -from typing import Dict, List - -import numpy as np -from datashuttle.configs.config_class import Configs -from datashuttle.utils import folders -from PIL import Image - - -class DatashuttleWrapper: - def __init__(self, output_path: Path) -> None: - # This is supposed to run in the cluster and have direct access - # to the central storages - self.output_path = output_path - self.datashuttle_cfg = Configs( - project_name=output_path.name, - file_path=output_path, - input_dict={ - "local_path": output_path, - "central_path": "", - "connection_method": "local_filesystem", - }, - ) - - def create_folders(self, dataset_names: List[str], session_number) -> None: - # all_paths is a dictionary with keys: sub, ses - self.all_paths: Dict[str, List[Path]] = folders.create_folder_trees( - cfg=self.datashuttle_cfg, - top_level_folder="derivatives", - sub_names=[ - f"sub-{i}_{dataset_name}" - for i, dataset_name in enumerate(dataset_names) - ], - ses_names=[f"ses-{i}" for i in range(session_number)], - datatype="funcimg", - ) - - def get_dataset_path(self, dataset_name: str) -> Path: - return next( - (self.output_path / "derivatives").glob(f"*{dataset_name}*") - ) - - def save_image( - self, - image: np.ndarray, - dataset_name: str, - session_number: int, - filename: str, - ) -> Path: - path = self.get_dataset_path(dataset_name) - image = Image.fromarray(image).convert("L") - image_path = path / f"ses-{session_number}" / "funcimg" / f"{filename}" - image.save( - image_path, - mode="PNG", - ) - - return image_path diff --git a/workflow/Snakefile b/workflow/Snakefile index e817e52..27bc8b3 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -1,35 +1,47 @@ -rule setup: - input: - "/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/", - "/ceph/margrie/laura/cimaut/", - params: - folder_read_pattern="2*", - file_read_pattern=["rotation_00001.tif", "*.bin"], - output: "datasets.csv" - run: - "calcium_imaging_automation/core/rules/setup.py" - -# import pandas as pd +# Base paths +raw_data_base = "/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/" +processed_data_base = "/ceph/margrie/laura/cimaut/derivatives" + +# Dynamically discover folders matching the "2*" pattern +datasets = glob_wildcards(f"{raw_data_base}{{dataset}}").dataset +datasets = [ds for ds in datasets if ds.startswith("2")] +datasets = [ds.split("/")[0] for ds in datasets] +datasets = list(set(datasets)) +datasets.sort() -# paths = pd.read_csv("datasets.csv") +# for the output +datasets_no_underscore = [ds.replace("_", "") for ds in datasets] -# rule all: -# input: -# expand("preprocess_output_{index}.txt", index=paths["index"]) +# Final state of the pipeline +# Are all the outputs files present? +rule all: + input: + expand( + [ + f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.tif", + f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.csv", + ], + zip, + index=range(len(datasets)), + datasets_no_underscore=datasets_no_underscore, + ) -# rule preprocess: -# input: -# lambda wildcards: paths.loc[int(wildcards.index), "read_dataset_path"], -# lambda wildcards: paths.loc[int(wildcards.index), "write_dataset_path"], -# output: -# "preprocess_output_{index}.txt" -# params: -# index=lambda wildcards: wildcards.index -# resources: -# partition="fast", -# mem_mb=16000, -# cpu_per_task=1, -# tasks=1, -# nodes=1, -# script: -# "calcium_imaging_automation/core/rules/preprocess.py" +rule preprocess: + input: + raw=lambda wildcards: f"{raw_data_base}{datasets[int(wildcards.index)]}/", + # Dynamically match input files using patterns + # bin=lambda wildcards: f"{raw_data_base}{datasets[int(wildcards.index)]}/aux_stim/*rotation_*001.bin", + # tif=lambda wildcards: f"{raw_data_base}{datasets[int(wildcards.index)]}/imaging/rotation_*001.tif", + output: + tiff=f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.tif", + csv=f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.csv", + params: + index=lambda wildcards: wildcards.index + resources: + partition="fast", + mem_mb=16000, + cpu_per_task=1, + tasks=1, + nodes=1, + script: + "../calcium_imaging_automation/core/rules/preprocess.py"