diff --git a/.gitignore b/.gitignore index aedc8d7c..ac4340c1 100644 --- a/.gitignore +++ b/.gitignore @@ -81,3 +81,4 @@ venv/ # written by setuptools_scm **/_version.py +benchmarks/results/* diff --git a/MANIFEST.in b/MANIFEST.in index 34cf45e6..27538bc8 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -9,3 +9,8 @@ recursive-exclude * __pycache__ recursive-exclude * *.py[co] recursive-exclude docs * recursive-exclude tests * + +include *.json +recursive-include benchmarks *.json +recursive-include benchmarks *.py +recursive-exclude benchmarks/results * diff --git a/asv.conf.json b/asv.conf.json new file mode 100644 index 00000000..a053875b --- /dev/null +++ b/asv.conf.json @@ -0,0 +1,194 @@ +{ + // The version of the config file format. Do not change, unless + // you know what you are doing. + "version": 1, + + // The name of the project being benchmarked + "project": "brainglobe_workflows", + + // The project's homepage + "project_url": "https://github.com/brainglobe/brainglobe-workflows", + + // The URL or local path of the source code repository for the + // project being benchmarked + "repo": ".", + + // The Python project's subdirectory in your repo. If missing or + // the empty string, the project is assumed to be located at the root + // of the repository. + // "repo_subdir": "", + + // Customizable commands for building the project. + // See asv.conf.json documentation. + // To build the package using pyproject.toml (PEP518), uncomment the following lines + "build_command": [ + "python -m pip install build", + "python -m build", + "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}" + ], + // To build the package using setuptools and a setup.py file, uncomment the following lines + // "build_command": [ + // "python setup.py build", + // "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}" + // ], + + // Customizable commands for installing and uninstalling the project. + // See asv.conf.json documentation. + "install_command": ["in-dir={env_dir} python -mpip install --force-reinstall {wheel_file}"], + "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], + + // List of branches to benchmark. If not provided, defaults to "master" + // (for git) or "default" (for mercurial). + "branches": ["smg/cellfinder-cli-benchmark"], // for git + // "branches": ["default"], // for mercurial + + // The DVCS being used. If not set, it will be automatically + // determined from "repo" by looking at the protocol in the URL + // (if remote), or by looking for special directories, such as + // ".git" (if local). + // "dvcs": "git", + + // The tool to use to create environments. May be "conda", + // "virtualenv", "mamba" (above 3.8) + // or other value depending on the plugins in use. + // If missing or the empty string, the tool will be automatically + // determined by looking for tools on the PATH environment + // variable. + "environment_type": "conda", + + // timeout in seconds for installing any dependencies in environment + // defaults to 10 min + //"install_timeout": 600, + + // the base URL to show a commit for the project. + "show_commit_url": "https://github.com/brainglobe/brainglobe-workflows/commit/", + + // The Pythons you'd like to test against. If not provided, defaults + // to the current version of Python used to run `asv`. + "pythons": ["3.10"], + + // The list of conda channel names to be searched for benchmark + // dependency packages in the specified order + "conda_channels": ["conda-forge", "defaults"], + + // A conda environment file that is used for environment creation. + // "conda_environment_file": "environment.yml", + + // The matrix of dependencies to test. Each key of the "req" + // requirements dictionary is the name of a package (in PyPI) and + // the values are version numbers. An empty list or empty string + // indicates to just test against the default (latest) + // version. null indicates that the package is to not be + // installed. If the package to be tested is only available from + // PyPi, and the 'environment_type' is conda, then you can preface + // the package name by 'pip+', and the package will be installed + // via pip (with all the conda available packages installed first, + // followed by the pip installed packages). + // + // The ``@env`` and ``@env_nobuild`` keys contain the matrix of + // environment variables to pass to build and benchmark commands. + // An environment will be created for every combination of the + // cartesian product of the "@env" variables in this matrix. + // Variables in "@env_nobuild" will be passed to every environment + // during the benchmark phase, but will not trigger creation of + // new environments. A value of ``null`` means that the variable + // will not be set for the current combination. + // + // "matrix": { + // "req": { + // "numpy": ["1.6", "1.7"], + // "six": ["", null], // test with and without six installed + // "pip+emcee": [""] // emcee is only available for install with pip. + // }, + // "env": {"ENV_VAR_1": ["val1", "val2"]}, + // "env_nobuild": {"ENV_VAR_2": ["val3", null]}, + // }, + + // Combinations of libraries/python versions can be excluded/included + // from the set to test. Each entry is a dictionary containing additional + // key-value pairs to include/exclude. + // + // An exclude entry excludes entries where all values match. The + // values are regexps that should match the whole string. + // + // An include entry adds an environment. Only the packages listed + // are installed. The 'python' key is required. The exclude rules + // do not apply to includes. + // + // In addition to package names, the following keys are available: + // + // - python + // Python version, as in the *pythons* variable above. + // - environment_type + // Environment type, as above. + // - sys_platform + // Platform, as in sys.platform. Possible values for the common + // cases: 'linux2', 'win32', 'cygwin', 'darwin'. + // - req + // Required packages + // - env + // Environment variables + // - env_nobuild + // Non-build environment variables + // + // "exclude": [ + // {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows + // {"environment_type": "conda", "req": {"six": null}}, // don't run without six on conda + // {"env": {"ENV_VAR_1": "val2"}}, // skip val2 for ENV_VAR_1 + // ], + // + // "include": [ + // // additional env for python2.7 + // {"python": "2.7", "req": {"numpy": "1.8"}, "env_nobuild": {"FOO": "123"}}, + // // additional env if run on windows+conda + // {"platform": "win32", "environment_type": "conda", "python": "2.7", "req": {"libpython": ""}}, + // ], + + // The directory (relative to the current directory) that benchmarks are + // stored in. If not provided, defaults to "benchmarks" + // "benchmark_dir": "benchmarks", + + // The directory (relative to the current directory) to cache the Python + // environments in. If not provided, defaults to "env" + "env_dir": ".asv/env", + + // The directory (relative to the current directory) that raw benchmark + // results are stored in. If not provided, defaults to "results". + "results_dir": "benchmarks/results", + + // The directory (relative to the current directory) that the html tree + // should be written to. If not provided, defaults to "html". + "html_dir": "benchmarks/html", + + // The number of characters to retain in the commit hashes. + // "hash_length": 8, + + // `asv` will cache results of the recent builds in each + // environment, making them faster to install next time. This is + // the number of builds to keep, per environment. + "build_cache_size": 2, + + // The commits after which the regression search in `asv publish` + // should start looking for regressions. Dictionary whose keys are + // regexps matching to benchmark names, and values corresponding to + // the commit (exclusive) after which to start looking for + // regressions. The default is to start from the first commit + // with results. If the commit is `null`, regression detection is + // skipped for the matching benchmark. + // + // "regressions_first_commits": { + // "some_benchmark": "352cdf", // Consider regressions only after this commit + // "another_benchmark": null, // Skip regression detection altogether + // }, + + // The thresholds for relative change in results, after which `asv + // publish` starts reporting regressions. Dictionary of the same + // form as in ``regressions_first_commits``, with values + // indicating the thresholds. If multiple entries match, the + // maximum is taken. If no entry matches, the default is 5%. + // + // "regressions_thresholds": { + // "some_benchmark": 0.01, // Threshold of 1% + // "another_benchmark": 0.5, // Threshold of 50% + // }, +} diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/benchmarks/cellfinder.py b/benchmarks/cellfinder.py new file mode 100644 index 00000000..76d364bc --- /dev/null +++ b/benchmarks/cellfinder.py @@ -0,0 +1,227 @@ +import json +import shutil +from pathlib import Path + +import pooch +from brainglobe_utils.IO.cells import save_cells +from cellfinder_core.main import main as cellfinder_run +from cellfinder_core.tools.IO import read_with_dask + +from brainglobe_workflows.cellfinder.cellfinder_main import ( + DEFAULT_JSON_CONFIG_PATH, + CellfinderConfig, + run_workflow_from_cellfinder_run, +) +from brainglobe_workflows.cellfinder.cellfinder_main import ( + setup as setup_cellfinder_workflow, +) + + +class TimeBenchmarkPrepGIN: + """ + + A base class for timing benchmarks for the cellfinder workflow. + + It includes: + - a setup_cache function that downloads the GIN data specified in the + default_config.json to a local directory (created by asv). This function + runs only once before all repeats of the benchmark. + - a setup function, that runs the setup steps for the workflow. + - a teardown function, that removes the output directory. + + Notes + ----- + The class includes some predefined attributes for timing benchmarks. For + the full list see + https://asv.readthedocs.io/en/stable/benchmarks.html#benchmark-attributes + + Some asv benchmarking nomenclature: + - repeat: a benchmark repeat is made up of the following steps: + 1- the `setup` is run, + 2- then the timed benchmark routine is called for `n` iterations, and + 3- finally that teardown function is run. + Each repeat generates a sample, which is the average time that the + routine took across all iterations. A new process is started for each + repeat of each benchmark. A calibration phase before running the repeat + computes the number of iterations that will be executed. Each benchmark + is run for a number of repeats. The setup_cache function is run only once + for all repeats of a benchmark (but it is discarded before the next + benchmark). By default `repeat` is set to 0, which means: + - if rounds==1 the default is + (min_repeat, max_repeat, max_time) = (1, 10, 20.0), + - if rounds != 1 the default is + (min_repeat, max_repeat, max_time) = (1, 5, 10.0) + + - iterations (`number`): the number of iterations in each sample. Note that + `setup` and `teardown` are not run between iterations. asv will + automatically select the number of iterations so that each sample takes + approximately `sample_time` seconds. + + - round: at each round, each benchmark is run for the specified number of + repeats. The idea is that we sample each benchmark over longer periods of + background performance variations. + + - warmup time: asv will spend this time (in seconds) in calling the + benchmarked function repeatedly, before starting to run the actual + benchmark. If not specified, warmup_time defaults to 0.1 seconds + + """ + + # Timing attributes + timeout = 600 # default: 60 s + version = ( + None # benchmark version. Default:None (i.e. hash of source code) + ) + warmup_time = 0.1 # seconds + rounds = 2 + repeat = 0 + sample_time = 0.01 # default: 10 ms = 0.01 s; + min_run_count = 2 # default:2 + + # Custom attributes + input_config_path = str(DEFAULT_JSON_CONFIG_PATH) + + def setup_cache( + self, + ): + """ + Download the input data from the GIN repository to the local + directory specified in the default_config.json + + Notes + ----- + The `setup_cache` method only performs the computations once + per benchmark round and then caches the result to disk [1]_. It cannot + be parametrised [2]_. + + + [1] https://asv.readthedocs.io/en/latest/writing_benchmarks.html#setup-and-teardown-functions + [2] https://asv.readthedocs.io/en/latest/writing_benchmarks.html#parameterized-benchmarks + """ + + # Check config file exists + assert Path(self.input_config_path).exists() + + # Instantiate a CellfinderConfig from the input json file + # (assumes config is json serializable) + with open(self.input_config_path) as cfg: + config_dict = json.load(cfg) + config = CellfinderConfig(**config_dict) + + # Download data with pooch + _ = pooch.retrieve( + url=config.data_url, + known_hash=config.data_hash, + path=config.install_path, + progressbar=True, + processor=pooch.Unzip(extract_dir=config.extract_dir_relative), + ) + + # Check paths to input data should now exist in config + assert Path(config.signal_dir_path).exists() + assert Path(config.background_dir_path).exists() + + def setup(self): + """ + Run the cellfinder workflow setup steps. + + The command line input arguments are injected as dependencies. + """ + + # Run setup + cfg = setup_cellfinder_workflow( + [ + "--config", + self.input_config_path, + ] + ) + + # Save configuration as attribute + self.cfg = cfg + + def teardown(self): + """ + Remove the cellfinder output directory. + + The input data is kept for all repeats of the same benchmark, + to avoid repeated downloads from GIN. + """ + shutil.rmtree(Path(self.cfg.output_path).resolve()) + + +class TimeFullWorkflow(TimeBenchmarkPrepGIN): + """ + Time the full cellfinder workflow. + + It includes reading the signal and background arrays with dask, + detecting the cells and saving the results to an XML file + + Parameters + ---------- + TimeBenchmarkPrepGIN : _type_ + A base class for timing benchmarks for the cellfinder workflow. + """ + + def time_workflow_from_cellfinder_run(self): + run_workflow_from_cellfinder_run(self.cfg) + + +class TimeReadInputDask(TimeBenchmarkPrepGIN): + """ + Time the reading input data operations with dask + + Parameters + ---------- + TimeBenchmarkPrepGIN : _type_ + A base class for timing benchmarks for the cellfinder workflow. + """ + + def time_read_signal_with_dask(self): + read_with_dask(self.cfg.signal_dir_path) + + def time_read_background_with_dask(self): + read_with_dask(self.cfg.background_dir_path) + + +class TimeDetectCells(TimeBenchmarkPrepGIN): + """ + Time the cell detection main pipeline (`cellfinder_run`) + + Parameters + ---------- + TimeBenchmarkPrepGIN : _type_ + A base class for timing benchmarks for the cellfinder workflow. + """ + + # extend basic setup function + def setup(self): + # basic setup + TimeBenchmarkPrepGIN.setup(self) + + # add input data as arrays to config + self.signal_array = read_with_dask(self.cfg.signal_dir_path) + self.background_array = read_with_dask(self.cfg.background_dir_path) + + def time_cellfinder_run(self): + cellfinder_run( + self.signal_array, self.background_array, self.cfg.voxel_sizes + ) + + +class TimeSaveCells(TimeBenchmarkPrepGIN): + # extend basic setup function + def setup(self): + # basic setup + TimeBenchmarkPrepGIN.setup(self) + + # add input data as arrays to config + self.signal_array = read_with_dask(self.cfg.signal_dir_path) + self.background_array = read_with_dask(self.cfg.background_dir_path) + + # detect cells + self.detected_cells = cellfinder_run( + self.signal_array, self.background_array, self.cfg.voxel_sizes + ) + + def time_save_cells(self): + save_cells(self.detected_cells, self.cfg.detected_cells_path) diff --git a/brainglobe_workflows/cellfinder/cellfinder_main.py b/brainglobe_workflows/cellfinder/cellfinder_main.py index 04ec7663..fd19db34 100644 --- a/brainglobe_workflows/cellfinder/cellfinder_main.py +++ b/brainglobe_workflows/cellfinder/cellfinder_main.py @@ -96,33 +96,272 @@ class CellfinderConfig: detected_cells_path: Pathlike = "" -def setup_logger() -> logging.Logger: - """Setup a logger for this script +def setup(argv=None) -> CellfinderConfig: + def parse_cli_arguments(argv_) -> argparse.Namespace: + """Define argument parser for cellfinder + workflow script. + + It expects a path to a json file with the + parameters required to run the workflow. + If none is provided, the default + + Returns + ------- + args : argparse.Namespace + command line input arguments parsed + """ + # initialise argument parser + parser = argparse.ArgumentParser( + description=( + "To launch the workflow with " + "a specific set of input parameters, run: " + "`python cellfinder_main.py --config path/to/config.json`" + "where path/to/input/config.json is the json file " + "containing the workflow parameters." + ) + ) + # add arguments + parser.add_argument( + "-c", + "--config", + default=str(DEFAULT_JSON_CONFIG_PATH), + type=str, + metavar="CONFIG", # a name for usage messages + help="", + ) - The logger's level is set to DEBUG, and it - is linked to a handler that writes to the - console and whose level is + # build parser object + args = parser.parse_args(argv_) - Returns - ------- - logging.Logger - a logger object - """ - # define handler that writes to stdout - console_handler = logging.StreamHandler(sys.stdout) - console_format = logging.Formatter("%(name)s %(levelname)s: %(message)s") - console_handler.setFormatter(console_format) + # print error if required arguments not provided + if not args.config: + logger.error("Paths to input config not provided.") + parser.print_help() + + return args - # define logger and link to handler - logger = logging.getLogger( - __name__ - ) # if imported as a module, the logger is named after the module - logger.setLevel(logging.DEBUG) - logger.addHandler(console_handler) - return logger + def setup_logger() -> logging.Logger: + """Setup a logger for this script + The logger's level is set to DEBUG, and it + is linked to a handler that writes to the + console and whose level is -def run_workflow_from_cellfinder_run(config: CellfinderConfig): + Returns + ------- + logging.Logger + a logger object + """ + # define handler that writes to stdout + console_handler = logging.StreamHandler(sys.stdout) + console_format = logging.Formatter( + "%(name)s %(levelname)s: %(message)s" + ) + console_handler.setFormatter(console_format) + + # define logger and link to handler + logger = logging.getLogger( + __name__ + ) # if imported as a module, the logger is named after the module + logger.setLevel(logging.DEBUG) + logger.addHandler(console_handler) + return logger + + def setup_workflow(input_config_path: Path) -> CellfinderConfig: + """Run setup steps prior to executing the workflow + + These setup steps include: + - instantiating a CellfinderConfig object with the required parameters, + - checking if the input data exists locally, and fetching from + GIN repository otherwise, + - adding the path to the input data files to the config, and + - creating a timestamped directory for the output of the workflow if + it doesn't exist and adding its path to the config + + Parameters + ---------- + input_config_path : Path + path to the input config file + + Returns + ------- + config : CellfinderConfig + a dataclass whose attributes are the parameters + for running cellfinder. + """ + + # Check config file exists + assert input_config_path.exists() + + # Instantiate a CellfinderConfig from the input json file + # (assumes config is json serializable) + with open(input_config_path) as cfg: + config_dict = json.load(cfg) + config = CellfinderConfig(**config_dict) + + # Print info logs for status + logger.info(f"Input config read from {input_config_path}") + if input_config_path == DEFAULT_JSON_CONFIG_PATH: + logger.info("Using default config file") + + # Retrieve and add lists of input data to the config, + # if these are defined yet + if not (config.list_signal_files and config.list_signal_files): + # build fullpaths to inputs + config.signal_dir_path = str( + Path(config.install_path) + / config.extract_dir_relative + / config.signal_subdir + ) + config.background_dir_path = str( + Path(config.install_path) + / config.extract_dir_relative + / config.background_subdir + ) + # retrieve data + config = retrieve_input_data(config) + + # Create timestamped output directory if it doesn't exist + timestamp = datetime.datetime.now() + timestamp_formatted = timestamp.strftime("%Y%m%d_%H%M%S") + output_path_timestamped = Path(config.install_path) / ( + str(config.output_path_basename_relative) + timestamp_formatted + ) + output_path_timestamped.mkdir(parents=True, exist_ok=True) + + # Add output path and output file path to config + config.output_path = output_path_timestamped + config.detected_cells_path = ( + config.output_path / config.detected_cells_filename + ) + + return config + + def retrieve_input_data(config: CellfinderConfig) -> CellfinderConfig: + """ + Adds the lists of input data files (signal and background) + to the config. + + It first checks if the input data exists locally. + - If both directories (signal and background) exist, the lists of + signal and background files are added to the config. + - If exactly one of the input data directories is missing, an error + message is logged. + - If neither of them exist, the data is retrieved from the provided GIN + repository. If no URL or hash to GIN is provided, an error is shown. + + Parameters + ---------- + config : CellfinderConfig + a dataclass whose attributes are the parameters + for running cellfinder. + + Returns + ------- + config : CellfinderConfig + a dataclass whose attributes are the parameters + for running cellfinder. + """ + # Check if input data (signal and background) exist locally. + # If both directories exist, get list of signal and background files + if ( + Path(config.signal_dir_path).exists() + and Path(config.background_dir_path).exists() + ): + logger.info("Fetching input data from the local directories") + + config.list_signal_files = [ + f + for f in Path(config.signal_dir_path).resolve().iterdir() + if f.is_file() + ] + config.list_background_files = [ + f + for f in Path(config.background_dir_path).resolve().iterdir() + if f.is_file() + ] + + # If exactly one of the input data directories is missing, print error + elif ( + Path(config.signal_dir_path).resolve().exists() + or Path(config.background_dir_path).resolve().exists() + ): + if not Path(config.signal_dir_path).resolve().exists(): + logger.error( + f"The directory {config.signal_dir_path} does not exist" + ) + else: + logger.error( + f"The directory {config.background_dir_path} " + "does not exist" + ) + + # If neither of them exist, retrieve data from GIN repository + else: + # check if GIN URL and hash are defined (log error otherwise) + if (not config.data_url) or (not config.data_hash): + logger.error( + "Input data not found locally, and URL/hash to " + "GIN repository not provided" + ) + + else: + # get list of files in GIN archive with pooch.retrieve + list_files_archive = pooch.retrieve( + url=config.data_url, + known_hash=config.data_hash, + path=config.install_path, # zip will be downloaded here + progressbar=True, + processor=pooch.Unzip( + extract_dir=config.extract_dir_relative + # path to unzipped dir, + # *relative* to the path set in 'path' + ), + ) + logger.info( + "Fetching input data from the provided GIN repository" + ) + + # Check signal and background parent directories exist now + assert Path(config.signal_dir_path).resolve().exists() + assert Path(config.background_dir_path).resolve().exists() + + # Add signal files to config + config.list_signal_files = [ + f + for f in list_files_archive + if f.startswith( + str(Path(config.signal_dir_path).resolve()) + ) # if str(config.signal_dir_path) in f + ] + + # Add background files to config + config.list_background_files = [ + f + for f in list_files_archive + if f.startswith( + str(Path(config.background_dir_path).resolve()) + ) # if str(config.background_dir_path) in f + ] + + return config + + # parse command line input arguments: + # sys.argv in most cases except for testing + # see https://paiml.com/docs/home/books/testing-in-python/chapter08-monkeypatching/#the-simplest-monkeypatching + argv = argv or sys.argv[1:] + args = parse_cli_arguments(argv) + + # setup logger + logger = setup_logger() + + # run setup steps and return config + cfg = setup_workflow(Path(args.config)) + + return cfg + + +def run_workflow_from_cellfinder_run(cfg: CellfinderConfig): """ Run workflow based on the cellfinder_core.main.main() function. @@ -131,260 +370,35 @@ def run_workflow_from_cellfinder_run(config: CellfinderConfig): 1. Read the input signal and background data as two separate Dask arrays. 2. Run the main cellfinder pipeline on the input Dask arrays, - with the parameters defined in the input configuration (config). + with the parameters defined in the input configuration (cfg). 3. Save the detected cells as an xml file to the location specified in - the input configuration (config). + the input configuration (cfg). Parameters ---------- - config : CellfinderConfig + cfg : CellfinderConfig a class with the required setup methods and parameters for the cellfinder workflow """ # Read input data as Dask arrays - signal_array = read_with_dask(config.signal_dir_path) - background_array = read_with_dask(config.background_dir_path) + signal_array = read_with_dask(cfg.signal_dir_path) + background_array = read_with_dask(cfg.background_dir_path) # Run main analysis using `cellfinder_run` detected_cells = cellfinder_run( - signal_array, background_array, config.voxel_sizes + signal_array, background_array, cfg.voxel_sizes ) # Save results to xml file save_cells( detected_cells, - config.detected_cells_path, - ) - - -def setup_workflow(input_config_path: Path) -> CellfinderConfig: - """Run setup steps prior to executing the workflow - - These setup steps include: - - instantiating a CellfinderConfig object with the required parameters, - - checking if the input data exists locally, and fetching from - GIN repository otherwise, - - adding the path to the input data files to the config, and - - creating a timestamped directory for the output of the workflow if - it doesn't exist and adding its path to the config - - Parameters - ---------- - input_config_path : Path - path to the input config file - - Returns - ------- - config : CellfinderConfig - a dataclass whose attributes are the parameters - for running cellfinder. - """ - - # Check config file exists - assert input_config_path.exists() - - # Instantiate a CellfinderConfig from the input json file - # (assumes config is json serializable) - with open(input_config_path) as c: - config_dict = json.load(c) - config = CellfinderConfig(**config_dict) - - # Print info logs for status - logger.info(f"Input config read from {input_config_path}") - if input_config_path == DEFAULT_JSON_CONFIG_PATH: - logger.info("Using default config file") - - # Retrieve and add lists of input data to the config, - # if these are defined yet - if not (config.list_signal_files and config.list_background_files): - # build fullpaths to inputs - config.signal_dir_path = str( - Path(config.install_path) - / config.extract_dir_relative - / config.signal_subdir - ) - config.background_dir_path = str( - Path(config.install_path) - / config.extract_dir_relative - / config.background_subdir - ) - # retrieve data - config = retrieve_input_data(config) - - # Create timestamped output directory if it doesn't exist - timestamp = datetime.datetime.now() - timestamp_formatted = timestamp.strftime("%Y%m%d_%H%M%S") - output_path_timestamped = Path(config.install_path) / ( - str(config.output_path_basename_relative) + timestamp_formatted - ) - output_path_timestamped.mkdir(parents=True, exist_ok=True) - - # Add output path and output file path to config - config.output_path = output_path_timestamped - config.detected_cells_path = ( - config.output_path / config.detected_cells_filename + cfg.detected_cells_path, ) - return config - - -def retrieve_input_data(config: CellfinderConfig) -> CellfinderConfig: - """ - Adds the lists of input data files (signal and background) to the config. - - It first checks if the input data exists locally. - - If both directories (signal and background) exist, the lists of signal - and background files are added to the config. - - If exactly one of the input data directories is missing, an error - message is logged. - - If neither of them exist, the data is retrieved from the provided GIN - repository. If no URL or hash to GIN is provided, an error is shown. - - Parameters - ---------- - config : CellfinderConfig - a dataclass whose attributes are the parameters - for running cellfinder. - - Returns - ------- - config : CellfinderConfig - a dataclass whose attributes are the parameters - for running cellfinder. - """ - # Check if input data (signal and background) exist locally. - # If both directories exist, get list of signal and background files - if ( - Path(config.signal_dir_path).exists() - and Path(config.background_dir_path).exists() - ): - logger.info("Fetching input data from the local directories") - - config.list_signal_files = [ - f - for f in Path(config.signal_dir_path).resolve().iterdir() - if f.is_file() - ] - config.list_background_files = [ - f - for f in Path(config.background_dir_path).resolve().iterdir() - if f.is_file() - ] - - # If exactly one of the input data directories is missing, print error - elif ( - Path(config.signal_dir_path).resolve().exists() - or Path(config.background_dir_path).resolve().exists() - ): - if not Path(config.signal_dir_path).resolve().exists(): - logger.error( - f"The directory {config.signal_dir_path} does not exist" - ) - else: - logger.error( - f"The directory {config.background_dir_path} does not exist" - ) - - # If neither of them exist, retrieve data from GIN repository - else: - # check if GIN URL and hash are defined (log error otherwise) - if (not config.data_url) or (not config.data_hash): - logger.error( - "Input data not found locally, and URL/hash to " - "GIN repository not provided" - ) - - else: - # get list of files in GIN archive with pooch.retrieve - list_files_archive = pooch.retrieve( - url=config.data_url, - known_hash=config.data_hash, - path=config.install_path, # zip will be downloaded here - progressbar=True, - processor=pooch.Unzip( - extract_dir=config.extract_dir_relative - # path to unzipped dir, - # *relative* to the path set in 'path' - ), - ) - logger.info("Fetching input data from the provided GIN repository") - - # Check signal and background parent directories exist now - assert Path(config.signal_dir_path).resolve().exists() - assert Path(config.background_dir_path).resolve().exists() - - # Add signal files to config - config.list_signal_files = [ - f - for f in list_files_archive - if f.startswith( - str(Path(config.signal_dir_path).resolve()) - ) # if str(config.signal_dir_path) in f - ] - - # Add background files to config - config.list_background_files = [ - f - for f in list_files_archive - if f.startswith( - str(Path(config.background_dir_path).resolve()) - ) # if str(config.background_dir_path) in f - ] - - return config - - -def parse_cli_arguments() -> argparse.Namespace: - """Define argument parser for cellfinder - workflow script. - - It expects a path to a json file with the - parameters required to run the workflow. - If none is provided, the default - - Returns - ------- - args : argparse.Namespace - command line input arguments parsed - """ - # initialise argument parser - parser = argparse.ArgumentParser( - description=( - "To launch the workflow with " - "a desired set of input parameters, run:" - " `python cellfinder_main.py --config path/to/input/config.json` " - "where path/to/input/config.json is the json file " - "containing the workflow parameters." - ) - ) - # add arguments - parser.add_argument( - "-c", - "--config", - default=str(DEFAULT_JSON_CONFIG_PATH), - type=str, - metavar="CONFIG", # a name for usage messages - help="", - ) - - # build parser object - args = parser.parse_args() - - # print error if required arguments not provided - if not args.config: - logger.error("Paths to input config not provided.") - parser.print_help() - - return args - if __name__ == "__main__": - # setup logger - logger = setup_logger() - - # parse command line arguments - args = parse_cli_arguments() + # run setup + cfg = setup() # run workflow - config = setup_workflow(Path(args.config)) - run_workflow_from_cellfinder_run(config) # only this will be benchmarked + run_workflow_from_cellfinder_run(cfg) # only this will be benchmarked