From ab670b70cdb54c217c3a80f42ac793ee2ad20ffe Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Tue, 12 Nov 2024 13:32:53 +0100 Subject: [PATCH 1/9] init for asv --- asv.conf.json | 203 +++++++++++++++++++++++++++++++++++++++ benchmarks/__init__.py | 0 benchmarks/benchmark.in | 0 benchmarks/benchmarks.py | 61 ++++++++++++ pyproject.toml | 3 + 5 files changed, 267 insertions(+) create mode 100644 asv.conf.json create mode 100644 benchmarks/__init__.py create mode 100644 benchmarks/benchmark.in create mode 100644 benchmarks/benchmarks.py diff --git a/asv.conf.json b/asv.conf.json new file mode 100644 index 00000000..6ef13a93 --- /dev/null +++ b/asv.conf.json @@ -0,0 +1,203 @@ +{ + // The version of the config file format. Do not change, unless + // you know what you are doing. + "version": 1, + + // The name of the project being benchmarked + "project": "spatialdata", + + // The project's homepage + "project_url": "https://spatialdata.scverse.org/", + + // The URL or local path of the source code repository for the + // project being benchmarked + "repo": ".", + + // The Python project's subdirectory in your repo. If missing or + // the empty string, the project is assumed to be located at the root + // of the repository. + // "repo_subdir": "", + + // Customizable commands for building the project. + // See asv.conf.json documentation. + // To build the package using pyproject.toml (PEP518), uncomment the following lines + // "build_command": [ + // "python -m pip install build", + // "python -m build", + // "python -mpip wheel -w {build_cache_dir} {build_dir}" + // ], + // To build the package using setuptools and a setup.py file, uncomment the following lines + // "build_command": [ + // "python setup.py build", + // "python -mpip wheel -w {build_cache_dir} {build_dir}" + // ], + "build_command": ["python -V"], // skip build stage + + // Customizable commands for installing and uninstalling the project. + // See asv.conf.json documentation. + // "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}"], + // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], + + // Install using default install + "install_command": [ + "in-dir={env_dir} python -m pip install {build_dir}[test]" + ], + "uninstall_command": [ + "in-dir={env_dir} python -m pip uninstall -y {project}" + ], + + // List of branches to benchmark. If not provided, defaults to "main" + // (for git) or "default" (for mercurial). + "branches": ["main"], // for git + // "branches": ["default"], // for mercurial + + // The DVCS being used. If not set, it will be automatically + // determined from "repo" by looking at the protocol in the URL + // (if remote), or by looking for special directories, such as + // ".git" (if local). + // "dvcs": "git", + + // The tool to use to create environments. May be "conda", + // "virtualenv", "mamba" (above 3.8) + // or other value depending on the plugins in use. + // If missing or the empty string, the tool will be automatically + // determined by looking for tools on the PATH environment + // variable. + "environment_type": "virtualenv", + + // timeout in seconds for installing any dependencies in environment + // defaults to 10 min + // "install_timeout": 600, + + // the base URL to show a commit for the project. + // "show_commit_url": "http://github.com/owner/project/commit/", + + // The Pythons you'd like to test against. If not provided, defaults + // to the current version of Python used to run `asv`. + "pythons": ["3.12"], + + // The list of conda channel names to be searched for benchmark + // dependency packages in the specified order + // "conda_channels": ["conda-forge", "defaults"], + + // A conda environment file that is used for environment creation. + // "conda_environment_file": "environment.yml", + + // The matrix of dependencies to test. Each key of the "req" + // requirements dictionary is the name of a package (in PyPI) and + // the values are version numbers. An empty list or empty string + // indicates to just test against the default (latest) + // version. null indicates that the package is to not be + // installed. If the package to be tested is only available from + // PyPi, and the 'environment_type' is conda, then you can preface + // the package name by 'pip+', and the package will be installed + // via pip (with all the conda available packages installed first, + // followed by the pip installed packages). + // + // The ``@env`` and ``@env_nobuild`` keys contain the matrix of + // environment variables to pass to build and benchmark commands. + // An environment will be created for every combination of the + // cartesian product of the "@env" variables in this matrix. + // Variables in "@env_nobuild" will be passed to every environment + // during the benchmark phase, but will not trigger creation of + // new environments. A value of ``null`` means that the variable + // will not be set for the current combination. + // + // "matrix": { + // "req": { + // "numpy": ["1.6", "1.7"], + // "six": ["", null], // test with and without six installed + // "pip+emcee": [""] // emcee is only available for install with pip. + // }, + // "env": {"ENV_VAR_1": ["val1", "val2"]}, + // "env_nobuild": {"ENV_VAR_2": ["val3", null]}, + // }, + + // Combinations of libraries/python versions can be excluded/included + // from the set to test. Each entry is a dictionary containing additional + // key-value pairs to include/exclude. + // + // An exclude entry excludes entries where all values match. The + // values are regexps that should match the whole string. + // + // An include entry adds an environment. Only the packages listed + // are installed. The 'python' key is required. The exclude rules + // do not apply to includes. + // + // In addition to package names, the following keys are available: + // + // - python + // Python version, as in the *pythons* variable above. + // - environment_type + // Environment type, as above. + // - sys_platform + // Platform, as in sys.platform. Possible values for the common + // cases: 'linux2', 'win32', 'cygwin', 'darwin'. + // - req + // Required packages + // - env + // Environment variables + // - env_nobuild + // Non-build environment variables + // + // "exclude": [ + // {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows + // {"environment_type": "conda", "req": {"six": null}}, // don't run without six on conda + // {"env": {"ENV_VAR_1": "val2"}}, // skip val2 for ENV_VAR_1 + // ], + // + // "include": [ + // // additional env for python3.12 + // {"python": "3.12", "req": {"numpy": "1.26"}, "env_nobuild": {"FOO": "123"}}, + // // additional env if run on windows+conda + // {"platform": "win32", "environment_type": "conda", "python": "3.12", "req": {"libpython": ""}}, + // ], + + // The directory (relative to the current directory) that benchmarks are + // stored in. If not provided, defaults to "benchmarks" + // "benchmark_dir": "benchmarks", + + // The directory (relative to the current directory) to cache the Python + // environments in. If not provided, defaults to "env" + "env_dir": ".asv/env", + + // The directory (relative to the current directory) that raw benchmark + // results are stored in. If not provided, defaults to "results". + "results_dir": ".asv/results", + + // The directory (relative to the current directory) that the html tree + // should be written to. If not provided, defaults to "html". + "html_dir": ".asv/html", + + // The number of characters to retain in the commit hashes. + "hash_length": 8, + + // `asv` will cache results of the recent builds in each + // environment, making them faster to install next time. This is + // the number of builds to keep, per environment. + "build_cache_size": 2, + + // The commits after which the regression search in `asv publish` + // should start looking for regressions. Dictionary whose keys are + // regexps matching to benchmark names, and values corresponding to + // the commit (exclusive) after which to start looking for + // regressions. The default is to start from the first commit + // with results. If the commit is `null`, regression detection is + // skipped for the matching benchmark. + // + // "regressions_first_commits": { + // "some_benchmark": "352cdf", // Consider regressions only after this commit + // "another_benchmark": null, // Skip regression detection altogether + // }, + + // The thresholds for relative change in results, after which `asv + // publish` starts reporting regressions. Dictionary of the same + // form as in ``regressions_first_commits``, with values + // indicating the thresholds. If multiple entries match, the + // maximum is taken. If no entry matches, the default is 5%. + // + // "regressions_thresholds": { + // "some_benchmark": 0.01, // Threshold of 1% + // "another_benchmark": 0.5, // Threshold of 50% + // }, +} diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/benchmarks/benchmark.in b/benchmarks/benchmark.in new file mode 100644 index 00000000..e69de29b diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py new file mode 100644 index 00000000..4e6ef5cb --- /dev/null +++ b/benchmarks/benchmarks.py @@ -0,0 +1,61 @@ +# Write the benchmarking functions here. +# See "Writing benchmarks" in the asv docs for more information. +import spatialdata as sd + +# class TimeSuite: +# """ +# An example benchmark that times the performance of various kinds +# of iterating over dictionaries in Python. +# """ +# def setup(self): +# self.d = {} +# for x in range(500): +# self.d[x] = None + +# def time_keys(self): +# for key in self.d.keys(): +# pass + +# def time_values(self): +# for value in self.d.values(): +# pass + +# def time_range(self): +# d = self.d +# for key in range(500): +# d[key] + +# class MemSuite: +# def mem_list(self): +# sdata: sd.SpatialData = sd.datasets.blobs() +# return sdata + +class SpatialBlobsSuite: + def peakmem_list(self): + sdata: sd.SpatialData = sd.datasets.blobs(n_channels=1) + return sdata + + def peakmem_list2(self): + sdata: sd.SpatialData = sd.datasets.blobs(n_channels=2) + return sdata + + +def timeraw_import_inspect(): + return """ + import spatialdata + """ + +class SpatialDataLoading: + + params = [100, 200, 300] + param_names = ["length"] + + def setup(self, length): + self.sdata = sd.datasets.blobs(length=length) + + def teardown(self, _): + del self.sdata + + def time_map_blocks(self, _): + sd.map_raster(self.sdata["blobs_image"], lambda x: x+1) + diff --git a/pyproject.toml b/pyproject.toml index 04dc3998..d95e74eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,6 +72,9 @@ test = [ "pytest-mock", "torch", ] +benchmark = [ + "asv", +] torch = [ "torch" ] From 64e6d80df2b8c45e6db61529cd2c4f6ab5b7cb5f Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Tue, 12 Nov 2024 14:47:25 +0100 Subject: [PATCH 2/9] ignore asv folder --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 328be6b2..439f8ae3 100644 --- a/.gitignore +++ b/.gitignore @@ -47,3 +47,5 @@ _version.py # other node_modules/ + +.asv/ \ No newline at end of file From 44cb698fe1a0e029a7a2a746f434f39b1eadb081 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Tue, 12 Nov 2024 14:50:18 +0100 Subject: [PATCH 3/9] add basic benchmarks --- benchmarks/benchmarks.py | 67 +++++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 31 deletions(-) diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py index 4e6ef5cb..f6b07bfe 100644 --- a/benchmarks/benchmarks.py +++ b/benchmarks/benchmarks.py @@ -2,35 +2,11 @@ # See "Writing benchmarks" in the asv docs for more information. import spatialdata as sd -# class TimeSuite: -# """ -# An example benchmark that times the performance of various kinds -# of iterating over dictionaries in Python. -# """ -# def setup(self): -# self.d = {} -# for x in range(500): -# self.d[x] = None - -# def time_keys(self): -# for key in self.d.keys(): -# pass - -# def time_values(self): -# for value in self.d.values(): -# pass - -# def time_range(self): -# d = self.d -# for key in range(500): -# d[key] - -# class MemSuite: -# def mem_list(self): -# sdata: sd.SpatialData = sd.datasets.blobs() -# return sdata - -class SpatialBlobsSuite: + +class MemorySpatialData: + # TODO: see what the memory overhead is e.g. Python interpreter... + """Calculate the peak memory usage is for artificial datasets with increasing channels.""" + def peakmem_list(self): sdata: sd.SpatialData = sd.datasets.blobs(n_channels=1) return sdata @@ -41,13 +17,15 @@ def peakmem_list2(self): def timeraw_import_inspect(): + """Time the import of the spatialdata module.""" return """ import spatialdata """ -class SpatialDataLoading: +class TimeMapRaster: + """Time the.""" - params = [100, 200, 300] + params = [100, 1000] param_names = ["length"] def setup(self, length): @@ -59,3 +37,30 @@ def teardown(self, _): def time_map_blocks(self, _): sd.map_raster(self.sdata["blobs_image"], lambda x: x+1) +class TimeQueries: + + params = ([100, 1000], [True, False]) + param_names = ["length", "filter_table"] + + def setup(self, length, filter_table): + import shapely + self.sdata = sd.datasets.blobs(length=length) + self.polygon = shapely.box(0, 0, 100, 100) + + + def teardown(self, length, filter_table): + del self.sdata + + def time_query_bounding_box(self, length, filter_table): + self.sdata.query.bounding_box( + axes=["x", "y"], + min_coordinate=[0, 0], + max_coordinate=[100, 100], + target_coordinate_system="global", + filter_table=filter_table, + ) + + def time_query_polygon_box(self, length, filter_table): + sd.polygon_query(self.sdata, self.polygon, target_coordinate_system="global", + filter_table=filter_table, + ) From cdee78909e4ac460e4eb63704e2e7ca19bc80be7 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Wed, 13 Nov 2024 14:01:23 +0100 Subject: [PATCH 4/9] improve cluster_blobs creation time --- benchmarks/README.md | 30 ++++ benchmarks/benchmarks.py | 33 +++- benchmarks/utils.py | 347 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 404 insertions(+), 6 deletions(-) create mode 100644 benchmarks/README.md create mode 100644 benchmarks/utils.py diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 00000000..827e8cf3 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,30 @@ +# Benchmarking + +setup +``` +pip install -e '.[docs,benchmark]' +``` + +See napari [docs](https://napari.org/stable/developers/contributing/performance/benchmarks.html) on profiling and benchmarking for more information. + +run a specific benchmark +``` +PYTHONWARNINGS="ignore" asv run --python=same --show-stderr --quick -b time_query_bounding_box +``` +output: +``` +[100.00%] ··· ======== ========== ============ + -- filter_table + -------- ----------------------- + length True False + ======== ========== ============ + 100 89.1±5ms 85.6±0.8ms + 1000 99.0±8ms 87.7±1ms + 10000 427±10ms 92.4±2ms + ======== ========== ============ +``` + +run everything in new env +``` +asv run +``` diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py index f6b07bfe..86b88054 100644 --- a/benchmarks/benchmarks.py +++ b/benchmarks/benchmarks.py @@ -2,6 +2,12 @@ # See "Writing benchmarks" in the asv docs for more information. import spatialdata as sd +try: + from .utils import cluster_blobs +except ImportError: + # TODO: remove ugly hack used for local testing + from utils import cluster_blobs + class MemorySpatialData: # TODO: see what the memory overhead is e.g. Python interpreter... @@ -25,11 +31,11 @@ def timeraw_import_inspect(): class TimeMapRaster: """Time the.""" - params = [100, 1000] + params = [100, 1000, 10_000] param_names = ["length"] def setup(self, length): - self.sdata = sd.datasets.blobs(length=length) + self.sdata = cluster_blobs(length=length) def teardown(self, _): del self.sdata @@ -39,13 +45,14 @@ def time_map_blocks(self, _): class TimeQueries: - params = ([100, 1000], [True, False]) + params = ([100, 1000, 10_000], [True, False]) param_names = ["length", "filter_table"] def setup(self, length, filter_table): import shapely - self.sdata = sd.datasets.blobs(length=length) - self.polygon = shapely.box(0, 0, 100, 100) + + self.sdata = cluster_blobs(length=length) + self.polygon = shapely.box(0, 0, length//2, length//2) def teardown(self, length, filter_table): @@ -55,7 +62,7 @@ def time_query_bounding_box(self, length, filter_table): self.sdata.query.bounding_box( axes=["x", "y"], min_coordinate=[0, 0], - max_coordinate=[100, 100], + max_coordinate=[length//2, length//2], target_coordinate_system="global", filter_table=filter_table, ) @@ -64,3 +71,17 @@ def time_query_polygon_box(self, length, filter_table): sd.polygon_query(self.sdata, self.polygon, target_coordinate_system="global", filter_table=filter_table, ) + + +if __name__ == "__main__": + length = 10_000 + sdata = cluster_blobs(length) + # sdata.write("tmp_test") + sdata.query.bounding_box( + axes=["x", "y"], + min_coordinate=[0, 0], + max_coordinate=[length//2, length//2], + target_coordinate_system="global", + filter_table=True, + ) + print(sdata) diff --git a/benchmarks/utils.py b/benchmarks/utils.py new file mode 100644 index 00000000..c2bba0b5 --- /dev/null +++ b/benchmarks/utils.py @@ -0,0 +1,347 @@ +import itertools +import os +from collections.abc import Sequence +from functools import lru_cache +from types import ModuleType +from typing import Callable, Literal, Optional, Union, overload + +import anndata as ad +import numpy as np +import pandas as pd +from skimage import morphology + +import spatialdata as sd +from spatialdata import SpatialData +from spatialdata.models import Image2DModel, TableModel +from spatialdata.transformations import Identity + + +def always_false(*_): + return False + + +class Skip: + def __init__( + self, + if_in_pr: Callable[..., bool] = always_false, + if_on_ci: Callable[..., bool] = always_false, + always: Callable[..., bool] = always_false, + ): + self.func_pr = if_in_pr if "PR" in os.environ else always_false + self.func_ci = if_on_ci if "CI" in os.environ else always_false + self.func_always = always + + def __contains__(self, item): + return ( + self.func_pr(*item) + or self.func_ci(*item) + or self.func_always(*item) + ) + + +def _generate_ball(radius: int, ndim: int) -> np.ndarray: + """Generate a ball of given radius and dimension. + + Parameters + ---------- + radius : int + Radius of the ball. + ndim : int + Dimension of the ball. + + Returns + ------- + ball : ndarray of uint8 + Binary array of the hyper ball. + """ + if ndim == 2: + return morphology.disk(radius) + if ndim == 3: + return morphology.ball(radius) + shape = (2 * radius + 1,) * ndim + radius_sq = radius**2 + coords = np.indices(shape) - radius + return (np.sum(coords**2, axis=0) <= radius_sq).astype(np.uint8) + + +def _generate_density(radius: int, ndim: int) -> np.ndarray: + """Generate gaussian density of given radius and dimension.""" + shape = (2 * radius + 1,) * ndim + coords = np.indices(shape) - radius + dist = np.sqrt(np.sum(coords**2 / ((radius / 4) ** 2), axis=0)) + res = np.exp(-dist) + res[res < 0.02] = 0 + return res + + +def _structure_at_coordinates( + shape: tuple[int], + coordinates: np.ndarray, + structure: np.ndarray, + *, + multipliers: Sequence = itertools.repeat(1), + dtype=None, + reduce_fn: Callable[ + [np.ndarray, np.ndarray, Optional[np.ndarray]], np.ndarray + ], +): + """Update data with structure at given coordinates. + + Parameters + ---------- + data : ndarray + Array to update. + coordinates : ndarray + Coordinates of the points. The structures will be added at these + points (center). + structure : ndarray + Array with encoded structure. For example, ball (boolean) or density + (0,1) float. + multipliers : ndarray + These values are multiplied by the values in the structure before + updating the array. Can be used to generate different labels, or to + vary the intensity of floating point gaussian densities. + reduce_fn : function + Function with which to update the array at a particular position. It + should take two arrays as input and an optional output array. + """ + radius = (structure.shape[0] - 1) // 2 + data = np.zeros(shape, dtype=dtype) + + for point, value in zip(coordinates, multipliers): + slice_im, slice_ball = _get_slices_at(shape, point, radius) + reduce_fn( + data[slice_im], value * structure[slice_ball], out=data[slice_im] + ) + return data + + +def _get_slices_at(shape, point, radius): + slice_im = [] + slice_ball = [] + for i, p in enumerate(point): + slice_im.append( + slice(max(0, p - radius), min(shape[i], p + radius + 1)) + ) + ball_start = max(0, radius - p) + ball_stop = slice_im[-1].stop - slice_im[-1].start + ball_start + slice_ball.append(slice(ball_start, ball_stop)) + return tuple(slice_im), tuple(slice_ball) + + +def _update_data_with_mask(data, struct, out=None): + """Update ``data`` with ``struct`` where ``struct`` is nonzero.""" + # these branches are needed because np.where does not support + # an out= keyword argument + if out is None: + return np.where(struct, struct, data) + else: # noqa: RET505 + nz = struct != 0 + out[nz] = struct[nz] + return out + + +def _smallest_dtype(n: int) -> np.dtype: + """Find the smallest dtype that can hold n values.""" + for dtype in [np.uint8, np.uint16, np.uint32, np.uint64]: + if np.iinfo(dtype).max >= n: + return dtype + break + else: + raise ValueError(f"{n=} is too large for any dtype.") + + +@overload +def labeled_particles( + shape: Sequence[int], + dtype: Optional[np.dtype] = None, + n: int = 144, + seed: Optional[int] = None, + return_density: Literal[False] = False, +) -> np.ndarray: ... + + +@overload +def labeled_particles( + shape: Sequence[int], + dtype: Optional[np.dtype] = None, + n: int = 144, + seed: Optional[int] = None, + return_density: Literal[True] = True, +) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ... + + +@lru_cache +def labeled_particles( + shape: Sequence[int], + dtype: Optional[np.dtype] = None, + n: int = 144, + seed: Optional[int] = None, + return_density: bool = False, +) -> Union[np.ndarray, tuple[np.ndarray, np.ndarray, np.ndarray]]: + """Generate labeled blobs of given shape and dtype. + + Parameters + ---------- + shape : Sequence[int] + Shape of the resulting array. + dtype : Optional[np.dtype] + Dtype of the resulting array. + n : int + Number of blobs to generate. + seed : Optional[int] + Seed for the random number generator. + return_density : bool + Whether to return the density array and center coordinates. + """ + if dtype is None: + dtype = _smallest_dtype(n) + rng = np.random.default_rng(seed) + ndim = len(shape) + points = rng.integers(shape, size=(n, ndim)) + # create values from 1 to max of number of points + values = np.linspace(1, n, n, dtype=dtype) + rng.shuffle(values) + # values = rng.integers( + # np.iinfo(dtype).min + 1, np.iinfo(dtype).max, size=n, dtype=dtype + # ) + sigma = int(max(shape) / (4.0 * n ** (1 / ndim))) + ball = _generate_ball(sigma, ndim) + + labels = _structure_at_coordinates( + shape, + points, + ball, + multipliers=values, + reduce_fn=_update_data_with_mask, + dtype=dtype, + ) + + if return_density: + dens = _generate_density(sigma * 2, ndim) + densities = _structure_at_coordinates( + shape, points, dens, reduce_fn=np.maximum, dtype=np.float32 + ) + + return labels, densities, points, values + else: # noqa: RET505 + return labels + + +def run_benchmark_from_module( + module: ModuleType, klass_name: str, method_name: str +): + klass = getattr(module, klass_name) + if getattr(klass, "params", None): + skip_if = getattr(klass, "skip_params", {}) + if isinstance(klass.params[0], Sequence): + params = itertools.product(*klass.params) + else: + params = ((i,) for i in klass.params) + for param in params: + if param in skip_if: + continue + obj = klass() + try: + obj.setup(*param) + except NotImplementedError: + continue + getattr(obj, method_name)(*param) + getattr(obj, "teardown", lambda: None)() + else: + obj = klass() + try: + obj.setup() + except NotImplementedError: + return + getattr(obj, method_name)() + getattr(obj, "teardown", lambda: None)() + + +def run_benchmark(): + import argparse + import inspect + + parser = argparse.ArgumentParser(description="Run benchmark") + parser.add_argument( + "benchmark", type=str, help="Name of the benchmark to run", default="" + ) + + args = parser.parse_args() + + benchmark_selection = args.benchmark.split(".") + + # get module of parent frame + call_module = inspect.getmodule(inspect.currentframe().f_back) + run_benchmark_from_module(call_module, *benchmark_selection) + +def cluster_blobs( + length=512, + n=None, + region_key="region_key", + instance_key="instance_key", + image_name="blobs_image", + labels_name="blobs_labels", + points_name="blobs_points", + table_name="table", + coordinate_system="global", +): + """Faster `spatialdata.datasets.make_blobs` using napari.datasets code.""" + if n is None: + n = length + labels, density, points , values = labeled_particles( + (length, length), return_density=True, n=n + ) + + im_el = Image2DModel.parse( + data=density[None, ...], + dims="cyx", + transformations={coordinate_system: Identity()}, + ) + label_el = sd.models.Labels2DModel.parse( + labels, + dims="yx", + transformations={coordinate_system: Identity()} + ) + points_el = sd.models.PointsModel.parse( + points, + transformations={coordinate_system: Identity()} + ) + # TODO: generate actual values table in a scalable fashion + # adata = aggregate(values=points_el, by=label_el, region_key=region_key, instance_key=instance_key, target_coordinate_system=coordinate_system).tables["table"] + # make X dense as markers are limited + # generate dummy table + adata = ad.AnnData(X=np.ones((length, 10))) + adata.obs[region_key] = pd.Categorical([labels_name] * len(adata)) + # adata.obs_names = values.astype(np.uint64) + adata.obs[instance_key] = adata.obs_names.values + adata.obs.index = adata.obs.index.astype(str) + adata.obs.index.name = instance_key + # del adata.uns[TableModel.ATTRS_KEY] + table = TableModel.parse( + adata, + region=labels_name, + region_key=region_key, + instance_key=instance_key, + ) + + sdata = SpatialData( + images={ + image_name: im_el, + }, + labels={ + labels_name: label_el, + # "blobs_markers": Labels2DModel.parse(data=markers), + }, + points={points_name: points_el}, + tables={table_name: table}, + ) + # if shapes_name: + # sdata[shapes_name] = sd.to_circles(sdata[labels_name]) + # add_regionprop_features(sdata, labels_layer=labels_name, table_layer=table_name) + return sdata + + +if __name__ == "__main__": + sdata = cluster_blobs(1_000) + print(sdata) From d38e1d1942b10ec149421e87a19fdbd730aa4283 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Wed, 13 Nov 2024 15:15:26 +0100 Subject: [PATCH 5/9] update readme --- benchmarks/README.md | 18 ++++++++++++------ ...{benchmarks.py => spatialdata_benchmark.py} | 2 +- benchmarks/utils.py | 18 ++++++++++++++++-- 3 files changed, 29 insertions(+), 9 deletions(-) rename benchmarks/{benchmarks.py => spatialdata_benchmark.py} (99%) diff --git a/benchmarks/README.md b/benchmarks/README.md index 827e8cf3..17cfc90a 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -5,22 +5,28 @@ setup pip install -e '.[docs,benchmark]' ``` +In PyCharm, configure your Configuration to include the benchmark module. In Python, you can run using +``` +python -m benchmarks.spatialdata_benchmark +``` + + See napari [docs](https://napari.org/stable/developers/contributing/performance/benchmarks.html) on profiling and benchmarking for more information. run a specific benchmark ``` -PYTHONWARNINGS="ignore" asv run --python=same --show-stderr --quick -b time_query_bounding_box +PYTHONWARNINGS="ignore" asv run --python=same --show-stderr -b time_query_bounding_box ``` output: ``` [100.00%] ··· ======== ========== ============ - -- filter_table + -- filter_table -------- ----------------------- - length True False + length True False ======== ========== ============ - 100 89.1±5ms 85.6±0.8ms - 1000 99.0±8ms 87.7±1ms - 10000 427±10ms 92.4±2ms + 100 89.1±5ms 85.6±0.8ms + 1000 99.0±8ms 87.7±1ms + 10000 427±10ms 92.4±2ms ======== ========== ============ ``` diff --git a/benchmarks/benchmarks.py b/benchmarks/spatialdata_benchmark.py similarity index 99% rename from benchmarks/benchmarks.py rename to benchmarks/spatialdata_benchmark.py index 86b88054..ff50f8a0 100644 --- a/benchmarks/benchmarks.py +++ b/benchmarks/spatialdata_benchmark.py @@ -74,7 +74,7 @@ def time_query_polygon_box(self, length, filter_table): if __name__ == "__main__": - length = 10_000 + length = 1_000 sdata = cluster_blobs(length) # sdata.write("tmp_test") sdata.query.bounding_box( diff --git a/benchmarks/utils.py b/benchmarks/utils.py index c2bba0b5..fbd53f64 100644 --- a/benchmarks/utils.py +++ b/benchmarks/utils.py @@ -275,6 +275,7 @@ def run_benchmark(): call_module = inspect.getmodule(inspect.currentframe().f_back) run_benchmark_from_module(call_module, *benchmark_selection) +@lru_cache def cluster_blobs( length=512, n=None, @@ -289,9 +290,14 @@ def cluster_blobs( """Faster `spatialdata.datasets.make_blobs` using napari.datasets code.""" if n is None: n = length + # cells labels, density, points , values = labeled_particles( (length, length), return_density=True, n=n ) + # transcript points + # generate 100 transcripts per cell + rng = np.random.default_rng(None) + points_transcripts = rng.integers(length, size=(n*1000, 2)) im_el = Image2DModel.parse( data=density[None, ...], @@ -303,10 +309,15 @@ def cluster_blobs( dims="yx", transformations={coordinate_system: Identity()} ) - points_el = sd.models.PointsModel.parse( + points_cells_el = sd.models.PointsModel.parse( points, transformations={coordinate_system: Identity()} ) + points_transcripts_el = sd.models.PointsModel.parse( + points_transcripts, + transformations={coordinate_system: Identity()} + ) + # TODO: generate actual values table in a scalable fashion # adata = aggregate(values=points_el, by=label_el, region_key=region_key, instance_key=instance_key, target_coordinate_system=coordinate_system).tables["table"] # make X dense as markers are limited @@ -333,7 +344,10 @@ def cluster_blobs( labels_name: label_el, # "blobs_markers": Labels2DModel.parse(data=markers), }, - points={points_name: points_el}, + points={ + points_name: points_cells_el, + "transcripts_" + points_name: points_transcripts_el, + }, tables={table_name: table}, ) # if shapes_name: From 2de7afb8628132ef8a615457f39ebd2eafcd4bf4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 13 Nov 2024 15:34:38 +0000 Subject: [PATCH 6/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- asv.conf.json | 4 +-- benchmarks/README.md | 7 +++- benchmarks/spatialdata_benchmark.py | 18 ++++++---- benchmarks/utils.py | 51 ++++++++--------------------- 4 files changed, 32 insertions(+), 48 deletions(-) diff --git a/asv.conf.json b/asv.conf.json index 6ef13a93..8a108478 100644 --- a/asv.conf.json +++ b/asv.conf.json @@ -37,7 +37,7 @@ // See asv.conf.json documentation. // "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}"], // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], - + // Install using default install "install_command": [ "in-dir={env_dir} python -m pip install {build_dir}[test]" @@ -175,7 +175,7 @@ // `asv` will cache results of the recent builds in each // environment, making them faster to install next time. This is // the number of builds to keep, per environment. - "build_cache_size": 2, + "build_cache_size": 2 // The commits after which the regression search in `asv publish` // should start looking for regressions. Dictionary whose keys are diff --git a/benchmarks/README.md b/benchmarks/README.md index 17cfc90a..a181feaf 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,23 +1,27 @@ # Benchmarking setup + ``` pip install -e '.[docs,benchmark]' ``` In PyCharm, configure your Configuration to include the benchmark module. In Python, you can run using + ``` python -m benchmarks.spatialdata_benchmark ``` - See napari [docs](https://napari.org/stable/developers/contributing/performance/benchmarks.html) on profiling and benchmarking for more information. run a specific benchmark + ``` PYTHONWARNINGS="ignore" asv run --python=same --show-stderr -b time_query_bounding_box ``` + output: + ``` [100.00%] ··· ======== ========== ============ -- filter_table @@ -31,6 +35,7 @@ output: ``` run everything in new env + ``` asv run ``` diff --git a/benchmarks/spatialdata_benchmark.py b/benchmarks/spatialdata_benchmark.py index ff50f8a0..8ecf51d1 100644 --- a/benchmarks/spatialdata_benchmark.py +++ b/benchmarks/spatialdata_benchmark.py @@ -28,6 +28,7 @@ def timeraw_import_inspect(): import spatialdata """ + class TimeMapRaster: """Time the.""" @@ -41,7 +42,8 @@ def teardown(self, _): del self.sdata def time_map_blocks(self, _): - sd.map_raster(self.sdata["blobs_image"], lambda x: x+1) + sd.map_raster(self.sdata["blobs_image"], lambda x: x + 1) + class TimeQueries: @@ -52,8 +54,7 @@ def setup(self, length, filter_table): import shapely self.sdata = cluster_blobs(length=length) - self.polygon = shapely.box(0, 0, length//2, length//2) - + self.polygon = shapely.box(0, 0, length // 2, length // 2) def teardown(self, length, filter_table): del self.sdata @@ -62,14 +63,17 @@ def time_query_bounding_box(self, length, filter_table): self.sdata.query.bounding_box( axes=["x", "y"], min_coordinate=[0, 0], - max_coordinate=[length//2, length//2], + max_coordinate=[length // 2, length // 2], target_coordinate_system="global", filter_table=filter_table, ) def time_query_polygon_box(self, length, filter_table): - sd.polygon_query(self.sdata, self.polygon, target_coordinate_system="global", - filter_table=filter_table, + sd.polygon_query( + self.sdata, + self.polygon, + target_coordinate_system="global", + filter_table=filter_table, ) @@ -80,7 +84,7 @@ def time_query_polygon_box(self, length, filter_table): sdata.query.bounding_box( axes=["x", "y"], min_coordinate=[0, 0], - max_coordinate=[length//2, length//2], + max_coordinate=[length // 2, length // 2], target_coordinate_system="global", filter_table=True, ) diff --git a/benchmarks/utils.py b/benchmarks/utils.py index fbd53f64..5efd2cc4 100644 --- a/benchmarks/utils.py +++ b/benchmarks/utils.py @@ -32,11 +32,7 @@ def __init__( self.func_always = always def __contains__(self, item): - return ( - self.func_pr(*item) - or self.func_ci(*item) - or self.func_always(*item) - ) + return self.func_pr(*item) or self.func_ci(*item) or self.func_always(*item) def _generate_ball(radius: int, ndim: int) -> np.ndarray: @@ -81,9 +77,7 @@ def _structure_at_coordinates( *, multipliers: Sequence = itertools.repeat(1), dtype=None, - reduce_fn: Callable[ - [np.ndarray, np.ndarray, Optional[np.ndarray]], np.ndarray - ], + reduce_fn: Callable[[np.ndarray, np.ndarray, Optional[np.ndarray]], np.ndarray], ): """Update data with structure at given coordinates. @@ -110,9 +104,7 @@ def _structure_at_coordinates( for point, value in zip(coordinates, multipliers): slice_im, slice_ball = _get_slices_at(shape, point, radius) - reduce_fn( - data[slice_im], value * structure[slice_ball], out=data[slice_im] - ) + reduce_fn(data[slice_im], value * structure[slice_ball], out=data[slice_im]) return data @@ -120,9 +112,7 @@ def _get_slices_at(shape, point, radius): slice_im = [] slice_ball = [] for i, p in enumerate(point): - slice_im.append( - slice(max(0, p - radius), min(shape[i], p + radius + 1)) - ) + slice_im.append(slice(max(0, p - radius), min(shape[i], p + radius + 1))) ball_start = max(0, radius - p) ball_stop = slice_im[-1].stop - slice_im[-1].start + ball_start slice_ball.append(slice(ball_start, ball_stop)) @@ -219,18 +209,14 @@ def labeled_particles( if return_density: dens = _generate_density(sigma * 2, ndim) - densities = _structure_at_coordinates( - shape, points, dens, reduce_fn=np.maximum, dtype=np.float32 - ) + densities = _structure_at_coordinates(shape, points, dens, reduce_fn=np.maximum, dtype=np.float32) return labels, densities, points, values else: # noqa: RET505 return labels -def run_benchmark_from_module( - module: ModuleType, klass_name: str, method_name: str -): +def run_benchmark_from_module(module: ModuleType, klass_name: str, method_name: str): klass = getattr(module, klass_name) if getattr(klass, "params", None): skip_if = getattr(klass, "skip_params", {}) @@ -263,9 +249,7 @@ def run_benchmark(): import inspect parser = argparse.ArgumentParser(description="Run benchmark") - parser.add_argument( - "benchmark", type=str, help="Name of the benchmark to run", default="" - ) + parser.add_argument("benchmark", type=str, help="Name of the benchmark to run", default="") args = parser.parse_args() @@ -275,6 +259,7 @@ def run_benchmark(): call_module = inspect.getmodule(inspect.currentframe().f_back) run_benchmark_from_module(call_module, *benchmark_selection) + @lru_cache def cluster_blobs( length=512, @@ -291,31 +276,21 @@ def cluster_blobs( if n is None: n = length # cells - labels, density, points , values = labeled_particles( - (length, length), return_density=True, n=n - ) + labels, density, points, values = labeled_particles((length, length), return_density=True, n=n) # transcript points # generate 100 transcripts per cell rng = np.random.default_rng(None) - points_transcripts = rng.integers(length, size=(n*1000, 2)) + points_transcripts = rng.integers(length, size=(n * 1000, 2)) im_el = Image2DModel.parse( data=density[None, ...], dims="cyx", transformations={coordinate_system: Identity()}, ) - label_el = sd.models.Labels2DModel.parse( - labels, - dims="yx", - transformations={coordinate_system: Identity()} - ) - points_cells_el = sd.models.PointsModel.parse( - points, - transformations={coordinate_system: Identity()} - ) + label_el = sd.models.Labels2DModel.parse(labels, dims="yx", transformations={coordinate_system: Identity()}) + points_cells_el = sd.models.PointsModel.parse(points, transformations={coordinate_system: Identity()}) points_transcripts_el = sd.models.PointsModel.parse( - points_transcripts, - transformations={coordinate_system: Identity()} + points_transcripts, transformations={coordinate_system: Identity()} ) # TODO: generate actual values table in a scalable fashion From 214eee1bc4125ccf854d919461fc04ff56bfdb57 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Wed, 13 Nov 2024 16:41:11 +0100 Subject: [PATCH 7/9] lint code --- benchmarks/README.md | 25 ++++++++------ benchmarks/utils.py | 81 +++++++++++++++----------------------------- 2 files changed, 42 insertions(+), 64 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 17cfc90a..03989701 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,36 +1,41 @@ # Benchmarking setup + ``` pip install -e '.[docs,benchmark]' ``` In PyCharm, configure your Configuration to include the benchmark module. In Python, you can run using + ``` python -m benchmarks.spatialdata_benchmark ``` - See napari [docs](https://napari.org/stable/developers/contributing/performance/benchmarks.html) on profiling and benchmarking for more information. run a specific benchmark + ``` PYTHONWARNINGS="ignore" asv run --python=same --show-stderr -b time_query_bounding_box ``` + output: + ``` -[100.00%] ··· ======== ========== ============ - -- filter_table - -------- ----------------------- - length True False - ======== ========== ============ - 100 89.1±5ms 85.6±0.8ms - 1000 99.0±8ms 87.7±1ms - 10000 427±10ms 92.4±2ms - ======== ========== ============ +[100.00%] ··· ======== ============ ============ + -- filter_table + -------- ------------------------- + length True False + ======== ============ ============ + 100 191±2ms 185±2ms + 1000 399±4ms 382±7ms + 10000 2.67±0.02s 2.18±0.01s + ======== ============ ============ ``` run everything in new env + ``` asv run ``` diff --git a/benchmarks/utils.py b/benchmarks/utils.py index fbd53f64..8d018a4e 100644 --- a/benchmarks/utils.py +++ b/benchmarks/utils.py @@ -32,11 +32,7 @@ def __init__( self.func_always = always def __contains__(self, item): - return ( - self.func_pr(*item) - or self.func_ci(*item) - or self.func_always(*item) - ) + return self.func_pr(*item) or self.func_ci(*item) or self.func_always(*item) def _generate_ball(radius: int, ndim: int) -> np.ndarray: @@ -81,9 +77,7 @@ def _structure_at_coordinates( *, multipliers: Sequence = itertools.repeat(1), dtype=None, - reduce_fn: Callable[ - [np.ndarray, np.ndarray, Optional[np.ndarray]], np.ndarray - ], + reduce_fn: Callable[[np.ndarray, np.ndarray, Optional[np.ndarray]], np.ndarray], ): """Update data with structure at given coordinates. @@ -110,9 +104,7 @@ def _structure_at_coordinates( for point, value in zip(coordinates, multipliers): slice_im, slice_ball = _get_slices_at(shape, point, radius) - reduce_fn( - data[slice_im], value * structure[slice_ball], out=data[slice_im] - ) + reduce_fn(data[slice_im], value * structure[slice_ball], out=data[slice_im]) return data @@ -120,9 +112,7 @@ def _get_slices_at(shape, point, radius): slice_im = [] slice_ball = [] for i, p in enumerate(point): - slice_im.append( - slice(max(0, p - radius), min(shape[i], p + radius + 1)) - ) + slice_im.append(slice(max(0, p - radius), min(shape[i], p + radius + 1))) ball_start = max(0, radius - p) ball_stop = slice_im[-1].stop - slice_im[-1].start + ball_start slice_ball.append(slice(ball_start, ball_stop)) @@ -219,18 +209,14 @@ def labeled_particles( if return_density: dens = _generate_density(sigma * 2, ndim) - densities = _structure_at_coordinates( - shape, points, dens, reduce_fn=np.maximum, dtype=np.float32 - ) + densities = _structure_at_coordinates(shape, points, dens, reduce_fn=np.maximum, dtype=np.float32) return labels, densities, points, values else: # noqa: RET505 return labels -def run_benchmark_from_module( - module: ModuleType, klass_name: str, method_name: str -): +def run_benchmark_from_module(module: ModuleType, klass_name: str, method_name: str): klass = getattr(module, klass_name) if getattr(klass, "params", None): skip_if = getattr(klass, "skip_params", {}) @@ -263,9 +249,7 @@ def run_benchmark(): import inspect parser = argparse.ArgumentParser(description="Run benchmark") - parser.add_argument( - "benchmark", type=str, help="Name of the benchmark to run", default="" - ) + parser.add_argument("benchmark", type=str, help="Name of the benchmark to run", default="") args = parser.parse_args() @@ -275,52 +259,34 @@ def run_benchmark(): call_module = inspect.getmodule(inspect.currentframe().f_back) run_benchmark_from_module(call_module, *benchmark_selection) + @lru_cache def cluster_blobs( length=512, - n=None, + n_cells=None, region_key="region_key", instance_key="instance_key", image_name="blobs_image", labels_name="blobs_labels", points_name="blobs_points", + n_transcripts_per_cell=None, table_name="table", coordinate_system="global", ): """Faster `spatialdata.datasets.make_blobs` using napari.datasets code.""" - if n is None: - n = length + if n_cells is None: + n_cells = length # cells - labels, density, points , values = labeled_particles( - (length, length), return_density=True, n=n - ) - # transcript points - # generate 100 transcripts per cell - rng = np.random.default_rng(None) - points_transcripts = rng.integers(length, size=(n*1000, 2)) + labels, density, points, values = labeled_particles((length, length), return_density=True, n=n_cells) im_el = Image2DModel.parse( data=density[None, ...], dims="cyx", transformations={coordinate_system: Identity()}, ) - label_el = sd.models.Labels2DModel.parse( - labels, - dims="yx", - transformations={coordinate_system: Identity()} - ) - points_cells_el = sd.models.PointsModel.parse( - points, - transformations={coordinate_system: Identity()} - ) - points_transcripts_el = sd.models.PointsModel.parse( - points_transcripts, - transformations={coordinate_system: Identity()} - ) + label_el = sd.models.Labels2DModel.parse(labels, dims="yx", transformations={coordinate_system: Identity()}) + points_cells_el = sd.models.PointsModel.parse(points, transformations={coordinate_system: Identity()}) - # TODO: generate actual values table in a scalable fashion - # adata = aggregate(values=points_el, by=label_el, region_key=region_key, instance_key=instance_key, target_coordinate_system=coordinate_system).tables["table"] - # make X dense as markers are limited # generate dummy table adata = ad.AnnData(X=np.ones((length, 10))) adata.obs[region_key] = pd.Categorical([labels_name] * len(adata)) @@ -342,14 +308,21 @@ def cluster_blobs( }, labels={ labels_name: label_el, - # "blobs_markers": Labels2DModel.parse(data=markers), - }, - points={ - points_name: points_cells_el, - "transcripts_" + points_name: points_transcripts_el, }, + points={points_name: points_cells_el}, tables={table_name: table}, ) + + if n_transcripts_per_cell: + # transcript points + # generate 100 transcripts per cell + rng = np.random.default_rng(None) + points_transcripts = rng.integers(length, size=(n_cells * n_transcripts_per_cell, 2)) + points_transcripts_el = sd.models.PointsModel.parse( + points_transcripts, transformations={coordinate_system: Identity()} + ) + sdata["transcripts_" + points_name] = points_transcripts_el + # if shapes_name: # sdata[shapes_name] = sd.to_circles(sdata[labels_name]) # add_regionprop_features(sdata, labels_layer=labels_name, table_layer=table_name) From ac8091396838e28bcea05a28554184b607ae6ef8 Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Wed, 13 Nov 2024 16:59:46 +0100 Subject: [PATCH 8/9] pass pre-commit by ignore benchmark files. --- benchmarks/spatialdata_benchmark.py | 8 +++----- benchmarks/utils.py | 2 ++ pyproject.toml | 1 + 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/benchmarks/spatialdata_benchmark.py b/benchmarks/spatialdata_benchmark.py index 8ecf51d1..4b9c3a99 100644 --- a/benchmarks/spatialdata_benchmark.py +++ b/benchmarks/spatialdata_benchmark.py @@ -1,12 +1,10 @@ +# type: ignore + # Write the benchmarking functions here. # See "Writing benchmarks" in the asv docs for more information. import spatialdata as sd -try: - from .utils import cluster_blobs -except ImportError: - # TODO: remove ugly hack used for local testing - from utils import cluster_blobs +from .utils import cluster_blobs class MemorySpatialData: diff --git a/benchmarks/utils.py b/benchmarks/utils.py index 8d018a4e..5f62ff0d 100644 --- a/benchmarks/utils.py +++ b/benchmarks/utils.py @@ -1,3 +1,5 @@ +# type: ignore + import itertools import os from collections.abc import Sequence diff --git a/pyproject.toml b/pyproject.toml index d95e74eb..2f725e04 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -207,6 +207,7 @@ convention = "numpy" "src/spatialdata/dataloader/datasets.py" = ["D101"] "tests/test_models/test_models.py" = ["NPY002"] "tests/conftest.py"= ["E402"] + "benchmarks/*" = ["ALL"] # pyupgrade typing rewrite TODO: remove at some point from per-file ignore From 81b3bfb552058e442cfffa37e271abdd4141ff0c Mon Sep 17 00:00:00 2001 From: Benjamin Rombaut Date: Wed, 13 Nov 2024 17:18:21 +0100 Subject: [PATCH 9/9] add n_transcripts_per_cell to benchmark --- benchmarks/README.md | 18 +++++++++--------- benchmarks/spatialdata_benchmark.py | 14 +++++++------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 03989701..704050f4 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -23,15 +23,15 @@ PYTHONWARNINGS="ignore" asv run --python=same --show-stderr -b time_query_boundi output: ``` -[100.00%] ··· ======== ============ ============ - -- filter_table - -------- ------------------------- - length True False - ======== ============ ============ - 100 191±2ms 185±2ms - 1000 399±4ms 382±7ms - 10000 2.67±0.02s 2.18±0.01s - ======== ============ ============ +[50.00%] ··· ======== ============ ============== ============= =============== + -- filter_table / n_transcripts_per_cell + -------- --------------------------------------------------------- + length True / 100 True / 10000 False / 100 False / 10000 + ======== ============ ============== ============= =============== + 100 813±0ms 1.09±0s 803±0ms 980±0ms + 1000 799±0ms 2.96±0s 789±0ms 2.81±0s + 10000 1.32±0s 24.4±0s 962±0ms 21.5±0s + ======== ============ ============== ============= =============== ``` run everything in new env diff --git a/benchmarks/spatialdata_benchmark.py b/benchmarks/spatialdata_benchmark.py index 4b9c3a99..62d95cf1 100644 --- a/benchmarks/spatialdata_benchmark.py +++ b/benchmarks/spatialdata_benchmark.py @@ -45,19 +45,19 @@ def time_map_blocks(self, _): class TimeQueries: - params = ([100, 1000, 10_000], [True, False]) - param_names = ["length", "filter_table"] + params = ([100, 1000, 10_000], [True, False], [100, 10_000]) + param_names = ["length", "filter_table", "n_transcripts_per_cell"] - def setup(self, length, filter_table): + def setup(self, length, filter_table, n_transcripts_per_cell): import shapely - self.sdata = cluster_blobs(length=length) + self.sdata = cluster_blobs(length=length, n_transcripts_per_cell=n_transcripts_per_cell) self.polygon = shapely.box(0, 0, length // 2, length // 2) - def teardown(self, length, filter_table): + def teardown(self, length, filter_table, n_transcripts_per_cell): del self.sdata - def time_query_bounding_box(self, length, filter_table): + def time_query_bounding_box(self, length, filter_table, n_transcripts_per_cell): self.sdata.query.bounding_box( axes=["x", "y"], min_coordinate=[0, 0], @@ -66,7 +66,7 @@ def time_query_bounding_box(self, length, filter_table): filter_table=filter_table, ) - def time_query_polygon_box(self, length, filter_table): + def time_query_polygon_box(self, length, filter_table, n_transcripts_per_cell): sd.polygon_query( self.sdata, self.polygon,