From ab670b70cdb54c217c3a80f42ac793ee2ad20ffe Mon Sep 17 00:00:00 2001
From: Benjamin Rombaut <benjamin.rombaut@gmail.com>
Date: Tue, 12 Nov 2024 13:32:53 +0100
Subject: [PATCH 1/9] init for asv

---
 asv.conf.json            | 203 +++++++++++++++++++++++++++++++++++++++
 benchmarks/__init__.py   |   0
 benchmarks/benchmark.in  |   0
 benchmarks/benchmarks.py |  61 ++++++++++++
 pyproject.toml           |   3 +
 5 files changed, 267 insertions(+)
 create mode 100644 asv.conf.json
 create mode 100644 benchmarks/__init__.py
 create mode 100644 benchmarks/benchmark.in
 create mode 100644 benchmarks/benchmarks.py

diff --git a/asv.conf.json b/asv.conf.json
new file mode 100644
index 00000000..6ef13a93
--- /dev/null
+++ b/asv.conf.json
@@ -0,0 +1,203 @@
+{
+    // The version of the config file format.  Do not change, unless
+    // you know what you are doing.
+    "version": 1,
+
+    // The name of the project being benchmarked
+    "project": "spatialdata",
+
+    // The project's homepage
+    "project_url": "https://spatialdata.scverse.org/",
+
+    // The URL or local path of the source code repository for the
+    // project being benchmarked
+    "repo": ".",
+
+    // The Python project's subdirectory in your repo.  If missing or
+    // the empty string, the project is assumed to be located at the root
+    // of the repository.
+    // "repo_subdir": "",
+
+    // Customizable commands for building the project.
+    // See asv.conf.json documentation.
+    // To build the package using pyproject.toml (PEP518), uncomment the following lines
+    // "build_command": [
+    //     "python -m pip install build",
+    //     "python -m build",
+    //     "python -mpip wheel -w {build_cache_dir} {build_dir}"
+    // ],
+    // To build the package using setuptools and a setup.py file, uncomment the following lines
+    // "build_command": [
+    //     "python setup.py build",
+    //     "python -mpip wheel -w {build_cache_dir} {build_dir}"
+    // ],
+    "build_command": ["python -V"], // skip build stage
+
+    // Customizable commands for installing and uninstalling the project.
+    // See asv.conf.json documentation.
+    // "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}"],
+    // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"],
+    
+    // Install using default install
+    "install_command": [
+        "in-dir={env_dir} python -m pip install {build_dir}[test]"
+    ],
+    "uninstall_command": [
+        "in-dir={env_dir} python -m pip uninstall -y {project}"
+    ],
+
+    // List of branches to benchmark. If not provided, defaults to "main"
+    // (for git) or "default" (for mercurial).
+    "branches": ["main"], // for git
+    // "branches": ["default"],    // for mercurial
+
+    // The DVCS being used.  If not set, it will be automatically
+    // determined from "repo" by looking at the protocol in the URL
+    // (if remote), or by looking for special directories, such as
+    // ".git" (if local).
+    // "dvcs": "git",
+
+    // The tool to use to create environments.  May be "conda",
+    // "virtualenv", "mamba" (above 3.8)
+    // or other value depending on the plugins in use.
+    // If missing or the empty string, the tool will be automatically
+    // determined by looking for tools on the PATH environment
+    // variable.
+    "environment_type": "virtualenv",
+
+    // timeout in seconds for installing any dependencies in environment
+    // defaults to 10 min
+    // "install_timeout": 600,
+
+    // the base URL to show a commit for the project.
+    // "show_commit_url": "http://github.com/owner/project/commit/",
+
+    // The Pythons you'd like to test against.  If not provided, defaults
+    // to the current version of Python used to run `asv`.
+    "pythons": ["3.12"],
+
+    // The list of conda channel names to be searched for benchmark
+    // dependency packages in the specified order
+    // "conda_channels": ["conda-forge", "defaults"],
+
+    // A conda environment file that is used for environment creation.
+    // "conda_environment_file": "environment.yml",
+
+    // The matrix of dependencies to test.  Each key of the "req"
+    // requirements dictionary is the name of a package (in PyPI) and
+    // the values are version numbers.  An empty list or empty string
+    // indicates to just test against the default (latest)
+    // version. null indicates that the package is to not be
+    // installed. If the package to be tested is only available from
+    // PyPi, and the 'environment_type' is conda, then you can preface
+    // the package name by 'pip+', and the package will be installed
+    // via pip (with all the conda available packages installed first,
+    // followed by the pip installed packages).
+    //
+    // The ``@env`` and ``@env_nobuild`` keys contain the matrix of
+    // environment variables to pass to build and benchmark commands.
+    // An environment will be created for every combination of the
+    // cartesian product of the "@env" variables in this matrix.
+    // Variables in "@env_nobuild" will be passed to every environment
+    // during the benchmark phase, but will not trigger creation of
+    // new environments.  A value of ``null`` means that the variable
+    // will not be set for the current combination.
+    //
+    // "matrix": {
+    //     "req": {
+    //         "numpy": ["1.6", "1.7"],
+    //         "six": ["", null],  // test with and without six installed
+    //         "pip+emcee": [""]   // emcee is only available for install with pip.
+    //     },
+    //     "env": {"ENV_VAR_1": ["val1", "val2"]},
+    //     "env_nobuild": {"ENV_VAR_2": ["val3", null]},
+    // },
+
+    // Combinations of libraries/python versions can be excluded/included
+    // from the set to test. Each entry is a dictionary containing additional
+    // key-value pairs to include/exclude.
+    //
+    // An exclude entry excludes entries where all values match. The
+    // values are regexps that should match the whole string.
+    //
+    // An include entry adds an environment. Only the packages listed
+    // are installed. The 'python' key is required. The exclude rules
+    // do not apply to includes.
+    //
+    // In addition to package names, the following keys are available:
+    //
+    // - python
+    //     Python version, as in the *pythons* variable above.
+    // - environment_type
+    //     Environment type, as above.
+    // - sys_platform
+    //     Platform, as in sys.platform. Possible values for the common
+    //     cases: 'linux2', 'win32', 'cygwin', 'darwin'.
+    // - req
+    //     Required packages
+    // - env
+    //     Environment variables
+    // - env_nobuild
+    //     Non-build environment variables
+    //
+    // "exclude": [
+    //     {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
+    //     {"environment_type": "conda", "req": {"six": null}}, // don't run without six on conda
+    //     {"env": {"ENV_VAR_1": "val2"}}, // skip val2 for ENV_VAR_1
+    // ],
+    //
+    // "include": [
+    //     // additional env for python3.12
+    //     {"python": "3.12", "req": {"numpy": "1.26"}, "env_nobuild": {"FOO": "123"}},
+    //     // additional env if run on windows+conda
+    //     {"platform": "win32", "environment_type": "conda", "python": "3.12", "req": {"libpython": ""}},
+    // ],
+
+    // The directory (relative to the current directory) that benchmarks are
+    // stored in.  If not provided, defaults to "benchmarks"
+    // "benchmark_dir": "benchmarks",
+
+    // The directory (relative to the current directory) to cache the Python
+    // environments in.  If not provided, defaults to "env"
+    "env_dir": ".asv/env",
+
+    // The directory (relative to the current directory) that raw benchmark
+    // results are stored in.  If not provided, defaults to "results".
+    "results_dir": ".asv/results",
+
+    // The directory (relative to the current directory) that the html tree
+    // should be written to.  If not provided, defaults to "html".
+    "html_dir": ".asv/html",
+
+    // The number of characters to retain in the commit hashes.
+    "hash_length": 8,
+
+    // `asv` will cache results of the recent builds in each
+    // environment, making them faster to install next time.  This is
+    // the number of builds to keep, per environment.
+    "build_cache_size": 2,
+
+    // The commits after which the regression search in `asv publish`
+    // should start looking for regressions. Dictionary whose keys are
+    // regexps matching to benchmark names, and values corresponding to
+    // the commit (exclusive) after which to start looking for
+    // regressions.  The default is to start from the first commit
+    // with results. If the commit is `null`, regression detection is
+    // skipped for the matching benchmark.
+    //
+    // "regressions_first_commits": {
+    //    "some_benchmark": "352cdf",  // Consider regressions only after this commit
+    //    "another_benchmark": null,   // Skip regression detection altogether
+    // },
+
+    // The thresholds for relative change in results, after which `asv
+    // publish` starts reporting regressions. Dictionary of the same
+    // form as in ``regressions_first_commits``, with values
+    // indicating the thresholds.  If multiple entries match, the
+    // maximum is taken. If no entry matches, the default is 5%.
+    //
+    // "regressions_thresholds": {
+    //    "some_benchmark": 0.01,     // Threshold of 1%
+    //    "another_benchmark": 0.5,   // Threshold of 50%
+    // },
+}
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/benchmarks/benchmark.in b/benchmarks/benchmark.in
new file mode 100644
index 00000000..e69de29b
diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py
new file mode 100644
index 00000000..4e6ef5cb
--- /dev/null
+++ b/benchmarks/benchmarks.py
@@ -0,0 +1,61 @@
+# Write the benchmarking functions here.
+# See "Writing benchmarks" in the asv docs for more information.
+import spatialdata as sd
+
+# class TimeSuite:
+#     """
+#     An example benchmark that times the performance of various kinds
+#     of iterating over dictionaries in Python.
+#     """
+#     def setup(self):
+#         self.d = {}
+#         for x in range(500):
+#             self.d[x] = None
+
+#     def time_keys(self):
+#         for key in self.d.keys():
+#             pass
+
+#     def time_values(self):
+#         for value in self.d.values():
+#             pass
+
+#     def time_range(self):
+#         d = self.d
+#         for key in range(500):
+#             d[key]
+
+# class MemSuite:
+#     def mem_list(self):
+#         sdata: sd.SpatialData = sd.datasets.blobs()
+#         return sdata
+
+class SpatialBlobsSuite:
+    def peakmem_list(self):
+        sdata: sd.SpatialData = sd.datasets.blobs(n_channels=1)
+        return sdata
+
+    def peakmem_list2(self):
+        sdata: sd.SpatialData = sd.datasets.blobs(n_channels=2)
+        return sdata
+
+
+def timeraw_import_inspect():
+    return """
+    import spatialdata
+    """
+
+class SpatialDataLoading:
+
+    params = [100, 200, 300]
+    param_names = ["length"]
+
+    def setup(self, length):
+        self.sdata = sd.datasets.blobs(length=length)
+
+    def teardown(self, _):
+        del self.sdata
+
+    def time_map_blocks(self, _):
+        sd.map_raster(self.sdata["blobs_image"], lambda x: x+1)
+
diff --git a/pyproject.toml b/pyproject.toml
index 04dc3998..d95e74eb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -72,6 +72,9 @@ test = [
     "pytest-mock",
     "torch",
 ]
+benchmark = [
+    "asv",
+]
 torch = [
     "torch"
 ]

From 64e6d80df2b8c45e6db61529cd2c4f6ab5b7cb5f Mon Sep 17 00:00:00 2001
From: Benjamin Rombaut <benjamin.rombaut@gmail.com>
Date: Tue, 12 Nov 2024 14:47:25 +0100
Subject: [PATCH 2/9] ignore asv folder

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 328be6b2..439f8ae3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -47,3 +47,5 @@ _version.py
 
 # other
 node_modules/
+
+.asv/
\ No newline at end of file

From 44cb698fe1a0e029a7a2a746f434f39b1eadb081 Mon Sep 17 00:00:00 2001
From: Benjamin Rombaut <benjamin.rombaut@gmail.com>
Date: Tue, 12 Nov 2024 14:50:18 +0100
Subject: [PATCH 3/9] add basic benchmarks

---
 benchmarks/benchmarks.py | 67 +++++++++++++++++++++-------------------
 1 file changed, 36 insertions(+), 31 deletions(-)

diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py
index 4e6ef5cb..f6b07bfe 100644
--- a/benchmarks/benchmarks.py
+++ b/benchmarks/benchmarks.py
@@ -2,35 +2,11 @@
 # See "Writing benchmarks" in the asv docs for more information.
 import spatialdata as sd
 
-# class TimeSuite:
-#     """
-#     An example benchmark that times the performance of various kinds
-#     of iterating over dictionaries in Python.
-#     """
-#     def setup(self):
-#         self.d = {}
-#         for x in range(500):
-#             self.d[x] = None
-
-#     def time_keys(self):
-#         for key in self.d.keys():
-#             pass
-
-#     def time_values(self):
-#         for value in self.d.values():
-#             pass
-
-#     def time_range(self):
-#         d = self.d
-#         for key in range(500):
-#             d[key]
-
-# class MemSuite:
-#     def mem_list(self):
-#         sdata: sd.SpatialData = sd.datasets.blobs()
-#         return sdata
-
-class SpatialBlobsSuite:
+
+class MemorySpatialData:
+    # TODO: see what the memory overhead is e.g. Python interpreter...
+    """Calculate the peak memory usage is for artificial datasets with increasing channels."""
+
     def peakmem_list(self):
         sdata: sd.SpatialData = sd.datasets.blobs(n_channels=1)
         return sdata
@@ -41,13 +17,15 @@ def peakmem_list2(self):
 
 
 def timeraw_import_inspect():
+    """Time the import of the spatialdata module."""
     return """
     import spatialdata
     """
 
-class SpatialDataLoading:
+class TimeMapRaster:
+    """Time the."""
 
-    params = [100, 200, 300]
+    params = [100, 1000]
     param_names = ["length"]
 
     def setup(self, length):
@@ -59,3 +37,30 @@ def teardown(self, _):
     def time_map_blocks(self, _):
         sd.map_raster(self.sdata["blobs_image"], lambda x: x+1)
 
+class TimeQueries:
+
+    params = ([100, 1000], [True, False])
+    param_names = ["length", "filter_table"]
+
+    def setup(self, length, filter_table):
+        import shapely
+        self.sdata = sd.datasets.blobs(length=length)
+        self.polygon = shapely.box(0, 0, 100, 100)
+
+
+    def teardown(self, length, filter_table):
+        del self.sdata
+
+    def time_query_bounding_box(self, length, filter_table):
+        self.sdata.query.bounding_box(
+            axes=["x", "y"],
+            min_coordinate=[0, 0],
+            max_coordinate=[100, 100],
+            target_coordinate_system="global",
+            filter_table=filter_table,
+        )
+
+    def time_query_polygon_box(self, length, filter_table):
+        sd.polygon_query(self.sdata, self.polygon, target_coordinate_system="global",
+                         filter_table=filter_table,
+        )

From cdee78909e4ac460e4eb63704e2e7ca19bc80be7 Mon Sep 17 00:00:00 2001
From: Benjamin Rombaut <benjamin.rombaut@gmail.com>
Date: Wed, 13 Nov 2024 14:01:23 +0100
Subject: [PATCH 4/9] improve cluster_blobs creation time

---
 benchmarks/README.md     |  30 ++++
 benchmarks/benchmarks.py |  33 +++-
 benchmarks/utils.py      | 347 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 404 insertions(+), 6 deletions(-)
 create mode 100644 benchmarks/README.md
 create mode 100644 benchmarks/utils.py

diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 00000000..827e8cf3
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,30 @@
+# Benchmarking
+
+setup
+```
+pip install -e '.[docs,benchmark]'
+```
+
+See napari [docs](https://napari.org/stable/developers/contributing/performance/benchmarks.html) on profiling and benchmarking for more information.
+
+run a specific benchmark
+```
+PYTHONWARNINGS="ignore" asv run --python=same --show-stderr --quick -b time_query_bounding_box 
+```
+output:
+```
+[100.00%] ··· ======== ========== ============
+              --             filter_table     
+              -------- -----------------------
+               length     True       False    
+              ======== ========== ============
+                100     89.1±5ms   85.6±0.8ms 
+                1000    99.0±8ms    87.7±1ms  
+               10000    427±10ms    92.4±2ms  
+              ======== ========== ============
+```
+
+run everything in new env
+```
+asv run
+```
diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py
index f6b07bfe..86b88054 100644
--- a/benchmarks/benchmarks.py
+++ b/benchmarks/benchmarks.py
@@ -2,6 +2,12 @@
 # See "Writing benchmarks" in the asv docs for more information.
 import spatialdata as sd
 
+try:
+    from .utils import cluster_blobs
+except ImportError:
+    # TODO: remove ugly hack used for local testing
+    from utils import cluster_blobs
+
 
 class MemorySpatialData:
     # TODO: see what the memory overhead is e.g. Python interpreter...
@@ -25,11 +31,11 @@ def timeraw_import_inspect():
 class TimeMapRaster:
     """Time the."""
 
-    params = [100, 1000]
+    params = [100, 1000, 10_000]
     param_names = ["length"]
 
     def setup(self, length):
-        self.sdata = sd.datasets.blobs(length=length)
+        self.sdata = cluster_blobs(length=length)
 
     def teardown(self, _):
         del self.sdata
@@ -39,13 +45,14 @@ def time_map_blocks(self, _):
 
 class TimeQueries:
 
-    params = ([100, 1000], [True, False])
+    params = ([100, 1000, 10_000], [True, False])
     param_names = ["length", "filter_table"]
 
     def setup(self, length, filter_table):
         import shapely
-        self.sdata = sd.datasets.blobs(length=length)
-        self.polygon = shapely.box(0, 0, 100, 100)
+
+        self.sdata = cluster_blobs(length=length)
+        self.polygon = shapely.box(0, 0, length//2, length//2)
 
 
     def teardown(self, length, filter_table):
@@ -55,7 +62,7 @@ def time_query_bounding_box(self, length, filter_table):
         self.sdata.query.bounding_box(
             axes=["x", "y"],
             min_coordinate=[0, 0],
-            max_coordinate=[100, 100],
+            max_coordinate=[length//2, length//2],
             target_coordinate_system="global",
             filter_table=filter_table,
         )
@@ -64,3 +71,17 @@ def time_query_polygon_box(self, length, filter_table):
         sd.polygon_query(self.sdata, self.polygon, target_coordinate_system="global",
                          filter_table=filter_table,
         )
+
+
+if __name__ == "__main__":
+    length = 10_000
+    sdata = cluster_blobs(length)
+    # sdata.write("tmp_test")
+    sdata.query.bounding_box(
+        axes=["x", "y"],
+        min_coordinate=[0, 0],
+        max_coordinate=[length//2, length//2],
+        target_coordinate_system="global",
+        filter_table=True,
+    )
+    print(sdata)
diff --git a/benchmarks/utils.py b/benchmarks/utils.py
new file mode 100644
index 00000000..c2bba0b5
--- /dev/null
+++ b/benchmarks/utils.py
@@ -0,0 +1,347 @@
+import itertools
+import os
+from collections.abc import Sequence
+from functools import lru_cache
+from types import ModuleType
+from typing import Callable, Literal, Optional, Union, overload
+
+import anndata as ad
+import numpy as np
+import pandas as pd
+from skimage import morphology
+
+import spatialdata as sd
+from spatialdata import SpatialData
+from spatialdata.models import Image2DModel, TableModel
+from spatialdata.transformations import Identity
+
+
+def always_false(*_):
+    return False
+
+
+class Skip:
+    def __init__(
+        self,
+        if_in_pr: Callable[..., bool] = always_false,
+        if_on_ci: Callable[..., bool] = always_false,
+        always: Callable[..., bool] = always_false,
+    ):
+        self.func_pr = if_in_pr if "PR" in os.environ else always_false
+        self.func_ci = if_on_ci if "CI" in os.environ else always_false
+        self.func_always = always
+
+    def __contains__(self, item):
+        return (
+            self.func_pr(*item)
+            or self.func_ci(*item)
+            or self.func_always(*item)
+        )
+
+
+def _generate_ball(radius: int, ndim: int) -> np.ndarray:
+    """Generate a ball of given radius and dimension.
+
+    Parameters
+    ----------
+    radius : int
+        Radius of the ball.
+    ndim : int
+        Dimension of the ball.
+
+    Returns
+    -------
+    ball : ndarray of uint8
+        Binary array of the hyper ball.
+    """
+    if ndim == 2:
+        return morphology.disk(radius)
+    if ndim == 3:
+        return morphology.ball(radius)
+    shape = (2 * radius + 1,) * ndim
+    radius_sq = radius**2
+    coords = np.indices(shape) - radius
+    return (np.sum(coords**2, axis=0) <= radius_sq).astype(np.uint8)
+
+
+def _generate_density(radius: int, ndim: int) -> np.ndarray:
+    """Generate gaussian density of given radius and dimension."""
+    shape = (2 * radius + 1,) * ndim
+    coords = np.indices(shape) - radius
+    dist = np.sqrt(np.sum(coords**2 / ((radius / 4) ** 2), axis=0))
+    res = np.exp(-dist)
+    res[res < 0.02] = 0
+    return res
+
+
+def _structure_at_coordinates(
+    shape: tuple[int],
+    coordinates: np.ndarray,
+    structure: np.ndarray,
+    *,
+    multipliers: Sequence = itertools.repeat(1),
+    dtype=None,
+    reduce_fn: Callable[
+        [np.ndarray, np.ndarray, Optional[np.ndarray]], np.ndarray
+    ],
+):
+    """Update data with structure at given coordinates.
+
+    Parameters
+    ----------
+    data : ndarray
+        Array to update.
+    coordinates : ndarray
+        Coordinates of the points. The structures will be added at these
+        points (center).
+    structure : ndarray
+        Array with encoded structure. For example, ball (boolean) or density
+        (0,1) float.
+    multipliers : ndarray
+        These values are multiplied by the values in the structure before
+        updating the array. Can be used to generate different labels, or to
+        vary the intensity of floating point gaussian densities.
+    reduce_fn : function
+        Function with which to update the array at a particular position. It
+        should take two arrays as input and an optional output array.
+    """
+    radius = (structure.shape[0] - 1) // 2
+    data = np.zeros(shape, dtype=dtype)
+
+    for point, value in zip(coordinates, multipliers):
+        slice_im, slice_ball = _get_slices_at(shape, point, radius)
+        reduce_fn(
+            data[slice_im], value * structure[slice_ball], out=data[slice_im]
+        )
+    return data
+
+
+def _get_slices_at(shape, point, radius):
+    slice_im = []
+    slice_ball = []
+    for i, p in enumerate(point):
+        slice_im.append(
+            slice(max(0, p - radius), min(shape[i], p + radius + 1))
+        )
+        ball_start = max(0, radius - p)
+        ball_stop = slice_im[-1].stop - slice_im[-1].start + ball_start
+        slice_ball.append(slice(ball_start, ball_stop))
+    return tuple(slice_im), tuple(slice_ball)
+
+
+def _update_data_with_mask(data, struct, out=None):
+    """Update ``data`` with ``struct`` where ``struct`` is nonzero."""
+    # these branches are needed because np.where does not support
+    # an out= keyword argument
+    if out is None:
+        return np.where(struct, struct, data)
+    else:  # noqa: RET505
+        nz = struct != 0
+        out[nz] = struct[nz]
+        return out
+
+
+def _smallest_dtype(n: int) -> np.dtype:
+    """Find the smallest dtype that can hold n values."""
+    for dtype in [np.uint8, np.uint16, np.uint32, np.uint64]:
+        if np.iinfo(dtype).max >= n:
+            return dtype
+            break
+    else:
+        raise ValueError(f"{n=} is too large for any dtype.")
+
+
+@overload
+def labeled_particles(
+    shape: Sequence[int],
+    dtype: Optional[np.dtype] = None,
+    n: int = 144,
+    seed: Optional[int] = None,
+    return_density: Literal[False] = False,
+) -> np.ndarray: ...
+
+
+@overload
+def labeled_particles(
+    shape: Sequence[int],
+    dtype: Optional[np.dtype] = None,
+    n: int = 144,
+    seed: Optional[int] = None,
+    return_density: Literal[True] = True,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ...
+
+
+@lru_cache
+def labeled_particles(
+    shape: Sequence[int],
+    dtype: Optional[np.dtype] = None,
+    n: int = 144,
+    seed: Optional[int] = None,
+    return_density: bool = False,
+) -> Union[np.ndarray, tuple[np.ndarray, np.ndarray, np.ndarray]]:
+    """Generate labeled blobs of given shape and dtype.
+
+    Parameters
+    ----------
+    shape : Sequence[int]
+        Shape of the resulting array.
+    dtype : Optional[np.dtype]
+        Dtype of the resulting array.
+    n : int
+        Number of blobs to generate.
+    seed : Optional[int]
+        Seed for the random number generator.
+    return_density : bool
+        Whether to return the density array and center coordinates.
+    """
+    if dtype is None:
+        dtype = _smallest_dtype(n)
+    rng = np.random.default_rng(seed)
+    ndim = len(shape)
+    points = rng.integers(shape, size=(n, ndim))
+    # create values from 1 to max of number of points
+    values = np.linspace(1, n, n, dtype=dtype)
+    rng.shuffle(values)
+    # values = rng.integers(
+    #     np.iinfo(dtype).min + 1, np.iinfo(dtype).max, size=n, dtype=dtype
+    # )
+    sigma = int(max(shape) / (4.0 * n ** (1 / ndim)))
+    ball = _generate_ball(sigma, ndim)
+
+    labels = _structure_at_coordinates(
+        shape,
+        points,
+        ball,
+        multipliers=values,
+        reduce_fn=_update_data_with_mask,
+        dtype=dtype,
+    )
+
+    if return_density:
+        dens = _generate_density(sigma * 2, ndim)
+        densities = _structure_at_coordinates(
+            shape, points, dens, reduce_fn=np.maximum, dtype=np.float32
+        )
+
+        return labels, densities, points, values
+    else:  # noqa: RET505
+        return labels
+
+
+def run_benchmark_from_module(
+    module: ModuleType, klass_name: str, method_name: str
+):
+    klass = getattr(module, klass_name)
+    if getattr(klass, "params", None):
+        skip_if = getattr(klass, "skip_params", {})
+        if isinstance(klass.params[0], Sequence):
+            params = itertools.product(*klass.params)
+        else:
+            params = ((i,) for i in klass.params)
+        for param in params:
+            if param in skip_if:
+                continue
+            obj = klass()
+            try:
+                obj.setup(*param)
+            except NotImplementedError:
+                continue
+            getattr(obj, method_name)(*param)
+            getattr(obj, "teardown", lambda: None)()
+    else:
+        obj = klass()
+        try:
+            obj.setup()
+        except NotImplementedError:
+            return
+        getattr(obj, method_name)()
+        getattr(obj, "teardown", lambda: None)()
+
+
+def run_benchmark():
+    import argparse
+    import inspect
+
+    parser = argparse.ArgumentParser(description="Run benchmark")
+    parser.add_argument(
+        "benchmark", type=str, help="Name of the benchmark to run", default=""
+    )
+
+    args = parser.parse_args()
+
+    benchmark_selection = args.benchmark.split(".")
+
+    # get module of parent frame
+    call_module = inspect.getmodule(inspect.currentframe().f_back)
+    run_benchmark_from_module(call_module, *benchmark_selection)
+
+def cluster_blobs(
+    length=512,
+    n=None,
+    region_key="region_key",
+    instance_key="instance_key",
+    image_name="blobs_image",
+    labels_name="blobs_labels",
+    points_name="blobs_points",
+    table_name="table",
+    coordinate_system="global",
+):
+    """Faster `spatialdata.datasets.make_blobs` using napari.datasets code."""
+    if n is None:
+        n = length
+    labels, density, points , values = labeled_particles(
+            (length, length), return_density=True, n=n
+    )
+
+    im_el = Image2DModel.parse(
+        data=density[None, ...],
+        dims="cyx",
+        transformations={coordinate_system: Identity()},
+    )
+    label_el = sd.models.Labels2DModel.parse(
+        labels,
+        dims="yx",
+        transformations={coordinate_system: Identity()}
+    )
+    points_el = sd.models.PointsModel.parse(
+        points,
+        transformations={coordinate_system: Identity()}
+    )
+    # TODO: generate actual values table in a scalable fashion
+    # adata = aggregate(values=points_el, by=label_el, region_key=region_key, instance_key=instance_key, target_coordinate_system=coordinate_system).tables["table"]
+    # make X dense as markers are limited
+    # generate dummy table
+    adata = ad.AnnData(X=np.ones((length, 10)))
+    adata.obs[region_key] = pd.Categorical([labels_name] * len(adata))
+    # adata.obs_names = values.astype(np.uint64)
+    adata.obs[instance_key] = adata.obs_names.values
+    adata.obs.index = adata.obs.index.astype(str)
+    adata.obs.index.name = instance_key
+    # del adata.uns[TableModel.ATTRS_KEY]
+    table = TableModel.parse(
+        adata,
+        region=labels_name,
+        region_key=region_key,
+        instance_key=instance_key,
+    )
+
+    sdata = SpatialData(
+        images={
+            image_name: im_el,
+        },
+        labels={
+            labels_name: label_el,
+            # "blobs_markers": Labels2DModel.parse(data=markers),
+        },
+        points={points_name: points_el},
+        tables={table_name: table},
+    )
+    # if shapes_name:
+    #     sdata[shapes_name] = sd.to_circles(sdata[labels_name])
+    # add_regionprop_features(sdata, labels_layer=labels_name, table_layer=table_name)
+    return sdata
+
+
+if __name__ == "__main__":
+    sdata = cluster_blobs(1_000)
+    print(sdata)

From d38e1d1942b10ec149421e87a19fdbd730aa4283 Mon Sep 17 00:00:00 2001
From: Benjamin Rombaut <benjamin.rombaut@gmail.com>
Date: Wed, 13 Nov 2024 15:15:26 +0100
Subject: [PATCH 5/9] update readme

---
 benchmarks/README.md                           | 18 ++++++++++++------
 ...{benchmarks.py => spatialdata_benchmark.py} |  2 +-
 benchmarks/utils.py                            | 18 ++++++++++++++++--
 3 files changed, 29 insertions(+), 9 deletions(-)
 rename benchmarks/{benchmarks.py => spatialdata_benchmark.py} (99%)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 827e8cf3..17cfc90a 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -5,22 +5,28 @@ setup
 pip install -e '.[docs,benchmark]'
 ```
 
+In PyCharm, configure your Configuration to include the benchmark module. In Python, you can run using
+```
+python -m benchmarks.spatialdata_benchmark
+```
+
+
 See napari [docs](https://napari.org/stable/developers/contributing/performance/benchmarks.html) on profiling and benchmarking for more information.
 
 run a specific benchmark
 ```
-PYTHONWARNINGS="ignore" asv run --python=same --show-stderr --quick -b time_query_bounding_box 
+PYTHONWARNINGS="ignore" asv run --python=same --show-stderr -b time_query_bounding_box
 ```
 output:
 ```
 [100.00%] ··· ======== ========== ============
-              --             filter_table     
+              --             filter_table
               -------- -----------------------
-               length     True       False    
+               length     True       False
               ======== ========== ============
-                100     89.1±5ms   85.6±0.8ms 
-                1000    99.0±8ms    87.7±1ms  
-               10000    427±10ms    92.4±2ms  
+                100     89.1±5ms   85.6±0.8ms
+                1000    99.0±8ms    87.7±1ms
+               10000    427±10ms    92.4±2ms
               ======== ========== ============
 ```
 
diff --git a/benchmarks/benchmarks.py b/benchmarks/spatialdata_benchmark.py
similarity index 99%
rename from benchmarks/benchmarks.py
rename to benchmarks/spatialdata_benchmark.py
index 86b88054..ff50f8a0 100644
--- a/benchmarks/benchmarks.py
+++ b/benchmarks/spatialdata_benchmark.py
@@ -74,7 +74,7 @@ def time_query_polygon_box(self, length, filter_table):
 
 
 if __name__ == "__main__":
-    length = 10_000
+    length = 1_000
     sdata = cluster_blobs(length)
     # sdata.write("tmp_test")
     sdata.query.bounding_box(
diff --git a/benchmarks/utils.py b/benchmarks/utils.py
index c2bba0b5..fbd53f64 100644
--- a/benchmarks/utils.py
+++ b/benchmarks/utils.py
@@ -275,6 +275,7 @@ def run_benchmark():
     call_module = inspect.getmodule(inspect.currentframe().f_back)
     run_benchmark_from_module(call_module, *benchmark_selection)
 
+@lru_cache
 def cluster_blobs(
     length=512,
     n=None,
@@ -289,9 +290,14 @@ def cluster_blobs(
     """Faster `spatialdata.datasets.make_blobs` using napari.datasets code."""
     if n is None:
         n = length
+    # cells
     labels, density, points , values = labeled_particles(
             (length, length), return_density=True, n=n
     )
+    # transcript points
+    # generate 100 transcripts per cell
+    rng = np.random.default_rng(None)
+    points_transcripts = rng.integers(length, size=(n*1000, 2))
 
     im_el = Image2DModel.parse(
         data=density[None, ...],
@@ -303,10 +309,15 @@ def cluster_blobs(
         dims="yx",
         transformations={coordinate_system: Identity()}
     )
-    points_el = sd.models.PointsModel.parse(
+    points_cells_el = sd.models.PointsModel.parse(
         points,
         transformations={coordinate_system: Identity()}
     )
+    points_transcripts_el = sd.models.PointsModel.parse(
+        points_transcripts,
+        transformations={coordinate_system: Identity()}
+    )
+
     # TODO: generate actual values table in a scalable fashion
     # adata = aggregate(values=points_el, by=label_el, region_key=region_key, instance_key=instance_key, target_coordinate_system=coordinate_system).tables["table"]
     # make X dense as markers are limited
@@ -333,7 +344,10 @@ def cluster_blobs(
             labels_name: label_el,
             # "blobs_markers": Labels2DModel.parse(data=markers),
         },
-        points={points_name: points_el},
+        points={
+            points_name: points_cells_el,
+            "transcripts_" + points_name: points_transcripts_el,
+        },
         tables={table_name: table},
     )
     # if shapes_name:

From 2de7afb8628132ef8a615457f39ebd2eafcd4bf4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 13 Nov 2024 15:34:38 +0000
Subject: [PATCH 6/9] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 asv.conf.json                       |  4 +--
 benchmarks/README.md                |  7 +++-
 benchmarks/spatialdata_benchmark.py | 18 ++++++----
 benchmarks/utils.py                 | 51 ++++++++---------------------
 4 files changed, 32 insertions(+), 48 deletions(-)

diff --git a/asv.conf.json b/asv.conf.json
index 6ef13a93..8a108478 100644
--- a/asv.conf.json
+++ b/asv.conf.json
@@ -37,7 +37,7 @@
     // See asv.conf.json documentation.
     // "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}"],
     // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"],
-    
+
     // Install using default install
     "install_command": [
         "in-dir={env_dir} python -m pip install {build_dir}[test]"
@@ -175,7 +175,7 @@
     // `asv` will cache results of the recent builds in each
     // environment, making them faster to install next time.  This is
     // the number of builds to keep, per environment.
-    "build_cache_size": 2,
+    "build_cache_size": 2
 
     // The commits after which the regression search in `asv publish`
     // should start looking for regressions. Dictionary whose keys are
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 17cfc90a..a181feaf 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,23 +1,27 @@
 # Benchmarking
 
 setup
+
 ```
 pip install -e '.[docs,benchmark]'
 ```
 
 In PyCharm, configure your Configuration to include the benchmark module. In Python, you can run using
+
 ```
 python -m benchmarks.spatialdata_benchmark
 ```
 
-
 See napari [docs](https://napari.org/stable/developers/contributing/performance/benchmarks.html) on profiling and benchmarking for more information.
 
 run a specific benchmark
+
 ```
 PYTHONWARNINGS="ignore" asv run --python=same --show-stderr -b time_query_bounding_box
 ```
+
 output:
+
 ```
 [100.00%] ··· ======== ========== ============
               --             filter_table
@@ -31,6 +35,7 @@ output:
 ```
 
 run everything in new env
+
 ```
 asv run
 ```
diff --git a/benchmarks/spatialdata_benchmark.py b/benchmarks/spatialdata_benchmark.py
index ff50f8a0..8ecf51d1 100644
--- a/benchmarks/spatialdata_benchmark.py
+++ b/benchmarks/spatialdata_benchmark.py
@@ -28,6 +28,7 @@ def timeraw_import_inspect():
     import spatialdata
     """
 
+
 class TimeMapRaster:
     """Time the."""
 
@@ -41,7 +42,8 @@ def teardown(self, _):
         del self.sdata
 
     def time_map_blocks(self, _):
-        sd.map_raster(self.sdata["blobs_image"], lambda x: x+1)
+        sd.map_raster(self.sdata["blobs_image"], lambda x: x + 1)
+
 
 class TimeQueries:
 
@@ -52,8 +54,7 @@ def setup(self, length, filter_table):
         import shapely
 
         self.sdata = cluster_blobs(length=length)
-        self.polygon = shapely.box(0, 0, length//2, length//2)
-
+        self.polygon = shapely.box(0, 0, length // 2, length // 2)
 
     def teardown(self, length, filter_table):
         del self.sdata
@@ -62,14 +63,17 @@ def time_query_bounding_box(self, length, filter_table):
         self.sdata.query.bounding_box(
             axes=["x", "y"],
             min_coordinate=[0, 0],
-            max_coordinate=[length//2, length//2],
+            max_coordinate=[length // 2, length // 2],
             target_coordinate_system="global",
             filter_table=filter_table,
         )
 
     def time_query_polygon_box(self, length, filter_table):
-        sd.polygon_query(self.sdata, self.polygon, target_coordinate_system="global",
-                         filter_table=filter_table,
+        sd.polygon_query(
+            self.sdata,
+            self.polygon,
+            target_coordinate_system="global",
+            filter_table=filter_table,
         )
 
 
@@ -80,7 +84,7 @@ def time_query_polygon_box(self, length, filter_table):
     sdata.query.bounding_box(
         axes=["x", "y"],
         min_coordinate=[0, 0],
-        max_coordinate=[length//2, length//2],
+        max_coordinate=[length // 2, length // 2],
         target_coordinate_system="global",
         filter_table=True,
     )
diff --git a/benchmarks/utils.py b/benchmarks/utils.py
index fbd53f64..5efd2cc4 100644
--- a/benchmarks/utils.py
+++ b/benchmarks/utils.py
@@ -32,11 +32,7 @@ def __init__(
         self.func_always = always
 
     def __contains__(self, item):
-        return (
-            self.func_pr(*item)
-            or self.func_ci(*item)
-            or self.func_always(*item)
-        )
+        return self.func_pr(*item) or self.func_ci(*item) or self.func_always(*item)
 
 
 def _generate_ball(radius: int, ndim: int) -> np.ndarray:
@@ -81,9 +77,7 @@ def _structure_at_coordinates(
     *,
     multipliers: Sequence = itertools.repeat(1),
     dtype=None,
-    reduce_fn: Callable[
-        [np.ndarray, np.ndarray, Optional[np.ndarray]], np.ndarray
-    ],
+    reduce_fn: Callable[[np.ndarray, np.ndarray, Optional[np.ndarray]], np.ndarray],
 ):
     """Update data with structure at given coordinates.
 
@@ -110,9 +104,7 @@ def _structure_at_coordinates(
 
     for point, value in zip(coordinates, multipliers):
         slice_im, slice_ball = _get_slices_at(shape, point, radius)
-        reduce_fn(
-            data[slice_im], value * structure[slice_ball], out=data[slice_im]
-        )
+        reduce_fn(data[slice_im], value * structure[slice_ball], out=data[slice_im])
     return data
 
 
@@ -120,9 +112,7 @@ def _get_slices_at(shape, point, radius):
     slice_im = []
     slice_ball = []
     for i, p in enumerate(point):
-        slice_im.append(
-            slice(max(0, p - radius), min(shape[i], p + radius + 1))
-        )
+        slice_im.append(slice(max(0, p - radius), min(shape[i], p + radius + 1)))
         ball_start = max(0, radius - p)
         ball_stop = slice_im[-1].stop - slice_im[-1].start + ball_start
         slice_ball.append(slice(ball_start, ball_stop))
@@ -219,18 +209,14 @@ def labeled_particles(
 
     if return_density:
         dens = _generate_density(sigma * 2, ndim)
-        densities = _structure_at_coordinates(
-            shape, points, dens, reduce_fn=np.maximum, dtype=np.float32
-        )
+        densities = _structure_at_coordinates(shape, points, dens, reduce_fn=np.maximum, dtype=np.float32)
 
         return labels, densities, points, values
     else:  # noqa: RET505
         return labels
 
 
-def run_benchmark_from_module(
-    module: ModuleType, klass_name: str, method_name: str
-):
+def run_benchmark_from_module(module: ModuleType, klass_name: str, method_name: str):
     klass = getattr(module, klass_name)
     if getattr(klass, "params", None):
         skip_if = getattr(klass, "skip_params", {})
@@ -263,9 +249,7 @@ def run_benchmark():
     import inspect
 
     parser = argparse.ArgumentParser(description="Run benchmark")
-    parser.add_argument(
-        "benchmark", type=str, help="Name of the benchmark to run", default=""
-    )
+    parser.add_argument("benchmark", type=str, help="Name of the benchmark to run", default="")
 
     args = parser.parse_args()
 
@@ -275,6 +259,7 @@ def run_benchmark():
     call_module = inspect.getmodule(inspect.currentframe().f_back)
     run_benchmark_from_module(call_module, *benchmark_selection)
 
+
 @lru_cache
 def cluster_blobs(
     length=512,
@@ -291,31 +276,21 @@ def cluster_blobs(
     if n is None:
         n = length
     # cells
-    labels, density, points , values = labeled_particles(
-            (length, length), return_density=True, n=n
-    )
+    labels, density, points, values = labeled_particles((length, length), return_density=True, n=n)
     # transcript points
     # generate 100 transcripts per cell
     rng = np.random.default_rng(None)
-    points_transcripts = rng.integers(length, size=(n*1000, 2))
+    points_transcripts = rng.integers(length, size=(n * 1000, 2))
 
     im_el = Image2DModel.parse(
         data=density[None, ...],
         dims="cyx",
         transformations={coordinate_system: Identity()},
     )
-    label_el = sd.models.Labels2DModel.parse(
-        labels,
-        dims="yx",
-        transformations={coordinate_system: Identity()}
-    )
-    points_cells_el = sd.models.PointsModel.parse(
-        points,
-        transformations={coordinate_system: Identity()}
-    )
+    label_el = sd.models.Labels2DModel.parse(labels, dims="yx", transformations={coordinate_system: Identity()})
+    points_cells_el = sd.models.PointsModel.parse(points, transformations={coordinate_system: Identity()})
     points_transcripts_el = sd.models.PointsModel.parse(
-        points_transcripts,
-        transformations={coordinate_system: Identity()}
+        points_transcripts, transformations={coordinate_system: Identity()}
     )
 
     # TODO: generate actual values table in a scalable fashion

From 214eee1bc4125ccf854d919461fc04ff56bfdb57 Mon Sep 17 00:00:00 2001
From: Benjamin Rombaut <benjamin.rombaut@gmail.com>
Date: Wed, 13 Nov 2024 16:41:11 +0100
Subject: [PATCH 7/9] lint code

---
 benchmarks/README.md | 25 ++++++++------
 benchmarks/utils.py  | 81 +++++++++++++++-----------------------------
 2 files changed, 42 insertions(+), 64 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 17cfc90a..03989701 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,36 +1,41 @@
 # Benchmarking
 
 setup
+
 ```
 pip install -e '.[docs,benchmark]'
 ```
 
 In PyCharm, configure your Configuration to include the benchmark module. In Python, you can run using
+
 ```
 python -m benchmarks.spatialdata_benchmark
 ```
 
-
 See napari [docs](https://napari.org/stable/developers/contributing/performance/benchmarks.html) on profiling and benchmarking for more information.
 
 run a specific benchmark
+
 ```
 PYTHONWARNINGS="ignore" asv run --python=same --show-stderr -b time_query_bounding_box
 ```
+
 output:
+
 ```
-[100.00%] ··· ======== ========== ============
-              --             filter_table
-              -------- -----------------------
-               length     True       False
-              ======== ========== ============
-                100     89.1±5ms   85.6±0.8ms
-                1000    99.0±8ms    87.7±1ms
-               10000    427±10ms    92.4±2ms
-              ======== ========== ============
+[100.00%] ··· ======== ============ ============
+              --              filter_table
+              -------- -------------------------
+               length      True        False
+              ======== ============ ============
+                100      191±2ms      185±2ms
+                1000     399±4ms      382±7ms
+               10000    2.67±0.02s   2.18±0.01s
+              ======== ============ ============
 ```
 
 run everything in new env
+
 ```
 asv run
 ```
diff --git a/benchmarks/utils.py b/benchmarks/utils.py
index fbd53f64..8d018a4e 100644
--- a/benchmarks/utils.py
+++ b/benchmarks/utils.py
@@ -32,11 +32,7 @@ def __init__(
         self.func_always = always
 
     def __contains__(self, item):
-        return (
-            self.func_pr(*item)
-            or self.func_ci(*item)
-            or self.func_always(*item)
-        )
+        return self.func_pr(*item) or self.func_ci(*item) or self.func_always(*item)
 
 
 def _generate_ball(radius: int, ndim: int) -> np.ndarray:
@@ -81,9 +77,7 @@ def _structure_at_coordinates(
     *,
     multipliers: Sequence = itertools.repeat(1),
     dtype=None,
-    reduce_fn: Callable[
-        [np.ndarray, np.ndarray, Optional[np.ndarray]], np.ndarray
-    ],
+    reduce_fn: Callable[[np.ndarray, np.ndarray, Optional[np.ndarray]], np.ndarray],
 ):
     """Update data with structure at given coordinates.
 
@@ -110,9 +104,7 @@ def _structure_at_coordinates(
 
     for point, value in zip(coordinates, multipliers):
         slice_im, slice_ball = _get_slices_at(shape, point, radius)
-        reduce_fn(
-            data[slice_im], value * structure[slice_ball], out=data[slice_im]
-        )
+        reduce_fn(data[slice_im], value * structure[slice_ball], out=data[slice_im])
     return data
 
 
@@ -120,9 +112,7 @@ def _get_slices_at(shape, point, radius):
     slice_im = []
     slice_ball = []
     for i, p in enumerate(point):
-        slice_im.append(
-            slice(max(0, p - radius), min(shape[i], p + radius + 1))
-        )
+        slice_im.append(slice(max(0, p - radius), min(shape[i], p + radius + 1)))
         ball_start = max(0, radius - p)
         ball_stop = slice_im[-1].stop - slice_im[-1].start + ball_start
         slice_ball.append(slice(ball_start, ball_stop))
@@ -219,18 +209,14 @@ def labeled_particles(
 
     if return_density:
         dens = _generate_density(sigma * 2, ndim)
-        densities = _structure_at_coordinates(
-            shape, points, dens, reduce_fn=np.maximum, dtype=np.float32
-        )
+        densities = _structure_at_coordinates(shape, points, dens, reduce_fn=np.maximum, dtype=np.float32)
 
         return labels, densities, points, values
     else:  # noqa: RET505
         return labels
 
 
-def run_benchmark_from_module(
-    module: ModuleType, klass_name: str, method_name: str
-):
+def run_benchmark_from_module(module: ModuleType, klass_name: str, method_name: str):
     klass = getattr(module, klass_name)
     if getattr(klass, "params", None):
         skip_if = getattr(klass, "skip_params", {})
@@ -263,9 +249,7 @@ def run_benchmark():
     import inspect
 
     parser = argparse.ArgumentParser(description="Run benchmark")
-    parser.add_argument(
-        "benchmark", type=str, help="Name of the benchmark to run", default=""
-    )
+    parser.add_argument("benchmark", type=str, help="Name of the benchmark to run", default="")
 
     args = parser.parse_args()
 
@@ -275,52 +259,34 @@ def run_benchmark():
     call_module = inspect.getmodule(inspect.currentframe().f_back)
     run_benchmark_from_module(call_module, *benchmark_selection)
 
+
 @lru_cache
 def cluster_blobs(
     length=512,
-    n=None,
+    n_cells=None,
     region_key="region_key",
     instance_key="instance_key",
     image_name="blobs_image",
     labels_name="blobs_labels",
     points_name="blobs_points",
+    n_transcripts_per_cell=None,
     table_name="table",
     coordinate_system="global",
 ):
     """Faster `spatialdata.datasets.make_blobs` using napari.datasets code."""
-    if n is None:
-        n = length
+    if n_cells is None:
+        n_cells = length
     # cells
-    labels, density, points , values = labeled_particles(
-            (length, length), return_density=True, n=n
-    )
-    # transcript points
-    # generate 100 transcripts per cell
-    rng = np.random.default_rng(None)
-    points_transcripts = rng.integers(length, size=(n*1000, 2))
+    labels, density, points, values = labeled_particles((length, length), return_density=True, n=n_cells)
 
     im_el = Image2DModel.parse(
         data=density[None, ...],
         dims="cyx",
         transformations={coordinate_system: Identity()},
     )
-    label_el = sd.models.Labels2DModel.parse(
-        labels,
-        dims="yx",
-        transformations={coordinate_system: Identity()}
-    )
-    points_cells_el = sd.models.PointsModel.parse(
-        points,
-        transformations={coordinate_system: Identity()}
-    )
-    points_transcripts_el = sd.models.PointsModel.parse(
-        points_transcripts,
-        transformations={coordinate_system: Identity()}
-    )
+    label_el = sd.models.Labels2DModel.parse(labels, dims="yx", transformations={coordinate_system: Identity()})
+    points_cells_el = sd.models.PointsModel.parse(points, transformations={coordinate_system: Identity()})
 
-    # TODO: generate actual values table in a scalable fashion
-    # adata = aggregate(values=points_el, by=label_el, region_key=region_key, instance_key=instance_key, target_coordinate_system=coordinate_system).tables["table"]
-    # make X dense as markers are limited
     # generate dummy table
     adata = ad.AnnData(X=np.ones((length, 10)))
     adata.obs[region_key] = pd.Categorical([labels_name] * len(adata))
@@ -342,14 +308,21 @@ def cluster_blobs(
         },
         labels={
             labels_name: label_el,
-            # "blobs_markers": Labels2DModel.parse(data=markers),
-        },
-        points={
-            points_name: points_cells_el,
-            "transcripts_" + points_name: points_transcripts_el,
         },
+        points={points_name: points_cells_el},
         tables={table_name: table},
     )
+
+    if n_transcripts_per_cell:
+        # transcript points
+        # generate 100 transcripts per cell
+        rng = np.random.default_rng(None)
+        points_transcripts = rng.integers(length, size=(n_cells * n_transcripts_per_cell, 2))
+        points_transcripts_el = sd.models.PointsModel.parse(
+            points_transcripts, transformations={coordinate_system: Identity()}
+        )
+        sdata["transcripts_" + points_name] = points_transcripts_el
+
     # if shapes_name:
     #     sdata[shapes_name] = sd.to_circles(sdata[labels_name])
     # add_regionprop_features(sdata, labels_layer=labels_name, table_layer=table_name)

From ac8091396838e28bcea05a28554184b607ae6ef8 Mon Sep 17 00:00:00 2001
From: Benjamin Rombaut <benjamin.rombaut@gmail.com>
Date: Wed, 13 Nov 2024 16:59:46 +0100
Subject: [PATCH 8/9] pass pre-commit by ignore benchmark files.

---
 benchmarks/spatialdata_benchmark.py | 8 +++-----
 benchmarks/utils.py                 | 2 ++
 pyproject.toml                      | 1 +
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/benchmarks/spatialdata_benchmark.py b/benchmarks/spatialdata_benchmark.py
index 8ecf51d1..4b9c3a99 100644
--- a/benchmarks/spatialdata_benchmark.py
+++ b/benchmarks/spatialdata_benchmark.py
@@ -1,12 +1,10 @@
+# type: ignore
+
 # Write the benchmarking functions here.
 # See "Writing benchmarks" in the asv docs for more information.
 import spatialdata as sd
 
-try:
-    from .utils import cluster_blobs
-except ImportError:
-    # TODO: remove ugly hack used for local testing
-    from utils import cluster_blobs
+from .utils import cluster_blobs
 
 
 class MemorySpatialData:
diff --git a/benchmarks/utils.py b/benchmarks/utils.py
index 8d018a4e..5f62ff0d 100644
--- a/benchmarks/utils.py
+++ b/benchmarks/utils.py
@@ -1,3 +1,5 @@
+# type: ignore
+
 import itertools
 import os
 from collections.abc import Sequence
diff --git a/pyproject.toml b/pyproject.toml
index d95e74eb..2f725e04 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -207,6 +207,7 @@ convention = "numpy"
     "src/spatialdata/dataloader/datasets.py" = ["D101"]
     "tests/test_models/test_models.py" = ["NPY002"]
     "tests/conftest.py"= ["E402"]
+    "benchmarks/*" = ["ALL"]
 
 
 # pyupgrade typing rewrite TODO: remove at some point from per-file ignore

From 81b3bfb552058e442cfffa37e271abdd4141ff0c Mon Sep 17 00:00:00 2001
From: Benjamin Rombaut <benjamin.rombaut@gmail.com>
Date: Wed, 13 Nov 2024 17:18:21 +0100
Subject: [PATCH 9/9] add n_transcripts_per_cell to benchmark

---
 benchmarks/README.md                | 18 +++++++++---------
 benchmarks/spatialdata_benchmark.py | 14 +++++++-------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 03989701..704050f4 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -23,15 +23,15 @@ PYTHONWARNINGS="ignore" asv run --python=same --show-stderr -b time_query_boundi
 output:
 
 ```
-[100.00%] ··· ======== ============ ============
-              --              filter_table
-              -------- -------------------------
-               length      True        False
-              ======== ============ ============
-                100      191±2ms      185±2ms
-                1000     399±4ms      382±7ms
-               10000    2.67±0.02s   2.18±0.01s
-              ======== ============ ============
+[50.00%] ··· ======== ============ ============== ============= ===============
+         	--             	filter_table / n_transcripts_per_cell
+         	-------- ---------------------------------------------------------
+          	length   True / 100   True / 10000   False / 100   False / 10000
+         	======== ============ ============== ============= ===============
+           	100  	813±0ms   	1.09±0s    	803±0ms    	980±0ms
+           	1000 	799±0ms   	2.96±0s    	789±0ms    	2.81±0s
+          	10000 	1.32±0s   	24.4±0s    	962±0ms    	21.5±0s
+         	======== ============ ============== ============= ===============
 ```
 
 run everything in new env
diff --git a/benchmarks/spatialdata_benchmark.py b/benchmarks/spatialdata_benchmark.py
index 4b9c3a99..62d95cf1 100644
--- a/benchmarks/spatialdata_benchmark.py
+++ b/benchmarks/spatialdata_benchmark.py
@@ -45,19 +45,19 @@ def time_map_blocks(self, _):
 
 class TimeQueries:
 
-    params = ([100, 1000, 10_000], [True, False])
-    param_names = ["length", "filter_table"]
+    params = ([100, 1000, 10_000], [True, False], [100, 10_000])
+    param_names = ["length", "filter_table", "n_transcripts_per_cell"]
 
-    def setup(self, length, filter_table):
+    def setup(self, length, filter_table, n_transcripts_per_cell):
         import shapely
 
-        self.sdata = cluster_blobs(length=length)
+        self.sdata = cluster_blobs(length=length, n_transcripts_per_cell=n_transcripts_per_cell)
         self.polygon = shapely.box(0, 0, length // 2, length // 2)
 
-    def teardown(self, length, filter_table):
+    def teardown(self, length, filter_table, n_transcripts_per_cell):
         del self.sdata
 
-    def time_query_bounding_box(self, length, filter_table):
+    def time_query_bounding_box(self, length, filter_table, n_transcripts_per_cell):
         self.sdata.query.bounding_box(
             axes=["x", "y"],
             min_coordinate=[0, 0],
@@ -66,7 +66,7 @@ def time_query_bounding_box(self, length, filter_table):
             filter_table=filter_table,
         )
 
-    def time_query_polygon_box(self, length, filter_table):
+    def time_query_polygon_box(self, length, filter_table, n_transcripts_per_cell):
         sd.polygon_query(
             self.sdata,
             self.polygon,