diff --git a/CHANGELOG.md b/CHANGELOG.md index fba2d92d..b8888528 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,10 @@ and this project adheres to [Semantic Versioning][]. ## [0.1.2] - xxxx-xx-xx +### Added + +- (Visium HD) added reader, coauthored by @LLehner + ### Fixed - (Xenium) reader for 1.0.1 (paper data) and unknown versions diff --git a/README.md b/README.md index 00a56a3f..35349c62 100644 --- a/README.md +++ b/README.md @@ -12,15 +12,16 @@ This package contains reader functions to load common spatial omics formats into SpatialData. Currently, we provide support for: -- NanoString CosMx -- MCMICRO (output data) -- Steinbock (output data) - 10x Genomics Visium +- 10x Genomics Visium HD - 10x Genomics Xenium -- Curio Seeker - Akoya PhenoCycler (formerly CODEX) -- Vizgen MERSCOPE (MERFISH) +- Curio Seeker - DBiT-seq +- MCMICRO (output data) +- NanoString CosMx +- Steinbock (output data) +- Vizgen MERSCOPE (MERFISH) ## Getting started diff --git a/docs/api.md b/docs/api.md index 280df0cb..a00ef427 100644 --- a/docs/api.md +++ b/docs/api.md @@ -15,14 +15,15 @@ I/O for the `spatialdata` project. :toctree: generated codex - curio cosmx + curio + dbit + mcmicro + merscope + steinbock visium + visium_hd xenium - steinbock - merscope - mcmicro - dbit ``` ### Conversion functions diff --git a/src/spatialdata_io/__init__.py b/src/spatialdata_io/__init__.py index e4eb41ae..f2d3cfcb 100644 --- a/src/spatialdata_io/__init__.py +++ b/src/spatialdata_io/__init__.py @@ -8,6 +8,7 @@ from spatialdata_io.readers.merscope import merscope from spatialdata_io.readers.steinbock import steinbock from spatialdata_io.readers.visium import visium +from spatialdata_io.readers.visium_hd import visium_hd from spatialdata_io.readers.xenium import ( xenium, xenium_aligned_image, @@ -26,6 +27,7 @@ "xenium_aligned_image", "xenium_explorer_selection", "dbit", + "visium_hd", ] __version__ = version("spatialdata-io") diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py index c485214d..08928215 100644 --- a/src/spatialdata_io/_constants/_constants.py +++ b/src/spatialdata_io/_constants/_constants.py @@ -227,3 +227,60 @@ class DbitKeys(ModeEnum): BARCODE_POSITION = "barcode_list" # image IMAGE_LOWRES_FILE = "tissue_lowres_image.png" + + +@unique +class VisiumHDKeys(ModeEnum): + """Keys for *10X Genomics Visium hd* formatted dataset.""" + + # directories + SPATIAL = "spatial" + DEFAULT_BIN = "square_008um" + BIN_PREFIX = "square_" + MICROSCOPE_IMAGE = "microscope_image" + + # counts and locations files + FILTERED_COUNTS_FILE = "filtered_feature_bc_matrix.h5" + RAW_COUNTS_FILE = "raw_feature_bc_matrix.h5" + TISSUE_POSITIONS_FILE = "tissue_positions.parquet" + + # images + IMAGE_HIRES_FILE = "spatial/tissue_hires_image.png" + IMAGE_LOWRES_FILE = "spatial/tissue_lowres_image.png" + IMAGE_CYTASSIST = "spatial/cytassist_image.tiff" + + # scalefactors + SCALEFACTORS_FILE = "scalefactors_json.json" + + # scalefactors keys + SCALEFACTORS_HIRES = "tissue_hires_scalef" + SCALEFACTORS_LOWRES = "tissue_lowres_scalef" + SCALEFACTORS_SPOT_DIAMETER_FULLRES = "spot_diameter_fullres" + SCALEFACTORS_BIN_SIZE_UM = "bin_size_um" + SCALEFACTORS_MICRONS_PER_PIXEL = "microns_per_pixel" + + # locations keys + LOCATIONS_X = "pxl_col_in_fullres" + LOCATIONS_Y = "pxl_row_in_fullres" + BARCODE = "barcode" + IN_TISSUE = "in_tissue" + ARRAY_ROW = "array_row" + ARRAY_COL = "array_col" + + # table keys + REGION_KEY = "region" + INSTANCE_KEY = "location_id" + + # feature slice file (it contains transformation matrices in the metadata) + FEATURE_SLICE_FILE = "feature_slice.h5" + + # METADATA_KEYS + METADATA_JSON = "metadata_json" + HD_LAYOUT_JSON = "hd_layout_json" + TRANSFORM = "transform" + TRANSFORM_MATRICES = "transform_matrices" + CYTASSIST_COLROW_TO_SPOT_COLROW = ("cytassist_colrow_to_spot_colrow",) + SPOT_COLROW_TO_CYTASSIST_COLROW = ("spot_colrow_to_cytassist_colrow",) + MICROSCOPE_COLROW_TO_SPOT_COLROW = ("microscope_colrow_to_spot_colrow",) + SPOT_COLROW_TO_MICROSCOPE_COLROW = ("spot_colrow_to_microscope_colrow",) + FILE_FORMAT = "file_format" diff --git a/src/spatialdata_io/converters/legacy_anndata.py b/src/spatialdata_io/converters/legacy_anndata.py index 17406a9a..c4fa5aa6 100644 --- a/src/spatialdata_io/converters/legacy_anndata.py +++ b/src/spatialdata_io/converters/legacy_anndata.py @@ -8,7 +8,7 @@ SpatialData, get_centroids, get_extent, - join_sdata_spatialelement_table, + join_spatialelement_table, to_circles, ) from spatialdata._core.operations._utils import transform_to_data_extent @@ -149,8 +149,8 @@ def to_legacy_anndata( shapes = to_circles(element) circles_sdata = SpatialData(tables={table_name: table}, shapes={region_name: shapes}) - joined_elements, new_table = join_sdata_spatialelement_table( - sdata=circles_sdata, spatial_element_name=region_name, table_name=table_name, how="inner", match_rows="left" + joined_elements, new_table = join_spatialelement_table( + sdata=circles_sdata, spatial_element_names=region_name, table_name=table_name, how="inner", match_rows="left" ) # the table after the filtering must not be empty diff --git a/src/spatialdata_io/readers/visium_hd.py b/src/spatialdata_io/readers/visium_hd.py new file mode 100644 index 00000000..e54e8759 --- /dev/null +++ b/src/spatialdata_io/readers/visium_hd.py @@ -0,0 +1,371 @@ +from __future__ import annotations + +import json +import os +import re +import warnings +from collections.abc import Mapping +from pathlib import Path +from types import MappingProxyType +from typing import Any + +import h5py +import numpy as np +import pandas as pd +import scanpy as sc +from dask_image.imread import imread +from imageio import imread as imread2 +from multiscale_spatial_image import MultiscaleSpatialImage +from spatial_image import SpatialImage +from spatialdata import SpatialData +from spatialdata.models import Image2DModel, ShapesModel, TableModel +from spatialdata.transformations import ( + Affine, + Identity, + Scale, + Sequence, + set_transformation, +) +from xarray import DataArray + +from spatialdata_io._constants._constants import VisiumHDKeys +from spatialdata_io._docs import inject_docs + + +@inject_docs(vx=VisiumHDKeys) +def visium_hd( + path: str | Path, + dataset_id: str | None = None, + filtered_counts_file: bool = True, + bin_size: int | list[int] | None = None, + fullres_image_file: str | Path | None = None, + load_all_images: bool = False, + imread_kwargs: Mapping[str, Any] = MappingProxyType({}), + image_models_kwargs: Mapping[str, Any] = MappingProxyType({}), + anndata_kwargs: Mapping[str, Any] = MappingProxyType({}), +) -> SpatialData: + """ + Read *10x Genomics* Visium HD formatted dataset. + + .. seealso:: + + - `Space Ranger output + `_. + + Parameters + ---------- + path + Path to directory containing the *10x Genomics* Visium HD output. + dataset_id + Unique identifier of the dataset. If `None`, it tries to infer it from the file name of the feature slice file. + filtered_counts_file + It sets the value of `counts_file` to ``{vx.FILTERED_COUNTS_FILE!r}`` (when `True`) or to + ``{vx.RAW_COUNTS_FILE!r}`` (when `False`). + bin_size + When specified, load the data of a specific bin size, or a list of bin sizes. By default, it loads all the + available bin sizes. + fullres_image_file + Path to the full-resolution image. By default the image is searched in the ``{vx.MICROSCOPE_IMAGE!r}`` + directory. + load_all_images + If `False`, load only the full resolution, high resolution and low resolution images. If `True`, also the + following images: ``{vx.IMAGE_CYTASSIST!r}``. + imread_kwargs + Keyword arguments for :func:`imageio.imread`. + image_models_kwargs + Keyword arguments for :class:`spatialdata.models.Image2DModel`. + anndata_kwargs + Keyword arguments for :func:`anndata.read_h5ad`. + + Returns + ------- + SpatialData object for the Visium HD data. + """ + path = Path(path) + tables = {} + shapes = {} + images: dict[str, Any] = {} + + if dataset_id is None: + dataset_id = _infer_dataset_id(path) + filename_prefix = f"{dataset_id}_" + else: + filename_prefix = "" + + def load_image(path: Path, suffix: str, scale_factors: list[int] | None = None) -> None: + _load_image( + path=path, + images=images, + suffix=suffix, + dataset_id=dataset_id, + imread_kwargs=imread_kwargs, + image_models_kwargs=image_models_kwargs, + scale_factors=scale_factors, + ) + + metadata, hd_layout = _parse_metadata(path, filename_prefix) + transform_matrices = _get_transform_matrices(metadata, hd_layout) + file_format = hd_layout[VisiumHDKeys.FILE_FORMAT] + if file_format != "1.0": + warnings.warn( + f"File format {file_format} is not supported. A more recent file format may be supported in a newer version" + f"of the spatialdata-io package.", + UserWarning, + stacklevel=2, + ) + + path_bins = path + all_bin_sizes = sorted( + [ + bin_size + for bin_size in os.listdir(path_bins) + if os.path.isdir(os.path.join(path_bins, bin_size)) and bin_size.startswith(VisiumHDKeys.BIN_PREFIX) + ] + ) + if bin_size is None: + bin_sizes = all_bin_sizes + elif isinstance(bin_size, int) or isinstance(bin_size, list) and len(bin_size) == 0: + if f"square_{bin_size:03}um" not in all_bin_sizes: + warnings.warn( + f"Requested bin size {bin_size} not found (available {all_bin_sizes}). Using all available bins.", + UserWarning, + stacklevel=2, + ) + bin_sizes = all_bin_sizes + else: + bin_sizes = [f"square_{bin_size:03}um"] + + # iterate over the given bins and load the data + for bin_size_str in bin_sizes: + path_bin = path_bins / bin_size_str + counts_file = VisiumHDKeys.FILTERED_COUNTS_FILE if filtered_counts_file else VisiumHDKeys.RAW_COUNTS_FILE + adata = sc.read_10x_h5( + path_bin / counts_file, + gex_only=False, + **anndata_kwargs, + ) + + path_bin_spatial = path_bin / VisiumHDKeys.SPATIAL + + with open(path_bin_spatial / VisiumHDKeys.SCALEFACTORS_FILE) as file: + scalefactors = json.load(file) + + # consistency check + found_bin_size = re.search(r"\d{3}", bin_size_str) + assert found_bin_size is not None + assert float(found_bin_size.group()) == scalefactors[VisiumHDKeys.SCALEFACTORS_BIN_SIZE_UM] + assert np.isclose( + scalefactors[VisiumHDKeys.SCALEFACTORS_BIN_SIZE_UM] + / scalefactors[VisiumHDKeys.SCALEFACTORS_SPOT_DIAMETER_FULLRES], + scalefactors[VisiumHDKeys.SCALEFACTORS_MICRONS_PER_PIXEL], + ) + + tissue_positions_file = path_bin_spatial / VisiumHDKeys.TISSUE_POSITIONS_FILE + + # read coordinates and set up adata.obs and adata.obsm + coords = pd.read_parquet(tissue_positions_file) + assert all( + coords.columns.values + == [ + VisiumHDKeys.BARCODE, + VisiumHDKeys.IN_TISSUE, + VisiumHDKeys.ARRAY_ROW, + VisiumHDKeys.ARRAY_COL, + VisiumHDKeys.LOCATIONS_Y, + VisiumHDKeys.LOCATIONS_X, + ] + ) + coords.set_index(VisiumHDKeys.BARCODE, inplace=True, drop=True) + coords_filtered = coords.loc[adata.obs.index] + adata.obs = pd.merge(adata.obs, coords_filtered, how="left", left_index=True, right_index=True) + # compatibility to legacy squidpy + adata.obsm["spatial"] = adata.obs[[VisiumHDKeys.LOCATIONS_X, VisiumHDKeys.LOCATIONS_Y]].values + # dropping the spatial coordinates (will be stored in shapes) + adata.obs.drop( + columns=[ + VisiumHDKeys.LOCATIONS_X, + VisiumHDKeys.LOCATIONS_Y, + ], + inplace=True, + ) + adata.obs[VisiumHDKeys.INSTANCE_KEY] = np.arange(len(adata)) + + # scaling + transform_original = Identity() + transform_lowres = Scale( + np.array( + [ + scalefactors[VisiumHDKeys.SCALEFACTORS_LOWRES], + scalefactors[VisiumHDKeys.SCALEFACTORS_LOWRES], + ] + ), + axes=("x", "y"), + ) + transform_hires = Scale( + np.array( + [ + scalefactors[VisiumHDKeys.SCALEFACTORS_HIRES], + scalefactors[VisiumHDKeys.SCALEFACTORS_HIRES], + ] + ), + axes=("x", "y"), + ) + # parse shapes + circles = ShapesModel.parse( + adata.obsm["spatial"], + geometry=0, + radius=scalefactors[VisiumHDKeys.SCALEFACTORS_SPOT_DIAMETER_FULLRES] / 2.0, + index=adata.obs[VisiumHDKeys.INSTANCE_KEY].copy(), + transformations={ + "global": transform_original, + "downscaled_hires": transform_hires, + "downscaled_lowres": transform_lowres, + }, + ) + shapes_name = dataset_id + "_" + bin_size_str + shapes[shapes_name] = circles + + # parse table + adata.obs[VisiumHDKeys.REGION_KEY] = shapes_name + adata.obs[VisiumHDKeys.REGION_KEY] = adata.obs[VisiumHDKeys.REGION_KEY].astype("category") + tables[bin_size_str] = TableModel.parse( + adata, + region=shapes_name, + region_key=str(VisiumHDKeys.REGION_KEY), + instance_key=str(VisiumHDKeys.INSTANCE_KEY), + ) + + # read full resolution image + if fullres_image_file is not None: + fullres_image_file = Path(fullres_image_file) + else: + path_fullres = path / VisiumHDKeys.MICROSCOPE_IMAGE + if path_fullres.exists(): + fullres_image_filenames = [ + f for f in os.listdir(path_fullres) if os.path.isfile(os.path.join(path_fullres, f)) + ] + if len(fullres_image_filenames) > 1: + warnings.warn( + f"Multiple files found in {path_fullres}, using the first one: {fullres_image_filenames[0]}. Please" + " specify the path to the full resolution image manually using the `fullres_image_file` argument.", + UserWarning, + stacklevel=2, + ) + fullres_image_filename = fullres_image_filenames[0] + fullres_image_file = path_fullres / fullres_image_filename + + if fullres_image_file is not None: + load_image( + path=path / fullres_image_file, + suffix="_full_image", + scale_factors=[2, 2, 2, 2], + ) + + # hires image + load_image( + path=path / VisiumHDKeys.IMAGE_HIRES_FILE, + suffix="_hires_image", + ) + set_transformation( + images[dataset_id + "_hires_image"], + {"downscaled_hires": Identity()}, + set_all=True, + ) + + # lowres image + load_image( + path=path / VisiumHDKeys.IMAGE_LOWRES_FILE, + suffix="_lowres_image", + ) + set_transformation( + images[dataset_id + "_lowres_image"], + {"downscaled_lowres": Identity()}, + set_all=True, + ) + + # cytassist image + if load_all_images: + load_image( + path=path / VisiumHDKeys.IMAGE_CYTASSIST, + suffix="_cytassist_image", + ) + image = images[dataset_id + "_cytassist_image"] + affine0 = transform_matrices["cytassist_colrow_to_spot_colrow"] + affine1 = transform_matrices["spot_colrow_to_microscope_colrow"] + set_transformation(image, Sequence([affine0, affine1]), "global") + + return SpatialData(tables=tables, images=images, shapes=shapes) + + +def _infer_dataset_id(path: Path) -> str: + suffix = f"_{VisiumHDKeys.FEATURE_SLICE_FILE.value}" + files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) and f.endswith(suffix)] + if len(files) == 0 or len(files) > 1: + raise ValueError( + f"Cannot infer `dataset_id` from the feature slice file in {path}, please pass `dataset_id` as an argument." + ) + return files[0].replace(suffix, "") + + +def _load_image( + path: Path, + images: dict[str, SpatialImage | MultiscaleSpatialImage], + suffix: str, + dataset_id: str, + imread_kwargs: Mapping[str, Any], + image_models_kwargs: Mapping[str, Any], + scale_factors: list[int] | None, +) -> None: + if path.exists(): + if path.suffix != ".btf": + data = imread(path, **imread_kwargs) + if len(data.shape) == 4: + # this happens for the cytassist, hires and lowres images; the umi image doesn't need processing + data = data.squeeze().transpose(2, 0, 1) + else: + if "MAX_IMAGE_PIXELS" in imread_kwargs: + from PIL import Image as ImagePIL + + ImagePIL.MAX_IMAGE_PIXELS = dict(imread_kwargs).pop("MAX_IMAGE_PIXELS") + # dask_image doesn't recognize .btf automatically and imageio v3 throws error due to pixel limit -> use imageio v2 + data = imread2(path, **imread_kwargs).squeeze().transpose(2, 0, 1) + image = DataArray(data, dims=("c", "y", "x")) + parsed = Image2DModel.parse(image, scale_factors=scale_factors, **image_models_kwargs) + images[dataset_id + suffix] = parsed + else: + warnings.warn(f"File {path} does not exist, skipping it.", UserWarning, stacklevel=2) + return None + + +def _get_affine(coefficients: list[int]) -> Affine: + matrix = np.array(coefficients).reshape(3, 3) + # the last row doesn't match with machine precision, let's check the matrix it's still close to a homogeneous + # matrix, and fix this + assert np.allclose(matrix[2], [0, 0, 1], atol=1e-2), matrix + matrix[2] = [0, 0, 1] + return Affine(matrix, input_axes=("x", "y"), output_axes=("x", "y")) + + +def _parse_metadata(path: Path, filename_prefix: str) -> tuple[dict[str, Any], dict[str, Any]]: + with h5py.File(path / f"{filename_prefix}{VisiumHDKeys.FEATURE_SLICE_FILE.value}", "r") as f5: + metadata = json.loads(dict(f5.attrs)[VisiumHDKeys.METADATA_JSON]) + hd_layout = json.loads(metadata[VisiumHDKeys.HD_LAYOUT_JSON]) + return metadata, hd_layout + + +def _get_transform_matrices(metadata: dict[str, Any], hd_layout: dict[str, Any]) -> dict[str, Affine]: + transform_matrices = {} + + # not used + transform_matrices["hd_layout_transform"] = _get_affine(hd_layout[VisiumHDKeys.TRANSFORM]) + + for key in [ + VisiumHDKeys.CYTASSIST_COLROW_TO_SPOT_COLROW, + VisiumHDKeys.SPOT_COLROW_TO_CYTASSIST_COLROW, + VisiumHDKeys.MICROSCOPE_COLROW_TO_SPOT_COLROW, + VisiumHDKeys.SPOT_COLROW_TO_MICROSCOPE_COLROW, + ]: + data = metadata[VisiumHDKeys.TRANSFORM_MATRICES][key] + transform_matrices[key.value] = _get_affine(data) + + return transform_matrices