From 53a609f75d4fee3b6b111bdfcf0df01f824480db Mon Sep 17 00:00:00 2001 From: Timothy Hodson <34148978+thodson-usgs@users.noreply.github.com> Date: Thu, 5 Sep 2024 11:39:28 -0500 Subject: [PATCH] Add example to create a virtual dataset using lithops (#203) * Set ZArray fill_value back to nan * Set NaT as datetime64 default fill value * Add example to create a virtual dataset using lithops * Rename file * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update examples/virtualizarr-with-lithops/README.md Co-authored-by: Tom Nicholas * Update examples/virtualizarr-with-lithops/README.md Co-authored-by: Tom Nicholas * Update examples/virtualizarr-with-lithops/README.md Co-authored-by: Tom Nicholas --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Tom Nicholas --- .../Dockerfile_virtualizarr | 59 +++++++++++++++++++ examples/virtualizarr-with-lithops/README.md | 41 +++++++++++++ .../virtualizarr-with-lithops/lithops.yaml | 14 +++++ .../requirements.txt | 8 +++ .../virtualizarr-with-lithops.py | 59 +++++++++++++++++++ 5 files changed, 181 insertions(+) create mode 100644 examples/virtualizarr-with-lithops/Dockerfile_virtualizarr create mode 100644 examples/virtualizarr-with-lithops/README.md create mode 100644 examples/virtualizarr-with-lithops/lithops.yaml create mode 100644 examples/virtualizarr-with-lithops/requirements.txt create mode 100644 examples/virtualizarr-with-lithops/virtualizarr-with-lithops.py diff --git a/examples/virtualizarr-with-lithops/Dockerfile_virtualizarr b/examples/virtualizarr-with-lithops/Dockerfile_virtualizarr new file mode 100644 index 00000000..d1793c6a --- /dev/null +++ b/examples/virtualizarr-with-lithops/Dockerfile_virtualizarr @@ -0,0 +1,59 @@ +# Python 3.11 +FROM python:3.11-slim-buster + + +RUN apt-get update \ + # Install aws-lambda-cpp build dependencies + && apt-get install -y \ + g++ \ + make \ + cmake \ + unzip \ + # cleanup package lists, they are not used anymore in this image + && rm -rf /var/lib/apt/lists/* \ + && apt-cache search linux-headers-generic + +ARG FUNCTION_DIR="/function" + +# Copy function code +RUN mkdir -p ${FUNCTION_DIR} + +# Update pip +# NB botocore/boto3 are pinned due to https://github.com/boto/boto3/issues/3648 +# using versions from https://github.com/aio-libs/aiobotocore/blob/72b8dd5d7d4ef2f1a49a0ae0c37b47e5280e2070/setup.py +# due to s3fs dependency +RUN pip install --upgrade --ignore-installed pip wheel six setuptools \ + && pip install --upgrade --no-cache-dir --ignore-installed \ + awslambdaric \ + botocore==1.29.76 \ + boto3==1.26.76 \ + redis \ + httplib2 \ + requests \ + numpy \ + scipy \ + pandas \ + pika \ + kafka-python \ + cloudpickle \ + ps-mem \ + tblib + +# Set working directory to function root directory +WORKDIR ${FUNCTION_DIR} + +# Add Lithops +COPY lithops_lambda.zip ${FUNCTION_DIR} +RUN unzip lithops_lambda.zip \ + && rm lithops_lambda.zip \ + && mkdir handler \ + && touch handler/__init__.py \ + && mv entry_point.py handler/ + +# Put your dependencies here, using RUN pip install... or RUN apt install... + +COPY requirements.txt requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + +ENTRYPOINT [ "/usr/local/bin/python", "-m", "awslambdaric" ] +CMD [ "handler.entry_point.lambda_handler" ] diff --git a/examples/virtualizarr-with-lithops/README.md b/examples/virtualizarr-with-lithops/README.md new file mode 100644 index 00000000..d3c02037 --- /dev/null +++ b/examples/virtualizarr-with-lithops/README.md @@ -0,0 +1,41 @@ +# Generate a virtual zarr dataset using lithops + +This example walks through how to create a virtual dataset from a collection of +netCDF files on s3 using lithops to open each file in parallel then concatenate +them into a single virtual dataset. + +## Credits +Inspired by Pythia's cookbook: https://projectpythia.org/kerchunk-cookbook +by norlandrhagen. + +Please, contribute improvements. + + + +1. Set up a Python environment +```bash +conda create --name virtualizarr-lithops -y python=3.11 +conda activate virtualizarr-lithops +pip install -r requirements.txt +``` + +2. Configure compute and storage backends for [lithops](https://lithops-cloud.github.io/docs/source/configuration.html). +The configuration in `lithops.yaml` uses AWS Lambda for [compute](https://lithops-cloud.github.io/docs/source/compute_config/aws_lambda.html) and AWS S3 for [storage](https://lithops-cloud.github.io/docs/source/storage_config/aws_s3.html). +To use those backends, simply edit `lithops.yaml` with your `bucket` and `execution_role`. + +1. Build a runtime image for Cubed +```bash +export LITHOPS_CONFIG_FILE=$(pwd)/lithops.yaml +lithops runtime build -b aws_lambda -f Dockerfile_virtualizarr virtualizarr-runtime +``` + +1. Run the script +```bash +python virtualizarr-with-lithops.py +``` + +## Cleaning up +To rebuild the Lithops image, delete the existing one by running +```bash +lithops runtime delete -b aws_lambda -d virtualizarr-runtime +``` diff --git a/examples/virtualizarr-with-lithops/lithops.yaml b/examples/virtualizarr-with-lithops/lithops.yaml new file mode 100644 index 00000000..b142b480 --- /dev/null +++ b/examples/virtualizarr-with-lithops/lithops.yaml @@ -0,0 +1,14 @@ +lithops: + backend: aws_lambda + storage: aws_s3 + +aws: + region: us-west-2 + +aws_lambda: + execution_role: arn:aws:iam::807615458658:role/lambdaLithopsExecutionRole + runtime: virtualizarr-runtime + runtime_memory: 2000 + +aws_s3: + bucket: arn:aws:s3:::cubed-thodson-temp diff --git a/examples/virtualizarr-with-lithops/requirements.txt b/examples/virtualizarr-with-lithops/requirements.txt new file mode 100644 index 00000000..ba6938f8 --- /dev/null +++ b/examples/virtualizarr-with-lithops/requirements.txt @@ -0,0 +1,8 @@ +boto +cftime +h5py +kerchunk +lithops +s3fs +virtualizarr +xarray diff --git a/examples/virtualizarr-with-lithops/virtualizarr-with-lithops.py b/examples/virtualizarr-with-lithops/virtualizarr-with-lithops.py new file mode 100644 index 00000000..5d16bb9c --- /dev/null +++ b/examples/virtualizarr-with-lithops/virtualizarr-with-lithops.py @@ -0,0 +1,59 @@ +# Use lithops to create a virtual dataset from a collection of necdf files on s3. +# +# Inspired by Pythia's cookbook: https://projectpythia.org/kerchunk-cookbook +# by norlandrhagen. +# +# Please, contribute improvements. + +import fsspec +import lithops +import xarray as xr + +from virtualizarr import open_virtual_dataset + +# to demonstrate this workflow, we will use a collection of netcdf files from the WRF-SE-AK-AR5 project. +fs_read = fsspec.filesystem("s3", anon=True, skip_instance_cache=True) +files_paths = fs_read.glob("s3://wrf-se-ak-ar5/ccsm/rcp85/daily/2060/*") +file_pattern = sorted(["s3://" + f for f in files_paths]) + +# optionally, truncate file_pattern while debugging +# file_pattern = file_pattern[:4] + +print(f"{len(file_pattern)} file paths were retrieved.") + + +def map_references(fil): + """Map function to open virtual datasets.""" + vds = open_virtual_dataset( + fil, + indexes={}, + loadable_variables=["Time"], + cftime_variables=["Time"], + ) + return vds + + +def reduce_references(results): + """Reduce to concat virtual datasets.""" + combined_vds = xr.combine_nested( + results, + concat_dim=["Time"], + coords="minimal", + compat="override", + ) + return combined_vds + + +fexec = lithops.FunctionExecutor(config_file="lithops.yaml") + +futures = fexec.map_reduce( + map_references, + file_pattern, + reduce_references, + spawn_reducer=100, +) + +ds = futures.get_result() + +# write out the virtual dataset to a kerchunk json +ds.virtualize.to_kerchunk("combined.json", format="json")