diff --git a/examples/virtualizarr-with-lithops/Dockerfile_virtualizarr b/examples/virtualizarr-with-lithops/Dockerfile_virtualizarr new file mode 100644 index 00000000..d1793c6a --- /dev/null +++ b/examples/virtualizarr-with-lithops/Dockerfile_virtualizarr @@ -0,0 +1,59 @@ +# Python 3.11 +FROM python:3.11-slim-buster + + +RUN apt-get update \ + # Install aws-lambda-cpp build dependencies + && apt-get install -y \ + g++ \ + make \ + cmake \ + unzip \ + # cleanup package lists, they are not used anymore in this image + && rm -rf /var/lib/apt/lists/* \ + && apt-cache search linux-headers-generic + +ARG FUNCTION_DIR="/function" + +# Copy function code +RUN mkdir -p ${FUNCTION_DIR} + +# Update pip +# NB botocore/boto3 are pinned due to https://github.com/boto/boto3/issues/3648 +# using versions from https://github.com/aio-libs/aiobotocore/blob/72b8dd5d7d4ef2f1a49a0ae0c37b47e5280e2070/setup.py +# due to s3fs dependency +RUN pip install --upgrade --ignore-installed pip wheel six setuptools \ + && pip install --upgrade --no-cache-dir --ignore-installed \ + awslambdaric \ + botocore==1.29.76 \ + boto3==1.26.76 \ + redis \ + httplib2 \ + requests \ + numpy \ + scipy \ + pandas \ + pika \ + kafka-python \ + cloudpickle \ + ps-mem \ + tblib + +# Set working directory to function root directory +WORKDIR ${FUNCTION_DIR} + +# Add Lithops +COPY lithops_lambda.zip ${FUNCTION_DIR} +RUN unzip lithops_lambda.zip \ + && rm lithops_lambda.zip \ + && mkdir handler \ + && touch handler/__init__.py \ + && mv entry_point.py handler/ + +# Put your dependencies here, using RUN pip install... or RUN apt install... + +COPY requirements.txt requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + +ENTRYPOINT [ "/usr/local/bin/python", "-m", "awslambdaric" ] +CMD [ "handler.entry_point.lambda_handler" ] diff --git a/examples/virtualizarr-with-lithops/README.md b/examples/virtualizarr-with-lithops/README.md new file mode 100644 index 00000000..10cba141 --- /dev/null +++ b/examples/virtualizarr-with-lithops/README.md @@ -0,0 +1,41 @@ +# Generate a virtual zarr dataset using lithops + +This example walks through how to create a virtual dataset from a collection of +necdf files on s3 using lithops to open each file in parralel then concatenate +them into a single virtual dataset. + +## Credits +Inspired by Pythia's cookbook: https://projectpythia.org/kerchunk-cookbook +by norlandrhagen. + +Please, contribute improvements. + + + +1. Set up a Python environment +```bash +conda create --name virtualizarr-lithops -y python=3.11 +conda activate virtualizarr-lithops +pip install -r requirements.txt +``` + +1. Configure compute and storage backends for [lithops](https://lithops-cloud.github.io/docs/source/configuration.html). +The configuration in `lithops.yaml` uses AWS Lambda for [compute](https://lithops-cloud.github.io/docs/source/compute_config/aws_lambda.html) and AWS S3 for [storage](https://lithops-cloud.github.io/docs/source/storage_config/aws_s3.html). +To use those backends, simply edit `lithops.yaml` with your `bucket` and `execution_role`. + +1. Build a runtime image for Cubed +```bash +export LITHOPS_CONFIG_FILE=$(pwd)/lithops.yaml +lithops runtime build -b aws_lambda -f Dockerfile_virtualizarr virtualizarr-runtime +``` + +1. Run the script +```bash +python virtualizarr-with-lithops.py +``` + +## Cleaning up +To rebuild the Litops image, delete the existing one by running +```bash +lithops runtime delete -b aws_lambda -d virtualizarr-runtime +``` diff --git a/examples/virtualizarr-with-lithops/lithops.yaml b/examples/virtualizarr-with-lithops/lithops.yaml new file mode 100644 index 00000000..b142b480 --- /dev/null +++ b/examples/virtualizarr-with-lithops/lithops.yaml @@ -0,0 +1,14 @@ +lithops: + backend: aws_lambda + storage: aws_s3 + +aws: + region: us-west-2 + +aws_lambda: + execution_role: arn:aws:iam::807615458658:role/lambdaLithopsExecutionRole + runtime: virtualizarr-runtime + runtime_memory: 2000 + +aws_s3: + bucket: arn:aws:s3:::cubed-thodson-temp diff --git a/examples/virtualizarr-with-lithops/requirements.txt b/examples/virtualizarr-with-lithops/requirements.txt new file mode 100644 index 00000000..ba6938f8 --- /dev/null +++ b/examples/virtualizarr-with-lithops/requirements.txt @@ -0,0 +1,8 @@ +boto +cftime +h5py +kerchunk +lithops +s3fs +virtualizarr +xarray diff --git a/examples/virtualizarr-with-lithops/virtual-rechunk.py b/examples/virtualizarr-with-lithops/virtual-rechunk.py new file mode 100644 index 00000000..b5e2884d --- /dev/null +++ b/examples/virtualizarr-with-lithops/virtual-rechunk.py @@ -0,0 +1,61 @@ +# Use lithops to create a virtual dataset from a collection of necdf files on s3. +# +# Inspired by Pythia's cookbook: https://projectpythia.org/kerchunk-cookbook +# by norlandrhagen. +# +# Please, contribute improvements. + +import fsspec +import lithops +import xarray as xr + +from virtualizarr import open_virtual_dataset + +# to demonstrate this workflow, we will use a collection of netcdf files from the WRF-SE-AK-AR5 project. +fs_read = fsspec.filesystem("s3", anon=True, skip_instance_cache=True) +files_paths = fs_read.glob("s3://wrf-se-ak-ar5/ccsm/rcp85/daily/2060/*") +file_pattern = sorted(["s3://" + f for f in files_paths]) + +# optionally, truncate file_pattern while debugging +# file_pattern = file_pattern[:4] + +print(f"{len(file_pattern)} file paths were retrieved.") + + +def map_references(fil): + """ Map function to open virtual datasets. + """ + vds = open_virtual_dataset(fil, + indexes={}, + loadable_variables=['Time'], + cftime_variables=['Time'], + ) + return vds + + +def reduce_references(results): + """ Reduce to concat virtual datasets. + + """ + combined_vds = xr.combine_nested( + results, + concat_dim=["Time"], + coords="minimal", + compat="override", + ) + return combined_vds + + +fexec = lithops.FunctionExecutor(config_file="lithops.yaml") + +futures = fexec.map_reduce( + map_references, + file_pattern, + reduce_references, + spawn_reducer=100, +) + +ds = futures.get_result() + +# write out the virtual dataset to a kerchunk json +ds.virtualize.to_kerchunk("combined.json", format="json") \ No newline at end of file