diff --git a/k8s/examples/fluid/Dockerfile b/k8s/examples/fluid/Dockerfile new file mode 100644 index 00000000..97ac29d3 --- /dev/null +++ b/k8s/examples/fluid/Dockerfile @@ -0,0 +1,3 @@ +FROM python:3.10 + +COPY ./configure-vineyard-socket.py /configure-vineyard-socket.py \ No newline at end of file diff --git a/k8s/examples/fluid/Makefile b/k8s/examples/fluid/Makefile new file mode 100644 index 00000000..c1f775c8 --- /dev/null +++ b/k8s/examples/fluid/Makefile @@ -0,0 +1,2 @@ +build-image: + docker build -t configure-vineyard-socket:latest -f Dockerfile . \ No newline at end of file diff --git a/k8s/examples/fluid/README.md b/k8s/examples/fluid/README.md new file mode 100644 index 00000000..0695cd50 --- /dev/null +++ b/k8s/examples/fluid/README.md @@ -0,0 +1,101 @@ +## Fluid integration + +If you are using [Fluid](https://fluidframework.com/) in your application, now it's a chance to cache your **Python Object** using **Vineyard** based on **Fluid**. + +### Prerequisites + +- A kubernetes cluster with version >= 1.25.10. If you don't have one by hand, you can refer to the guide [Initialize Kubernetes Cluster](https://v6d.io/tutorials/kubernetes/using-vineyard-operator.html#step-0-optional-initialize-kubernetes-cluster) to create one. +- Install the [Vineyardctl](https://v6d.io/notes/developers/build-from-source.html#install-vineyardctl) by following the official guide. + +### Install the argo server on Kubernetes + +1. Install the argo server on Kubernetes: + +```bash +$ kubectl create namespace argo +$ kubectl apply -n argo -f https://github.com/argoproj/argo-workflows/releases/download/v3.4.8/install.yaml +``` + +2. Check the status of the argo server: + +```bash +$ kubectl get pod -n argo +NAME READY STATUS RESTARTS AGE +argo-server-7698c96655-xsd5k 1/1 Running 0 10m +workflow-controller-b888f4458-ts58f 1/1 Running 0 10m +``` + +### Submit the job wihout Vineyard + +1. Submit the workflow: + +```bash +$ cd k8s/examples/fluid +$ argo submit --watch argo_workflow.yaml +``` + +### Submit the job with Vineyard + + +1. Install the vineyard deployment: + +```bash +$ vineyardctl deploy vineyard-deployment +``` + +2. Install the Fluid: + +```bash +$ kubectl create ns fluid-system +$ helm repo add fluid https://fluid-cloudnative.github.io/charts +$ helm repo update +$ helm install fluid fluid/fluid +``` + +3. Build the `configure-vineyard-socket` image and load to the cluster. + +```bash +$ cd k8s/examples/fluid && make build-image && kind load docker-image configure-vineyard-socket +``` + +4. Install the Vineyard profile: + +```bash +$ kubectl apply -f vineyard_profile.yaml +``` + +5. Install the Vineyard Dataset: + +```bash +$ kubectl apply -f dataset.yaml +``` + +6. Check the dataset status and make sure the dataset is in `Bound` status as follows. + +```bash +$ kubectl get dataset -A +NAMESPACE NAME UFS TOTAL SIZE CACHED CACHE CAPACITY CACHED PERCENTAGE PHASE AGE +default vineyard [Calculating] N/A N/A N/A Bound 105s +``` + +After that, the vineyard pv and pvc will be created automatically as follows. + +```bash +$ kubectl get pvc +NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE +vineyard Bound default-vineyard 100Pi RWX fluid 87s +$ kubectl get pv +NAME CAPACITY ACCESS MODES RECLAIM POLICY STATUS CLAIM STORAGECLASS REASON AGE +default-vineyard 100Pi RWX Retain Bound default/vineyard fluid 2m43s +``` + +7. Submit the workflow with Vineyard: + +```bash +$ argo submit --watch argo_workflow_with_vineyard.yaml +``` + +### result + +The execution time of the workflow without Vineyard is 41s. + diff --git a/k8s/examples/fluid/argo_workflow.yaml b/k8s/examples/fluid/argo_workflow.yaml new file mode 100644 index 00000000..4c20ca68 --- /dev/null +++ b/k8s/examples/fluid/argo_workflow.yaml @@ -0,0 +1,97 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Workflow +metadata: + generateName: mlops- +spec: + entrypoint: dag + templates: + - name: producer + volumes: + - name: data + hostPath: + path: /data + type: DirectoryOrCreate + container: + image: python:3.10 + volumeMounts: + - name: data + mountPath: /data + command: [bash, -c] + args: + - | + pip install numpy pandas; + cat << EOF >> producer.py + import time + + import numpy as np + import pandas as pd + + def generate_random_dataframe(num_rows): + return pd.DataFrame({ + 'Id': np.random.randint(1, 100000, num_rows), + 'TotalRooms': np.random.randint(2, 11, num_rows), + "GarageAge": np.random.randint(1, 31, num_rows), + "RemodAge": np.random.randint(1, 31, num_rows), + "HouseAge": np.random.randint(1, 31, num_rows), + "TotalBath": np.random.randint(1, 5, num_rows), + "TotalPorchSF": np.random.randint(1, 1001, num_rows), + "TotalSF": np.random.randint(1000, 6001, num_rows), + "TotalArea": np.random.randint(1000, 6001, num_rows), + 'MoSold': np.random.randint(1, 13, num_rows), + 'YrSold': np.random.randint(2006, 2022, num_rows), + 'SalePrice': np.random.randint(50000, 800001, num_rows), + }) + + def producer(): + st = time.time() + print('Generating data....', flush=True) + df = generate_random_dataframe(1000000000) + ed = time.time() + print('##################################') + print('Generating data time: ', ed - st, flush=True) + st = time.time() + print('Serializing data....', flush=True) + df.to_pickle('/data/df.pkl') + ed = time.time() + print('##################################') + print('Serializing data time: ', ed - st, flush=True) + EOF + python producer.py; + - name: consumer + volumes: + - name: data + hostPath: + path: /data + type: DirectoryOrCreate + container: + image: python:3.10 + volumeMounts: + - name: data + mountPath: /data + command: [bash, -c] + args: + - | + pip install pandas; + cat << EOF >> consumer.py + import time + + import pandas as pd + + def consumer(): + st = time.time() + print('Deserializing data....', flush=True) + df = pd.read_pickle('/data/df.pkl') + ed = time.time() + print('##################################') + print('Deserializing data time: ', ed - st, flush=True) + EOF + python consumer.py; + - name: dag + dag: + tasks: + - name: producer + template: producer + - name: consumer + template: consumer + dependencies: + - producer \ No newline at end of file diff --git a/k8s/examples/fluid/argo_workflow_with_vineyard.yaml b/k8s/examples/fluid/argo_workflow_with_vineyard.yaml new file mode 100644 index 00000000..4e6fb353 --- /dev/null +++ b/k8s/examples/fluid/argo_workflow_with_vineyard.yaml @@ -0,0 +1,95 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Workflow +metadata: + generateName: mlops- +spec: + entrypoint: dag + templates: + - name: producer + volumes: + - name: data + persistentVolumeClaim: + claimName: vineyard + container: + image: python:3.10 + volumeMounts: + - name: data + mountPath: /data + command: [bash, -c] + args: + - | + pip install vineyard numpy pandas; + cat << EOF >> producer.py + import time + + import numpy as np + import pandas as pd + + def generate_random_dataframe(num_rows): + return pd.DataFrame({ + 'Id': np.random.randint(1, 100000, num_rows), + 'TotalRooms': np.random.randint(2, 11, num_rows), + "GarageAge": np.random.randint(1, 31, num_rows), + "RemodAge": np.random.randint(1, 31, num_rows), + "HouseAge": np.random.randint(1, 31, num_rows), + "TotalBath": np.random.randint(1, 5, num_rows), + "TotalPorchSF": np.random.randint(1, 1001, num_rows), + "TotalSF": np.random.randint(1000, 6001, num_rows), + "TotalArea": np.random.randint(1000, 6001, num_rows), + 'MoSold': np.random.randint(1, 13, num_rows), + 'YrSold': np.random.randint(2006, 2022, num_rows), + 'SalePrice': np.random.randint(50000, 800001, num_rows), + }) + + def producer() + st = time.time() + print('Generating data....', flush=True) + df = generate_random_dataframe(1000000000) + ed = time.time() + print('##################################') + print('Generating data time: ', ed - st, flush=True) + st = time.time() + print('Serializing data....', flush=True) + vineyard.csi.write(df, '/data/df.pkl') + ed = time.time() + print('##################################') + print('Serializing data time: ', ed - st, flush=True) + EOF + python producer.py; + - name: consumer + volumes: + - name: data + persistentVolumeClaim: + claimName: vineyard + container: + image: python:3.10 + volumeMounts: + - name: data + mountPath: /data + command: [bash, -c] + args: + - | + pip install vineyard pandas; + cat << EOF >> consumer.py + import time + + import vineyard + + def consumer() + st = time.time() + print('Deserializing data....', flush=True) + df = vineyard.csi.read('/data/df.pkl') + ed = time.time() + print('##################################') + print('Deserializing data time: ', ed - st, flush=True) + EOF + python consumer.py; + - name: dag + dag: + tasks: + - name: producer + template: producer + - name: consumer + template: consumer + dependencies: + - producer \ No newline at end of file diff --git a/k8s/examples/fluid/configure-vineyard-socket.py b/k8s/examples/fluid/configure-vineyard-socket.py new file mode 100644 index 00000000..1ccd7e00 --- /dev/null +++ b/k8s/examples/fluid/configure-vineyard-socket.py @@ -0,0 +1,32 @@ +import json + +with open("/etc/fluid/config.json", "r") as f: + lines = f.readlines() + +rawStr = lines[0] +print(rawStr) + + +script = """ +#!/bin/sh +set -ex + +mkdir -p $targetPath +while true; do + if [ ! -S "$targetPath/vineyard.sock" ]; then + mount --bind $socketPath $targetPath + fi + sleep 10 +done +""" + +obj = json.loads(rawStr) + +with open("mount-vineyard-socket.sh", "w") as f: + f.write("targetPath=\"%s\"\n" % obj['targetPath']) + if obj['mounts'][0]['mountPoint'].startswith("local://"): + f.write("socketPath=\"%s\"\n" % obj['mounts'][0]['mountPoint'][len("local://"):]) + else: + f.write("socketPath=\"%s\"\n" % obj['mounts'][0]['mountPoint']) + + f.write(script) diff --git a/k8s/examples/fluid/dataset.yaml b/k8s/examples/fluid/dataset.yaml new file mode 100644 index 00000000..74afdd53 --- /dev/null +++ b/k8s/examples/fluid/dataset.yaml @@ -0,0 +1,18 @@ +apiVersion: data.fluid.io/v1alpha1 +kind: Dataset +metadata: + name: vineyard +spec: + mounts: + # This directory should be the same as the vineyard socket directory in the vineyard deployment + - mountPoint: local:///var/run/vineyard-kubernetes/vineyard-system/vineyardd-sample + name: vineyard + accessModes: + - ReadWriteMany +--- +apiVersion: data.fluid.io/v1alpha1 +kind: ThinRuntime +metadata: + name: vineyard +spec: + profileName: vineyard-profile diff --git a/k8s/examples/fluid/vineyard_profile.yaml b/k8s/examples/fluid/vineyard_profile.yaml new file mode 100644 index 00000000..c7527835 --- /dev/null +++ b/k8s/examples/fluid/vineyard_profile.yaml @@ -0,0 +1,23 @@ +apiVersion: data.fluid.io/v1alpha1 +kind: ThinRuntimeProfile +metadata: + name: vineyard-profile +spec: + fileSystemType: fuse + volumes: + - name: vineyard + hostPath: + # This path should be the same as the vineyard socket path in the vineyard deployment + path: /var/run/vineyard-kubernetes/vineyard-system/vineyardd-sample + type: DirectoryOrCreate + fuse: + image: configure-vineyard-socket + imageTag: latest + imagePullPolicy: IfNotPresent + volumeMounts: + - name: vineyard-socket + mountPath: /var/run/vineyard-kubernetes/vineyard-system/vineyardd-sample + command: + - sh + - -c + - "python3 /configure-vineyard-socket.py && chmod u+x ./mount-vineyard-socket.sh && ./mount-vineyard-socket.sh"