Merge pull request #58 from judemars/test-cache

Add basic experimental cache support
GoogleCloudPlatform · Sep 6, 2023 · 0d985b6 · 0d985b6
2 parents 050c095 + 0debcb5
commit 0d985b6
Show file tree

Hide file tree

Showing 2 changed files with 123 additions and 2 deletions.
diff --git a/examples/README.md b/examples/README.md
@@ -1,4 +1,4 @@
-<!-- 
+<!--
 Copyright 2018 The Kubernetes Authors.
 Copyright 2022 Google LLC
 
@@ -170,7 +170,7 @@ kubectl apply -f ./examples/tensorflow/train-job-tensorflow-dlc.yaml
 kubectl delete -f ./examples/tensorflow/train-job-tensorflow-dlc.yaml
 ```
 
-## Jupyter Notebook Example
+## Jupyter Notebook Example (no experimental read cache)
 
 ```bash
 # replace <bucket-name> with your pre-provisioned GCS bucket name
@@ -186,3 +186,38 @@ kubectl port-forward jupyter-notebook-server 8888:8888
 # clean up
 kubectl delete -f ./examples/jupyter/jupyter-notebook-server.yaml
 ```
+
+## Jupyter Notebook Example (with experimental read cache)
+
+#### Prerequisites
+
+1. Your node pool must have created an ephemeral local ssds as described in
+https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/local-ssd#node-pool
+
+2. Your node pool must have a GPU accelerator. This example uses nvidia-tesla-t4,
+but you can use another one (just make sure to update the nodeSelector in the
+yaml below if so). See
+[an example here](https://github.com/GoogleCloudPlatform/gcs-fuse-csi-driver/tree/main/examples#prerequisites)
+
+#### Steps
+
+```bash
+# 1. replace <bucket-name> with your pre-provisioned GCS bucket name
+GCS_BUCKET_NAME=your-bucket-name
+sed -i "s/<bucket-name>/$GCS_BUCKET_NAME/g" ./examples/jupyter/jupyter-experimental-readcache.yaml
+
+# 2. install a Jupyter Notebook server using experimental gcsfuse read cache
+kubectl apply -f ./examples/jupyter/jupyter-experimental-readcache.yaml
+
+# 3. get service IPs
+kubectl get services -n example
+
+# 4. Open jupyter
+#  a. copy EXTERNAL-IP of tensorflow-jupyter server
+#  b. open IP Address in a browser
+#  c. input token "jupyter" (from yaml)
+
+# 5. (optional) clean up
+kubectl delete -f ./examples/jupyter/jupyter-notebook-server.yaml
+```
+
diff --git a/examples/jupyter/jupyter-experimental-readcache.yaml b/examples/jupyter/jupyter-experimental-readcache.yaml
@@ -0,0 +1,86 @@
+# Tensorflow/Jupyter StatefulSet
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: tensorflow
+  namespace: example
+spec:
+  selector:
+    matchLabels:
+      pod: tensorflow-pod
+  serviceName: tensorflow
+  replicas: 1
+  template:
+    metadata:
+      annotations:
+        gke-gcsfuse/volumes: "true"
+        gke-gcsfuse/cpu-limit: 500m
+        gke-gcsfuse/memory-limit: 10Gi
+        gke-gcsfuse/ephemeral-storage-limit: 30Gi
+      labels:
+        pod: tensorflow-pod
+    spec:
+      serviceAccountName: gcsfuse-ksa
+      nodeSelector:
+        cloud.google.com/gke-accelerator: nvidia-tesla-t4
+        cloud.google.com/gke-ephemeral-storage-local-ssd: "true"
+      terminationGracePeriodSeconds: 30
+      containers:
+      - name: tensorflow-container
+        securityContext:
+          privileged: true
+        image: tensorflow/tensorflow:2.13.0-gpu-jupyter
+        volumeMounts:
+        - name: tensorflow-pvc
+          mountPath: /tf/saved
+        resources:
+            limits:
+              nvidia.com/gpu: "1"
+              ephemeral-storage: 30Gi
+              memory: 10Gi
+            requests:
+              nvidia.com/gpu: "1"
+              ephemeral-storage: 30Gi
+              memory: 10Gi
+        env:
+        - name: JUPYTER_TOKEN
+          value: "jupyter"
+      volumes:
+      - name: tensorflow-pvc
+        csi:
+          driver: gcsfuse.csi.storage.gke.io
+          volumeAttributes:
+            bucketName: <bucket-name> # unique bucket name
+            # update your experimental cache file options according to flags
+            # from
+            # https://github.com/GoogleCloudPlatform/gcsfuse/blob/19ed094b6612789b09ad4a1df3a2314099c65129/flags.go#L233C1-L236
+            mountOptions: "experimental-local-file-cache,stat-cache-ttl=240m0s,type-cache-ttl=240m0s,stat-cache-capacity=5000000000"
+
+---
+# Headless service for the above StatefulSet
+apiVersion: v1
+kind: Service
+metadata:
+  name: tensorflow
+  namespace: example
+spec:
+  ports:
+  - port: 8888
+  clusterIP: None
+  selector:
+    pod: tensorflow-pod
+---
+# External service
+apiVersion: "v1"
+kind: "Service"
+metadata:
+  name: tensorflow-jupyter
+  namespace: example
+spec:
+  ports:
+  - protocol: "TCP"
+    port: 80
+    targetPort: 8888
+  selector:
+    pod: tensorflow-pod
+  type: LoadBalancer