Skip to content

Commit

Permalink
[patch] reinstate zenmetastore and couchdb workarounds for cpd 4.8 (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
IanBoden authored Dec 9, 2024
1 parent aa05d8a commit 91bd67b
Show file tree
Hide file tree
Showing 4 changed files with 239 additions and 1 deletion.
9 changes: 9 additions & 0 deletions ibm/mas_devops/roles/cp4d/tasks/install-cp4d.yml
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,15 @@
retries: 20 # Approximately 20 minutes before we give up
delay: 60 # 1 minute

# 3-pre. Wait for Zen Metastore Cluster to be ready
# -----------------------------------------------------------------------------
# There have been issues with Zen Metastore Cluster not starting due to Persistent Storage,
# This task restarts any failing pods
- name: "install-cp4d : Wait for zen-metadata to be ready again (60s delay)"
include_tasks: "tasks/wait/wait-zenmetastore-edb.yml"
when:
- cpd_48_or_higher

# 3. Wait for zenStatus
# -----------------------------------------------------------------------------
# oc get ZenService lite-cr -o jsonpath="{.status.zenStatus}{'\n'}"
Expand Down
112 changes: 112 additions & 0 deletions ibm/mas_devops/roles/cp4d/tasks/wait/wait-zenmetastore-edb.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
---
# 1. Wait for zen metastore cluster to start
# -----------------------------------------------------------------------------
- name: "wait-zenmetastore-edb : Wait for Zen Metastore EDB Cluster to be created"
k8s_info:
kind: Cluster
namespace: "{{ cpd_instance_namespace }}"
name: "zen-metastore-edb"
register: zenmetastoreCluster
retries: 120 # Give 60 minutes for the zenService to start Zen Metastore Pods (Logs show this taking ~20 minutes in a good run)
delay: 30
until: zenmetastoreCluster.resources[0].status is defined

# 2. For V4.8, We need to patch the postgres licensing job acccording to https://www.ibm.com/support/pages/node/7158524
- name: "wait-zenmetastore-edb : Check and display the create-postgres-license-config license expiry date"
when: cpd_48
block:
- name: "wait-zenmetastore-edb : Fetch the license expiry date"
kubernetes.core.k8s_info:
kind: Cluster
namespace: "{{ cpd_instance_namespace }}"
name: "zen-metastore-edb"
register: zenmetastoreCluster
until:
zenmetastoreCluster.resources[0].spec.instances is defined
and zenmetastoreCluster.resources[0].status.licenseStatus.licenseStatus is defined
retries: 40 # Give 20 minutes
delay: 30
- debug:
msg:
- "License Expiration .. {{ zenmetastoreCluster.resources[0].status.licenseStatus.licenseExpiration | default ('') }}"
- "License Status ...... {{ zenmetastoreCluster.resources[0].status.licenseStatus.licenseStatus}}"

- name: "wait-zenmetastore-edb : Update create-postgres-license-config job"
when:
- cpd_48
- zenmetastoreCluster.resources[0].status.licenseStatus.licenseExpiration is not defined
block:
- name: "wait-zenmetastore-edb : Wait for create-postgres-license-config Job to be Completed or Failed (10s delay)"
kubernetes.core.k8s_info:
api_version: batch/v1
kind: Job
name: "create-postgres-license-config"
namespace: "{{ cpd_operators_namespace }}"
register: _job_info
until:
- _job_info.resources is defined
- _job_info.resources | length > 0
- (_job_info.resources | json_query('[*].status.conditions[?type==`Complete`][].status') | select ('match','True') | list | length == 1) or
(_job_info.resources | json_query('[*].status.conditions[?type==`Failed`][].status') | select ('match','True') | list | length == 1)
retries: 30
delay: 10

- name: "wait-zenmetastore-edb : Recreate the job with up to date license image"
shell: >-
oc get job create-postgres-license-config -n {{ cpd_operators_namespace }} -o yaml | \
sed -e 's/operator.ibm.com\/opreq-control: "true"/operator.ibm.com\/opreq-control: "false"/' \
-e 's|\(image: \).*|\1"cp.icr.io/cp/cpd/edb-postgres-license-provider@sha256:c1670e7dd93c1e65a6659ece644e44aa5c2150809ac1089e2fd6be37dceae4ce"|' \
-e '/controller-uid:/d' | \
oc replace --force -f -
register: _job_recreate_output

- debug:
msg: "Recreate the job: {{ _job_recreate_output }}"

- name: "wait-zenmetastore-edb : Wait for create-postgres-license-config Job to be Completed or Failed (10s delay)"
kubernetes.core.k8s_info:
api_version: batch/v1
kind: Job
name: "create-postgres-license-config"
namespace: "{{ cpd_operators_namespace }}"
register: _job_info
until:
- _job_info.resources is defined
- _job_info.resources | length > 0
- (_job_info.resources | json_query('[*].status.conditions[?type==`Complete`][].status') | select ('match','True') | list | length == 1) or
(_job_info.resources | json_query('[*].status.conditions[?type==`Failed`][].status') | select ('match','True') | list | length == 1)
retries: 30
delay: 10

- name: "wait-zenmetastore-edb : Check and display the license expiry date"
kubernetes.core.k8s_info:
kind: Cluster
namespace: "{{ cpd_instance_namespace }}"
name: "zen-metastore-edb"
register: newzenmetastoreCluster
until:
newzenmetastoreCluster.resources[0].spec.instances is defined
and newzenmetastoreCluster.resources[0].status.licenseStatus.licenseStatus is defined
and "Valid license" in newzenmetastoreCluster.resources[0].status.licenseStatus.licenseStatus
retries: 40 # Give 20 minutes
delay: 30

- debug:
msg:
- "License Expiration .. {{ newzenmetastoreCluster.resources[0].status.licenseStatus.licenseExpiration | default ('') }}"
- "License Status ...... {{ newzenmetastoreCluster.resources[0].status.licenseStatus.licenseStatus | default ('')}}"

# 3. Wait for zen metastore replica pods to become ready
# -----------------------------------------------------------------------------
- name: "wait-zenmetastore-edb : Wait for ZenMetastore pods to be become ready"
k8s_info:
kind: Cluster
namespace: "{{ cpd_instance_namespace }}"
name: "zen-metastore-edb"
register: zenmetastoreCluster
retries: 40 # Give 20 minutes for the pods to become ready
delay: 30
until: >-
zenmetastoreCluster.resources[0].spec.instances is defined
and zenmetastoreCluster.resources[0].status.readyInstances is defined
and zenmetastoreCluster.resources[0].spec.instances == zenmetastoreCluster.resources[0].status.readyInstances
11 changes: 10 additions & 1 deletion ibm/mas_devops/roles/cp4d_service/tasks/wait/wait-ccs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,16 @@
- cpd_48_or_higher # elastic search operator was just introduced with cpd 4.8
- not skip_ibm_entitlement_injection # eventually we hope to be able to skip patching the elastic search cr with image pull secret, but not for now

# 5. Wait for CCS CR to be ready
# 5. Wait for CouchDB Stateful Set to be ready
# -----------------------------------------------------------------------------
# There have been issues with CouchDB not starting due to Persistent Storage,
# This task restarts any failing pods
- include_tasks: "tasks/wait/wait-couchdb.yml"
when:
- cpd_48


# 6. Wait for CCS CR to be ready
# -----------------------------------------------------------------------------
# Note: We can't fail early when we see Failed status, as the operator will
# report failed multiple times during initial reconcile.
Expand Down
108 changes: 108 additions & 0 deletions ibm/mas_devops/roles/cp4d_service/tasks/wait/wait-couchdb.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
---
# 1. Wait for couch-db stateful set to start all the replica pods
# -----------------------------------------------------------------------------
- name: "wait-couchdb: Wait for CouchDB pods to be created"
k8s_info:
kind: StatefulSet
namespace: "{{ cpd_instance_namespace }}"
name: "wdp-couchdb"
register: couchdbStatefulSet
retries: 40 # Give 20 minutes for the ccs Operator to start CouchDB Pods (Logs show this taking ~7 minutes in a good run)
delay: 30
until: >-
(( couchdbStatefulSet.resources[0].status is defined
and couchdbStatefulSet.resources[0].status.replicas is defined
and couchdbStatefulSet.resources[0].status.replicas == 0 )
or ( couchdbStatefulSet.resources[0].status is defined
and couchdbStatefulSet.resources[0].status.updatedReplicas is defined
and couchdbStatefulSet.resources[0].status.replicas == couchdbStatefulSet.resources[0].status.updatedReplicas ))
# 2. Wait for couchdb replica pods to become ready
# -----------------------------------------------------------------------------
- name: "wait-couchdb: Wait for CouchDB pods to be become ready"
k8s_info:
kind: StatefulSet
namespace: "{{ cpd_instance_namespace }}"
name: "wdp-couchdb"
register: couchdbStatefulSet
retries: 10 # Give 5 minutes for the pods to become ready
delay: 30
until: >-
couchdbStatefulSet.resources[0].status.readyReplicas is defined
and couchdbStatefulSet.resources[0].status.replicas == couchdbStatefulSet.resources[0].status.readyReplicas
#ignore-errors: true # If this fails then we restart pending pods below
failed_when: false

# 2. Restart any couchDB pods that are still Pending
# -----------------------------------------------------------------------------
- set_fact:
is_couchdb_ready: true
when:
couchdbStatefulSet.resources[0].status.readyReplicas is defined
and couchdbStatefulSet.resources[0].status.replicas == couchdbStatefulSet.resources[0].status.readyReplicas

- name: "wait-couchdb: Detecting and restarting pending CouchDB Pods"
block:
- name: "install-cp4d : Get pending CouchDB Pods"
kubernetes.core.k8s_info:
api_version: v1
kind: Pod
label_selectors:
- "app=couchdb"
field_selectors:
- "status.phase=Pending"
namespace: "{{ cpd_instance_namespace }}"
register: pending_pod_lookup

- set_fact:
pending_pod_names: "{{ pending_pod_lookup.resources | map(attribute='metadata.name') }}"

- debug:
msg: "Restarting pending CouchDB Pods: {{ pending_pod_names }}"

- name: "wait-couchdb: Restarting pending CouchDB Pods"
kubernetes.core.k8s:
state: absent
api_version: v1
kind: Pod
namespace: "{{ cpd_instance_namespace }}"
name: "{{ item }}"
loop: "{{ pending_pod_names }}"

# 3. Wait again couchdb replica pods to become ready
# -----------------------------------------------------------------------------
- name: "wait-couchdb: Wait for CouchDB pods to be become ready"
k8s_info:
kind: StatefulSet
namespace: "{{ cpd_instance_namespace }}"
name: "wdp-couchdb"
register: couchdbStatefulSet
retries: 10 # Give another 5 minutes for the pods to become ready
delay: 30
until: >-
couchdbStatefulSet.resources[0].status.readyReplicas is defined
and couchdbStatefulSet.resources[0].status.replicas == couchdbStatefulSet.resources[0].status.readyReplicas
failed_when: false # We handle and log the failure below.

- name: "wait-couchdb: Fail if CouchDB pods are not ready"
block:
- name: "install-cp4d : Get Pending CouchDB Pods"
kubernetes.core.k8s_info:
api_version: v1
kind: Pod
label_selectors:
- "app=couchdb"
field_selectors:
- "status.phase=Pending"
namespace: "{{ cpd_instance_namespace }}"
register: pending_pod_lookup

- fail:
msg:
- "CouchDB pods are not ready {{ couchdbStatefulSet.resources[0].status }}"
- "Pending CouchDB Pods: {{ pending_pod_lookup.resources | map(attribute='metadata.name') }}"
when:
couchdbStatefulSet.resources[0].status.replicas != couchdbStatefulSet.resources[0].status.readyReplicas

when: is_couchdb_ready is not defined

0 comments on commit 91bd67b

Please sign in to comment.