diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 31d3585cff2..e72e6f6ef9b 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -33,18 +33,6 @@ jobs: with: tune-api: true training-operator: true - - - name: Check the status of Experiment and Trials - shell: bash - run: | - kubectl get pods -n default - - # describe pod - pod_name=$(kubectl get pods -n default -o jsonpath='{.items[?(@.metadata.labels.trial-name)].metadata.name}') - kubectl describe pod $pod_name -n default - - # check the logs of pod - kubectl logs $pod_name -n default strategy: fail-fast: false diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py index 135f40c6ef8..48496b864c6 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -3,13 +3,13 @@ import kubeflow.katib as katib import transformers -from kubeflow.katib import KatibClient, search, types +from kubeflow.katib import KatibClient, search from kubeflow.storage_initializer.hugging_face import ( HuggingFaceDatasetParams, HuggingFaceModelParams, HuggingFaceTrainerParams, ) -from kubernetes import client +from kubernetes import client, config from peft import LoraConfig from verify import verify_experiment_results @@ -19,6 +19,25 @@ # The default logging config. logging.basicConfig(level=logging.INFO) +# Function to get logs of the pod related to the experiment. +def get_experiment_pod_logs(namespace: str, exp_name: str): + v1 = client.CoreV1Api() + pods = v1.list_namespaced_pod(namespace) + + for pod in pods.items: + # Identify the pod associated with the experiment + if exp_name in pod.metadata.name: + logging.info(f"Describing pod: {pod.metadata.name}") + pod_description = v1.read_namespaced_pod(name=pod.metadata.name, namespace=namespace) + logging.info(pod_description) + + logging.info(f"Fetching logs for pod: {pod.metadata.name}") + pod_logs = v1.read_namespaced_pod_log(name=pod.metadata.name, namespace=namespace) + logging.info(pod_logs) + break + else: + logging.warning(f"No pod found for experiment: {exp_name}") + # Test for Experiment created with custom objective. def run_e2e_experiment_create_by_tune_with_custom_objective( @@ -144,6 +163,8 @@ def run_e2e_experiment_create_by_tune_with_external_model( if args.verbose: logging.getLogger().setLevel(logging.DEBUG) + config.load_kube_config() # Load Kubernetes config from the environment + katib_client = KatibClient() namespace_labels = client.CoreV1Api().read_namespace(args.namespace).metadata.labels @@ -163,6 +184,9 @@ def run_e2e_experiment_create_by_tune_with_external_model( logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}") raise e finally: + # Describe and get logs of the experiment pod + get_experiment_pod_logs(exp_namespace, exp_name) + # Delete the Experiment. logging.info("---------------------------------------------------------------") logging.info("---------------------------------------------------------------") @@ -178,7 +202,10 @@ def run_e2e_experiment_create_by_tune_with_external_model( logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name_2}") raise e finally: + # Describe and get logs of the experiment pod + get_experiment_pod_logs(exp_namespace, exp_name_2) + # Delete the Experiment. logging.info("---------------------------------------------------------------") logging.info("---------------------------------------------------------------") - katib_client.delete_experiment(exp_name_2, exp_namespace) + #katib_client.delete_experiment(exp_name_2, exp_namespace)