integration: add test for deploying Nvidia gpu-operator through Helm …

…chart (#929) --------- Signed-off-by: Nashwan Azhari <[email protected]>
canonical · Jan 9, 2025 · eb070bd · eb070bd
1 parent cf3bb9b
commit eb070bd
Show file tree

Hide file tree

Showing 4 changed files with 262 additions and 0 deletions.
diff --git a/tests/integration/templates/cuda-vectoradd-nvidia-gpu-test-pod.yaml b/tests/integration/templates/cuda-vectoradd-nvidia-gpu-test-pod.yaml
@@ -0,0 +1,14 @@
+# Lifted 1:1 from:
+# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/getting-started.html#cuda-vectoradd
+apiVersion: v1
+kind: Pod
+metadata:
+  name: {}
+spec:
+  restartPolicy: OnFailure
+  containers:
+  - name: cuda-vectoradd
+    image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1-ubuntu20.04"
+    resources:
+      limits:
+        nvidia.com/gpu: 1
diff --git a/tests/integration/tests/test_nvidia_gpu_operator.py b/tests/integration/tests/test_nvidia_gpu_operator.py
@@ -0,0 +1,174 @@
+#
+# Copyright 2025 Canonical, Ltd.
+#
+
+import logging
+from typing import List, Mapping
+
+import pytest
+from test_util import config, harness, tags, util
+
+LOG = logging.getLogger(__name__)
+
+NVIDIA_GPU_OPERATOR_HELM_CHART_REPO = "https://helm.ngc.nvidia.com/nvidia"
+
+# Mapping between the versions of the Nvidia `gpu-operator` and
+# the host versions of Ubuntu they support.
+# Because the `nvidia-driver-daemonset` pod included in the `gpu-operator`
+# includes kernel drivers, its container image's release lifecycle is
+# strictly tied to the version of Ubuntu on the host.
+# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/platform-support.html
+NVIDIA_GPU_OPERATOR_SUPPORTED_UBUNTU_VERSIONS = {"v24.9.1": ["20.04", "22.04"]}
+
+NVIDIA_KERNEL_MODULE_NAMES = ["nvidia", "nvidia_uvm", "nvidia_modeset"]
+
+# Lifted 1:1 from:
+# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/getting-started.html#cuda-vectoradd
+NVIDIA_CUDA_VECTOR_ADDITION_TEST_POD_NAME = "cuda-vectoradd"
+
+
+def _check_nvidia_gpu_present(instance: harness.Instance) -> bool:
+    """Checks whether at least one Nvidia GPU is available
+    by exec-ing `lspci` on the target instance."""
+    proc = instance.exec(["lspci", "-k"], capture_output=True, text=True)
+
+    for line in proc.stdout.split("\n"):
+        if "NVIDIA Corporation" in line:
+            LOG.info(f"Found NVIDIA GPU in lspci output: {line}")
+            return True
+
+    LOG.info(f"Failed to find NVIDIA GPU in lspci output: {proc.stdout}")
+    return False
+
+
+def _check_nvidia_drivers_loaded(instance: harness.Instance) -> Mapping[str, bool]:
+    """Ensures that Nvidia kernel modules are NOT loaded on
+    the given harness instance."""
+
+    proc = instance.exec(["lsmod"], capture_output=True, text=True)
+    modules_present = {m: False for m in NVIDIA_KERNEL_MODULE_NAMES}
+    for line in proc.stdout.split("\n"):
+        for mod in modules_present:
+            if line.startswith(mod):
+                modules_present[mod] = True
+
+    LOG.info(f"Located the following Nvidia kernel modules {modules_present}")
+    return modules_present
+
+
+@pytest.mark.node_count(1)
+@pytest.mark.tags(tags.WEEKLY)
+@pytest.mark.tags(tags.GPU)
+@pytest.mark.parametrize(
+    "gpu_operator_version", NVIDIA_GPU_OPERATOR_SUPPORTED_UBUNTU_VERSIONS.keys()
+)
+def test_deploy_nvdia_gpu_operator(
+    instances: List[harness.Instance], gpu_operator_version: str
+):
+    """Tests that the Nvidia `gpu-operator` can be deployed successfully
+    using the upstream Helm chart and a sample application running a small
+    CUDA workload gets scheduled and executed to completion.
+    """
+    instance = instances[0]
+    test_namespace = "gpu-operator"
+
+    # Prechecks to ensure the test instance is valid.
+    if not _check_nvidia_gpu_present(instance):
+        msg = (
+            f"No Nvidia GPU present on harness instance '{instance.id}'. "
+            "Skipping GPU-operator test."
+        )
+        LOG.warn(msg)
+        pytest.skip(msg)
+
+    # NOTE(aznashwan): considering the Nvidia gpu-operator's main purpose
+    # is to set up the drivers on the nodes, and that running the `gpu-operator`
+    # with pre-installed drivers can lead to incompatibilities between the
+    # version of the drivers and the rest of the toolchain, we skip the test
+    # if any of the drivers happened to be pre-loaded on the harness instance:
+    modules_loaded = _check_nvidia_drivers_loaded(instance)
+    if any(modules_loaded.values()):
+        msg = (
+            f"Cannot have any pre-loaded Nvidia GPU drivers before running "
+            f"the Nvidia 'gpu-operator' test on instance {instance.id}. "
+            f"Current Nvidia driver statuses: {modules_loaded}"
+        )
+        LOG.warn(msg)
+        pytest.skip(msg)
+
+    instance_release = util.get_os_version_id_for_instance(instance)
+    if (
+        instance_release
+        not in NVIDIA_GPU_OPERATOR_SUPPORTED_UBUNTU_VERSIONS[gpu_operator_version]
+    ):
+        msg = (
+            f"Unsupported Ubuntu release '{instance_release}' for `gpu-operator` "
+            f"version '{gpu_operator_version}'. Skipping gpu-operator test."
+        )
+        LOG.warn(msg)
+        pytest.skip(msg)
+
+    # Add the upstream Nvidia GPU-operator Helm repo:
+    instance.exec(
+        ["k8s", "helm", "repo", "add", "nvidia", NVIDIA_GPU_OPERATOR_HELM_CHART_REPO]
+    )
+    instance.exec(["k8s", "helm", "repo", "update"])
+
+    # Install `gpu-operator` chart:
+    instance.exec(
+        [
+            "k8s",
+            "helm",
+            "install",
+            "--generate-name",
+            "--wait",
+            "-n",
+            test_namespace,
+            "--create-namespace",
+            "nvidia/gpu-operator",
+            f"--version={gpu_operator_version}",
+        ]
+    )
+
+    # Wait for the core daemonsets of the gpu-operator to be ready:
+    daemonsets = [
+        "nvidia-driver-daemonset",
+        "nvidia-device-plugin-daemonset",
+        "nvidia-container-toolkit-daemonset",
+    ]
+    # NOTE(aznashwan): it takes on average a little under 10 minutes for all
+    # of the core daemonsets of the Nvidia GPU-operator to do their thing
+    # on an AWS `g4dn.xlarge` instance (4 vCPUs/16GiB RAM), so we offer a
+    # generous timeout of 15 minutes:
+    for daemonset in daemonsets:
+        util.wait_for_daemonset(
+            instance,
+            daemonset,
+            namespace=test_namespace,
+            retry_times=15,
+            retry_delay_s=60,
+        )
+
+    # Deploy a sample CUDA app and let it run to completion:
+    pod_spec_file = config.MANIFESTS_DIR / "cuda-vectoradd-nvidia-gpu-test-pod.yaml"
+    pod_spec = pod_spec_file.read_text().format(
+        NVIDIA_CUDA_VECTOR_ADDITION_TEST_POD_NAME
+    )
+    instance.exec(
+        ["k8s", "kubectl", "-n", test_namespace, "apply", "-f", "-"],
+        input=pod_spec.encode(),
+    )
+    util.stubbornly(retries=3, delay_s=1).on(instance).exec(
+        [
+            "k8s",
+            "kubectl",
+            "-n",
+            test_namespace,
+            "wait",
+            "--for=condition=ready",
+            "pod",
+            NVIDIA_CUDA_VECTOR_ADDITION_TEST_POD_NAME,
+            "--timeout",
+            "180s",
+        ]
+    )
diff --git a/tests/integration/tests/test_util/tags.py b/tests/integration/tests/test_util/tags.py
@@ -6,6 +6,7 @@
 PULL_REQUEST = "pull_request"
 NIGHTLY = "nightly"
 WEEKLY = "weekly"
+GPU = "gpu"
 
 TEST_LEVELS = [PULL_REQUEST, NIGHTLY, WEEKLY]
 

diff --git a/tests/integration/tests/test_util/util.py b/tests/integration/tests/test_util/util.py
@@ -7,6 +7,7 @@
 import re
 import shlex
 import subprocess
+import time
 import urllib.request
 from datetime import datetime
 from functools import partial
@@ -554,3 +555,75 @@ def check_file_paths_exist(
         p: not (f"cannot access '{p}': No such file or directory" in process.stderr)
         for p in paths
     }
+
+
+def get_os_version_id_for_instance(instance: harness.Instance) -> str:
+    """Returns the version of the OS on the given harness Instance
+    by reading the `VERSION_ID` from `/etc/os-release`.
+    """
+    proc = instance.exec(["cat", "/etc/os-release"], capture_output=True)
+
+    release = None
+    var = "VERSION_ID"
+    for line in proc.stdout.split(b"\n"):
+        line = line.decode()
+        if line.startswith(var):
+            release = line.lstrip(f"{var}=")
+            break
+
+    if release is None:
+        raise ValueError(
+            f"Failed to parse OS release var '{var}' from OS release "
+            f"info: {proc.stdout}"
+        )
+
+    return release
+
+
+def wait_for_daemonset(
+    instance: harness.Instance,
+    name: str,
+    namespace: str = "default",
+    retry_times: int = 5,
+    retry_delay_s: int = 60,
+    expected_pods_ready: int = 1,
+):
+    """Waits for the daemonset with the given name to have at least
+    `expected_pods_ready` pods ready."""
+    proc = None
+    for i in range(retry_times):
+        # NOTE: we can't reliably use `rollout status` on Daemonsets unless
+        # they have `RollingUpdate` strategy, so we must go by the number of
+        # pods which are Ready.
+        proc = instance.exec(
+            [
+                "k8s",
+                "kubectl",
+                "-n",
+                namespace,
+                "get",
+                "daemonset",
+                name,
+                "-o",
+                "jsonpath={.status.numberReady}",
+            ],
+            check=True,
+            capture_output=True,
+        )
+        if int(proc.stdout.decode()) >= expected_pods_ready:
+            LOG.info(
+                f"Successfully waited for daemonset '{name}' after "
+                f"{(i+1)*retry_delay_s} seconds"
+            )
+            return
+
+        LOG.info(
+            f"Waiting {retry_delay_s} seconds for daemonset '{name}'.\n"
+            f"code: {proc.returncode}\nstdout: {proc.stdout}\nstderr: {proc.stderr}"
+        )
+        time.sleep(retry_delay_s)
+
+    raise AssertionError(
+        f"Daemonset '{name}' failed to have at least one pod ready after "
+        f"{retry_times} x {retry_delay_s} seconds."
+    )