Skip to content

Commit

Permalink
integration: add test for deploying Nvidia gpu-operator through Helm …
Browse files Browse the repository at this point in the history
…chart (#929)

---------

Signed-off-by: Nashwan Azhari <[email protected]>
  • Loading branch information
bschimke95 authored Jan 9, 2025
1 parent cf3bb9b commit eb070bd
Show file tree
Hide file tree
Showing 4 changed files with 262 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Lifted 1:1 from:
# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/getting-started.html#cuda-vectoradd
apiVersion: v1
kind: Pod
metadata:
name: {}
spec:
restartPolicy: OnFailure
containers:
- name: cuda-vectoradd
image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1-ubuntu20.04"
resources:
limits:
nvidia.com/gpu: 1
174 changes: 174 additions & 0 deletions tests/integration/tests/test_nvidia_gpu_operator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
#
# Copyright 2025 Canonical, Ltd.
#

import logging
from typing import List, Mapping

import pytest
from test_util import config, harness, tags, util

LOG = logging.getLogger(__name__)

NVIDIA_GPU_OPERATOR_HELM_CHART_REPO = "https://helm.ngc.nvidia.com/nvidia"

# Mapping between the versions of the Nvidia `gpu-operator` and
# the host versions of Ubuntu they support.
# Because the `nvidia-driver-daemonset` pod included in the `gpu-operator`
# includes kernel drivers, its container image's release lifecycle is
# strictly tied to the version of Ubuntu on the host.
# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/platform-support.html
NVIDIA_GPU_OPERATOR_SUPPORTED_UBUNTU_VERSIONS = {"v24.9.1": ["20.04", "22.04"]}

NVIDIA_KERNEL_MODULE_NAMES = ["nvidia", "nvidia_uvm", "nvidia_modeset"]

# Lifted 1:1 from:
# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/getting-started.html#cuda-vectoradd
NVIDIA_CUDA_VECTOR_ADDITION_TEST_POD_NAME = "cuda-vectoradd"


def _check_nvidia_gpu_present(instance: harness.Instance) -> bool:
"""Checks whether at least one Nvidia GPU is available
by exec-ing `lspci` on the target instance."""
proc = instance.exec(["lspci", "-k"], capture_output=True, text=True)

for line in proc.stdout.split("\n"):
if "NVIDIA Corporation" in line:
LOG.info(f"Found NVIDIA GPU in lspci output: {line}")
return True

LOG.info(f"Failed to find NVIDIA GPU in lspci output: {proc.stdout}")
return False


def _check_nvidia_drivers_loaded(instance: harness.Instance) -> Mapping[str, bool]:
"""Ensures that Nvidia kernel modules are NOT loaded on
the given harness instance."""

proc = instance.exec(["lsmod"], capture_output=True, text=True)
modules_present = {m: False for m in NVIDIA_KERNEL_MODULE_NAMES}
for line in proc.stdout.split("\n"):
for mod in modules_present:
if line.startswith(mod):
modules_present[mod] = True

LOG.info(f"Located the following Nvidia kernel modules {modules_present}")
return modules_present


@pytest.mark.node_count(1)
@pytest.mark.tags(tags.WEEKLY)
@pytest.mark.tags(tags.GPU)
@pytest.mark.parametrize(
"gpu_operator_version", NVIDIA_GPU_OPERATOR_SUPPORTED_UBUNTU_VERSIONS.keys()
)
def test_deploy_nvdia_gpu_operator(
instances: List[harness.Instance], gpu_operator_version: str
):
"""Tests that the Nvidia `gpu-operator` can be deployed successfully
using the upstream Helm chart and a sample application running a small
CUDA workload gets scheduled and executed to completion.
"""
instance = instances[0]
test_namespace = "gpu-operator"

# Prechecks to ensure the test instance is valid.
if not _check_nvidia_gpu_present(instance):
msg = (
f"No Nvidia GPU present on harness instance '{instance.id}'. "
"Skipping GPU-operator test."
)
LOG.warn(msg)
pytest.skip(msg)

# NOTE(aznashwan): considering the Nvidia gpu-operator's main purpose
# is to set up the drivers on the nodes, and that running the `gpu-operator`
# with pre-installed drivers can lead to incompatibilities between the
# version of the drivers and the rest of the toolchain, we skip the test
# if any of the drivers happened to be pre-loaded on the harness instance:
modules_loaded = _check_nvidia_drivers_loaded(instance)
if any(modules_loaded.values()):
msg = (
f"Cannot have any pre-loaded Nvidia GPU drivers before running "
f"the Nvidia 'gpu-operator' test on instance {instance.id}. "
f"Current Nvidia driver statuses: {modules_loaded}"
)
LOG.warn(msg)
pytest.skip(msg)

instance_release = util.get_os_version_id_for_instance(instance)
if (
instance_release
not in NVIDIA_GPU_OPERATOR_SUPPORTED_UBUNTU_VERSIONS[gpu_operator_version]
):
msg = (
f"Unsupported Ubuntu release '{instance_release}' for `gpu-operator` "
f"version '{gpu_operator_version}'. Skipping gpu-operator test."
)
LOG.warn(msg)
pytest.skip(msg)

# Add the upstream Nvidia GPU-operator Helm repo:
instance.exec(
["k8s", "helm", "repo", "add", "nvidia", NVIDIA_GPU_OPERATOR_HELM_CHART_REPO]
)
instance.exec(["k8s", "helm", "repo", "update"])

# Install `gpu-operator` chart:
instance.exec(
[
"k8s",
"helm",
"install",
"--generate-name",
"--wait",
"-n",
test_namespace,
"--create-namespace",
"nvidia/gpu-operator",
f"--version={gpu_operator_version}",
]
)

# Wait for the core daemonsets of the gpu-operator to be ready:
daemonsets = [
"nvidia-driver-daemonset",
"nvidia-device-plugin-daemonset",
"nvidia-container-toolkit-daemonset",
]
# NOTE(aznashwan): it takes on average a little under 10 minutes for all
# of the core daemonsets of the Nvidia GPU-operator to do their thing
# on an AWS `g4dn.xlarge` instance (4 vCPUs/16GiB RAM), so we offer a
# generous timeout of 15 minutes:
for daemonset in daemonsets:
util.wait_for_daemonset(
instance,
daemonset,
namespace=test_namespace,
retry_times=15,
retry_delay_s=60,
)

# Deploy a sample CUDA app and let it run to completion:
pod_spec_file = config.MANIFESTS_DIR / "cuda-vectoradd-nvidia-gpu-test-pod.yaml"
pod_spec = pod_spec_file.read_text().format(
NVIDIA_CUDA_VECTOR_ADDITION_TEST_POD_NAME
)
instance.exec(
["k8s", "kubectl", "-n", test_namespace, "apply", "-f", "-"],
input=pod_spec.encode(),
)
util.stubbornly(retries=3, delay_s=1).on(instance).exec(
[
"k8s",
"kubectl",
"-n",
test_namespace,
"wait",
"--for=condition=ready",
"pod",
NVIDIA_CUDA_VECTOR_ADDITION_TEST_POD_NAME,
"--timeout",
"180s",
]
)
1 change: 1 addition & 0 deletions tests/integration/tests/test_util/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
PULL_REQUEST = "pull_request"
NIGHTLY = "nightly"
WEEKLY = "weekly"
GPU = "gpu"

TEST_LEVELS = [PULL_REQUEST, NIGHTLY, WEEKLY]

Expand Down
73 changes: 73 additions & 0 deletions tests/integration/tests/test_util/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import re
import shlex
import subprocess
import time
import urllib.request
from datetime import datetime
from functools import partial
Expand Down Expand Up @@ -554,3 +555,75 @@ def check_file_paths_exist(
p: not (f"cannot access '{p}': No such file or directory" in process.stderr)
for p in paths
}


def get_os_version_id_for_instance(instance: harness.Instance) -> str:
"""Returns the version of the OS on the given harness Instance
by reading the `VERSION_ID` from `/etc/os-release`.
"""
proc = instance.exec(["cat", "/etc/os-release"], capture_output=True)

release = None
var = "VERSION_ID"
for line in proc.stdout.split(b"\n"):
line = line.decode()
if line.startswith(var):
release = line.lstrip(f"{var}=")
break

if release is None:
raise ValueError(
f"Failed to parse OS release var '{var}' from OS release "
f"info: {proc.stdout}"
)

return release


def wait_for_daemonset(
instance: harness.Instance,
name: str,
namespace: str = "default",
retry_times: int = 5,
retry_delay_s: int = 60,
expected_pods_ready: int = 1,
):
"""Waits for the daemonset with the given name to have at least
`expected_pods_ready` pods ready."""
proc = None
for i in range(retry_times):
# NOTE: we can't reliably use `rollout status` on Daemonsets unless
# they have `RollingUpdate` strategy, so we must go by the number of
# pods which are Ready.
proc = instance.exec(
[
"k8s",
"kubectl",
"-n",
namespace,
"get",
"daemonset",
name,
"-o",
"jsonpath={.status.numberReady}",
],
check=True,
capture_output=True,
)
if int(proc.stdout.decode()) >= expected_pods_ready:
LOG.info(
f"Successfully waited for daemonset '{name}' after "
f"{(i+1)*retry_delay_s} seconds"
)
return

LOG.info(
f"Waiting {retry_delay_s} seconds for daemonset '{name}'.\n"
f"code: {proc.returncode}\nstdout: {proc.stdout}\nstderr: {proc.stderr}"
)
time.sleep(retry_delay_s)

raise AssertionError(
f"Daemonset '{name}' failed to have at least one pod ready after "
f"{retry_times} x {retry_delay_s} seconds."
)

0 comments on commit eb070bd

Please sign in to comment.