Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add GPU E2E Test #1136

Open
wants to merge 1 commit into
base: msanft/gpu-runtime-class
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/e2e_manual.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ on:
options:
- genpolicy
- getdents
- gpu
- openssl
- policy
- regression
Expand All @@ -24,6 +25,7 @@ on:
options:
- AKS-CLH-SNP
- K3s-QEMU-SNP
- K3s-QEMU-SNP-GPU
- K3s-QEMU-TDX
skip-undeploy:
description: "Skip undeploy"
Expand Down
91 changes: 91 additions & 0 deletions e2e/gpu/gpu_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
// Copyright 2024 Edgeless Systems GmbH
// SPDX-License-Identifier: AGPL-3.0-only

//go:build e2e

package gpu

import (
"bytes"
"context"
"flag"
"os"
"testing"
"time"

"github.com/edgelesssys/contrast/e2e/internal/contrasttest"
"github.com/edgelesssys/contrast/internal/kuberesource"
"github.com/edgelesssys/contrast/internal/manifest"
"github.com/edgelesssys/contrast/internal/platforms"
"github.com/stretchr/testify/require"
)

const (
gpuPodName = "gpu-pod"
gpuName = "NVIDIA H100 PCIe"
)

// TestGPU runs e2e tests on an GPU-enabled Contrast.
func TestGPU(t *testing.T) {
platform, err := platforms.FromString(contrasttest.Flags.PlatformStr)
require.NoError(t, err)
ct := contrasttest.New(t)

runtimeHandler, err := manifest.RuntimeHandler(platform)
require.NoError(t, err)

resources := kuberesource.OpenSSL()
coordinator := kuberesource.CoordinatorBundle()

resources = append(resources, coordinator...)

resources = kuberesource.PatchRuntimeHandlers(resources, runtimeHandler)

resources = kuberesource.AddPortForwarders(resources)

ct.Init(t, resources)
require.True(t, t.Run("generate", ct.Generate), "contrast generate needs to succeed for subsequent tests")

require.True(t, t.Run("apply", ct.Apply), "Kubernetes resources need to be applied for subsequent tests")

require.True(t, t.Run("set", ct.Set), "contrast set needs to succeed for subsequent tests")

require.True(t, t.Run("contrast verify", ct.Verify), "contrast verify needs to succeed for subsequent tests")

applyGPUPod := func(t *testing.T) {
yaml, err := os.ReadFile("./e2e/gpu/testdata/gpu-pod.yaml")
require.NoError(t, err)

yaml = bytes.ReplaceAll(
bytes.ReplaceAll(yaml, []byte("@@REPLACE_NAMESPACE@@"), []byte(ct.Namespace)),
[]byte("@@REPLACE_RUNTIME@@"), []byte(ct.RuntimeClassName),
)

ct.ApplyFromYAML(t, yaml)
}

require.True(t, t.Run("apply GPU pod", applyGPUPod), "GPU pod needs to deploy successfully for subsequent tests")

t.Run("check GPU availability", func(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), ct.FactorPlatformTimeout(5*time.Minute))
defer cancel()

require := require.New(t)

err := ct.Kubeclient.WaitForPod(ctx, ct.Namespace, gpuPodName)
require.NoError(err, "GPU pod %s did not start", gpuPodName)

argv := []string{"/bin/sh", "-c", "nvidia-smi"}
stdout, stderr, err := ct.Kubeclient.Exec(ctx, ct.Namespace, gpuPodName, argv)
require.NoError(err, "stderr: %q", stderr)

require.Contains(stdout, gpuName, "nvidia-smi output should contain %s", gpuName)
})
}

func TestMain(m *testing.M) {
contrasttest.RegisterFlags()
flag.Parse()

os.Exit(m.Run())
}
25 changes: 25 additions & 0 deletions e2e/gpu/testdata/gpu-pod.yaml
msanft marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# TODO(msanft): Move this to internal/kuberesource/sets.go as soon as genpolicy
# support for GPU pods is added.
apiVersion: v1
kind: Pod
metadata:
name: gpu-pod
namespace: "@@REPLACE_NAMESPACE@@"
annotations:
# Allow-all policy
# TODO(msanft): Generate a policy dynamically once we support policy generation for GPU pods.
io.katacontainers.config.agent.policy: IyBDb3B5cmlnaHQgKGMpIDIwMjMgTWljcm9zb2Z0IENvcnBvcmF0aW9uCiMKIyBTUERYLUxpY2Vuc2UtSWRlbnRpZmllcjogQXBhY2hlLTIuMAojCgpwYWNrYWdlIGFnZW50X3BvbGljeQoKZGVmYXVsdCBBZGRBUlBOZWlnaGJvcnNSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBBZGRTd2FwUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgQ2xvc2VTdGRpblJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IENvcHlGaWxlUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgQ3JlYXRlQ29udGFpbmVyUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgQ3JlYXRlU2FuZGJveFJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IERlc3Ryb3lTYW5kYm94UmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgRXhlY1Byb2Nlc3NSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBHZXRNZXRyaWNzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgR2V0T09NRXZlbnRSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBHdWVzdERldGFpbHNSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBMaXN0SW50ZXJmYWNlc1JlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IExpc3RSb3V0ZXNSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBNZW1Ib3RwbHVnQnlQcm9iZVJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IE9ubGluZUNQVU1lbVJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFBhdXNlQ29udGFpbmVyUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgUHVsbEltYWdlUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgUmVhZFN0cmVhbVJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlbW92ZUNvbnRhaW5lclJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlbW92ZVN0YWxlVmlydGlvZnNTaGFyZU1vdW50c1JlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlc2VlZFJhbmRvbURldlJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlc3VtZUNvbnRhaW5lclJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFNldEd1ZXN0RGF0ZVRpbWVSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTZXRQb2xpY3lSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTaWduYWxQcm9jZXNzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgU3RhcnRDb250YWluZXJSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTdGFydFRyYWNpbmdSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTdGF0c0NvbnRhaW5lclJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFN0b3BUcmFjaW5nUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVHR5V2luUmVzaXplUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlQ29udGFpbmVyUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlRXBoZW1lcmFsTW91bnRzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlSW50ZXJmYWNlUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlUm91dGVzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgV2FpdFByb2Nlc3NSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBXcml0ZVN0cmVhbVJlcXVlc3QgOj0gdHJ1ZQo=
io.katacontainers.config.hypervisor.default_memory: "15258"
cdi.k8s.io/gpu: "nvidia.com/pgpu=0"
spec:
runtimeClassName: "@@REPLACE_RUNTIME@@"
restartPolicy: OnFailure
containers:
- name: vllm
image: ghcr.io/edgelesssys/contrast/ubuntu:24.04
env:
- name: NVIDIA_VISIBLE_DEVICES
value: all
resources:
limits:
"nvidia.com/GH100_H100_PCIE": 1
17 changes: 15 additions & 2 deletions e2e/internal/contrasttest/contrasttest.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ type ContrastTest struct {
ImageReplacementsFile string
Platform platforms.Platform
NamespaceFile string
RuntimeClassName string
Kubeclient *kubeclient.Kubeclient

// outputs of contrast subcommands
Expand All @@ -70,15 +71,21 @@ type ContrastTest struct {

// New creates a new contrasttest.T object bound to the given test.
func New(t *testing.T) *ContrastTest {
require := require.New(t)

platform, err := platforms.FromString(Flags.PlatformStr)
require.NoError(t, err)
require.NoError(err)

runtimeClass, err := kuberesource.ContrastRuntimeClass(platform)
require.NoError(err)

return &ContrastTest{
Namespace: MakeNamespace(t, Flags.NamespaceSuffix),
WorkDir: t.TempDir(),
ImageReplacementsFile: Flags.ImageReplacementsFile,
Platform: platform,
NamespaceFile: Flags.NamespaceFile,
RuntimeClassName: *runtimeClass.Handler,
Kubeclient: kubeclient.NewForTest(t),
}
}
Expand Down Expand Up @@ -283,9 +290,15 @@ func patchReferenceValues(k *kubeclient.Kubeclient, platform platforms.Platform)
// Apply the generated resources to the Kubernetes test environment.
func (ct *ContrastTest) Apply(t *testing.T) {
require := require.New(t)

yaml, err := os.ReadFile(path.Join(ct.WorkDir, "resources.yml"))
require.NoError(err)
ct.ApplyFromYAML(t, yaml)
}

// ApplyFromYAML applies the given YAML to the Kubernetes test environment.
func (ct *ContrastTest) ApplyFromYAML(t *testing.T, yaml []byte) {
require := require.New(t)

objects, err := kubeapi.UnmarshalUnstructuredK8SResource(yaml)
require.NoError(err)

Expand Down
1 change: 1 addition & 0 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ node-installer platform=default_platform:
;;
"Metal-QEMU-SNP-GPU"|"K3s-QEMU-SNP-GPU")
just push "nydus-snapshotter"
just push "nydus-pull"
just push "node-installer-kata-gpu"
;;
"AKS-PEER-SNP")
Expand Down
65 changes: 38 additions & 27 deletions packages/by-name/contrast/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ let
subPackages = [
"e2e/genpolicy"
"e2e/getdents"
"e2e/gpu"
"e2e/openssl"
"e2e/servicemesh"
"e2e/release"
Expand Down Expand Up @@ -81,35 +82,45 @@ let
];
};

snpRefVals = {
snp =
let
launch-digest =
if kata.contrast-node-installer-image.debugRuntime then
kata.snp-launch-digest.override { debug = true; }
else
kata.snp-launch-digest;
in
[
{
trustedMeasurement = lib.removeSuffix "\n" (builtins.readFile "${launch-digest}/milan.hex");
productName = "Milan";
}
{
trustedMeasurement = lib.removeSuffix "\n" (builtins.readFile "${launch-digest}/genoa.hex");
productName = "Genoa";
}
];
};
snpRefValsWith =
{
gpu,
}:
{
snp =
let
os-image =
if gpu then
kata.contrast-node-installer-image.gpu.os-image
else
kata.contrast-node-installer-image.os-image;
launch-digest = kata.snp-launch-digest.override {
inherit os-image;
debug = kata.contrast-node-installer-image.debugRuntime;
};
in
[
{
trustedMeasurement = lib.removeSuffix "\n" (builtins.readFile "${launch-digest}/milan.hex");
productName = "Milan";
}
{
trustedMeasurement = lib.removeSuffix "\n" (builtins.readFile "${launch-digest}/genoa.hex");
productName = "Genoa";
}
];
};

snpRefVals = snpRefValsWith { gpu = false; };
snpGpuRefVals = snpRefValsWith { gpu = true; };

tdxRefVals = {
tdx = [
(
let
launch-digests =
if kata.contrast-node-installer-image.debugRuntime then
kata.tdx-launch-digests.override { debug = true; }
else
kata.tdx-launch-digests;
launch-digests = kata.tdx-launch-digests.override {
debug = kata.contrast-node-installer-image.debugRuntime;
};
in
{
mrTd = builtins.readFile "${launch-digests}/mrtd.hex";
Expand All @@ -135,9 +146,9 @@ let
"${k3s-qemu-tdx-handler}" = tdxRefVals;
"${rke2-qemu-tdx-handler}" = tdxRefVals;
"${metal-qemu-snp-handler}" = snpRefVals;
"${metal-qemu-snp-gpu-handler}" = snpRefVals;
"${metal-qemu-snp-gpu-handler}" = snpGpuRefVals;
"${k3s-qemu-snp-handler}" = snpRefVals;
"${k3s-qemu-snp-gpu-handler}" = snpRefVals;
"${k3s-qemu-snp-gpu-handler}" = snpGpuRefVals;
}
);

Expand Down