-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This adds an E2E test for GPU use on Contrast. It currently runs on the GPU-enabled bare-metal SNP runner. The test currently only verifies that the GPU is available via nvidia-smi, which also verifies that driver and CUDA work correctly.
- Loading branch information
Showing
6 changed files
with
172 additions
and
29 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
// Copyright 2024 Edgeless Systems GmbH | ||
// SPDX-License-Identifier: AGPL-3.0-only | ||
|
||
//go:build e2e | ||
|
||
package gpu | ||
|
||
import ( | ||
"bytes" | ||
"context" | ||
"flag" | ||
"os" | ||
"testing" | ||
"time" | ||
|
||
"github.com/edgelesssys/contrast/e2e/internal/contrasttest" | ||
"github.com/edgelesssys/contrast/internal/kuberesource" | ||
"github.com/edgelesssys/contrast/internal/manifest" | ||
"github.com/edgelesssys/contrast/internal/platforms" | ||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
const ( | ||
gpuPodName = "gpu-pod" | ||
gpuName = "NVIDIA H100 PCIe" | ||
) | ||
|
||
// TestGPU runs e2e tests on an GPU-enabled Contrast. | ||
func TestGPU(t *testing.T) { | ||
platform, err := platforms.FromString(contrasttest.Flags.PlatformStr) | ||
require.NoError(t, err) | ||
ct := contrasttest.New(t) | ||
|
||
runtimeHandler, err := manifest.RuntimeHandler(platform) | ||
require.NoError(t, err) | ||
|
||
resources := kuberesource.OpenSSL() | ||
coordinator := kuberesource.CoordinatorBundle() | ||
|
||
resources = append(resources, coordinator...) | ||
|
||
resources = kuberesource.PatchRuntimeHandlers(resources, runtimeHandler) | ||
|
||
resources = kuberesource.AddPortForwarders(resources) | ||
|
||
ct.Init(t, resources) | ||
require.True(t, t.Run("generate", ct.Generate), "contrast generate needs to succeed for subsequent tests") | ||
|
||
require.True(t, t.Run("apply", ct.Apply), "Kubernetes resources need to be applied for subsequent tests") | ||
|
||
require.True(t, t.Run("set", ct.Set), "contrast set needs to succeed for subsequent tests") | ||
|
||
require.True(t, t.Run("contrast verify", ct.Verify), "contrast verify needs to succeed for subsequent tests") | ||
|
||
applyGPUPod := func(t *testing.T) { | ||
yaml, err := os.ReadFile("./e2e/gpu/testdata/gpu-pod.yaml") | ||
require.NoError(t, err) | ||
|
||
yaml = bytes.ReplaceAll( | ||
bytes.ReplaceAll(yaml, []byte("@@REPLACE_NAMESPACE@@"), []byte(ct.Namespace)), | ||
[]byte("@@REPLACE_RUNTIME@@"), []byte(ct.RuntimeClassName), | ||
) | ||
|
||
ct.ApplyFromYAML(t, yaml) | ||
} | ||
|
||
require.True(t, t.Run("apply GPU pod", applyGPUPod), "GPU pod needs to deploy successfully for subsequent tests") | ||
|
||
t.Run("check GPU availability", func(t *testing.T) { | ||
ctx, cancel := context.WithTimeout(context.Background(), ct.FactorPlatformTimeout(5*time.Minute)) | ||
defer cancel() | ||
|
||
require := require.New(t) | ||
|
||
err := ct.Kubeclient.WaitForPod(ctx, ct.Namespace, gpuPodName) | ||
require.NoError(err, "GPU pod %s did not start", gpuPodName) | ||
|
||
argv := []string{"/bin/sh", "-c", "nvidia-smi"} | ||
stdout, stderr, err := ct.Kubeclient.Exec(ctx, ct.Namespace, gpuPodName, argv) | ||
require.NoError(err, "stderr: %q", stderr) | ||
|
||
require.Contains(stdout, gpuName, "nvidia-smi output should contain %s", gpuName) | ||
}) | ||
} | ||
|
||
func TestMain(m *testing.M) { | ||
contrasttest.RegisterFlags() | ||
flag.Parse() | ||
|
||
os.Exit(m.Run()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
# TODO(msanft): Move this to internal/kuberesource/sets.go as soon as genpolicy | ||
# support for GPU pods is added. | ||
apiVersion: v1 | ||
kind: Pod | ||
metadata: | ||
name: gpu-pod | ||
namespace: "@@REPLACE_NAMESPACE@@" | ||
annotations: | ||
# Allow-all policy | ||
# TODO(msanft): Generate a policy dynamically once we support policy generation for GPU pods. | ||
io.katacontainers.config.agent.policy: IyBDb3B5cmlnaHQgKGMpIDIwMjMgTWljcm9zb2Z0IENvcnBvcmF0aW9uCiMKIyBTUERYLUxpY2Vuc2UtSWRlbnRpZmllcjogQXBhY2hlLTIuMAojCgpwYWNrYWdlIGFnZW50X3BvbGljeQoKZGVmYXVsdCBBZGRBUlBOZWlnaGJvcnNSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBBZGRTd2FwUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgQ2xvc2VTdGRpblJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IENvcHlGaWxlUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgQ3JlYXRlQ29udGFpbmVyUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgQ3JlYXRlU2FuZGJveFJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IERlc3Ryb3lTYW5kYm94UmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgRXhlY1Byb2Nlc3NSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBHZXRNZXRyaWNzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgR2V0T09NRXZlbnRSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBHdWVzdERldGFpbHNSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBMaXN0SW50ZXJmYWNlc1JlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IExpc3RSb3V0ZXNSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBNZW1Ib3RwbHVnQnlQcm9iZVJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IE9ubGluZUNQVU1lbVJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFBhdXNlQ29udGFpbmVyUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgUHVsbEltYWdlUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgUmVhZFN0cmVhbVJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlbW92ZUNvbnRhaW5lclJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlbW92ZVN0YWxlVmlydGlvZnNTaGFyZU1vdW50c1JlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlc2VlZFJhbmRvbURldlJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlc3VtZUNvbnRhaW5lclJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFNldEd1ZXN0RGF0ZVRpbWVSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTZXRQb2xpY3lSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTaWduYWxQcm9jZXNzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgU3RhcnRDb250YWluZXJSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTdGFydFRyYWNpbmdSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTdGF0c0NvbnRhaW5lclJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFN0b3BUcmFjaW5nUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVHR5V2luUmVzaXplUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlQ29udGFpbmVyUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlRXBoZW1lcmFsTW91bnRzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlSW50ZXJmYWNlUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlUm91dGVzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgV2FpdFByb2Nlc3NSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBXcml0ZVN0cmVhbVJlcXVlc3QgOj0gdHJ1ZQo= | ||
io.katacontainers.config.hypervisor.default_memory: "15258" | ||
cdi.k8s.io/gpu: "nvidia.com/pgpu=0" | ||
spec: | ||
runtimeClassName: "@@REPLACE_RUNTIME@@" | ||
restartPolicy: OnFailure | ||
containers: | ||
- name: vllm | ||
image: ghcr.io/edgelesssys/contrast/ubuntu:24.04 | ||
env: | ||
- name: NVIDIA_VISIBLE_DEVICES | ||
value: all | ||
resources: | ||
limits: | ||
"nvidia.com/GH100_H100_PCIE": 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters