diff --git a/.github/workflows/e2e_manual.yml b/.github/workflows/e2e_manual.yml index 71096e72cb..5cd50e6323 100644 --- a/.github/workflows/e2e_manual.yml +++ b/.github/workflows/e2e_manual.yml @@ -10,6 +10,7 @@ on: options: - genpolicy - getdents + - gpu - openssl - policy - regression @@ -24,6 +25,7 @@ on: options: - AKS-CLH-SNP - K3s-QEMU-SNP + - K3s-QEMU-SNP-GPU - K3s-QEMU-TDX skip-undeploy: description: "Skip undeploy" diff --git a/e2e/gpu/gpu_test.go b/e2e/gpu/gpu_test.go new file mode 100644 index 0000000000..e44593632d --- /dev/null +++ b/e2e/gpu/gpu_test.go @@ -0,0 +1,91 @@ +// Copyright 2024 Edgeless Systems GmbH +// SPDX-License-Identifier: AGPL-3.0-only + +//go:build e2e + +package gpu + +import ( + "bytes" + "context" + "flag" + "os" + "testing" + "time" + + "github.com/edgelesssys/contrast/e2e/internal/contrasttest" + "github.com/edgelesssys/contrast/internal/kuberesource" + "github.com/edgelesssys/contrast/internal/manifest" + "github.com/edgelesssys/contrast/internal/platforms" + "github.com/stretchr/testify/require" +) + +const ( + gpuPodName = "gpu-pod" + gpuName = "NVIDIA H100 PCIe" +) + +// TestGPU runs e2e tests on an GPU-enabled Contrast. +func TestGPU(t *testing.T) { + platform, err := platforms.FromString(contrasttest.Flags.PlatformStr) + require.NoError(t, err) + ct := contrasttest.New(t) + + runtimeHandler, err := manifest.RuntimeHandler(platform) + require.NoError(t, err) + + resources := kuberesource.OpenSSL() + coordinator := kuberesource.CoordinatorBundle() + + resources = append(resources, coordinator...) + + resources = kuberesource.PatchRuntimeHandlers(resources, runtimeHandler) + + resources = kuberesource.AddPortForwarders(resources) + + ct.Init(t, resources) + require.True(t, t.Run("generate", ct.Generate), "contrast generate needs to succeed for subsequent tests") + + require.True(t, t.Run("apply", ct.Apply), "Kubernetes resources need to be applied for subsequent tests") + + require.True(t, t.Run("set", ct.Set), "contrast set needs to succeed for subsequent tests") + + require.True(t, t.Run("contrast verify", ct.Verify), "contrast verify needs to succeed for subsequent tests") + + applyGPUPod := func(t *testing.T) { + yaml, err := os.ReadFile("./e2e/gpu/testdata/gpu-pod.yaml") + require.NoError(t, err) + + yaml = bytes.ReplaceAll( + bytes.ReplaceAll(yaml, []byte("@@REPLACE_NAMESPACE@@"), []byte(ct.Namespace)), + []byte("@@REPLACE_RUNTIME@@"), []byte(ct.RuntimeClassName), + ) + + ct.ApplyFromYAML(t, yaml) + } + + require.True(t, t.Run("apply GPU pod", applyGPUPod), "GPU pod needs to deploy successfully for subsequent tests") + + t.Run("check GPU availability", func(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), ct.FactorPlatformTimeout(5*time.Minute)) + defer cancel() + + require := require.New(t) + + err := ct.Kubeclient.WaitForPod(ctx, ct.Namespace, gpuPodName) + require.NoError(err, "GPU pod %s did not start", gpuPodName) + + argv := []string{"/bin/sh", "-c", "nvidia-smi"} + stdout, stderr, err := ct.Kubeclient.Exec(ctx, ct.Namespace, gpuPodName, argv) + require.NoError(err, "stderr: %q", stderr) + + require.Contains(stdout, gpuName, "nvidia-smi output should contain %s", gpuName) + }) +} + +func TestMain(m *testing.M) { + contrasttest.RegisterFlags() + flag.Parse() + + os.Exit(m.Run()) +} diff --git a/e2e/gpu/testdata/gpu-pod.yaml b/e2e/gpu/testdata/gpu-pod.yaml new file mode 100644 index 0000000000..5fc9546c55 --- /dev/null +++ b/e2e/gpu/testdata/gpu-pod.yaml @@ -0,0 +1,25 @@ +# TODO(msanft): Move this to internal/kuberesource/sets.go as soon as genpolicy +# support for GPU pods is added. +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod + namespace: "@@REPLACE_NAMESPACE@@" + annotations: + # Allow-all policy + # TODO(msanft): Generate a policy dynamically once we support policy generation for GPU pods. + io.katacontainers.config.agent.policy: IyBDb3B5cmlnaHQgKGMpIDIwMjMgTWljcm9zb2Z0IENvcnBvcmF0aW9uCiMKIyBTUERYLUxpY2Vuc2UtSWRlbnRpZmllcjogQXBhY2hlLTIuMAojCgpwYWNrYWdlIGFnZW50X3BvbGljeQoKZGVmYXVsdCBBZGRBUlBOZWlnaGJvcnNSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBBZGRTd2FwUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgQ2xvc2VTdGRpblJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IENvcHlGaWxlUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgQ3JlYXRlQ29udGFpbmVyUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgQ3JlYXRlU2FuZGJveFJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IERlc3Ryb3lTYW5kYm94UmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgRXhlY1Byb2Nlc3NSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBHZXRNZXRyaWNzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgR2V0T09NRXZlbnRSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBHdWVzdERldGFpbHNSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBMaXN0SW50ZXJmYWNlc1JlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IExpc3RSb3V0ZXNSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBNZW1Ib3RwbHVnQnlQcm9iZVJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IE9ubGluZUNQVU1lbVJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFBhdXNlQ29udGFpbmVyUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgUHVsbEltYWdlUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgUmVhZFN0cmVhbVJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlbW92ZUNvbnRhaW5lclJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlbW92ZVN0YWxlVmlydGlvZnNTaGFyZU1vdW50c1JlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlc2VlZFJhbmRvbURldlJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlc3VtZUNvbnRhaW5lclJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFNldEd1ZXN0RGF0ZVRpbWVSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTZXRQb2xpY3lSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTaWduYWxQcm9jZXNzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgU3RhcnRDb250YWluZXJSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTdGFydFRyYWNpbmdSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTdGF0c0NvbnRhaW5lclJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFN0b3BUcmFjaW5nUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVHR5V2luUmVzaXplUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlQ29udGFpbmVyUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlRXBoZW1lcmFsTW91bnRzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlSW50ZXJmYWNlUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlUm91dGVzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgV2FpdFByb2Nlc3NSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBXcml0ZVN0cmVhbVJlcXVlc3QgOj0gdHJ1ZQo= + io.katacontainers.config.hypervisor.default_memory: "15258" + cdi.k8s.io/gpu: "nvidia.com/pgpu=0" +spec: + runtimeClassName: "@@REPLACE_RUNTIME@@" + restartPolicy: OnFailure + containers: + - name: vllm + image: ghcr.io/edgelesssys/contrast/ubuntu:24.04 + env: + - name: NVIDIA_VISIBLE_DEVICES + value: all + resources: + limits: + "nvidia.com/GH100_H100_PCIE": 1 diff --git a/e2e/internal/contrasttest/contrasttest.go b/e2e/internal/contrasttest/contrasttest.go index bf2fd53530..4b36dec336 100644 --- a/e2e/internal/contrasttest/contrasttest.go +++ b/e2e/internal/contrasttest/contrasttest.go @@ -61,6 +61,7 @@ type ContrastTest struct { ImageReplacementsFile string Platform platforms.Platform NamespaceFile string + RuntimeClassName string Kubeclient *kubeclient.Kubeclient // outputs of contrast subcommands @@ -70,8 +71,13 @@ type ContrastTest struct { // New creates a new contrasttest.T object bound to the given test. func New(t *testing.T) *ContrastTest { + require := require.New(t) + platform, err := platforms.FromString(Flags.PlatformStr) - require.NoError(t, err) + require.NoError(err) + + runtimeClass, err := kuberesource.ContrastRuntimeClass(platform) + require.NoError(err) return &ContrastTest{ Namespace: MakeNamespace(t, Flags.NamespaceSuffix), @@ -79,6 +85,7 @@ func New(t *testing.T) *ContrastTest { ImageReplacementsFile: Flags.ImageReplacementsFile, Platform: platform, NamespaceFile: Flags.NamespaceFile, + RuntimeClassName: *runtimeClass.Handler, Kubeclient: kubeclient.NewForTest(t), } } @@ -283,9 +290,15 @@ func patchReferenceValues(k *kubeclient.Kubeclient, platform platforms.Platform) // Apply the generated resources to the Kubernetes test environment. func (ct *ContrastTest) Apply(t *testing.T) { require := require.New(t) - yaml, err := os.ReadFile(path.Join(ct.WorkDir, "resources.yml")) require.NoError(err) + ct.ApplyFromYAML(t, yaml) +} + +// ApplyFromYAML applies the given YAML to the Kubernetes test environment. +func (ct *ContrastTest) ApplyFromYAML(t *testing.T, yaml []byte) { + require := require.New(t) + objects, err := kubeapi.UnmarshalUnstructuredK8SResource(yaml) require.NoError(err) diff --git a/justfile b/justfile index 142c05858d..ab224b8f5a 100644 --- a/justfile +++ b/justfile @@ -57,6 +57,7 @@ node-installer platform=default_platform: ;; "Metal-QEMU-SNP-GPU"|"K3s-QEMU-SNP-GPU") just push "nydus-snapshotter" + just push "nydus-pull" just push "node-installer-kata-gpu" ;; "AKS-PEER-SNP") diff --git a/packages/by-name/contrast/package.nix b/packages/by-name/contrast/package.nix index 2bc395a5a7..a3448a4cfb 100644 --- a/packages/by-name/contrast/package.nix +++ b/packages/by-name/contrast/package.nix @@ -32,6 +32,7 @@ let subPackages = [ "e2e/genpolicy" "e2e/getdents" + "e2e/gpu" "e2e/openssl" "e2e/servicemesh" "e2e/release" @@ -81,35 +82,45 @@ let ]; }; - snpRefVals = { - snp = - let - launch-digest = - if kata.contrast-node-installer-image.debugRuntime then - kata.snp-launch-digest.override { debug = true; } - else - kata.snp-launch-digest; - in - [ - { - trustedMeasurement = lib.removeSuffix "\n" (builtins.readFile "${launch-digest}/milan.hex"); - productName = "Milan"; - } - { - trustedMeasurement = lib.removeSuffix "\n" (builtins.readFile "${launch-digest}/genoa.hex"); - productName = "Genoa"; - } - ]; - }; + snpRefValsWith = + { + gpu, + }: + { + snp = + let + os-image = + if gpu then + kata.contrast-node-installer-image.gpu.os-image + else + kata.contrast-node-installer-image.os-image; + launch-digest = kata.snp-launch-digest.override { + inherit os-image; + debug = kata.contrast-node-installer-image.debugRuntime; + }; + in + [ + { + trustedMeasurement = lib.removeSuffix "\n" (builtins.readFile "${launch-digest}/milan.hex"); + productName = "Milan"; + } + { + trustedMeasurement = lib.removeSuffix "\n" (builtins.readFile "${launch-digest}/genoa.hex"); + productName = "Genoa"; + } + ]; + }; + + snpRefVals = snpRefValsWith { gpu = false; }; + snpGpuRefVals = snpRefValsWith { gpu = true; }; + tdxRefVals = { tdx = [ ( let - launch-digests = - if kata.contrast-node-installer-image.debugRuntime then - kata.tdx-launch-digests.override { debug = true; } - else - kata.tdx-launch-digests; + launch-digests = kata.tdx-launch-digests.override { + debug = kata.contrast-node-installer-image.debugRuntime; + }; in { mrTd = builtins.readFile "${launch-digests}/mrtd.hex"; @@ -135,9 +146,9 @@ let "${k3s-qemu-tdx-handler}" = tdxRefVals; "${rke2-qemu-tdx-handler}" = tdxRefVals; "${metal-qemu-snp-handler}" = snpRefVals; - "${metal-qemu-snp-gpu-handler}" = snpRefVals; + "${metal-qemu-snp-gpu-handler}" = snpGpuRefVals; "${k3s-qemu-snp-handler}" = snpRefVals; - "${k3s-qemu-snp-gpu-handler}" = snpRefVals; + "${k3s-qemu-snp-gpu-handler}" = snpGpuRefVals; } );