diff --git a/e2e/gpu/gpu_test.go b/e2e/gpu/gpu_test.go index e44593632d..5278703ed8 100644 --- a/e2e/gpu/gpu_test.go +++ b/e2e/gpu/gpu_test.go @@ -6,7 +6,6 @@ package gpu import ( - "bytes" "context" "flag" "os" @@ -14,6 +13,7 @@ import ( "time" "github.com/edgelesssys/contrast/e2e/internal/contrasttest" + "github.com/edgelesssys/contrast/e2e/internal/kubeclient" "github.com/edgelesssys/contrast/internal/kuberesource" "github.com/edgelesssys/contrast/internal/manifest" "github.com/edgelesssys/contrast/internal/platforms" @@ -21,8 +21,8 @@ import ( ) const ( - gpuPodName = "gpu-pod" - gpuName = "NVIDIA H100 PCIe" + gpuDeploymentName = "gpu-tester" + gpuName = "NVIDIA H100 PCIe" ) // TestGPU runs e2e tests on an GPU-enabled Contrast. @@ -34,7 +34,7 @@ func TestGPU(t *testing.T) { runtimeHandler, err := manifest.RuntimeHandler(platform) require.NoError(t, err) - resources := kuberesource.OpenSSL() + resources := kuberesource.GPU() coordinator := kuberesource.CoordinatorBundle() resources = append(resources, coordinator...) @@ -52,31 +52,20 @@ func TestGPU(t *testing.T) { require.True(t, t.Run("contrast verify", ct.Verify), "contrast verify needs to succeed for subsequent tests") - applyGPUPod := func(t *testing.T) { - yaml, err := os.ReadFile("./e2e/gpu/testdata/gpu-pod.yaml") - require.NoError(t, err) - - yaml = bytes.ReplaceAll( - bytes.ReplaceAll(yaml, []byte("@@REPLACE_NAMESPACE@@"), []byte(ct.Namespace)), - []byte("@@REPLACE_RUNTIME@@"), []byte(ct.RuntimeClassName), - ) - - ct.ApplyFromYAML(t, yaml) - } - - require.True(t, t.Run("apply GPU pod", applyGPUPod), "GPU pod needs to deploy successfully for subsequent tests") - t.Run("check GPU availability", func(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), ct.FactorPlatformTimeout(5*time.Minute)) defer cancel() require := require.New(t) - err := ct.Kubeclient.WaitForPod(ctx, ct.Namespace, gpuPodName) - require.NoError(err, "GPU pod %s did not start", gpuPodName) + require.NoError(ct.Kubeclient.WaitFor(ctx, kubeclient.Ready, kubeclient.Deployment{}, ct.Namespace, gpuDeploymentName)) + + pods, err := ct.Kubeclient.PodsFromDeployment(ctx, ct.Namespace, gpuDeploymentName) + require.NoError(err) + require.Len(pods, 1, "pod not found: %s/%s", ct.Namespace, gpuDeploymentName) argv := []string{"/bin/sh", "-c", "nvidia-smi"} - stdout, stderr, err := ct.Kubeclient.Exec(ctx, ct.Namespace, gpuPodName, argv) + stdout, stderr, err := ct.Kubeclient.Exec(ctx, ct.Namespace, pods[0].Name, argv) require.NoError(err, "stderr: %q", stderr) require.Contains(stdout, gpuName, "nvidia-smi output should contain %s", gpuName) diff --git a/e2e/gpu/testdata/gpu-pod.yaml b/e2e/gpu/testdata/gpu-pod.yaml deleted file mode 100644 index 5fc9546c55..0000000000 --- a/e2e/gpu/testdata/gpu-pod.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# TODO(msanft): Move this to internal/kuberesource/sets.go as soon as genpolicy -# support for GPU pods is added. -apiVersion: v1 -kind: Pod -metadata: - name: gpu-pod - namespace: "@@REPLACE_NAMESPACE@@" - annotations: - # Allow-all policy - # TODO(msanft): Generate a policy dynamically once we support policy generation for GPU pods. - io.katacontainers.config.agent.policy: IyBDb3B5cmlnaHQgKGMpIDIwMjMgTWljcm9zb2Z0IENvcnBvcmF0aW9uCiMKIyBTUERYLUxpY2Vuc2UtSWRlbnRpZmllcjogQXBhY2hlLTIuMAojCgpwYWNrYWdlIGFnZW50X3BvbGljeQoKZGVmYXVsdCBBZGRBUlBOZWlnaGJvcnNSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBBZGRTd2FwUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgQ2xvc2VTdGRpblJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IENvcHlGaWxlUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgQ3JlYXRlQ29udGFpbmVyUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgQ3JlYXRlU2FuZGJveFJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IERlc3Ryb3lTYW5kYm94UmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgRXhlY1Byb2Nlc3NSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBHZXRNZXRyaWNzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgR2V0T09NRXZlbnRSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBHdWVzdERldGFpbHNSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBMaXN0SW50ZXJmYWNlc1JlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IExpc3RSb3V0ZXNSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBNZW1Ib3RwbHVnQnlQcm9iZVJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IE9ubGluZUNQVU1lbVJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFBhdXNlQ29udGFpbmVyUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgUHVsbEltYWdlUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgUmVhZFN0cmVhbVJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlbW92ZUNvbnRhaW5lclJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlbW92ZVN0YWxlVmlydGlvZnNTaGFyZU1vdW50c1JlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlc2VlZFJhbmRvbURldlJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlc3VtZUNvbnRhaW5lclJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFNldEd1ZXN0RGF0ZVRpbWVSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTZXRQb2xpY3lSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTaWduYWxQcm9jZXNzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgU3RhcnRDb250YWluZXJSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTdGFydFRyYWNpbmdSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTdGF0c0NvbnRhaW5lclJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFN0b3BUcmFjaW5nUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVHR5V2luUmVzaXplUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlQ29udGFpbmVyUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlRXBoZW1lcmFsTW91bnRzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlSW50ZXJmYWNlUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlUm91dGVzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgV2FpdFByb2Nlc3NSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBXcml0ZVN0cmVhbVJlcXVlc3QgOj0gdHJ1ZQo= - io.katacontainers.config.hypervisor.default_memory: "15258" - cdi.k8s.io/gpu: "nvidia.com/pgpu=0" -spec: - runtimeClassName: "@@REPLACE_RUNTIME@@" - restartPolicy: OnFailure - containers: - - name: vllm - image: ghcr.io/edgelesssys/contrast/ubuntu:24.04 - env: - - name: NVIDIA_VISIBLE_DEVICES - value: all - resources: - limits: - "nvidia.com/GH100_H100_PCIE": 1 diff --git a/internal/kuberesource/sets.go b/internal/kuberesource/sets.go index 904fd9f831..35cfbb6a30 100644 --- a/internal/kuberesource/sets.go +++ b/internal/kuberesource/sets.go @@ -743,3 +743,39 @@ done client, } } + +// GPU returns the resources for deploying a GPU test pod. +func GPU() []any { + tester := Deployment("gpu-tester", ""). + WithSpec(DeploymentSpec(). + WithReplicas(1). + WithSelector(LabelSelector(). + WithMatchLabels(map[string]string{"app.kubernetes.io/name": "gpu-tester"}), + ). + WithTemplate(PodTemplateSpec(). + WithLabels(map[string]string{"app.kubernetes.io/name": "gpu-tester"}). + WithAnnotations(map[string]string{ + "io.katacontainers.config.hypervisor.default_memory": "15258", + "cdi.k8s.io/gpu": "nvidia.com/pgpu=0", + }). + WithSpec(PodSpec(). + WithContainers( + Container(). + WithName("gpu-tester"). + WithImage("ghcr.io/edgelesssys/contrast/ubuntu:24.04"). + WithCommand("/bin/sh", "-c", "sleep inf"). + WithEnv(EnvVar(). + WithName("NVIDIA_VISIBLE_DEVICES").WithValue("all"), + ). + WithResources(ResourceRequirements(). + WithLimits(corev1.ResourceList{ + corev1.ResourceName("nvidia.com/GH100_H100_PCIE"): resource.MustParse("1"), + }), + ), + ), + ), + ), + ) + + return []any{tester} +}