From 93683c805e51eaf8dc64e6a08aa0c897b71a5646 Mon Sep 17 00:00:00 2001 From: Moritz Sanft <58110325+msanft@users.noreply.github.com> Date: Thu, 9 Jan 2025 16:25:12 +0100 Subject: [PATCH] packages/nixos: bare-metal Kata GPU support This adds the necessary bits to facilitate GPU support in bare-metal Kata deployments to our NixOS image build. --- .../by-name/kata/kata-runtime/package.nix | 10 ++-- packages/nixos/gpu.nix | 50 ++++++++++++++++++- 2 files changed, 51 insertions(+), 9 deletions(-) diff --git a/packages/by-name/kata/kata-runtime/package.nix b/packages/by-name/kata/kata-runtime/package.nix index d9aad658df..690bf5a7f4 100644 --- a/packages/by-name/kata/kata-runtime/package.nix +++ b/packages/by-name/kata/kata-runtime/package.nix @@ -180,13 +180,9 @@ buildGoModule rec { # is used when Kata starts a VM. # For example, this command should do the job: # `journalctl -t kata -l --no-pager | grep launching | tail -1` - passthru = { - inherit src; - - cmdline = { - default = "tsc=reliable no_timer_check rcupdate.rcu_expedited=1 i8042.direct=1 i8042.dumbkbd=1 i8042.nopnp=1 i8042.noaux=1 noreplace-smp reboot=k cryptomgr.notests net.ifnames=0 pci=lastbus=0 root=/dev/vda1 rootflags=ro rootfstype=erofs console=hvc0 console=hvc1 quiet systemd.show_status=false panic=1 nr_cpus=1 selinux=0 systemd.unit=kata-containers.target systemd.mask=systemd-networkd.service systemd.mask=systemd-networkd.socket scsi_mod.scan=none"; - debug = "tsc=reliable no_timer_check rcupdate.rcu_expedited=1 i8042.direct=1 i8042.dumbkbd=1 i8042.nopnp=1 i8042.noaux=1 noreplace-smp reboot=k cryptomgr.notests net.ifnames=0 pci=lastbus=0 root=/dev/vda1 rootflags=ro rootfstype=erofs console=hvc0 console=hvc1 debug systemd.show_status=true systemd.log_level=debug panic=1 nr_cpus=1 selinux=0 systemd.unit=kata-containers.target systemd.mask=systemd-networkd.service systemd.mask=systemd-networkd.socket scsi_mod.scan=none agent.log=debug agent.debug_console agent.debug_console_vport=1026"; - }; + passthru.cmdline = { + default = "tsc=reliable no_timer_check rcupdate.rcu_expedited=1 i8042.direct=1 i8042.dumbkbd=1 i8042.nopnp=1 i8042.noaux=1 noreplace-smp reboot=k cryptomgr.notests net.ifnames=0 pci=lastbus=0 root=/dev/vda1 rootflags=ro rootfstype=erofs console=hvc0 console=hvc1 quiet systemd.show_status=false panic=1 nr_cpus=1 selinux=0 systemd.unit=kata-containers.target systemd.mask=systemd-networkd.service systemd.mask=systemd-networkd.socket scsi_mod.scan=none"; + debug = "tsc=reliable no_timer_check rcupdate.rcu_expedited=1 i8042.direct=1 i8042.dumbkbd=1 i8042.nopnp=1 i8042.noaux=1 noreplace-smp reboot=k cryptomgr.notests net.ifnames=0 pci=lastbus=0 root=/dev/vda1 rootflags=ro rootfstype=erofs console=hvc0 console=hvc1 debug systemd.show_status=true systemd.log_level=debug panic=1 nr_cpus=1 selinux=0 systemd.unit=kata-containers.target systemd.mask=systemd-networkd.service systemd.mask=systemd-networkd.socket scsi_mod.scan=none agent.log=debug agent.debug_console agent.debug_console_vport=1026"; }; meta.mainProgram = "containerd-shim-kata-v2"; diff --git a/packages/nixos/gpu.nix b/packages/nixos/gpu.nix index 7bcf738519..79bce53781 100644 --- a/packages/nixos/gpu.nix +++ b/packages/nixos/gpu.nix @@ -71,6 +71,34 @@ let }); }; }); + + # nix-store-mount-hook mounts the VM's nix store into the container. + # TODO(burgerdev): only do that for containers that actually get a GPU device. + nix-store-mount-hook = pkgs.writeShellApplication { + name = "nix-store-mount-hook"; + runtimeInputs = with pkgs; [ + coreutils + util-linux + jq + ]; + text = '' + # Reads from the state JSON supplied on stdin. + bundle="$(jq -r .bundle)" + rootfs="$bundle/rootfs" + id="$(basename "$bundle")" + + lower=/nix/store + target="$rootfs$lower" + mkdir -p "$target" + + overlays="/run/kata-containers/nix-overlays/$id" + upperdir="$overlays/upperdir" + workdir="$overlays/workdir" + mkdir -p "$upperdir" "$workdir" + + mount -t overlay -o "lowerdir=$lower:$target,upperdir=$upperdir,workdir=$workdir" none "$target" + ''; + }; in { @@ -90,6 +118,19 @@ in videoAcceleration = false; }; + # Configure the persistenced for use with CC GPUs (e.g. H100). + # TODO(msanft): This needs to be adjusted for non-CC-GPUs. + # See: https://docs.nvidia.com/cc-deployment-guide-snp.pdf (Page 23 & 24) + systemd.services."nvidia-persistenced" = { + wantedBy = [ "kata-containers.target" ]; + serviceConfig.ExecStart = lib.mkForce "${lib.getExe config.hardware.nvidia.package.persistenced} --uvm-persistence-mode --verbose"; + }; + + # kata-containers.target needs to pull this in so that we get a valid + # CDI configuration inside the PodVM. This is not necessary, as we use the + # legacy mode as of now, but will be once we switch to CDI. + systemd.services."nvidia-container-toolkit-cdi-generator".wantedBy = [ "kata-containers.target" ]; + hardware.nvidia-container-toolkit.enable = true; # Make NVIDIA the "default" graphics driver to replace Mesa, @@ -97,8 +138,13 @@ in hardware.graphics.package = nvidiaPackage; hardware.graphics.package32 = nvidiaPackage; - image.repart.partitions."10-root".contents."/usr/share/oci/hooks/prestart/nvidia-container-toolkit.sh".source = - lib.getExe pkgs.nvidia-ctk-oci-hook; + image.repart.partitions."10-root".contents = { + "/usr/share/oci/hooks/prestart/nvidia-container-toolkit.sh".source = + lib.getExe pkgs.nvidia-ctk-oci-hook; + "/usr/share/oci/hooks/prestart/nix-store-mount-hook.sh".source = lib.getExe nix-store-mount-hook; + }; + + environment.systemPackages = [ pkgs.nvidia-ctk-with-config ]; boot.initrd.kernelModules = [ # Extra kernel modules required to talk to the GPU in CC-Mode.