From 7685fcfc67a33ccf91c97f71cd5300f872ba8a1d Mon Sep 17 00:00:00 2001 From: William Zhao Date: Thu, 3 Aug 2023 18:08:43 -0400 Subject: [PATCH] Kernel params set by the config daemon should be verified In the case of vfio-pci, the config daemon might not notice it has to reboot to add kernel parameters. This could happen if the vfio driver state was updated but the config daemon was interrupted (e.g policy was removed and readded quickly). This could lead to a pending change to the kernel parameters via grubby or os-tree but without a reboot, these changes wouldn't be applied. This change introduces a map to hold the desired kernel parameters that the config daemon wishes to apply. This map is then validated against "/proc/cmdline" to see if the desired kernel parameters are actually applied. If not then there will be a subsequent attempt to apply these parameters. Signed-off-by: William Zhao --- pkg/plugins/generic/generic_plugin.go | 75 +++++++++++++++++++-------- pkg/utils/utils.go | 25 +++++++++ 2 files changed, 78 insertions(+), 22 deletions(-) diff --git a/pkg/plugins/generic/generic_plugin.go b/pkg/plugins/generic/generic_plugin.go index 20c759fc1..3502e422f 100644 --- a/pkg/plugins/generic/generic_plugin.go +++ b/pkg/plugins/generic/generic_plugin.go @@ -26,6 +26,7 @@ type GenericPlugin struct { LastState *sriovnetworkv1.SriovNetworkNodeState LoadVfioDriver uint LoadVirtioVdpaDriver uint + DesiredKernelParams map[string]uint RunningOnHost bool HostManager host.HostManagerInterface } @@ -45,6 +46,7 @@ func NewGenericPlugin(runningOnHost bool) (plugin.VendorPlugin, error) { SpecVersion: "1.0", LoadVfioDriver: unloaded, LoadVirtioVdpaDriver: unloaded, + DesiredKernelParams: make(map[string]uint), RunningOnHost: runningOnHost, HostManager: host.NewHostManager(runningOnHost), }, nil @@ -69,7 +71,7 @@ func (p *GenericPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeSt p.DesireState = new needDrain = needDrainNode(new.Spec.Interfaces, new.Status.Interfaces) - needReboot = needRebootNode(new, &p.LoadVfioDriver, &p.LoadVirtioVdpaDriver) + needReboot = p.needRebootNode(new) if needReboot { needDrain = true @@ -151,28 +153,28 @@ func needVirtioVdpaDriver(state *sriovnetworkv1.SriovNetworkNodeState) bool { return false } -func tryEnableIommuInKernelArgs() (bool, error) { - glog.Info("generic-plugin tryEnableIommuInKernelArgs()") - args := [2]string{"intel_iommu=on", "iommu=pt"} +// trySetKernelParam Tries to add the kernel param via ostree or grubby. +func trySetKernelParam(kparam string) (bool, error) { + glog.Info("generic-plugin trySetKernelParam()") var stdout, stderr bytes.Buffer - cmd := exec.Command("/bin/sh", scriptsPath, args[0], args[1]) + cmd := exec.Command("/bin/sh", scriptsPath, kparam) cmd.Stdout = &stdout cmd.Stderr = &stderr if err := cmd.Run(); err != nil { // if grubby is not there log and assume kernel args are set correctly. if isCommandNotFound(err) { - glog.Error("generic-plugin tryEnableIommuInKernelArgs(): grubby command not found. Please ensure that kernel args intel_iommu=on iommu=pt are set") + glog.Errorf("generic-plugin trySetKernelParam(): grubby or ostree command not found. Please ensure that kernel param %s are set", kparam) return false, nil } - glog.Errorf("generic-plugin tryEnableIommuInKernelArgs(): fail to enable iommu %s: %v", args, err) + glog.Errorf("generic-plugin trySetKernelParam(): fail to enable kernel param %s: %v", kparam, err) return false, err } i, err := strconv.Atoi(strings.TrimSpace(stdout.String())) if err == nil { if i > 0 { - glog.Infof("generic-plugin tryEnableIommuInKernelArgs(): need to reboot node") + glog.Infof("generic-plugin trySetKernelParam(): need to reboot node for kernel param %s", kparam) return true, nil } } @@ -188,6 +190,41 @@ func isCommandNotFound(err error) bool { return false } +// AddToDesiredKernelParams Should be called to queue a kernel param to be added to the node. +func (p *GenericPlugin) AddToDesiredKernelParams(kparam string) { + if _, ok := p.DesiredKernelParams[kparam]; !ok { + glog.Infof("generic-plugin AddToDesiredKernelParams(): Adding %s to desired kernel params", kparam) + // element "uint" is a counter of number of attempts to set the kernel param + p.DesiredKernelParams[kparam] = 0 + } +} + +// SetAllDesiredKernelParams Should be called to set all the kernel parameters. Returns true if reboot of the node is needed. +func (p *GenericPlugin) SetAllDesiredKernelParams() bool { + needReboot := false + for kparam, attempts := range p.DesiredKernelParams { + if !utils.IsKernelCmdLineParamSet(kparam, false) { + if attempts > 0 && attempts <= 4 { + glog.Errorf("generic-plugin SetAllDesiredKernelParams(): Fail to set kernel param %s after reboot with attempts %d", kparam, attempts) + } else if attempts > 4 { + // If we tried several times and was unsuccessful, we should give up. + continue + } + update, err := trySetKernelParam(kparam) + if err != nil { + glog.Errorf("generic-plugin SetAllDesiredKernelParams(): Fail to set kernel param %s: %v", kparam, err) + } + if update { + glog.V(2).Infof("generic-plugin SetAllDesiredKernelParams(): Need reboot for setting kernel param %s", kparam) + } + // Update the number of attempts we tried to set the kernel parameter. + p.DesiredKernelParams[kparam]++ + needReboot = needReboot || update + } + } + return needReboot +} + func needDrainNode(desired sriovnetworkv1.Interfaces, current sriovnetworkv1.InterfaceExts) (needDrain bool) { glog.V(2).Infof("generic-plugin needDrainNode(): current state '%+v', desired state '%+v'", current, desired) needDrain = false @@ -217,25 +254,19 @@ func needDrainNode(desired sriovnetworkv1.Interfaces, current sriovnetworkv1.Int return } -func needRebootNode(state *sriovnetworkv1.SriovNetworkNodeState, loadVfioDriver *uint, loadVirtioVdpaDriver *uint) (needReboot bool) { +func (p *GenericPlugin) needRebootNode(state *sriovnetworkv1.SriovNetworkNodeState) (needReboot bool) { needReboot = false - if *loadVfioDriver != loaded { + if p.LoadVfioDriver != loaded { if needVfioDriver(state) { - *loadVfioDriver = loading - update, err := tryEnableIommuInKernelArgs() - if err != nil { - glog.Errorf("generic-plugin needRebootNode():fail to enable iommu in kernel args: %v", err) - } - if update { - glog.V(2).Infof("generic-plugin needRebootNode(): need reboot for enabling iommu kernel args") - } - needReboot = needReboot || update + p.LoadVfioDriver = loading + p.AddToDesiredKernelParams(utils.KernelParamIntelIommu) + p.AddToDesiredKernelParams(utils.KernelParamIommuPt) } } - if *loadVirtioVdpaDriver != loaded { + if p.LoadVirtioVdpaDriver != loaded { if needVirtioVdpaDriver(state) { - *loadVirtioVdpaDriver = loading + p.LoadVirtioVdpaDriver = loading } } @@ -246,6 +277,6 @@ func needRebootNode(state *sriovnetworkv1.SriovNetworkNodeState, loadVfioDriver if update { glog.V(2).Infof("generic-plugin needRebootNode(): need reboot for updating switchdev device configuration") } - needReboot = needReboot || update + needReboot = needReboot || update || p.SetAllDesiredKernelParams() return } diff --git a/pkg/utils/utils.go b/pkg/utils/utils.go index 257f4c0ca..7899cec3d 100644 --- a/pkg/utils/utils.go +++ b/pkg/utils/utils.go @@ -32,6 +32,7 @@ const ( sysBusPciDrivers = "/sys/bus/pci/drivers" sysBusPciDriversProbe = "/sys/bus/pci/drivers_probe" sysClassNet = "/sys/class/net" + sysKernelCmdLine = "/proc/cmdline" netClass = 0x02 numVfsFile = "sriov_numvfs" @@ -40,6 +41,9 @@ const ( VendorMellanox = "15b3" DeviceBF2 = "a2d6" DeviceBF3 = "a2dc" + + KernelParamIntelIommu = "intel_iommu=on" + KernelParamIommuPt = "iommu=pt" ) var InitialState sriovnetworkv1.SriovNetworkNodeState @@ -51,6 +55,27 @@ func init() { ClusterType = os.Getenv("CLUSTER_TYPE") } +// IsKernelCmdLineParamSet This checks if the kernel cmd line is set properly. +func IsKernelCmdLineParamSet(param string, chroot bool) bool { + path := sysKernelCmdLine + if !chroot { + path = "/host" + path + } + cmdLine, err := os.ReadFile(path) + if err != nil { + glog.Warningf("IsKernelCmdLineParamSet(): Error reading %s, Please check the Kernel Params manually!", sysKernelCmdLine) + // Although intuitively we should report false here. However if we cannot reliably read the Kernel's cmd line params + // then it is unknown whether it was successfully set, and to avoid unnecessary reboots we should just post a warning + // and return true. + return true + } + + if strings.Contains(string(cmdLine), param) { + return true + } + return false +} + func DiscoverSriovDevices(withUnsupported bool) ([]sriovnetworkv1.InterfaceExt, error) { glog.V(2).Info("DiscoverSriovDevices") pfList := []sriovnetworkv1.InterfaceExt{}