Skip to content

Commit

Permalink
Kernel params set by the config daemon should be verified
Browse files Browse the repository at this point in the history
In the case of vfio-pci, the config daemon might not notice it has to
reboot to add kernel parameters. This could happen if the vfio driver
state was updated but the config daemon was interrupted (e.g policy was
removed and readded quickly).

This could lead to a pending change to the kernel parameters via grubby
or os-tree but without a reboot, these changes wouldn't be applied.

This change introduces a map to hold the desired kernel parameters that
the config daemon wishes to apply. This map is then validated against
"/proc/cmdline" to see if the desired kernel parameters are actually
applied. If not then there will be a subsequent attempt to apply these
parameters.

Signed-off-by: William Zhao <[email protected]>
  • Loading branch information
wizhaoredhat committed Aug 3, 2023
1 parent 990648e commit 7685fcf
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 22 deletions.
75 changes: 53 additions & 22 deletions pkg/plugins/generic/generic_plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ type GenericPlugin struct {
LastState *sriovnetworkv1.SriovNetworkNodeState
LoadVfioDriver uint
LoadVirtioVdpaDriver uint
DesiredKernelParams map[string]uint
RunningOnHost bool
HostManager host.HostManagerInterface
}
Expand All @@ -45,6 +46,7 @@ func NewGenericPlugin(runningOnHost bool) (plugin.VendorPlugin, error) {
SpecVersion: "1.0",
LoadVfioDriver: unloaded,
LoadVirtioVdpaDriver: unloaded,
DesiredKernelParams: make(map[string]uint),
RunningOnHost: runningOnHost,
HostManager: host.NewHostManager(runningOnHost),
}, nil
Expand All @@ -69,7 +71,7 @@ func (p *GenericPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeSt
p.DesireState = new

needDrain = needDrainNode(new.Spec.Interfaces, new.Status.Interfaces)
needReboot = needRebootNode(new, &p.LoadVfioDriver, &p.LoadVirtioVdpaDriver)
needReboot = p.needRebootNode(new)

if needReboot {
needDrain = true
Expand Down Expand Up @@ -151,28 +153,28 @@ func needVirtioVdpaDriver(state *sriovnetworkv1.SriovNetworkNodeState) bool {
return false
}

func tryEnableIommuInKernelArgs() (bool, error) {
glog.Info("generic-plugin tryEnableIommuInKernelArgs()")
args := [2]string{"intel_iommu=on", "iommu=pt"}
// trySetKernelParam Tries to add the kernel param via ostree or grubby.
func trySetKernelParam(kparam string) (bool, error) {
glog.Info("generic-plugin trySetKernelParam()")
var stdout, stderr bytes.Buffer
cmd := exec.Command("/bin/sh", scriptsPath, args[0], args[1])
cmd := exec.Command("/bin/sh", scriptsPath, kparam)
cmd.Stdout = &stdout
cmd.Stderr = &stderr

if err := cmd.Run(); err != nil {
// if grubby is not there log and assume kernel args are set correctly.
if isCommandNotFound(err) {
glog.Error("generic-plugin tryEnableIommuInKernelArgs(): grubby command not found. Please ensure that kernel args intel_iommu=on iommu=pt are set")
glog.Errorf("generic-plugin trySetKernelParam(): grubby or ostree command not found. Please ensure that kernel param %s are set", kparam)
return false, nil
}
glog.Errorf("generic-plugin tryEnableIommuInKernelArgs(): fail to enable iommu %s: %v", args, err)
glog.Errorf("generic-plugin trySetKernelParam(): fail to enable kernel param %s: %v", kparam, err)
return false, err
}

i, err := strconv.Atoi(strings.TrimSpace(stdout.String()))
if err == nil {
if i > 0 {
glog.Infof("generic-plugin tryEnableIommuInKernelArgs(): need to reboot node")
glog.Infof("generic-plugin trySetKernelParam(): need to reboot node for kernel param %s", kparam)
return true, nil
}
}
Expand All @@ -188,6 +190,41 @@ func isCommandNotFound(err error) bool {
return false
}

// AddToDesiredKernelParams Should be called to queue a kernel param to be added to the node.
func (p *GenericPlugin) AddToDesiredKernelParams(kparam string) {
if _, ok := p.DesiredKernelParams[kparam]; !ok {
glog.Infof("generic-plugin AddToDesiredKernelParams(): Adding %s to desired kernel params", kparam)
// element "uint" is a counter of number of attempts to set the kernel param
p.DesiredKernelParams[kparam] = 0
}
}

// SetAllDesiredKernelParams Should be called to set all the kernel parameters. Returns true if reboot of the node is needed.
func (p *GenericPlugin) SetAllDesiredKernelParams() bool {
needReboot := false
for kparam, attempts := range p.DesiredKernelParams {
if !utils.IsKernelCmdLineParamSet(kparam, false) {
if attempts > 0 && attempts <= 4 {
glog.Errorf("generic-plugin SetAllDesiredKernelParams(): Fail to set kernel param %s after reboot with attempts %d", kparam, attempts)
} else if attempts > 4 {
// If we tried several times and was unsuccessful, we should give up.
continue
}
update, err := trySetKernelParam(kparam)
if err != nil {
glog.Errorf("generic-plugin SetAllDesiredKernelParams(): Fail to set kernel param %s: %v", kparam, err)
}
if update {
glog.V(2).Infof("generic-plugin SetAllDesiredKernelParams(): Need reboot for setting kernel param %s", kparam)
}
// Update the number of attempts we tried to set the kernel parameter.
p.DesiredKernelParams[kparam]++
needReboot = needReboot || update
}
}
return needReboot
}

func needDrainNode(desired sriovnetworkv1.Interfaces, current sriovnetworkv1.InterfaceExts) (needDrain bool) {
glog.V(2).Infof("generic-plugin needDrainNode(): current state '%+v', desired state '%+v'", current, desired)
needDrain = false
Expand Down Expand Up @@ -217,25 +254,19 @@ func needDrainNode(desired sriovnetworkv1.Interfaces, current sriovnetworkv1.Int
return
}

func needRebootNode(state *sriovnetworkv1.SriovNetworkNodeState, loadVfioDriver *uint, loadVirtioVdpaDriver *uint) (needReboot bool) {
func (p *GenericPlugin) needRebootNode(state *sriovnetworkv1.SriovNetworkNodeState) (needReboot bool) {
needReboot = false
if *loadVfioDriver != loaded {
if p.LoadVfioDriver != loaded {
if needVfioDriver(state) {
*loadVfioDriver = loading
update, err := tryEnableIommuInKernelArgs()
if err != nil {
glog.Errorf("generic-plugin needRebootNode():fail to enable iommu in kernel args: %v", err)
}
if update {
glog.V(2).Infof("generic-plugin needRebootNode(): need reboot for enabling iommu kernel args")
}
needReboot = needReboot || update
p.LoadVfioDriver = loading
p.AddToDesiredKernelParams(utils.KernelParamIntelIommu)
p.AddToDesiredKernelParams(utils.KernelParamIommuPt)
}
}

if *loadVirtioVdpaDriver != loaded {
if p.LoadVirtioVdpaDriver != loaded {
if needVirtioVdpaDriver(state) {
*loadVirtioVdpaDriver = loading
p.LoadVirtioVdpaDriver = loading
}
}

Expand All @@ -246,6 +277,6 @@ func needRebootNode(state *sriovnetworkv1.SriovNetworkNodeState, loadVfioDriver
if update {
glog.V(2).Infof("generic-plugin needRebootNode(): need reboot for updating switchdev device configuration")
}
needReboot = needReboot || update
needReboot = needReboot || update || p.SetAllDesiredKernelParams()
return
}
25 changes: 25 additions & 0 deletions pkg/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ const (
sysBusPciDrivers = "/sys/bus/pci/drivers"
sysBusPciDriversProbe = "/sys/bus/pci/drivers_probe"
sysClassNet = "/sys/class/net"
sysKernelCmdLine = "/proc/cmdline"
netClass = 0x02
numVfsFile = "sriov_numvfs"

Expand All @@ -40,6 +41,9 @@ const (
VendorMellanox = "15b3"
DeviceBF2 = "a2d6"
DeviceBF3 = "a2dc"

KernelParamIntelIommu = "intel_iommu=on"
KernelParamIommuPt = "iommu=pt"
)

var InitialState sriovnetworkv1.SriovNetworkNodeState
Expand All @@ -51,6 +55,27 @@ func init() {
ClusterType = os.Getenv("CLUSTER_TYPE")
}

// IsKernelCmdLineParamSet This checks if the kernel cmd line is set properly.
func IsKernelCmdLineParamSet(param string, chroot bool) bool {
path := sysKernelCmdLine
if !chroot {
path = "/host" + path
}
cmdLine, err := os.ReadFile(path)
if err != nil {
glog.Warningf("IsKernelCmdLineParamSet(): Error reading %s, Please check the Kernel Params manually!", sysKernelCmdLine)
// Although intuitively we should report false here. However if we cannot reliably read the Kernel's cmd line params
// then it is unknown whether it was successfully set, and to avoid unnecessary reboots we should just post a warning
// and return true.
return true
}

if strings.Contains(string(cmdLine), param) {
return true
}
return false
}

func DiscoverSriovDevices(withUnsupported bool) ([]sriovnetworkv1.InterfaceExt, error) {
glog.V(2).Info("DiscoverSriovDevices")
pfList := []sriovnetworkv1.InterfaceExt{}
Expand Down

0 comments on commit 7685fcf

Please sign in to comment.