Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Kernel params set by the config daemon should be verified #486

Merged
merged 2 commits into from
Oct 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 96 additions & 38 deletions pkg/plugins/generic/generic_plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package generic

import (
"bytes"
"errors"
"os/exec"
"reflect"
"strconv"
Expand Down Expand Up @@ -47,14 +48,15 @@ type DriverState struct {
type DriverStateMapType map[uint]*DriverState

type GenericPlugin struct {
PluginName string
SpecVersion string
DesireState *sriovnetworkv1.SriovNetworkNodeState
LastState *sriovnetworkv1.SriovNetworkNodeState
DriverStateMap DriverStateMapType
RunningOnHost bool
HostManager host.HostManagerInterface
StoreManager utils.StoreManagerInterface
PluginName string
SpecVersion string
DesireState *sriovnetworkv1.SriovNetworkNodeState
LastState *sriovnetworkv1.SriovNetworkNodeState
DriverStateMap DriverStateMapType
DesiredKernelArgs map[string]bool
RunningOnHost bool
HostManager host.HostManagerInterface
StoreManager utils.StoreManagerInterface
}

const scriptsPath = "bindata/scripts/enable-kargs.sh"
Expand Down Expand Up @@ -84,12 +86,13 @@ func NewGenericPlugin(runningOnHost bool, hostManager host.HostManagerInterface,
DriverLoaded: false,
}
return &GenericPlugin{
PluginName: PluginName,
SpecVersion: "1.0",
DriverStateMap: driverStateMap,
RunningOnHost: runningOnHost,
HostManager: hostManager,
StoreManager: storeManager,
PluginName: PluginName,
SpecVersion: "1.0",
DriverStateMap: driverStateMap,
DesiredKernelArgs: make(map[string]bool),
RunningOnHost: runningOnHost,
HostManager: hostManager,
StoreManager: storeManager,
}, nil
}

Expand All @@ -112,7 +115,10 @@ func (p *GenericPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeSt
p.DesireState = new

needDrain = p.needDrainNode(new.Spec.Interfaces, new.Status.Interfaces)
needReboot = needRebootNode(new, p.DriverStateMap)
needReboot, err = p.needRebootNode(new)
wizhaoredhat marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
return needDrain, needReboot, err
}

if needReboot {
needDrain = true
Expand Down Expand Up @@ -168,6 +174,10 @@ func (p *GenericPlugin) Apply() error {
}

if err := utils.SyncNodeState(p.DesireState, pfsToSkip); err != nil {
// Catch the "cannot allocate memory" error and try to use PCI realloc
if errors.Is(err, syscall.ENOMEM) {
p.addToDesiredKernelArgs(utils.KernelArgPciRealloc)
}
bn222 marked this conversation as resolved.
Show resolved Hide resolved
return err
}
p.LastState = &sriovnetworkv1.SriovNetworkNodeState{}
Expand Down Expand Up @@ -197,28 +207,28 @@ func needDriverCheckVdpaType(state *sriovnetworkv1.SriovNetworkNodeState, driver
return false
}

func tryEnableIommuInKernelArgs() (bool, error) {
glog.Info("generic-plugin tryEnableIommuInKernelArgs()")
args := [2]string{"intel_iommu=on", "iommu=pt"}
// setKernelArg Tries to add the kernel args via ostree or grubby.
func setKernelArg(karg string) (bool, error) {
glog.Info("generic-plugin setKernelArg()")
var stdout, stderr bytes.Buffer
cmd := exec.Command("/bin/sh", scriptsPath, args[0], args[1])
cmd := exec.Command("/bin/sh", scriptsPath, karg)
cmd.Stdout = &stdout
cmd.Stderr = &stderr

if err := cmd.Run(); err != nil {
// if grubby is not there log and assume kernel args are set correctly.
if isCommandNotFound(err) {
glog.Error("generic-plugin tryEnableIommuInKernelArgs(): grubby command not found. Please ensure that kernel args intel_iommu=on iommu=pt are set")
glog.Errorf("generic-plugin setKernelArg(): grubby or ostree command not found. Please ensure that kernel arg %s are set", karg)
return false, nil
}
glog.Errorf("generic-plugin tryEnableIommuInKernelArgs(): fail to enable iommu %s: %v", args, err)
glog.Errorf("generic-plugin setKernelArg(): fail to enable kernel arg %s: %v", karg, err)
return false, err
}

i, err := strconv.Atoi(strings.TrimSpace(stdout.String()))
if err == nil {
if i > 0 {
glog.Infof("generic-plugin tryEnableIommuInKernelArgs(): need to reboot node")
glog.Infof("generic-plugin setKernelArg(): need to reboot node for kernel arg %s", karg)
return true, nil
}
}
Expand All @@ -234,6 +244,48 @@ func isCommandNotFound(err error) bool {
return false
}

// addToDesiredKernelArgs Should be called to queue a kernel arg to be added to the node.
func (p *GenericPlugin) addToDesiredKernelArgs(karg string) {
if _, ok := p.DesiredKernelArgs[karg]; !ok {
glog.Infof("generic-plugin addToDesiredKernelArgs(): Adding %s to desired kernel arg", karg)
p.DesiredKernelArgs[karg] = false
}
}

// syncDesiredKernelArgs Should be called to set all the kernel arguments. Returns bool if node update is needed.
func (p *GenericPlugin) syncDesiredKernelArgs() (bool, error) {
needReboot := false
if len(p.DesiredKernelArgs) == 0 {
return false, nil
}
kargs, err := utils.GetCurrentKernelArgs(false)
if err != nil {
return false, err
}
for desiredKarg, attempted := range p.DesiredKernelArgs {
set := utils.IsKernelArgsSet(kargs, desiredKarg)
if !set {
if attempted {
glog.V(2).Infof("generic-plugin syncDesiredKernelArgs(): previously attempted to set kernel arg %s", desiredKarg)
Copy link
Collaborator

@adrianchiris adrianchiris Oct 15, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@wizhaoredhat if you previously attempted to set the arg, it means setKernlelArg succeeded.
then why do we need to set it again ? shouldnt we "continue" here ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This captures the case where we hit this case, but either the kernel arg wasn't set properly or if the reboot did not occur for some reason. This results in IsKernelArgsSet returning false even though it was set before. So we should try again.

}
// There is a case when we try to set the kernel argument here, the daemon could decide to not reboot because
// the daemon encountered a potentially one-time error. However we always want to make sure that the kernel
// argument is set once the daemon goes through node state sync again.
update, err := setKernelArg(desiredKarg)
if err != nil {
glog.Errorf("generic-plugin syncDesiredKernelArgs(): fail to set kernel arg %s: %v", desiredKarg, err)
return false, err
}
if update {
needReboot = true
glog.V(2).Infof("generic-plugin syncDesiredKernelArgs(): need reboot for setting kernel arg %s", desiredKarg)
}
p.DesiredKernelArgs[desiredKarg] = true
}
}
return needReboot, nil
}

func (p *GenericPlugin) needDrainNode(desired sriovnetworkv1.Interfaces, current sriovnetworkv1.InterfaceExts) (needDrain bool) {
glog.V(2).Infof("generic-plugin needDrainNode(): current state '%+v', desired state '%+v'", current, desired)

Expand Down Expand Up @@ -285,33 +337,39 @@ func (p *GenericPlugin) needDrainNode(desired sriovnetworkv1.Interfaces, current
return
}

func needRebootIfVfio(state *sriovnetworkv1.SriovNetworkNodeState, driverMap DriverStateMapType) (needReboot bool) {
driverState := driverMap[Vfio]
func (p *GenericPlugin) addVfioDesiredKernelArg(state *sriovnetworkv1.SriovNetworkNodeState) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: addVfioDesiredKernelArg -> addVfioDesiredKernelArgs to keep it the same as other methods ? WDYT.
Although, we are only adding a single arg here....

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I prefer singular. It's only adding 1 arg.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also prefer it being singular. Let's keep it as is. Thanks for reviewing!

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

its adding two, what am i missing ?

L343, L344

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry I grouped IOMMU as singular. There is infact Intel IOMMU and the general IOMMU. However this is a small NIT on naming.

driverState := p.DriverStateMap[Vfio]
if !driverState.DriverLoaded && driverState.NeedDriverFunc(state, driverState) {
var err error
needReboot, err = tryEnableIommuInKernelArgs()
if err != nil {
glog.Errorf("generic-plugin needRebootNode():fail to enable iommu in kernel args: %v", err)
}
if needReboot {
glog.V(2).Infof("generic-plugin needRebootNode(): need reboot for enabling iommu kernel args")
}
p.addToDesiredKernelArgs(utils.KernelArgIntelIommu)
p.addToDesiredKernelArgs(utils.KernelArgIommuPt)
}
return needReboot
}

func needRebootNode(state *sriovnetworkv1.SriovNetworkNodeState, driverMap DriverStateMapType) (needReboot bool) {
needReboot = needRebootIfVfio(state, driverMap)
updateNode, err := utils.WriteSwitchdevConfFile(state)
func (p *GenericPlugin) needRebootNode(state *sriovnetworkv1.SriovNetworkNodeState) (needReboot bool, err error) {
needReboot = false
p.addVfioDesiredKernelArg(state)

updateNode, err := p.syncDesiredKernelArgs()
if err != nil {
glog.Errorf("generic-plugin needRebootNode(): failed to set the desired kernel arguments")
return false, err
}
if updateNode {
glog.V(2).Infof("generic-plugin needRebootNode(): need reboot for updating kernel arguments")
needReboot = true
}

updateNode, err = utils.WriteSwitchdevConfFile(state)
if err != nil {
glog.Errorf("generic-plugin needRebootNode(): fail to write switchdev device config file")
return false, err
}
if updateNode {
glog.V(2).Infof("generic-plugin needRebootNode(): need reboot for updating switchdev device configuration")
needReboot = true
}
needReboot = needReboot || updateNode
return

return needReboot, nil
}

// ////////////// for testing purposes only ///////////////////////
Expand Down
30 changes: 30 additions & 0 deletions pkg/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ const (
sysBusPciDrivers = "/sys/bus/pci/drivers"
sysBusPciDriversProbe = "/sys/bus/pci/drivers_probe"
sysClassNet = "/sys/class/net"
procKernelCmdLine = "/proc/cmdline"
netClass = 0x02
numVfsFile = "sriov_numvfs"

Expand All @@ -46,6 +47,10 @@ const (
udevRulesFolder = udevFolder + "/rules.d"
udevDisableNM = "/bindata/scripts/udev-find-sriov-pf.sh"
nmUdevRule = "SUBSYSTEM==\"net\", ACTION==\"add|change|move\", ATTRS{device}==\"%s\", IMPORT{program}=\"/etc/udev/disable-nm-sriov.sh $env{INTERFACE} %s\""

KernelArgPciRealloc = "pci=realloc"
KernelArgIntelIommu = "intel_iommu=on"
KernelArgIommuPt = "iommu=pt"
)

var InitialState sriovnetworkv1.SriovNetworkNodeState
Expand All @@ -62,6 +67,31 @@ func init() {
ClusterType = os.Getenv("CLUSTER_TYPE")
}

// GetCurrentKernelArgs This retrieves the kernel cmd line arguments
func GetCurrentKernelArgs(chroot bool) (string, error) {
path := procKernelCmdLine
if !chroot {
path = "/host" + path
}
cmdLine, err := os.ReadFile(path)
if err != nil {
return "", fmt.Errorf("GetCurrentKernelArgs(): Error reading %s: %v", procKernelCmdLine, err)
}
return string(cmdLine), nil
}

// IsKernelArgsSet This checks if the kernel cmd line is set properly. Please note that the same key could be repeated
// several times in the kernel cmd line. We can only ensure that the kernel cmd line has the key/val kernel arg that we set.
func IsKernelArgsSet(cmdLine string, karg string) bool {
elements := strings.Fields(cmdLine)
for _, element := range elements {
if element == karg {
return true
}
}
return false
}

func DiscoverSriovDevices(withUnsupported bool, storeManager StoreManagerInterface) ([]sriovnetworkv1.InterfaceExt, error) {
glog.V(2).Info("DiscoverSriovDevices")
pfList := []sriovnetworkv1.InterfaceExt{}
Expand Down
Loading