Skip to content

Commit

Permalink
Parallel NICs configuraion
Browse files Browse the repository at this point in the history
  • Loading branch information
e0ne committed Oct 18, 2023
1 parent faef84f commit 59acc63
Show file tree
Hide file tree
Showing 10 changed files with 99 additions and 14 deletions.
5 changes: 4 additions & 1 deletion bindata/manifests/daemon/daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,12 @@ spec:
privileged: true
args:
- "start"
{{- if .UsedSystemdMode}}
{{- if .UsedSystemdMode }}
- --use-systemd-service
{{- end }}
{{ - if .ParallelNicConfig }}
- --parallel-nic-config
{{ - end }}
env:
- name: NODE_NAME
valueFrom:
Expand Down
4 changes: 2 additions & 2 deletions cmd/sriov-network-config-daemon/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,9 @@ func runServiceCmd(cmd *cobra.Command, args []string) error {
glog.Errorf("sriov-config-service: failed to discover sriov devices on the host: %v", err)
return fmt.Errorf("sriov-config-service: failed to discover sriov devices on the host: %v", err)
}

// TODO(e0ne): read ParallelNicConfig from SriovOperatorConfig CR
// Create the generic plugin
configPlugin, err = generic.NewGenericPlugin(true, hostManager, storeManager)
configPlugin, err = generic.NewGenericPlugin(true, hostManager, storeManager, false)
if err != nil {
glog.Errorf("sriov-config-service: failed to create generic plugin %v", err)
return fmt.Errorf("sriov-config-service failed to create generic plugin %v", err)
Expand Down
9 changes: 6 additions & 3 deletions cmd/sriov-network-config-daemon/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,10 @@ var (
}

startOpts struct {
kubeconfig string
nodeName string
systemd bool
kubeconfig string
nodeName string
systemd bool
parallelNicConfig bool
}
)

Expand All @@ -63,6 +64,7 @@ func init() {
startCmd.PersistentFlags().StringVar(&startOpts.kubeconfig, "kubeconfig", "", "Kubeconfig file to access a remote cluster (testing only)")
startCmd.PersistentFlags().StringVar(&startOpts.nodeName, "node-name", "", "kubernetes node name daemon is managing")
startCmd.PersistentFlags().BoolVar(&startOpts.systemd, "use-systemd-service", false, "use config daemon in systemd mode")
startCmd.PersistentFlags().BoolVar(&startOpts.parallelNicConfig, "parallel-nic-config", false, "NICs configuration in a parallel on the same node")
}

func runStartCmd(cmd *cobra.Command, args []string) {
Expand Down Expand Up @@ -214,6 +216,7 @@ func runStartCmd(cmd *cobra.Command, args []string) {
startOpts.systemd,
eventRecorder,
devMode,
startOpts.parallelNicConfig,
).Run(stopCh, exitCh)
if err != nil {
glog.Errorf("failed to run daemon: %v", err)
Expand Down
5 changes: 5 additions & 0 deletions controllers/sriovoperatorconfig_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,11 @@ func (r *SriovOperatorConfigReconciler) syncConfigDaemonSet(ctx context.Context,
} else {
data.Data["UsedSystemdMode"] = false
}
if parallelConfig, ok := dc.Spec.FeatureGates["parallelNicConfig"]; ok {
data.Data["ParallelNicConfig"] = parallelConfig
} else {
data.Data["ParallelNicConfig"] = false
}

envCniBinPath := os.Getenv("SRIOV_CNI_BIN_PATH")
if envCniBinPath == "" {
Expand Down
6 changes: 5 additions & 1 deletion pkg/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ type Daemon struct {

useSystemdService bool

parallelNicConfig bool

devMode bool

client snclientset.Interface
Expand Down Expand Up @@ -153,11 +155,13 @@ func New(
useSystemdService bool,
er *EventRecorder,
devMode bool,
parallelNicConfig bool,
) *Daemon {
return &Daemon{
name: nodeName,
platform: platformType,
useSystemdService: useSystemdService,
parallelNicConfig: parallelNicConfig,
devMode: devMode,
client: client,
kubeClient: kubeClient,
Expand Down Expand Up @@ -540,7 +544,7 @@ func (dn *Daemon) nodeStateSyncHandler() error {

// load plugins if it has not loaded
if len(dn.enabledPlugins) == 0 {
dn.enabledPlugins, err = enablePlugins(dn.platform, dn.useSystemdService, latestState, dn.hostManager, dn.storeManager)
dn.enabledPlugins, err = enablePlugins(dn.platform, dn.useSystemdService, latestState, dn.hostManager, dn.storeManager, dn.parallelNicConfig)
if err != nil {
glog.Errorf("nodeStateSyncHandler(): failed to enable vendor plugins error: %v", err)
return err
Expand Down
1 change: 1 addition & 0 deletions pkg/daemon/daemon_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ var _ = Describe("Config Daemon", func() {
false,
er,
false,
false,
)

sut.enabledPlugins = map[string]plugin.VendorPlugin{generic.PluginName: &fake.FakePlugin{}}
Expand Down
4 changes: 2 additions & 2 deletions pkg/daemon/plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ var (
K8sPlugin = k8splugin.NewK8sPlugin
)

func enablePlugins(platform utils.PlatformType, useSystemdService bool, ns *sriovnetworkv1.SriovNetworkNodeState, hostManager host.HostManagerInterface, storeManager utils.StoreManagerInterface) (map[string]plugin.VendorPlugin, error) {
func enablePlugins(platform utils.PlatformType, useSystemdService bool, ns *sriovnetworkv1.SriovNetworkNodeState, hostManager host.HostManagerInterface, storeManager utils.StoreManagerInterface, parallelNicConfig bool) (map[string]plugin.VendorPlugin, error) {
glog.Infof("enableVendorPlugins(): enabling plugins")
enabledPlugins := map[string]plugin.VendorPlugin{}

Expand All @@ -55,7 +55,7 @@ func enablePlugins(platform utils.PlatformType, useSystemdService bool, ns *srio
}
enabledPlugins[k8sPlugin.Name()] = k8sPlugin
}
genericPlugin, err := GenericPlugin(false, hostManager, storeManager)
genericPlugin, err := GenericPlugin(false, hostManager, storeManager, parallelNicConfig)
if err != nil {
glog.Errorf("enableVendorPlugins(): failed to load the generic plugin error: %v", err)
return nil, err
Expand Down
6 changes: 4 additions & 2 deletions pkg/plugins/generic/generic_plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,15 @@ type GenericPlugin struct {
DriverStateMap DriverStateMapType
DesiredKernelArgs map[string]bool
RunningOnHost bool
ParallelNicConfig bool
HostManager host.HostManagerInterface
StoreManager utils.StoreManagerInterface
}

const scriptsPath = "bindata/scripts/enable-kargs.sh"

// Initialize our plugin and set up initial values
func NewGenericPlugin(runningOnHost bool, hostManager host.HostManagerInterface, storeManager utils.StoreManagerInterface) (plugin.VendorPlugin, error) {
func NewGenericPlugin(runningOnHost bool, hostManager host.HostManagerInterface, storeManager utils.StoreManagerInterface, parallelNicConfig bool) (plugin.VendorPlugin, error) {
driverStateMap := make(map[uint]*DriverState)
driverStateMap[Vfio] = &DriverState{
DriverName: vfioPciDriver,
Expand Down Expand Up @@ -91,6 +92,7 @@ func NewGenericPlugin(runningOnHost bool, hostManager host.HostManagerInterface,
DriverStateMap: driverStateMap,
DesiredKernelArgs: make(map[string]bool),
RunningOnHost: runningOnHost,
ParallelNicConfig: parallelNicConfig,
HostManager: hostManager,
StoreManager: storeManager,
}, nil
Expand Down Expand Up @@ -173,7 +175,7 @@ func (p *GenericPlugin) Apply() error {
defer exit()
}

if err := utils.SyncNodeState(p.DesireState, pfsToSkip); err != nil {
if err := utils.SyncNodeState(p.DesireState, pfsToSkip, p.ParallelNicConfig); err != nil {

Check failure on line 178 in pkg/plugins/generic/generic_plugin.go

View workflow job for this annotation

GitHub Actions / Golangci-lint

File is not `gofmt`-ed with `-s` (gofmt)
// Catch the "cannot allocate memory" error and try to use PCI realloc
if errors.Is(err, syscall.ENOMEM) {
p.addToDesiredKernelArgs(utils.KernelArgPciRealloc)
Expand Down
2 changes: 1 addition & 1 deletion pkg/plugins/generic/generic_plugin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ var _ = Describe("Generic plugin", func() {
ctrl = gomock.NewController(t)
mockHost = mock_host.NewMockHostManagerInterface(ctrl)
mockStore = mock_utils.NewMockStoreManagerInterface(ctrl)
genericPlugin, err = NewGenericPlugin(false, mockHost, mockStore)
genericPlugin, err = NewGenericPlugin(false, mockHost, mockStore, false)
Expect(err).ToNot(HaveOccurred())
})

Expand Down
71 changes: 69 additions & 2 deletions pkg/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"regexp"
"strconv"
"strings"
"sync"
"syscall"
"time"

Expand Down Expand Up @@ -197,8 +198,71 @@ func DiscoverSriovDevices(withUnsupported bool, storeManager StoreManagerInterfa
}

// SyncNodeState Attempt to update the node state to match the desired state
func SyncNodeState(newState *sriovnetworkv1.SriovNetworkNodeState, pfsToConfig map[string]bool) error {
return ConfigSriovInterfaces(newState.Spec.Interfaces, newState.Status.Interfaces, pfsToConfig)
func SyncNodeState(newState *sriovnetworkv1.SriovNetworkNodeState, pfsToConfig map[string]bool, parallelNicConfig bool) error {
if !parallelNicConfig {
return ConfigSriovInterfaces(newState.Spec.Interfaces, newState.Status.Interfaces, pfsToConfig)
}
return ConfigSriovInterfacesInParallel(newState.Spec.Interfaces, newState.Status.Interfaces, pfsToConfig)
}

func ConfigSriovInterfacesInParallel(interfaces []sriovnetworkv1.Interface, ifaceStatuses []sriovnetworkv1.InterfaceExt, pfsToConfig map[string]bool) error {
glog.V(2).Infof("ConfigSriovInterfacesInParallel(): start sriov configuration")
if IsKernelLockdownMode(true) && hasMellanoxInterfacesInSpec(ifaceStatuses, interfaces) {
glog.Warningf("cannot use mellanox devices when in kernel lockdown mode")
return fmt.Errorf("cannot use mellanox devices when in kernel lockdown mode")
}
// TODO(e0ne): store all errors in SriovNetworkNodeState
var result error
wg := sync.WaitGroup{}
for _, ifaceStatus := range ifaceStatuses {
configured := false
for _, iface := range interfaces {
if iface.PciAddress == ifaceStatus.PciAddress {
configured = true

if skip := pfsToConfig[iface.PciAddress]; skip {
break
}

if !NeedUpdate(&iface, &ifaceStatus) {
glog.V(2).Infof("syncNodeState(): no need update interface %s", iface.PciAddress)
break
}

wg.Add(1)
go func(iface *sriovnetworkv1.Interface, ifaceStatus *sriovnetworkv1.InterfaceExt) {
if err := configSriovDevice(iface, ifaceStatus); err != nil {
glog.Errorf("ConfigSriovInterfacesInParallel(): fail to configure sriov interface %s: %v. resetting interface.", iface.PciAddress, err)
result = err
if resetErr := resetSriovDevice(*ifaceStatus); resetErr != nil {
glog.Errorf("SyncNodeState(): fail to reset on error SR-IOV interface: %s", resetErr)
result = resetErr
}
}
wg.Done()
}(&iface, &ifaceStatus)

break
}
}
if !configured && ifaceStatus.NumVfs > 0 {
if skip := pfsToConfig[ifaceStatus.PciAddress]; skip {
continue
}

if err := resetSriovDevice(ifaceStatus); err != nil {
glog.V(2).Infof("ConfigSriovInterfacesInParallel(): reset failed %v", ifaceStatus.PciAddress)
result = err
}
}
}
wg.Wait()
if result != nil {
glog.Errorf("ConfigSriovInterfacesInParallel(): fail to configure sriov interface: %v", result)
return result
}
glog.V(2).Infof("ConfigSriovInterfacesInParallel(): sriov configuration finished")
return nil
}

func ConfigSriovInterfaces(interfaces []sriovnetworkv1.Interface, ifaceStatuses []sriovnetworkv1.InterfaceExt, pfsToConfig map[string]bool) error {
Expand Down Expand Up @@ -235,6 +299,7 @@ func ConfigSriovInterfaces(interfaces []sriovnetworkv1.Interface, ifaceStatuses

break
}

if err = configSriovDevice(&iface, &ifaceStatus); err != nil {
glog.Errorf("SyncNodeState(): fail to configure sriov interface %s: %v. resetting interface.", iface.PciAddress, err)
if iface.ExternallyManaged {
Expand Down Expand Up @@ -540,6 +605,7 @@ func configSriovDevice(iface *sriovnetworkv1.Interface, ifaceStatus *sriovnetwor
return err
}
}
glog.V(2).Infof("configSriovDevice(): config interface %s completed", ifaceStatus.PciAddress)
return nil
}

Expand Down Expand Up @@ -584,6 +650,7 @@ func setNetdevMTU(pciAddr string, mtu int) error {
glog.Warningf("setNetdevMTU(): fail to write mtu file after retrying: %v", err)
return err
}
glog.V(2).Infof("setNetdevMTU(): set MTU for device %s to %d completed", pciAddr, mtu)
return nil
}

Expand Down

0 comments on commit 59acc63

Please sign in to comment.