Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…master into master
  • Loading branch information
SchSeba committed Jan 2, 2025
2 parents 5db3aba + d62b546 commit 5954a33
Show file tree
Hide file tree
Showing 11 changed files with 214 additions and 15 deletions.
41 changes: 41 additions & 0 deletions api/v1/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"sort"
"strconv"
"strings"
"time"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -1005,3 +1006,43 @@ func GenerateBridgeName(iface *InterfaceExt) string {
func NeedToUpdateBridges(bridgeSpec, bridgeStatus *Bridges) bool {
return !reflect.DeepEqual(bridgeSpec, bridgeStatus)
}

// SetKeepUntilTime sets an annotation to hold the "keep until time" for the node’s state.
// The "keep until time" specifies the earliest time at which the state object can be removed
// if the daemon's pod is not found on the node.
func (s *SriovNetworkNodeState) SetKeepUntilTime(t time.Time) {
ts := t.Format(time.RFC3339)
annotations := s.GetAnnotations()
if annotations == nil {
annotations = map[string]string{}
}
annotations[consts.NodeStateKeepUntilAnnotation] = ts
s.SetAnnotations(annotations)
}

// GetKeepUntilTime returns the value that is stored in the "keep until time" annotation.
// The "keep until time" specifies the earliest time at which the state object can be removed
// if the daemon's pod is not found on the node.
// Return zero time instant if annotaion is not found on the object or if it has a wrong format.
func (s *SriovNetworkNodeState) GetKeepUntilTime() time.Time {
t, err := time.Parse(time.RFC3339, s.GetAnnotations()[consts.NodeStateKeepUntilAnnotation])
if err != nil {
return time.Time{}
}
return t
}

// ResetKeepUntilTime removes "keep until time" annotation from the state object.
// The "keep until time" specifies the earliest time at which the state object can be removed
// if the daemon's pod is not found on the node.
// Returns true if the value was removed, false otherwise.
func (s *SriovNetworkNodeState) ResetKeepUntilTime() bool {
annotations := s.GetAnnotations()
_, exist := annotations[consts.NodeStateKeepUntilAnnotation]
if !exist {
return false
}
delete(annotations, consts.NodeStateKeepUntilAnnotation)
s.SetAnnotations(annotations)
return true
}
8 changes: 8 additions & 0 deletions bindata/scripts/kargs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@ declare -a kargs=( "$@" )
ret=0
args=$(chroot /host/ cat /proc/cmdline)

IS_OS_UBUNTU=true; [[ "$(chroot /host/ grep -i ubuntu /etc/os-release -c)" == "0" ]] && IS_OS_UBUNTU=false

# Kernel args configuration isn't supported for Ubuntu now, so we shouldn't do anything here
if ${IS_OS_UBUNTU} ; then
echo $ret
exit 0
fi

if chroot /host/ test -f /run/ostree-booted ; then
for t in "${kargs[@]}";do
if [[ $command == "add" ]];then
Expand Down
66 changes: 61 additions & 5 deletions controllers/sriovnetworknodepolicy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@ import (
"context"
"encoding/json"
"fmt"
"os"
"reflect"
"sort"
"strconv"
"strings"
"time"

Expand Down Expand Up @@ -338,10 +340,7 @@ func (r *SriovNetworkNodePolicyReconciler) syncAllSriovNetworkNodeStates(ctx con
logger.Error(err, "Fail to remove device plugin label from node", "node", ns.Name)
return err
}
logger.Info("Deleting SriovNetworkNodeState as node with that name doesn't exist", "nodeStateName", ns.Name)
err = r.Delete(ctx, &ns, &client.DeleteOptions{})
if err != nil {
logger.Error(err, "Fail to Delete", "SriovNetworkNodeState CR:", ns.GetName())
if err := r.handleStaleNodeState(ctx, &ns); err != nil {
return err
}
}
Expand All @@ -350,6 +349,56 @@ func (r *SriovNetworkNodePolicyReconciler) syncAllSriovNetworkNodeStates(ctx con
return nil
}

// handleStaleNodeState handles stale SriovNetworkNodeState CR (the CR which no longer have a corresponding node with the daemon).
// If the CR has the "keep until time" annotation, indicating the earliest time the state object can be removed,
// this function will compare it to the current time to determine if deletion is permissible and do deletion if allowed.
// If the annotation is absent, the function will create one with a timestamp in future, using either the default or a configured offset.
// If STALE_NODE_STATE_CLEANUP_DELAY_MINUTES env variable is set to 0, removes the CR immediately
func (r *SriovNetworkNodePolicyReconciler) handleStaleNodeState(ctx context.Context, ns *sriovnetworkv1.SriovNetworkNodeState) error {
logger := log.Log.WithName("handleStaleNodeState")

var delayMinutes int
var err error

envValue, found := os.LookupEnv("STALE_NODE_STATE_CLEANUP_DELAY_MINUTES")
if found {
delayMinutes, err = strconv.Atoi(envValue)
if err != nil || delayMinutes < 0 {
delayMinutes = constants.DefaultNodeStateCleanupDelayMinutes
logger.Error(err, "invalid value in STALE_NODE_STATE_CLEANUP_DELAY_MINUTES env variable, use default delay",
"delay", delayMinutes)
}
} else {
delayMinutes = constants.DefaultNodeStateCleanupDelayMinutes
}

if delayMinutes != 0 {
now := time.Now().UTC()
keepUntilTime := ns.GetKeepUntilTime()
if keepUntilTime.IsZero() {
keepUntilTime = now.Add(time.Minute * time.Duration(delayMinutes))
logger.V(2).Info("SriovNetworkNodeState has no matching node, configure cleanup delay for the state object",
"nodeStateName", ns.Name, "delay", delayMinutes, "keepUntilTime", keepUntilTime.String())
ns.SetKeepUntilTime(keepUntilTime)
if err := r.Update(ctx, ns); err != nil {
logger.Error(err, "Fail to update SriovNetworkNodeState CR", "name", ns.GetName())
return err
}
return nil
}
if now.Before(keepUntilTime) {
return nil
}
}
// remove the object if delayMinutes is 0 or if keepUntilTime is already passed
logger.Info("Deleting SriovNetworkNodeState as node with that name doesn't exist", "nodeStateName", ns.Name)
if err := r.Delete(ctx, ns, &client.DeleteOptions{}); err != nil {
logger.Error(err, "Fail to delete SriovNetworkNodeState CR", "name", ns.GetName())
return err
}
return nil
}

func (r *SriovNetworkNodePolicyReconciler) syncSriovNetworkNodeState(ctx context.Context,
dc *sriovnetworkv1.SriovOperatorConfig,
npl *sriovnetworkv1.SriovNetworkNodePolicyList,
Expand All @@ -375,9 +424,16 @@ func (r *SriovNetworkNodePolicyReconciler) syncSriovNetworkNodeState(ctx context
return fmt.Errorf("failed to get SriovNetworkNodeState: %v", err)
}
} else {
keepUntilAnnotationUpdated := found.ResetKeepUntilTime()

if len(found.Status.Interfaces) == 0 {
logger.Info("SriovNetworkNodeState Status Interfaces are empty. Skip update of policies in spec",
"namespace", ns.Namespace, "name", ns.Name)
if keepUntilAnnotationUpdated {
if err := r.Update(ctx, found); err != nil {
return fmt.Errorf("couldn't update SriovNetworkNodeState: %v", err)
}
}
return nil
}

Expand Down Expand Up @@ -420,7 +476,7 @@ func (r *SriovNetworkNodePolicyReconciler) syncSriovNetworkNodeState(ctx context
// Note(adrianc): we check same ownerReferences since SriovNetworkNodeState
// was owned by a default SriovNetworkNodePolicy. if we encounter a descripancy
// we need to update.
if reflect.DeepEqual(newVersion.OwnerReferences, found.OwnerReferences) &&
if !keepUntilAnnotationUpdated && reflect.DeepEqual(newVersion.OwnerReferences, found.OwnerReferences) &&
equality.Semantic.DeepEqual(newVersion.Spec, found.Spec) {
logger.V(1).Info("SriovNetworkNodeState did not change, not updating")
return nil
Expand Down
78 changes: 72 additions & 6 deletions controllers/sriovnetworknodepolicy_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package controllers
import (
"context"
"encoding/json"
"os"
"sync"
"testing"
"time"
Expand All @@ -11,18 +12,17 @@ import (
. "github.com/onsi/gomega"

"github.com/google/go-cmp/cmp"
dptypes "github.com/k8snetworkplumbingwg/sriov-network-device-plugin/pkg/types"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
k8sclient "sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"

dptypes "github.com/k8snetworkplumbingwg/sriov-network-device-plugin/pkg/types"

sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
v1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/featuregate"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars"
Expand All @@ -48,7 +48,7 @@ func TestRenderDevicePluginConfigData(t *testing.T) {
{
tname: "testVirtioVdpaVirtio",
policy: sriovnetworkv1.SriovNetworkNodePolicy{
Spec: v1.SriovNetworkNodePolicySpec{
Spec: sriovnetworkv1.SriovNetworkNodePolicySpec{
ResourceName: "resourceName",
DeviceType: consts.DeviceTypeNetDevice,
VdpaType: consts.VdpaTypeVirtio,
Expand All @@ -67,7 +67,7 @@ func TestRenderDevicePluginConfigData(t *testing.T) {
}, {
tname: "testVhostVdpaVirtio",
policy: sriovnetworkv1.SriovNetworkNodePolicy{
Spec: v1.SriovNetworkNodePolicySpec{
Spec: sriovnetworkv1.SriovNetworkNodePolicySpec{
ResourceName: "resourceName",
DeviceType: consts.DeviceTypeNetDevice,
VdpaType: consts.VdpaTypeVhost,
Expand All @@ -87,7 +87,7 @@ func TestRenderDevicePluginConfigData(t *testing.T) {
{
tname: "testExcludeTopology",
policy: sriovnetworkv1.SriovNetworkNodePolicy{
Spec: v1.SriovNetworkNodePolicySpec{
Spec: sriovnetworkv1.SriovNetworkNodePolicySpec{
ResourceName: "resourceName",
ExcludeTopology: true,
},
Expand Down Expand Up @@ -138,6 +138,10 @@ var _ = Describe("SriovnetworkNodePolicy controller", Ordered, func() {
var ctx context.Context

BeforeAll(func() {
// disable stale state cleanup delay to check that the controller can cleanup state objects
DeferCleanup(os.Setenv, "STALE_NODE_STATE_CLEANUP_DELAY_MINUTES", os.Getenv("STALE_NODE_STATE_CLEANUP_DELAY_MINUTES"))
os.Setenv("STALE_NODE_STATE_CLEANUP_DELAY_MINUTES", "0")

By("Create SriovOperatorConfig controller k8s objs")
config := makeDefaultSriovOpConfig()
Expect(k8sClient.Create(context.Background(), config)).Should(Succeed())
Expand Down Expand Up @@ -321,3 +325,65 @@ var _ = Describe("SriovnetworkNodePolicy controller", Ordered, func() {
})
})
})

var _ = Describe("SriovNetworkNodePolicyReconciler", Ordered, func() {
Context("handleStaleNodeState", func() {
var (
ctx context.Context
r *SriovNetworkNodePolicyReconciler
nodeState *sriovnetworkv1.SriovNetworkNodeState
)

BeforeEach(func() {
ctx = context.Background()
scheme := runtime.NewScheme()
utilruntime.Must(sriovnetworkv1.AddToScheme(scheme))
nodeState = &sriovnetworkv1.SriovNetworkNodeState{ObjectMeta: metav1.ObjectMeta{Name: "node1"}}
r = &SriovNetworkNodePolicyReconciler{Client: fake.NewClientBuilder().WithObjects(nodeState).Build()}
})
It("should set default delay", func() {
nodeState := nodeState.DeepCopy()
Expect(r.handleStaleNodeState(ctx, nodeState)).NotTo(HaveOccurred())
Expect(r.Get(ctx, types.NamespacedName{Name: nodeState.Name}, nodeState)).NotTo(HaveOccurred())
Expect(time.Now().UTC().Before(nodeState.GetKeepUntilTime())).To(BeTrue())
})
It("should remove CR if wait time expired", func() {
nodeState := nodeState.DeepCopy()
nodeState.SetKeepUntilTime(time.Now().UTC().Add(-time.Minute))
Expect(r.handleStaleNodeState(ctx, nodeState)).NotTo(HaveOccurred())
Expect(errors.IsNotFound(r.Get(ctx, types.NamespacedName{Name: nodeState.Name}, nodeState))).To(BeTrue())
})
It("should keep existing wait time if already set", func() {
nodeState := nodeState.DeepCopy()
nodeState.SetKeepUntilTime(time.Now().UTC().Add(time.Minute))
testTime := nodeState.GetKeepUntilTime()
r.Update(ctx, nodeState)
Expect(r.handleStaleNodeState(ctx, nodeState)).NotTo(HaveOccurred())
Expect(r.Get(ctx, types.NamespacedName{Name: nodeState.Name}, nodeState)).NotTo(HaveOccurred())
Expect(nodeState.GetKeepUntilTime()).To(Equal(testTime))
})
It("non default dealy", func() {
DeferCleanup(os.Setenv, "STALE_NODE_STATE_CLEANUP_DELAY_MINUTES", os.Getenv("STALE_NODE_STATE_CLEANUP_DELAY_MINUTES"))
os.Setenv("STALE_NODE_STATE_CLEANUP_DELAY_MINUTES", "60")
nodeState := nodeState.DeepCopy()
Expect(r.handleStaleNodeState(ctx, nodeState)).NotTo(HaveOccurred())
Expect(r.Get(ctx, types.NamespacedName{Name: nodeState.Name}, nodeState)).NotTo(HaveOccurred())
Expect(time.Until(nodeState.GetKeepUntilTime()) > 30*time.Minute).To(BeTrue())
})
It("invalid non default delay - should use default", func() {
DeferCleanup(os.Setenv, "STALE_NODE_STATE_CLEANUP_DELAY_MINUTES", os.Getenv("STALE_NODE_STATE_CLEANUP_DELAY_MINUTES"))
os.Setenv("STALE_NODE_STATE_CLEANUP_DELAY_MINUTES", "-20")
nodeState := nodeState.DeepCopy()
Expect(r.handleStaleNodeState(ctx, nodeState)).NotTo(HaveOccurred())
Expect(r.Get(ctx, types.NamespacedName{Name: nodeState.Name}, nodeState)).NotTo(HaveOccurred())
Expect(time.Until(nodeState.GetKeepUntilTime()) > 20*time.Minute).To(BeTrue())
})
It("should remove CR if delay is zero", func() {
DeferCleanup(os.Setenv, "STALE_NODE_STATE_CLEANUP_DELAY_MINUTES", os.Getenv("STALE_NODE_STATE_CLEANUP_DELAY_MINUTES"))
os.Setenv("STALE_NODE_STATE_CLEANUP_DELAY_MINUTES", "0")
nodeState := nodeState.DeepCopy()
Expect(r.handleStaleNodeState(ctx, nodeState)).NotTo(HaveOccurred())
Expect(errors.IsNotFound(r.Get(ctx, types.NamespacedName{Name: nodeState.Name}, nodeState))).To(BeTrue())
})
})
})
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ spec:
value: {{ .Values.operator.cniBinPath }}
- name: CLUSTER_TYPE
value: {{ .Values.operator.clusterType }}
- name: STALE_NODE_STATE_CLEANUP_DELAY_MINUTES
value: "{{ .Values.operator.staleNodeStateCleanupDelayMinutes }}"
{{- if .Values.operator.admissionControllers.enabled }}
- name: ADMISSION_CONTROLLERS_CERTIFICATES_OPERATOR_SECRET_NAME
value: {{ .Values.operator.admissionControllers.certificates.secretNames.operator }}
Expand Down
4 changes: 4 additions & 0 deletions deployment/sriov-network-operator-chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ operator:
resourcePrefix: "openshift.io"
cniBinPath: "/opt/cni/bin"
clusterType: "kubernetes"
# minimal amount of time (in minutes) the operator will wait before removing
# stale SriovNetworkNodeState objects (objects that doesn't match node with the daemon)
# "0" means no extra delay, in this case the CR will be removed by the next reconcilation cycle (may take up to 5 minutes)
staleNodeStateCleanupDelayMinutes: "30"
metricsExporter:
port: "9110"
certificates:
Expand Down
2 changes: 1 addition & 1 deletion hack/run-e2e-conformance-virtual-cluster.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env bash
set -xeo pipefail

cluster_version=${CLUSTER_VERSION:-1.29.3}
cluster_version=${CLUSTER_VERSION:-1.32.0}
cluster_name=${CLUSTER_NAME:-virtual}
domain_name=$cluster_name.lab

Expand Down
8 changes: 8 additions & 0 deletions pkg/consts/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,14 @@ const (
MCPPauseAnnotationState = "sriovnetwork.openshift.io/state"
MCPPauseAnnotationTime = "sriovnetwork.openshift.io/time"

// NodeStateKeepUntilAnnotation contains name of the "keep until time" annotation for SriovNetworkNodeState object.
// The "keep until time" specifies the earliest time at which the state object can be removed
// if the daemon's pod is not found on the node.
NodeStateKeepUntilAnnotation = "sriovnetwork.openshift.io/keep-state-until"
// DefaultNodeStateCleanupDelayMinutes contains default delay before removing stale SriovNetworkNodeState CRs
// (the CRs that no longer have a corresponding node with the daemon).
DefaultNodeStateCleanupDelayMinutes = 30

CheckpointFileName = "sno-initial-node-state.json"
Unknown = "Unknown"

Expand Down
5 changes: 5 additions & 0 deletions test/conformance/tests/test_policy_configuration.go
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,11 @@ var _ = Describe("[sriov] operator", Ordered, func() {
NetworkNamespace: namespaces.Test,
}}

// for real BM env we enable link state
if !cluster.VirtualCluster() {
sriovNetwork.Spec.LinkState = "enable"
}

// We need this to be able to run the connectivity checks on Mellanox cards
if intf.DeviceID == "1015" {
sriovNetwork.Spec.SpoofChk = off
Expand Down
11 changes: 8 additions & 3 deletions test/conformance/tests/test_sriov_operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -501,9 +501,14 @@ var _ = Describe("[sriov] operator", func() {
Expect(err).ToNot(HaveOccurred())

Eventually(func() bool {
stdout, stderr, err := pod.ExecCommand(clients, hostNetPod, "ip", "link", "show")
Expect(err).ToNot(HaveOccurred())
Expect(stderr).To(Equal(""))
var stdout, stderr string
// Adding a retry because some of the time we get `Dump was interrupted and may be inconsistent.`
// output from the ip link command
Eventually(func(g Gomega) {
stdout, stderr, err = pod.ExecCommand(clients, hostNetPod, "ip", "link", "show")
g.Expect(err).ToNot(HaveOccurred())
g.Expect(stderr).To(Equal(""))
}, time.Minute, 2*time.Second).Should(Succeed())

found := false
for _, line := range strings.Split(stdout, "\n") {
Expand Down
Loading

0 comments on commit 5954a33

Please sign in to comment.