Skip to content

Commit

Permalink
Merge pull request #261 from jaxesn/jgw/improve-cordon
Browse files Browse the repository at this point in the history
waits for cordon to complete
  • Loading branch information
jaxesn authored Dec 31, 2024
2 parents 79d8979 + bcd18d2 commit 33e2bb9
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 9 deletions.
50 changes: 43 additions & 7 deletions test/e2e/kubernetes/kubernetes.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ const (
hybridNodeWaitTimeout = 10 * time.Minute
hybridNodeDelayInterval = 5 * time.Second
hybridNodeUpgradeTimeout = 2 * time.Minute
nodeCordonDelayInterval = 1 * time.Second
nodeCordonTimeout = 30 * time.Second
MinimumVersion = "1.25"
)

Expand Down Expand Up @@ -342,12 +344,7 @@ func DrainNode(ctx context.Context, k8s *kubernetes.Clientset, node *corev1.Node
ErrOut: os.Stderr,
}

err := CordonNode(ctx, k8s, node)
if err != nil {
return err
}

err = drain.RunNodeDrain(helper, node.Name)
err := drain.RunNodeDrain(helper, node.Name)
if err != nil {
return fmt.Errorf("draining node %s: %v", node.Name, err)
}
Expand All @@ -369,7 +366,7 @@ func UncordonNode(ctx context.Context, k8s *kubernetes.Clientset, node *corev1.N
return nil
}

func CordonNode(ctx context.Context, k8s *kubernetes.Clientset, node *corev1.Node) error {
func CordonNode(ctx context.Context, k8s *kubernetes.Clientset, node *corev1.Node, logger logr.Logger) error {
helper := &drain.Helper{
Ctx: ctx,
Client: k8s,
Expand All @@ -380,9 +377,48 @@ func CordonNode(ctx context.Context, k8s *kubernetes.Clientset, node *corev1.Nod
return fmt.Errorf("cordoning node %s: %v", node.Name, err)
}

// Cordon returns before the node has been tainted and since we immediately run
// drain, its possible (common) during our tests that pods get scheduled on the node after
// drain gets the list of pods to evict and before the taint has been fully applied
// leading to an error during nodeadm upgrade/uninstall due to non-daemonset pods running
nodeName := node.ObjectMeta.Name
consecutiveErrors := 0
err = wait.PollUntilContextTimeout(ctx, nodeCordonDelayInterval, nodeCordonTimeout, true, func(ctx context.Context) (done bool, err error) {
node, err := k8s.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
if err != nil {
consecutiveErrors += 1
logger.Info("consecutiveErrors", "consecutiveErrors", consecutiveErrors)
if consecutiveErrors > 3 {
return false, fmt.Errorf("getting node %s: %w", nodeName, err)
}
logger.Info("Retryable error getting hybrid node. Continuing to poll", "name", nodeName, "error", err)
return false, nil // continue polling
}
consecutiveErrors = 0

if nodeCordon(node) {
logger.Info("Node successfully cordoned")
return true, nil
}

return false, nil // continue polling
})
if err != nil {
return fmt.Errorf("waiting for node %s to be cordoned: %w", nodeName, err)
}

return nil
}

func nodeCordon(node *corev1.Node) bool {
for _, taint := range node.Spec.Taints {
if taint.Key == "node.kubernetes.io/unschedulable" {
return true
}
}
return false
}

func GetPodLogs(ctx context.Context, k8s *kubernetes.Clientset, name, namespace string) (string, error) {
req := k8s.CoreV1().Pods(namespace).GetLogs(name, &corev1.PodLogOptions{})
podLogs, err := req.Stream(ctx)
Expand Down
4 changes: 2 additions & 2 deletions test/e2e/suite/nodeadm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -581,7 +581,7 @@ func (u uninstallNodeTest) Run(ctx context.Context) error {
}

u.logger.Info("Cordoning hybrid node...")
err = kubernetes.CordonNode(ctx, u.k8s, node)
err = kubernetes.CordonNode(ctx, u.k8s, node, u.logger)
if err != nil {
return err
}
Expand Down Expand Up @@ -726,7 +726,7 @@ func (u upgradeNodeTest) Run(ctx context.Context) error {
}
nodeName := node.Name
u.logger.Info("Cordoning hybrid node...")
err = kubernetes.CordonNode(ctx, u.k8s, node)
err = kubernetes.CordonNode(ctx, u.k8s, node, u.logger)
if err != nil {
return err
}
Expand Down

0 comments on commit 33e2bb9

Please sign in to comment.