Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: de-dupe KubeletTooManyPods, add cluster to descriptions #1011

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 50 additions & 16 deletions alerts/apps_alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ local utils = import '../lib/utils.libsonnet';
severity: 'warning',
},
annotations: {
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is in waiting state (reason: "CrashLoopBackOff").',
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is in waiting state (reason: "CrashLoopBackOff")%s.' % [
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
],
summary: 'Pod is crash looping.',
},
'for': '15m',
Expand All @@ -47,7 +49,9 @@ local utils = import '../lib/utils.libsonnet';
severity: 'warning',
},
annotations: {
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.',
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes%s.' % [
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
],
summary: 'Pod has been in a non-ready state for more than 15 minutes.',
},
'for': '15m',
Expand All @@ -63,7 +67,9 @@ local utils = import '../lib/utils.libsonnet';
severity: 'warning',
},
annotations: {
description: 'Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.',
description: 'Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back%s.' % [
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
],
summary: 'Deployment generation mismatch due to possible roll-back',
},
'for': '15m',
Expand All @@ -85,7 +91,9 @@ local utils = import '../lib/utils.libsonnet';
severity: 'warning',
},
annotations: {
description: 'Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.',
description: 'Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes%s.' % [
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
],
summary: 'Deployment has not matched the expected number of replicas.',
},
'for': '15m',
Expand All @@ -100,7 +108,9 @@ local utils = import '../lib/utils.libsonnet';
severity: 'warning',
},
annotations: {
description: 'Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes.',
description: 'Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes%s.' % [
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
],
summary: 'Deployment rollout is not progressing.',
},
'for': '15m',
Expand All @@ -122,7 +132,9 @@ local utils = import '../lib/utils.libsonnet';
severity: 'warning',
},
annotations: {
description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.',
description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes%s.' % [
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
],
summary: 'StatefulSet has not matched the expected number of replicas.',
},
'for': '15m',
Expand All @@ -138,7 +150,9 @@ local utils = import '../lib/utils.libsonnet';
severity: 'warning',
},
annotations: {
description: 'StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.',
description: 'StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back%s.' % [
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
],
summary: 'StatefulSet generation mismatch due to possible roll-back',
},
'for': '15m',
Expand Down Expand Up @@ -168,7 +182,9 @@ local utils = import '../lib/utils.libsonnet';
severity: 'warning',
},
annotations: {
description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.',
description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out%s.' % [
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
],
summary: 'StatefulSet update has not been rolled out.',
},
'for': '15m',
Expand Down Expand Up @@ -205,7 +221,10 @@ local utils = import '../lib/utils.libsonnet';
severity: 'warning',
},
annotations: {
description: 'DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least %(kubeDaemonSetRolloutStuckFor)s.' % $._config,
description: 'DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least %s%s.' % [
$._config.kubeDaemonSetRolloutStuckFor,
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
],
summary: 'DaemonSet rollout is stuck.',
},
'for': $._config.kubeDaemonSetRolloutStuckFor,
Expand All @@ -218,7 +237,9 @@ local utils = import '../lib/utils.libsonnet';
severity: 'warning',
},
annotations: {
description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: "{{ $labels.reason }}").',
description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: "{{ $labels.reason }}")%s.' % [
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
],
summary: 'Pod container waiting longer than 1 hour',
},
'for': '1h',
Expand All @@ -235,7 +256,9 @@ local utils = import '../lib/utils.libsonnet';
severity: 'warning',
},
annotations: {
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.',
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled%s.' % [
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
],
summary: 'DaemonSet pods are not scheduled.',
},
'for': '10m',
Expand All @@ -249,7 +272,9 @@ local utils = import '../lib/utils.libsonnet';
severity: 'warning',
},
annotations: {
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.',
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run%s.' % [
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
],
summary: 'DaemonSet pods are misscheduled.',
},
'for': '15m',
Expand All @@ -265,7 +290,10 @@ local utils = import '../lib/utils.libsonnet';
severity: 'warning',
},
annotations: {
description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "%(kubeJobTimeoutDuration)s" | humanizeDuration }} to complete.' % $._config,
description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "%s" | humanizeDuration }} to complete%s.' % [
$._config.kubeJobTimeoutDuration,
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
],
summary: 'Job did not complete in time',
},
},
Expand All @@ -279,7 +307,9 @@ local utils = import '../lib/utils.libsonnet';
severity: 'warning',
},
annotations: {
description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.',
description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert%s.' % [
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
],
summary: 'Job failed to complete.',
},
},
Expand All @@ -303,7 +333,9 @@ local utils = import '../lib/utils.libsonnet';
severity: 'warning',
},
annotations: {
description: 'HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes.',
description: 'HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes%s.' % [
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
],
summary: 'HPA has not matched desired number of replicas.',
},
'for': '15m',
Expand All @@ -319,7 +351,9 @@ local utils = import '../lib/utils.libsonnet';
severity: 'warning',
},
annotations: {
description: 'HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes.',
description: 'HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes%s.' % [
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
],
summary: 'HPA is running at max replicas',
},
'for': '15m',
Expand Down
12 changes: 9 additions & 3 deletions alerts/kube_apiserver.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ local utils = import '../lib/utils.libsonnet';
long: '%(long)s' % w,
},
annotations: {
description: 'The API server is burning too much error budget.',
description: 'The API server is burning too much error budget%s.' % [
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
],
summary: 'The API server is burning too much error budget.',
},
'for': '%(for)s' % w,
Expand Down Expand Up @@ -111,7 +113,9 @@ local utils = import '../lib/utils.libsonnet';
severity: 'warning',
},
annotations: {
description: 'Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.',
description: 'Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}%% available over the last 10m%s.' % [
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
],
summary: 'Kubernetes aggregated API is down.',
},
},
Expand All @@ -128,7 +132,9 @@ local utils = import '../lib/utils.libsonnet';
severity: 'warning',
},
annotations: {
description: 'The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.',
description: 'The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests%s.' % [
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
],
summary: 'The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.',
},
'for': '5m',
Expand Down
Loading
Loading