diff --git a/.prow/Makefile b/.prow/Makefile index a324e23b..dbf5f475 100644 --- a/.prow/Makefile +++ b/.prow/Makefile @@ -11,39 +11,53 @@ ifdef IS_CI NAME_PREFIX=$(shell git symbolic-ref --short HEAD) endif +# ifeq ($(NAME_PREFIX),) # Defaults NAME_PREFIX to "manual" if IS_CI is unset or set to "" NAME_PREFIX=manual endif +# +ifndef FULL_IMAGE_NAME + override FULL_IMAGE_NAME = registry.k8s.dev.circonus.com/circonus-kubernetes-agent:latest +endif + + RUNTIME_DATA_FILE=./.runtime-$(NAME_PREFIX).yaml -.PHONY: all terraform gcp runtime_data helm clean +.PHONY: all terraform runtime_data gcp kubeconfig registry proxy helm clean -all: terraform helm +all: runtime_data terraform kubeconfig registry proxy helm + +runtime_data: +ifeq ($(NAME_PREFIX),manual) + @if [ ! -f "$(RUNTIME_DATA_FILE)" ]; then cp "$(DATA_FILE)" "$(RUNTIME_DATA_FILE)"; fi +else + @cp "$(DATA_FILE)" "$(RUNTIME_DATA_FILE)" +endif terraform: gcp -gcp: runtime_data +gcp: @NAME_PREFIX=$(NAME_PREFIX) $(MULTIFORM) apply # ifeq ($(NAME_PREFIX),manual) # NAME_PREFIX=$(NAME_PREFIX) $(MULTIFORM) watch_apply # else @NAME_PREFIX=$(NAME_PREFIX) $(MULTIFORM) wait_apply # endif + +kubeconfig: @if [ -f "$(KUBECONFIG)" ] && ! [ -f "$(KUBECONFIG).bak" ]; then mv $(KUBECONFIG) $(KUBECONFIG).bak; fi @NAME_PREFIX=$(NAME_PREFIX) $(MULTIFORM) kubeconfig - @NAME_PREFIX=$(NAME_PREFIX) $(MULTIFORM) proxy -runtime_data: -ifeq ($(NAME_PREFIX),manual) - @if [ ! -f "$(RUNTIME_DATA_FILE)" ]; then cp "$(DATA_FILE)" "$(RUNTIME_DATA_FILE)"; fi -else - @cp "$(DATA_FILE)" "$(RUNTIME_DATA_FILE)" -endif +registry: + @NAME_PREFIX=$(NAME_PREFIX) $(MULTIFORM) registry + +proxy: + @NAME_PREFIX=$(NAME_PREFIX) $(MULTIFORM) proxy helm: - @NAME_PREFIX=$(NAME_PREFIX) $(ADMIRAL) sync + @NAME_PREFIX=$(NAME_PREFIX) FULL_IMAGE_NAME=$(FULL_IMAGE_NAME) $(ADMIRAL) sync # ifeq ($(NAME_PREFIX),manual) # NAME_PREFIX=$(NAME_PREFIX) $(ADMIRAL) watch_sync # else diff --git a/.prow/README.md b/.prow/README.md index dfa6806f..847b8d03 100644 --- a/.prow/README.md +++ b/.prow/README.md @@ -7,31 +7,46 @@ The following are required, outside of (but not excluding) utilities normally available on macos -- yq () +- gcloud () - terraform () +- yq () - helm () - helmfile () - git () - bash () - ssh () -- gcloud () -## Manual deploy +## Manual + +### Manual deploy A deployment process as easy as `seq 1 3` -0. Ensure your system has the required dependencies +0. Ensure the host system has the required dependencies installed 1. `cd` to this directory. -2. run `make` +2. run `make` **note: the tf can take up to 20 minutes to deploy** 3. ??? -## Automatic Deploy +### Manual teardown + +1. run `make clean` +2. ??? + +### Troubleshooting manual deploy/teardown + +Just run the step again. +They should all be idempotent. + +## Automatic + +### Automatic Deploy > TODO -## TODO +### TODO -- Docker container +- Artifact registry / push k8s agent docker container +- package all dependencies in a docker image (alpine based?) - S3/GCS bucket for tfstate -- Automatic Deploy +- Automatic prow-triggered Deploy diff --git a/.prow/helm/admiral.sh b/.prow/helm/admiral.sh index de82c444..9034bf6b 100755 --- a/.prow/helm/admiral.sh +++ b/.prow/helm/admiral.sh @@ -129,6 +129,10 @@ multiplex_helmfile() { ctx=$(yq -r '(.workspaces[] | select(.name == "'"${workspace}"'")).kubectl.context' "${RUNTIME_DATA_FILE}") kube_contexts=$(kubectl config view | yq -r '.contexts[].name') cluster_name=$(yq -r '(.workspaces[] | select(.name == "'"${workspace}"'")).kubectl.cluster_name' "${RUNTIME_DATA_FILE}") + registry_name=$(yq -r '(.workspaces[] | select(.name == "'"${workspace}"'")).registry_name' "${RUNTIME_DATA_FILE}") + # if we pass a FULL_IMAGE_NAME, use it. Otherwise, use the default + FULL_IMAGE_NAME="${FULL_IMAGE_NAME:-${registry_name}/circonus-kubernetes-agent:latest}" + # ensure workspace log dir exists mkdir -p "${LOG_DIR}/${workspace}" @@ -136,7 +140,7 @@ multiplex_helmfile() { if [[ "${kube_contexts}" == *"${ctx}"* ]]; then # background a task that starts helmfile and notifies of completion in the workspace command log # shellcheck disable=SC2068 - ( (HTTPS_PROXY="localhost:${port_number}" CLUSTER_NAME="${cluster_name}" helmfile ${@} --kube-context "${ctx}" &> "${LOG_DIR}/${workspace}/helmfile_${command}.log") \ + ( (HTTPS_PROXY="localhost:${port_number}" CLUSTER_NAME="${cluster_name}" FULL_IMAGE_NAME="${FULL_IMAGE_NAME}" helmfile ${@} --kube-context "${ctx}" &> "${LOG_DIR}/${workspace}/helmfile_${command}.log") \ && (echo "Helmfile ${command} complete!" > "${LOG_DIR}/${workspace}/helmfile_${command}.log") ) & else echo "[ERROR] Couldn't find ${ctx} in kubectl config" | tee "${LOG_DIR}/${workspace}/helmfile_${command}.log" diff --git a/.prow/helm/charts/kube-dns-metrics/.helmignore b/.prow/helm/charts/kube-dns-metrics/.helmignore new file mode 100644 index 00000000..0e8a0eb3 --- /dev/null +++ b/.prow/helm/charts/kube-dns-metrics/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/.prow/helm/charts/kube-dns-metrics/Chart.yaml b/.prow/helm/charts/kube-dns-metrics/Chart.yaml new file mode 100644 index 00000000..a1bbf989 --- /dev/null +++ b/.prow/helm/charts/kube-dns-metrics/Chart.yaml @@ -0,0 +1,24 @@ +apiVersion: v2 +name: kube-dns-metrics +description: A Helm chart for Kubernetes + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.16.0" diff --git a/.prow/helm/charts/kube-dns-metrics/templates/NOTES.txt b/.prow/helm/charts/kube-dns-metrics/templates/NOTES.txt new file mode 100644 index 00000000..5e5ffae2 --- /dev/null +++ b/.prow/helm/charts/kube-dns-metrics/templates/NOTES.txt @@ -0,0 +1,5 @@ +1. Get the application URL by running these commands: + export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "kube-dns-metrics.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") + export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") + echo "Visit http://127.0.0.1:8080 to use your application" + kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT diff --git a/.prow/helm/charts/kube-dns-metrics/templates/_helpers.tpl b/.prow/helm/charts/kube-dns-metrics/templates/_helpers.tpl new file mode 100644 index 00000000..d1140e1c --- /dev/null +++ b/.prow/helm/charts/kube-dns-metrics/templates/_helpers.tpl @@ -0,0 +1,52 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "kube-dns-metrics.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "kube-dns-metrics.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "kube-dns-metrics.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "kube-dns-metrics.labels" -}} +helm.sh/chart: {{ include "kube-dns-metrics.chart" . }} +{{ include "kube-dns-metrics.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "kube-dns-metrics.selectorLabels" -}} +app.kubernetes.io/name: {{ include "kube-dns-metrics.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + diff --git a/.prow/helm/charts/kube-dns-metrics/templates/service.yaml b/.prow/helm/charts/kube-dns-metrics/templates/service.yaml new file mode 100644 index 00000000..26a75286 --- /dev/null +++ b/.prow/helm/charts/kube-dns-metrics/templates/service.yaml @@ -0,0 +1,19 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ include "kube-dns-metrics.fullname" . }} + labels: + {{- include "kube-dns-metrics.labels" . | nindent 4 }} + annotations: + prometheus.io/port: "{{ .Values.service.port }}" + prometheus.io/scrape: "true" +spec: + type: {{ .Values.service.type }} + ports: + - name: http-metrics-kube-dns + port: {{ .Values.service.port }} + protocol: TCP + targetPort: {{ .Values.service.port }} + selector: + k8s-app: kube-dns diff --git a/.prow/helm/charts/kube-dns-metrics/templates/tests/test-connection.yaml b/.prow/helm/charts/kube-dns-metrics/templates/tests/test-connection.yaml new file mode 100644 index 00000000..5ee4abd6 --- /dev/null +++ b/.prow/helm/charts/kube-dns-metrics/templates/tests/test-connection.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "kube-dns-metrics.fullname" . }}-test-connection" + labels: + {{- include "kube-dns-metrics.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test +spec: + containers: + - name: wget + image: busybox + command: ['wget'] + args: ['{{ include "kube-dns-metrics.fullname" . }}:{{ .Values.service.port }}/metrics'] + restartPolicy: Never diff --git a/.prow/helm/charts/kube-dns-metrics/values.yaml b/.prow/helm/charts/kube-dns-metrics/values.yaml new file mode 100644 index 00000000..dd6c6a87 --- /dev/null +++ b/.prow/helm/charts/kube-dns-metrics/values.yaml @@ -0,0 +1,4 @@ +service: + type: ClusterIP + port: 10055 + diff --git a/.prow/helm/config/circonus-kubernetes-agent/secrets.yaml b/.prow/helm/config/circonus-kubernetes-agent/secrets.yaml index 32b384b2..ab5e90df 100644 --- a/.prow/helm/config/circonus-kubernetes-agent/secrets.yaml +++ b/.prow/helm/config/circonus-kubernetes-agent/secrets.yaml @@ -1,4 +1,4 @@ -circonus_api_key: ENC[AES256_GCM,data:VeWoAT5uvTcdT+6IsqRAviPj0Ag2CfWN2OUTUPCXLL1a9Jtm,iv:rKlICA/p7Qmt/iPZd7rrudE8vb/Tr5ZtM7gCnZNtOOM=,tag:LiVCG3EqjQZnYQKlihCF8Q==,type:str] +circonus_api_key: ENC[AES256_GCM,data:gEINr7R9hupERtQfG7pXoctawFXxmUiZgPGo615lvvUsErCn,iv:NV5kpe6lwXAjtpoq+d+Cp+x6um+k2HFiwcu6H11VWc4=,tag:89MpyyuwGW+J+uKrYOm0jA==,type:str] contact_email: ENC[AES256_GCM,data:I+s1NW7NGF3PnDex6a6lwUXOj+U6SZLLKjUx8PY=,iv:yb5wF77r/UNRn4uJ4vsmOlm+jgrPMXk5zoM2948dHJ0=,tag:SBpO6IHyXLANncvnk/POxQ==,type:str] sops: kms: [] @@ -6,8 +6,8 @@ sops: azure_kv: [] hc_vault: [] age: [] - lastmodified: "2022-10-04T01:06:08Z" - mac: ENC[AES256_GCM,data:ZXUgBlq+3mVnYCoBWNOcXgac451jM+Oh0xzhhpkao1PnzG00FkHVmL54TBw6ksAllJau25toTNOFe0p1TFt0dUTSYdodPQehxsqTGpYF+8J3mvLv4gJ+Z4t0smmd3L2mJgGJSmW7fPYOSKhARtFFNjM9jk0J1VI7fzZQ859np0w=,iv:ejphILKM0/cFRrO6Fiie8gcvU0tmAypK7TPl5XttXkg=,tag:tueLUtDbIgdY3KcmBjQt8Q==,type:str] + lastmodified: "2022-10-07T23:15:35Z" + mac: ENC[AES256_GCM,data:fHHKZ6QRJz7Yk6Bi/QkYFPA7ZGMdTeiINf+QSm7Zvsll+S3PiaVM85G1iCmj9INWBsC0d9K9VR3la0qtOyTSbXytpiTgZzzEcUgJUBwoiiXYzTSj0vSe1LuKMXflAUgKlOnwcVjwX8LEem2aoO9LIGp/dfDwryWXwD9/3XX+xeQ=,iv:swYu+RL1m/3S/rvETvtwECfWndu6SvGwueR37eP638U=,tag:21TSzISfMuJOUQ50AKvegw==,type:str] pgp: - created_at: "2022-10-03T23:53:23Z" enc: | diff --git a/.prow/helm/config/circonus-kubernetes-agent/values.yaml b/.prow/helm/config/circonus-kubernetes-agent/values.yaml index 35dd7f69..857bd0d2 100644 --- a/.prow/helm/config/circonus-kubernetes-agent/values.yaml +++ b/.prow/helm/config/circonus-kubernetes-agent/values.yaml @@ -1,4 +1,69 @@ +image: "" kubernetes_name: "" broker_cid: "/broker/35" dns: port: "10055" +metric_filters: | + { + "metric_filters": [ + ["allow", "^.+$", "tags", "and(collector:dynamic)", "NO_LOCAL_FILTER dynamically collected metrics"], + ["allow", "^(Disk|Memory|PID)Pressure$", "node status"], + ["allow", "^(container|node|pod)_.*$", "node metrics k8s v1.18+"], + ["allow", "^(kube_)?pod_container_status_(running|terminated|waiting|ready)(_count)?$", "containers"], + ["allow", "^(kube_)?pod_container_status_(terminated|waiting)_reason(_count)?$", "containers health"], + ["allow", "^(kube_)?pod_init_container_status_(terminated|waiting)_reason(_count)?$", "init containers health"], + ["allow", "^(kube_)?pod_status_(ready|scheduled)(_count)?$", "tags", "and(condition:true)", "pods"], + ["allow", "^(kube_)?pod_status_phase(_count)?$", "tags", "and(or(phase:Running,phase:Pending,phase:Failed,phase:Succeeded))", "pods"], + ["allow", "^(node|kubelet_running_pod_count|Ready)$", "nodes"], + ["allow", "^(pod|node)_cpu_usage_seconds_total$", "utilization"], + ["allow", "^(pod|node)_memory_working_set_bytes$", "utilization"], + ["allow", "^(used|capacity)$", "tags", "and(or(units:bytes,units:percent),or(resource:memory,resource:fs,volume_name:*),not(container_name:*),not(sys_container:*))", "utilization"], + ["allow", "^NetworkUnavailable$", "node status"], + ["allow", "^[rt]x$", "tags", "and(resource:network,or(units:bytes,units:errors),not(container_name:*),not(sys_container:*))", "utilization"], + ["allow", "^apiserver_request_total$", "tags", "and(or(code:5*,code:4*))", "api req errors"], + ["allow", "^authenticated_user_requests$", "api auth"], + ["allow", "^authentication_attempts$", "api auth health"], + ["allow", "^cadvisor.*$", "cadvisor"], + ["allow", "^capacity_.*$", "node capacity"], + ["allow", "^collect_.*$", "agent collection stats"], + ["allow", "^coredns*", "dns health"], + ["allow", "^coredns_(dns|forward)_request_(count_total|duration_seconds_avg)$", "dns health"], + ["allow", "^coredns_(dns|forward)_response_rcode_count_total$", "dns health"], + ["allow", "^daemonset_scheduled_delta$", "health"], + ["allow", "^deployment_generation_delta$", "health"], + ["allow", "^events$", "events"], + ["allow", "^kube_(service_labels|deployment_labels|pod_container_info|pod_deleted)$", "ksm inventory"], + ["allow", "^kube_(service|deployment)_labels$", "ksm inventory"], + ["allow", "^kube_daemonset_status_(current|desired)_number_scheduled$", "health"], + ["allow", "^kube_deployment_(created|spec_replicas)$", "deployments"], + ["allow", "^kube_deployment_(metadata|status_observed)_generation$", "health"], + ["allow", "^kube_deployment_status_(replicas|replicas_updated|replicas_available|replicas_unavailable)$", "deployments"], + ["allow", "^kube_deployment_status_replicas_unavailable$", "deployments"], + ["allow", "^kube_hpa_(spec_max|status_current)_replicas$", "scale"], + ["allow", "^kube_job_status_failed$", "health"], + ["allow", "^kube_namespace_status_phase$", "namespaces"], + ["allow", "^kube_namespace_status_phase$", "tags", "and(or(phase:Active,phase:Terminating))", "namespaces"], + ["allow", "^kube_node_spec_unschedulable$", "node status"], + ["allow", "^kube_node_status_allocatable$", "node status"], + ["allow", "^kube_node_status_condition$", "node status health"], + ["allow", "^kube_persistentvolume_status_phase$", "health"], + ["allow", "^kube_pod_info$", "pods"], + ["allow", "^kube_pod_start_time$", "pods"], + ["allow", "^kube_pod_status_condition$", "pods"], + ["allow", "^kube_statefulset_status_(replicas|replicas_ready)$", "health"], + ["allow", "^kubedns*","dns health"], + ["allow", "^kubelet_.*$", "node metrics k8s v1.18+"], + ["allow", "^machine_.*$", "node metrics k8s v1.18+"], + ["allow", "^pod_container_status$", "containers"], + ["allow", "^pod_container_status_(terminated|waiting)_reason(_count)?$", "containers health"], + ["allow", "^pod_init_container_status_(terminated|waiting)_reason(_count)?$", "init containers health"], + ["allow", "^pod_status_(ready|scheduled)$", "pods"], + ["allow", "^pod_status_phase$", "pods"], + ["allow", "^prober_.*$", "node metrics/probes k8s v1.18+"], + ["allow", "^resource_(request|limit)$", "resources"], + ["allow", "^statefulset_replica_delta$", "health"], + ["allow", "^usage(Milli|Nano)Cores$", "tags", "and(not(container_name:*),not(sys_container:*))", "utilization"], + ["allow", "^utilization$", "utilization health"], + ["deny", "^.+$", "all other metrics"] + ] + } diff --git a/.prow/helm/helmfile.d/00-kube-dns-metrics.yaml b/.prow/helm/helmfile.d/00-kube-dns-metrics.yaml new file mode 100644 index 00000000..b20914f1 --- /dev/null +++ b/.prow/helm/helmfile.d/00-kube-dns-metrics.yaml @@ -0,0 +1,4 @@ +releases: + - name: kube-dns-metrics + namespace: kube-system + chart: ../charts/kube-dns-metrics diff --git a/.prow/helm/helmfile.d/99-kubernetes-agent.yaml b/.prow/helm/helmfile.d/99-kubernetes-agent.yaml index 741acb7c..e6cd7aef 100644 --- a/.prow/helm/helmfile.d/99-kubernetes-agent.yaml +++ b/.prow/helm/helmfile.d/99-kubernetes-agent.yaml @@ -5,5 +5,6 @@ releases: values: - ../config/{{`{{ .Release.Name }}`}}/values.yaml - kubernetes_name: {{ requiredEnv "CLUSTER_NAME" }} + - image: {{ requiredEnv "FULL_IMAGE_NAME" }} secrets: - ../config/{{`{{ .Release.Name }}`}}/secrets.yaml diff --git a/.prow/tf/gcp/main.tf b/.prow/tf/gcp/main.tf index 51a9d897..be89a8f9 100644 --- a/.prow/tf/gcp/main.tf +++ b/.prow/tf/gcp/main.tf @@ -10,6 +10,7 @@ locals { master_subnet_name = "${var.name_prefix}-${local.safe_kubernetes_version}-${local.business_unit}-master-subnet" pods_range_name = "${var.name_prefix}-${local.safe_kubernetes_version}-${local.business_unit}-ip-range-pods" svc_range_name = "${var.name_prefix}-${local.safe_kubernetes_version}-${local.business_unit}-ip-range-svc" + registry_name = "${var.name_prefix}-${local.safe_kubernetes_version}-${local.business_unit}" subnet_cidr = "10.10.0.0/16" cluster_master_ip_cidr_range = "10.100.100.0/28" @@ -63,3 +64,10 @@ module "bastion" { subnet_name = module.google_networks.subnet.name } +resource "google_artifact_registry_repository" "k8s_agent_registry" { + location = var.region + repository_id = local.registry_name + description = "container image registry for the kubernetes agent" + format = "DOCKER" +} + diff --git a/.prow/tf/gcp/modules/kubernetes_cluster/main.tf b/.prow/tf/gcp/modules/kubernetes_cluster/main.tf index 45248df9..0b090477 100644 --- a/.prow/tf/gcp/modules/kubernetes_cluster/main.tf +++ b/.prow/tf/gcp/modules/kubernetes_cluster/main.tf @@ -98,7 +98,7 @@ resource "google_container_node_pool" "product_cluster_linux_node_pool" { version = google_container_cluster.product_cluster.min_master_version autoscaling { - max_node_count = 5 + max_node_count = 2 min_node_count = 1 } max_pods_per_node = 100 diff --git a/.prow/tf/gcp/modules/kubernetes_cluster/outputs.tf b/.prow/tf/gcp/modules/kubernetes_cluster/outputs.tf index db2de772..8418508f 100644 --- a/.prow/tf/gcp/modules/kubernetes_cluster/outputs.tf +++ b/.prow/tf/gcp/modules/kubernetes_cluster/outputs.tf @@ -2,3 +2,4 @@ output "name" { value = google_container_cluster.product_cluster.name description = "The Kubernetes cluster name." } + diff --git a/.prow/tf/gcp/multiform.sh b/.prow/tf/gcp/multiform.sh index eb65959b..c32a6335 100755 --- a/.prow/tf/gcp/multiform.sh +++ b/.prow/tf/gcp/multiform.sh @@ -118,6 +118,25 @@ kubeconfig() { echo "[INFO] Kubeconfig contexts generated successfully" } +# zzz +registry() { + + echo "[INFO] Init fetching registries..." + # loop through the workspaces + for workspace in $(yq -r '.workspaces[].name' "${RUNTIME_DATA_FILE}"); do + + # set up variables + project_id=$(TF_WORKSPACE="${workspace}" terraform output --raw project_id) + region=$(TF_WORKSPACE="${workspace}" terraform output --raw region) + registry_name=$(TF_WORKSPACE="${workspace}" terraform output --raw registry_name) + + # update RUNTIME_DATA_FILE with registry name + yq -i '(.workspaces[] | select(.name == "'"${workspace}"'")).registry_name = "'"${region}"'-docker.pkg.dev/'"${project_id}"'/'"${registry_name}"'"' "${RUNTIME_DATA_FILE}" + (cd ../../../ && make build FULL_IMAGE_NAME="${region}-docker.pkg.dev/${project_id}/${registry_name}/circonus-kubernetes-agent:latest") + done + echo "[INFO] Registry images pushed successfully" +} + # set up a reverse ssh tunnel per workspace # ssh tunnel -> bastion -> gke control plane proxy() { diff --git a/.prow/tf/gcp/outputs.tf b/.prow/tf/gcp/outputs.tf index 4f38cf9e..c4e957fa 100644 --- a/.prow/tf/gcp/outputs.tf +++ b/.prow/tf/gcp/outputs.tf @@ -33,6 +33,11 @@ output "bastion_zone" { value = module.bastion.zone } +output "registry_name" { + description = "the k8s agent registry" + value = local.registry_name +} + output "get_credentials" { description = "Gcloud get-credentials command" value = format("gcloud container clusters get-credentials --project %s --region %s --internal-ip %s", var.project_id, var.region, local.cluster_name) diff --git a/.scripts/render.sh b/.scripts/render.sh index 94c5247e..bc95441a 100755 --- a/.scripts/render.sh +++ b/.scripts/render.sh @@ -5,6 +5,11 @@ # requirements: # - render (https://github.com/VirtusLab/render) +# - yq () + +# cd to repo's top-level dir +cd "$(dirname """$0""")/../" || exit + # @default: all # @type: string @@ -20,28 +25,47 @@ TEMPLATE_FILES_DIR="${TEMPLATE_FILES_DIR:-$TEMPLATE_DIR/files}" # @default: $TEMPLATE_DIR/data # @type: string -TEMPLATE_DATA_DIR="${TEMPLATE_DATA_DIR:-$TEMPLATE_DIR/data}" +TEMPLATE_DATA_FILE="${TEMPLATE_DATA_FILE:-$TEMPLATE_DIR/data/data.yaml}" -code() { - FILE_NAME="internal/circonus/metric_filters.go" - render --in "${TEMPLATE_FILES_DIR}/${FILE_NAME}.tmpl" --out "./${FILE_NAME}" --config "${TEMPLATE_DATA_DIR}/metric_filters.yaml" -} +# @default: CHANGELOG.md +# @type: string +CHANGELOG_FILE="${CHANGELOG_FILE:-CHANGELOG.md}" + +render_file() { + + FILE_NAME="${1}" + if [ "${FILE_NAME: -5}" != ".tmpl" ]; then + echo "FILE: ${FILE_NAME} does not end in .tmpl" + return 1 + fi -manifests() { - FILE_NAME="deploy/custom/configuration.yaml" - render --in "${TEMPLATE_FILES_DIR}/${FILE_NAME}.tmpl" --out "./${FILE_NAME}" --config "${TEMPLATE_DATA_DIR}/metric_filters.yaml" + if [ "${FILE_NAME}" = "./templates/files/CHANGELOG.md.tmpl" ]; then + return 0 + fi - FILE_NAME="deploy/custom/deployment.yaml" - render --in "${TEMPLATE_FILES_DIR}/${FILE_NAME}.tmpl" --out "./${FILE_NAME}" --config <( echo "version: $(grep v CHANGELOG.md | head -1 | cut -f2 -d' ')" ) + FILE_NAME="${FILE_NAME#"${TEMPLATE_FILES_DIR}"/}" + FILE_NAME="${FILE_NAME%.tmpl}" + + if [ -z "${FILE_NAME}" ]; then + echo "${0} called with no file name specified" + fi + render --in "${TEMPLATE_FILES_DIR}/${FILE_NAME}.tmpl" --out "${FILE_NAME}" --config "${TEMPLATE_DATA_FILE}" 1>/dev/null } all() { - code - manifests -} -# cd to repo's top-level dir -cd "$(dirname """$0""")/../" || exit + # set changelog version from CHANGELOG file + CHANGELOG_VERSION="$(grep v CHANGELOG.md | head -1 | cut -f2 -d' ')" + + # set agent version in yaml from changelog version + yq -i '.data.agent_version = "'"${CHANGELOG_VERSION}"'"' "${TEMPLATE_DATA_FILE}" + + # hacky, shellcheck made me do + export -f render_file + export TEMPLATE_FILES_DIR + export TEMPLATE_DATA_FILE + find "${TEMPLATE_FILES_DIR}" -type f -exec sh -c 'render_file "$1"' shell {} \; +} # Parse command line arguments if [ "$#" -eq 0 ]; then diff --git a/Makefile b/Makefile index 53393052..186fdd26 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,7 @@ BRANCH_PRIMARY=$(shell git symbolic-ref refs/remotes/origin/HEAD | sed 's;^refs/ BUILD_FLAGS=-mod=vendor GCI=$(shell which gci) GIT=$(shell which git) +GITCOMM=$(shell which gitcomm) GO=$(shell which go) GOFUMPT=$(shell which gofumpt) GOLANGCI_LINT=$(shell which golangci-lint) @@ -44,7 +45,7 @@ changelog: $(SCRIPTS_DIR)/changelog.sh build: build_deps - DOCKER_REGISTRY=registry.k8s.dev.circonus.com $(GORELEASER) --rm-dist --snapshot + DOCKER_REGISTRY=$(DOCKER_REGISTRY) $(GORELEASER) --rm-dist --snapshot commit: build_deps $(GIT) status diff --git a/contrib/helm/templates/clusterrole.yaml b/contrib/helm/templates/clusterrole.yaml index 1c5947e3..d28cc5f5 100644 --- a/contrib/helm/templates/clusterrole.yaml +++ b/contrib/helm/templates/clusterrole.yaml @@ -1,7 +1,7 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: cka-readonly + name: {{ .Release.Name }}-cka-readonly namespace: {{ .Release.Namespace }} labels: app.kubernetes.io/name: circonus-kubernetes-agent diff --git a/contrib/helm/templates/clusterrolebinding.yaml b/contrib/helm/templates/clusterrolebinding.yaml index 7ced88bf..99181913 100644 --- a/contrib/helm/templates/clusterrolebinding.yaml +++ b/contrib/helm/templates/clusterrolebinding.yaml @@ -1,15 +1,15 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: cka-readonly + name: {{ .Release.Name }}-cka-readonly namespace: {{ .Release.Namespace }} labels: app.kubernetes.io/name: circonus-kubernetes-agent roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: cka-readonly + name: {{ .Release.Name }}-cka-readonly subjects: - kind: ServiceAccount - name: circonus-kubernetes-agent + name: {{ .Release.Name }}-circonus-kubernetes-agent namespace: {{ .Release.Namespace }} diff --git a/contrib/helm/templates/configmap.yaml b/contrib/helm/templates/configmap.yaml index 9a28973f..efc8027f 100644 --- a/contrib/helm/templates/configmap.yaml +++ b/contrib/helm/templates/configmap.yaml @@ -17,6 +17,13 @@ data: ## tags on rules, contact group, etc. kubernetes-name: {{ .Values.kubernetes_name | quote }} ## see deploy/custom/configuration.yaml for more information + + circonus-api-url: {{ .Values.circonus_api_url | quote }} + + metric-filters.json: | +{{ .Values.metric_filters | indent 6 }} + + ## ## alert configuration ## diff --git a/contrib/helm/templates/deployment.yaml b/contrib/helm/templates/deployment.yaml index 4b063e01..98b07da3 100644 --- a/contrib/helm/templates/deployment.yaml +++ b/contrib/helm/templates/deployment.yaml @@ -18,11 +18,13 @@ spec: labels: app.kubernetes.io/name: circonus-kubernetes-agent app.kubernetes.io/version: latest + annotations: + rollout.circonus.com/hash: {{ randAlphaNum 5 | quote }} spec: - serviceAccountName: circonus-kubernetes-agent + serviceAccountName: {{ .Release.Name }}-circonus-kubernetes-agent containers: - name: circonus-kubernetes-agent - image: circonus/circonus-kubernetes-agent:latest + image: {{ .Values.image | quote }} command: ["/circonus-kubernetes-agentd"] #args: ["--debug"] env: @@ -31,6 +33,11 @@ spec: secretKeyRef: name: cka-secrets-v1 key: circonus-api-key + - name: CKA_CIRCONUS_API_URL + valueFrom: + configMapKeyRef: + name: cka-config-v1 + key: circonus-api-url - name: CKA_K8S_NAME valueFrom: configMapKeyRef: diff --git a/contrib/helm/templates/serviceaccount.yaml b/contrib/helm/templates/serviceaccount.yaml index 23a84615..f742824e 100644 --- a/contrib/helm/templates/serviceaccount.yaml +++ b/contrib/helm/templates/serviceaccount.yaml @@ -1,7 +1,7 @@ apiVersion: v1 kind: ServiceAccount metadata: - name: circonus-kubernetes-agent + name: {{ .Release.Name }}-circonus-kubernetes-agent namespace: {{ .Release.Namespace }} labels: - app.kubernetes.io/name: circonus-kubernetes-agent \ No newline at end of file + app.kubernetes.io/name: circonus-kubernetes-agent diff --git a/contrib/helm/values.yaml b/contrib/helm/values.yaml index 665a1f6a..daaf878e 100644 --- a/contrib/helm/values.yaml +++ b/contrib/helm/values.yaml @@ -1,6 +1,74 @@ +## WARN: this is a template rendered from a file in $reporoot/templates/files/ +image: "circonus/circonus-kubernetes-agent:latest" +circonus_api_url: "https://api.circonus.com/v2/" circonus_api_key: "" -kubernetes_name: "" contact_email: "" broker_cid: "/broker/35" +kubernetes_name: "" dns: port: "10055" +metric_filters: | + { + "metric_filters": [ + ["allow", "^.+$", "tags", "and(collector:dynamic)", "NO_LOCAL_FILTER dynamically collected metrics"], + ["allow", "^(Disk|Memory|PID)Pressure$", "node status"], + ["allow", "^(container|node|pod)_.*$", "node metrics k8s v1.18+"], + ["allow", "^(kube_)?pod_container_status_(running|terminated|waiting|ready)(_count)?$", "containers"], + ["allow", "^(kube_)?pod_container_status_(terminated|waiting)_reason(_count)?$", "containers health"], + ["allow", "^(kube_)?pod_init_container_status_(terminated|waiting)_reason(_count)?$", "init containers health"], + ["allow", "^(kube_)?pod_status_(ready|scheduled)(_count)?$", "tags", "and(condition:true)", "pods"], + ["allow", "^(kube_)?pod_status_phase(_count)?$", "tags", "and(or(phase:Running,phase:Pending,phase:Failed,phase:Succeeded))", "pods"], + ["allow", "^(node|kubelet_running_pod_count|Ready)$", "nodes"], + ["allow", "^(pod|node)_cpu_usage_seconds_total$", "utilization"], + ["allow", "^(pod|node)_memory_working_set_bytes$", "utilization"], + ["allow", "^(used|capacity)$", "tags", "and(or(units:bytes,units:percent),or(resource:memory,resource:fs,volume_name:*),not(container_name:*),not(sys_container:*))", "utilization"], + ["allow", "^NetworkUnavailable$", "node status"], + ["allow", "^[rt]x$", "tags", "and(resource:network,or(units:bytes,units:errors),not(container_name:*),not(sys_container:*))", "utilization"], + ["allow", "^apiserver_request_total$", "tags", "and(or(code:5*,code:4*))", "api req errors"], + ["allow", "^authenticated_user_requests$", "api auth"], + ["allow", "^authentication_attempts$", "api auth health"], + ["allow", "^cadvisor.*$", "cadvisor"], + ["allow", "^capacity_.*$", "node capacity"], + ["allow", "^collect_.*$", "agent collection stats"], + ["allow", "^coredns*", "dns health"], + ["allow", "^coredns_(dns|forward)_request_(count_total|duration_seconds_avg)$", "dns health"], + ["allow", "^coredns_(dns|forward)_response_rcode_count_total$", "dns health"], + ["allow", "^daemonset_scheduled_delta$", "health"], + ["allow", "^deployment_generation_delta$", "health"], + ["allow", "^events$", "events"], + ["allow", "^kube_(service_labels|deployment_labels|pod_container_info|pod_deleted)$", "ksm inventory"], + ["allow", "^kube_(service|deployment)_labels$", "ksm inventory"], + ["allow", "^kube_daemonset_status_(current|desired)_number_scheduled$", "health"], + ["allow", "^kube_deployment_(created|spec_replicas)$", "deployments"], + ["allow", "^kube_deployment_(metadata|status_observed)_generation$", "health"], + ["allow", "^kube_deployment_status_(replicas|replicas_updated|replicas_available|replicas_unavailable)$", "deployments"], + ["allow", "^kube_deployment_status_replicas_unavailable$", "deployments"], + ["allow", "^kube_hpa_(spec_max|status_current)_replicas$", "scale"], + ["allow", "^kube_job_status_failed$", "health"], + ["allow", "^kube_namespace_status_phase$", "namespaces"], + ["allow", "^kube_namespace_status_phase$", "tags", "and(or(phase:Active,phase:Terminating))", "namespaces"], + ["allow", "^kube_node_spec_unschedulable$", "node status"], + ["allow", "^kube_node_status_allocatable$", "node status"], + ["allow", "^kube_node_status_condition$", "node status health"], + ["allow", "^kube_persistentvolume_status_phase$", "health"], + ["allow", "^kube_pod_info$", "pods"], + ["allow", "^kube_pod_start_time$", "pods"], + ["allow", "^kube_pod_status_condition$", "pods"], + ["allow", "^kube_statefulset_status_(replicas|replicas_ready)$", "health"], + ["allow", "^kubedns*","dns health"], + ["allow", "^kubelet_.*$", "node metrics k8s v1.18+"], + ["allow", "^machine_.*$", "node metrics k8s v1.18+"], + ["allow", "^pod_container_status$", "containers"], + ["allow", "^pod_container_status_(terminated|waiting)_reason(_count)?$", "containers health"], + ["allow", "^pod_init_container_status_(terminated|waiting)_reason(_count)?$", "init containers health"], + ["allow", "^pod_status_(ready|scheduled)$", "pods"], + ["allow", "^pod_status_phase$", "pods"], + ["allow", "^prober_.*$", "node metrics/probes k8s v1.18+"], + ["allow", "^resource_(request|limit)$", "resources"], + ["allow", "^statefulset_replica_delta$", "health"], + ["allow", "^usage(Milli|Nano)Cores$", "tags", "and(not(container_name:*),not(sys_container:*))", "utilization"], + ["allow", "^utilization$", "utilization health"], + ["deny", "^.+$", "all other metrics"] + ] + } + \ No newline at end of file diff --git a/deploy/custom/configuration.yaml b/deploy/custom/configuration.yaml index 37579ea5..a05cff0f 100644 --- a/deploy/custom/configuration.yaml +++ b/deploy/custom/configuration.yaml @@ -1,3 +1,4 @@ +## WARN: this is a template rendered from a file in $reporoot/templates/files/ ## ## NOTE: Change settings here, in the corresonding ## Secret or ConfigMap entry. Ensure settings @@ -5,300 +6,299 @@ ## in order for it to take effect. ## --- - apiVersion: v1 - kind: Secret - metadata: - # versioned, cadence independent of app version - name: cka-secrets-v1 - labels: - app.kubernetes.io/name: circonus-kubernetes-agent - stringData: - ## Circonus API Key is REQUIRED - circonus-api-key: "" - ## For in-cluster operation, the service account token - ## will be used. Only set this to use a DIFFERENT token - ## than the kubernetes-bearer-token-file setting - ## below. The file will always take precedence, ensure - ## kubernetes-bearer-token-file is set to "" when using - ## this setting. - #kubernetes-bearer-token: "" - +apiVersion: v1 +kind: Secret +metadata: + # versioned, cadence independent of app version + name: cka-secrets-v1 + labels: + app.kubernetes.io/name: circonus-kubernetes-agent +stringData: + ## Circonus API Key is REQUIRED + circonus-api-key: "" + ## For in-cluster operation, the service account token + ## will be used. Only set this to use a DIFFERENT token + ## than the kubernetes-bearer-token-file setting + ## below. The file will always take precedence, ensure + ## kubernetes-bearer-token-file is set to "" when using + ## this setting. + #kubernetes-bearer-token: "" --- - apiVersion: v1 - kind: ConfigMap - metadata: - # versioned, cadence independent of app version - name: cka-config-v1 - labels: - app.kubernetes.io/name: circonus-kubernetes-agent - data: - #circonus-api-key-file: "" - #circonus-api-app: "circonus-kubernetes-agent" - #circonus-api-url: "https://api.circonus.com/v2" - #circonus-api-ca-file: "" - #circonus-api-debug: "false" - ## broker to use when creating a new httptrap check - #circonus-check-broker-cid: "/broker/35" - #circonus-check-broker-ca-file: "" - ## create a check, if one cannot be found using the target - #circonus-check-create: "true" - ## or, turn create off, and specify a check which has already been created - #circonus-check-bundle-cid: "" - ## comman delimited list of k:v tags to add to the check - #circonus-check-tags: "" - ## Use a static target to ensure that the agent can find the check - ## the next time the pod starts. Otherwise, the pod's hostname will - ## be used and a new check would be created each time the pod is - ## created when create is enabled. The kubernetes-name will be - ## used if check-target is not set. - circonus-check-target: "" - ## set a custom display title for the check when it is created - #circonus-check-title: "" - ## comma delimited list of k:v streamtags to add to every metric - #circonus-default-streamtags: "" - ## - ## set a name identifying the cluster, to be used in the check - ## title when it is created - kubernetes-name: "" - #kubernetes-api-url: "https://kubernetes.default.svc" - #kubernetes-api-ca-file: "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" - #kubernetes-bearer-token-file: "/var/run/secrets/kubernetes.io/serviceaccount/token" - ## collect event metrics - default is enabled for dashboard - kubernetes-enable-events: "true" - ## collect metrics from kube-state-metrics if running - default is enabled for dashboard - kubernetes-enable-kube-state-metrics: "true" - ## kube-state-metrics fieldSelector query, default from https://github.com/kubernetes/kube-state-metrics/blob/master/examples/standard/service.yaml - kubernetes-ksm-field-selector-query: "metadata.name=kube-state-metrics" - ## kube-state-metrics metrics port, no default, service endpoint ports will be used if not set - kubernetes-ksm-metrics-port: "" - ## kube-state-metrics metrics port name, default from https://github.com/kubernetes/kube-state-metrics/blob/master/examples/standard/service.yaml - ## if using helm or some other tool, look at the configuration to see if the port is named differently in the service endpoint... - kubernetes-ksm-metrics-port-name: "http-metrics" - ## collect metrics from api-server - default is enabled for dashboard - kubernetes-enable-api-server: "true" - ## collect node metrics - default is enabled for dashboard - kubernetes-enable-nodes: "true" - ## expression to use for node labelSelector - blank = all nodes - kubernetes-node-selector: "" - ## collect kublet /stats/summary performance metrics (e.g. cpu, memory, fs) - default is enabled for dashboard (k8s // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. diff --git a/internal/dns/dns.go b/internal/dns/dns.go index fd7d4632..5142e0db 100644 --- a/internal/dns/dns.go +++ b/internal/dns/dns.go @@ -149,46 +149,105 @@ func (dns *DNS) getMetricURLs(ctx context.Context) (map[string]string, error) { return nil, err } + /* + 1. parse scrape options from config + 2. check known services in order [ kube-dns, coredns ] + 3. if known service exists, check if it has a scrape annotation (and use it if it does) + 4. if known service doesn't have a scrape annotation, check if config options have scrape settings + 5. if config options have scrape settings, check if known service has port from scrape settings + 6. if known service does not have port from scrape settings, check if any other services do + */ + + // parse scrape options from config + port := viper.GetInt(keys.K8SDNSMetricsPort) + scrape := viper.GetBool(keys.K8SEnableDNSMetrics) + + // check for kube-dns first svc, err := clientset.CoreV1().Services("kube-system").Get(ctx, "kube-dns", metav1.GetOptions{}) - dns.service = "kube-dns" - if err != nil { + + // if kube-dns found + if err == nil { + + dns.service = "kube-dns" + + for name, value := range svc.Annotations { + switch name { + case "prometheus.io/port": + p, err := strconv.Atoi(value) + if err != nil { + return nil, errors.Wrap(err, "parsing service port annotation") + } + port = p + case "prometheus.io/scrape": + s, err := strconv.ParseBool(value) + if err != nil { + return nil, errors.Wrap(err, "parsing service scrape annotation") + } + scrape = s + } + } + + // if kube-dns not found + } else { + dns.log.Info().Str("get kube-dns service failed", err.Error()).Msg("service not found, checking coredns") - dns.service = "coredns" + // maybe we're using coredns? svc, err = clientset.CoreV1().Services("kube-system").Get(ctx, "coredns", metav1.GetOptions{}) - if err != nil { - dns.service = "" - dns.log.Warn().Str("get all dns services failed", err.Error()).Msg("service not found, nothing to do") - return nil, err - } - } - scrape := false - port := 0 + // if we're not using coredns + if err == nil { + + dns.service = "coredns" + + for name, value := range svc.Annotations { + switch name { + case "prometheus.io/port": + p, err := strconv.Atoi(value) + if err != nil { + return nil, errors.Wrap(err, "parsing service port annotation") + } + port = p + case "prometheus.io/scrape": + s, err := strconv.ParseBool(value) + if err != nil { + return nil, errors.Wrap(err, "parsing service scrape annotation") + } + scrape = s + } + } - for name, value := range svc.Annotations { - switch name { - case "prometheus.io/port": - p, err := strconv.Atoi(value) + } else if port != 0 && scrape { + // get all services + svcsl, err := clientset.CoreV1().Services("kube-system").List(ctx, metav1.ListOptions{}) if err != nil { - return nil, errors.Wrap(err, "parsing service port annotation") + dns.service = "" + dns.log.Warn().Str("get all kube-system services failed", err.Error()).Msg("service not found, nothing to do") + return nil, err } - port = p - case "prometheus.io/scrape": - s, err := strconv.ParseBool(value) - if err != nil { - return nil, errors.Wrap(err, "parsing service scrape annotation") + + // see if we have any services that match the port from the scrape settings + for i, s := range svcsl.Items { + if val, ok := s.Annotations["prometheus.io/port"]; ok { + vali, err := strconv.Atoi(val) + if err != nil { + return nil, errors.Wrap(err, "parsing service port annotation") + } + if vali == port { + dns.service = "kube-dns" + svc = &svcsl.Items[i] + break + } + } } - scrape = s } } - if port == 0 { - dns.log.Warn().Int("port", port).Msg("service annotations not found, checking supplied service ports") - port = viper.GetInt(keys.K8SDNSMetricsPort) + if !scrape { + return nil, errors.New("dns service not configured for scraping") } - if !scrape { - return nil, errors.New("service not configured for scraping") + if port == 0 { + return nil, errors.New("dns service scrape port not configured") } if len(svc.Spec.Selector) == 0 { diff --git a/templates/data/data.yaml b/templates/data/data.yaml new file mode 100644 index 00000000..5d371036 --- /dev/null +++ b/templates/data/data.yaml @@ -0,0 +1,66 @@ +data: + agent_version: v0.13.0 + metric_filters: | + { + "metric_filters": [ + ["allow", "^.+$", "tags", "and(collector:dynamic)", "NO_LOCAL_FILTER dynamically collected metrics"], + ["allow", "^(Disk|Memory|PID)Pressure$", "node status"], + ["allow", "^(container|node|pod)_.*$", "node metrics k8s v1.18+"], + ["allow", "^(kube_)?pod_container_status_(running|terminated|waiting|ready)(_count)?$", "containers"], + ["allow", "^(kube_)?pod_container_status_(terminated|waiting)_reason(_count)?$", "containers health"], + ["allow", "^(kube_)?pod_init_container_status_(terminated|waiting)_reason(_count)?$", "init containers health"], + ["allow", "^(kube_)?pod_status_(ready|scheduled)(_count)?$", "tags", "and(condition:true)", "pods"], + ["allow", "^(kube_)?pod_status_phase(_count)?$", "tags", "and(or(phase:Running,phase:Pending,phase:Failed,phase:Succeeded))", "pods"], + ["allow", "^(node|kubelet_running_pod_count|Ready)$", "nodes"], + ["allow", "^(pod|node)_cpu_usage_seconds_total$", "utilization"], + ["allow", "^(pod|node)_memory_working_set_bytes$", "utilization"], + ["allow", "^(used|capacity)$", "tags", "and(or(units:bytes,units:percent),or(resource:memory,resource:fs,volume_name:*),not(container_name:*),not(sys_container:*))", "utilization"], + ["allow", "^NetworkUnavailable$", "node status"], + ["allow", "^[rt]x$", "tags", "and(resource:network,or(units:bytes,units:errors),not(container_name:*),not(sys_container:*))", "utilization"], + ["allow", "^apiserver_request_total$", "tags", "and(or(code:5*,code:4*))", "api req errors"], + ["allow", "^authenticated_user_requests$", "api auth"], + ["allow", "^authentication_attempts$", "api auth health"], + ["allow", "^cadvisor.*$", "cadvisor"], + ["allow", "^capacity_.*$", "node capacity"], + ["allow", "^collect_.*$", "agent collection stats"], + ["allow", "^coredns*", "dns health"], + ["allow", "^coredns_(dns|forward)_request_(count_total|duration_seconds_avg)$", "dns health"], + ["allow", "^coredns_(dns|forward)_response_rcode_count_total$", "dns health"], + ["allow", "^daemonset_scheduled_delta$", "health"], + ["allow", "^deployment_generation_delta$", "health"], + ["allow", "^events$", "events"], + ["allow", "^kube_(service_labels|deployment_labels|pod_container_info|pod_deleted)$", "ksm inventory"], + ["allow", "^kube_(service|deployment)_labels$", "ksm inventory"], + ["allow", "^kube_daemonset_status_(current|desired)_number_scheduled$", "health"], + ["allow", "^kube_deployment_(created|spec_replicas)$", "deployments"], + ["allow", "^kube_deployment_(metadata|status_observed)_generation$", "health"], + ["allow", "^kube_deployment_status_(replicas|replicas_updated|replicas_available|replicas_unavailable)$", "deployments"], + ["allow", "^kube_deployment_status_replicas_unavailable$", "deployments"], + ["allow", "^kube_hpa_(spec_max|status_current)_replicas$", "scale"], + ["allow", "^kube_job_status_failed$", "health"], + ["allow", "^kube_namespace_status_phase$", "namespaces"], + ["allow", "^kube_namespace_status_phase$", "tags", "and(or(phase:Active,phase:Terminating))", "namespaces"], + ["allow", "^kube_node_spec_unschedulable$", "node status"], + ["allow", "^kube_node_status_allocatable$", "node status"], + ["allow", "^kube_node_status_condition$", "node status health"], + ["allow", "^kube_persistentvolume_status_phase$", "health"], + ["allow", "^kube_pod_info$", "pods"], + ["allow", "^kube_pod_start_time$", "pods"], + ["allow", "^kube_pod_status_condition$", "pods"], + ["allow", "^kube_statefulset_status_(replicas|replicas_ready)$", "health"], + ["allow", "^kubedns*","dns health"], + ["allow", "^kubelet_.*$", "node metrics k8s v1.18+"], + ["allow", "^machine_.*$", "node metrics k8s v1.18+"], + ["allow", "^pod_container_status$", "containers"], + ["allow", "^pod_container_status_(terminated|waiting)_reason(_count)?$", "containers health"], + ["allow", "^pod_init_container_status_(terminated|waiting)_reason(_count)?$", "init containers health"], + ["allow", "^pod_status_(ready|scheduled)$", "pods"], + ["allow", "^pod_status_phase$", "pods"], + ["allow", "^prober_.*$", "node metrics/probes k8s v1.18+"], + ["allow", "^resource_(request|limit)$", "resources"], + ["allow", "^statefulset_replica_delta$", "health"], + ["allow", "^usage(Milli|Nano)Cores$", "tags", "and(not(container_name:*),not(sys_container:*))", "utilization"], + ["allow", "^utilization$", "utilization health"], + ["deny", "^.+$", "all other metrics"] + ] + } diff --git a/templates/data/metric_filters.yaml b/templates/data/metric_filters.yaml deleted file mode 100644 index 5fad16ff..00000000 --- a/templates/data/metric_filters.yaml +++ /dev/null @@ -1,64 +0,0 @@ -metric_filters: | - { - "metric_filters": [ - ["allow", "^.+$", "tags", "and(collector:dynamic)", "NO_LOCAL_FILTER dynamically collected metrics"], - ["allow", "^(Disk|Memory|PID)Pressure$", "node status"], - ["allow", "^(container|node|pod)_.*$", "node metrics k8s v1.18+"], - ["allow", "^(kube_)?pod_container_status_(running|terminated|waiting|ready)(_count)?$", "containers"], - ["allow", "^(kube_)?pod_container_status_(terminated|waiting)_reason(_count)?$", "containers health"], - ["allow", "^(kube_)?pod_init_container_status_(terminated|waiting)_reason(_count)?$", "init containers health"], - ["allow", "^(kube_)?pod_status_(ready|scheduled)(_count)?$", "tags", "and(condition:true)", "pods"], - ["allow", "^(kube_)?pod_status_phase(_count)?$", "tags", "and(or(phase:Running,phase:Pending,phase:Failed,phase:Succeeded))", "pods"], - ["allow", "^(node|kubelet_running_pod_count|Ready)$", "nodes"], - ["allow", "^(pod|node)_cpu_usage_seconds_total$", "utilization"], - ["allow", "^(pod|node)_memory_working_set_bytes$", "utilization"], - ["allow", "^(used|capacity)$", "tags", "and(or(units:bytes,units:percent),or(resource:memory,resource:fs,volume_name:*),not(container_name:*),not(sys_container:*))", "utilization"], - ["allow", "^NetworkUnavailable$", "node status"], - ["allow", "^[rt]x$", "tags", "and(resource:network,or(units:bytes,units:errors),not(container_name:*),not(sys_container:*))", "utilization"], - ["allow", "^apiserver_request_total$", "tags", "and(or(code:5*,code:4*))", "api req errors"], - ["allow", "^authenticated_user_requests$", "api auth"], - ["allow", "^authentication_attempts$", "api auth health"], - ["allow", "^cadvisor.*$", "cadvisor"], - ["allow", "^capacity_.*$", "node capacity"], - ["allow", "^collect_.*$", "agent collection stats"], - ["allow", "^coredns*", "dns health"], - ["allow", "^coredns_(dns|forward)_request_(count_total|duration_seconds_avg)$", "dns health"], - ["allow", "^coredns_(dns|forward)_response_rcode_count_total$", "dns health"], - ["allow", "^daemonset_scheduled_delta$", "health"], - ["allow", "^deployment_generation_delta$", "health"], - ["allow", "^events$", "events"], - ["allow", "^kube_(service_labels|deployment_labels|pod_container_info|pod_deleted)$", "ksm inventory"], - ["allow", "^kube_(service|deployment)_labels$", "ksm inventory"], - ["allow", "^kube_daemonset_status_(current|desired)_number_scheduled$", "health"], - ["allow", "^kube_deployment_(created|spec_replicas)$", "deployments"], - ["allow", "^kube_deployment_(metadata|status_observed)_generation$", "health"], - ["allow", "^kube_deployment_status_(replicas|replicas_updated|replicas_available|replicas_unavailable)$", "deployments"], - ["allow", "^kube_deployment_status_replicas_unavailable$", "deployments"], - ["allow", "^kube_hpa_(spec_max|status_current)_replicas$", "scale"], - ["allow", "^kube_job_status_failed$", "health"], - ["allow", "^kube_namespace_status_phase$", "namespaces"], - ["allow", "^kube_namespace_status_phase$", "tags", "and(or(phase:Active,phase:Terminating))", "namespaces"], - ["allow", "^kube_node_spec_unschedulable$", "node status"], - ["allow", "^kube_node_status_allocatable$", "node status"], - ["allow", "^kube_node_status_condition$", "node status health"], - ["allow", "^kube_persistentvolume_status_phase$", "health"], - ["allow", "^kube_pod_info$", "pods"], - ["allow", "^kube_pod_start_time$", "pods"], - ["allow", "^kube_pod_status_condition$", "pods"], - ["allow", "^kube_statefulset_status_(replicas|replicas_ready)$", "health"], - ["allow", "^kubedns*","dns health"], - ["allow", "^kubelet_.*$", "node metrics k8s v1.18+"], - ["allow", "^machine_.*$", "node metrics k8s v1.18+"], - ["allow", "^pod_container_status$", "containers"], - ["allow", "^pod_container_status_(terminated|waiting)_reason(_count)?$", "containers health"], - ["allow", "^pod_init_container_status_(terminated|waiting)_reason(_count)?$", "init containers health"], - ["allow", "^pod_status_(ready|scheduled)$", "pods"], - ["allow", "^pod_status_phase$", "pods"], - ["allow", "^prober_.*$", "node metrics/probes k8s v1.18+"], - ["allow", "^resource_(request|limit)$", "resources"], - ["allow", "^statefulset_replica_delta$", "health"], - ["allow", "^usage(Milli|Nano)Cores$", "tags", "and(not(container_name:*),not(sys_container:*))", "utilization"], - ["allow", "^utilization$", "utilization health"], - ["deny", "^.+$", "all other metrics"] - ] - } diff --git a/templates/files/contrib/helm/values.yaml.tmpl b/templates/files/contrib/helm/values.yaml.tmpl new file mode 100644 index 00000000..367b9612 --- /dev/null +++ b/templates/files/contrib/helm/values.yaml.tmpl @@ -0,0 +1,11 @@ +## WARN: this is a template rendered from a file in $reporoot/templates/files/ +image: "circonus/circonus-kubernetes-agent:latest" +circonus_api_url: "https://api.circonus.com/v2/" +circonus_api_key: "" +contact_email: "" +broker_cid: "/broker/35" +kubernetes_name: "" +dns: + port: "10055" +metric_filters: | +{{ .data.metric_filters | indent 2 }} diff --git a/templates/files/deploy/custom/configuration.yaml.tmpl b/templates/files/deploy/custom/configuration.yaml.tmpl index be700ec8..3998b751 100644 --- a/templates/files/deploy/custom/configuration.yaml.tmpl +++ b/templates/files/deploy/custom/configuration.yaml.tmpl @@ -1,3 +1,4 @@ +## WARN: this is a template rendered from a file in $reporoot/templates/files/ ## ## NOTE: Change settings here, in the corresonding ## Secret or ConfigMap entry. Ensure settings @@ -5,237 +6,236 @@ ## in order for it to take effect. ## --- - apiVersion: v1 - kind: Secret - metadata: - # versioned, cadence independent of app version - name: cka-secrets-v1 - labels: - app.kubernetes.io/name: circonus-kubernetes-agent - stringData: - ## Circonus API Key is REQUIRED - circonus-api-key: "" - ## For in-cluster operation, the service account token - ## will be used. Only set this to use a DIFFERENT token - ## than the kubernetes-bearer-token-file setting - ## below. The file will always take precedence, ensure - ## kubernetes-bearer-token-file is set to "" when using - ## this setting. - #kubernetes-bearer-token: "" - +apiVersion: v1 +kind: Secret +metadata: + # versioned, cadence independent of app version + name: cka-secrets-v1 + labels: + app.kubernetes.io/name: circonus-kubernetes-agent +stringData: + ## Circonus API Key is REQUIRED + circonus-api-key: "" + ## For in-cluster operation, the service account token + ## will be used. Only set this to use a DIFFERENT token + ## than the kubernetes-bearer-token-file setting + ## below. The file will always take precedence, ensure + ## kubernetes-bearer-token-file is set to "" when using + ## this setting. + #kubernetes-bearer-token: "" --- - apiVersion: v1 - kind: ConfigMap - metadata: - # versioned, cadence independent of app version - name: cka-config-v1 - labels: - app.kubernetes.io/name: circonus-kubernetes-agent - data: - #circonus-api-key-file: "" - #circonus-api-app: "circonus-kubernetes-agent" - #circonus-api-url: "https://api.circonus.com/v2" - #circonus-api-ca-file: "" - #circonus-api-debug: "false" - ## broker to use when creating a new httptrap check - #circonus-check-broker-cid: "/broker/35" - #circonus-check-broker-ca-file: "" - ## create a check, if one cannot be found using the target - #circonus-check-create: "true" - ## or, turn create off, and specify a check which has already been created - #circonus-check-bundle-cid: "" - ## comman delimited list of k:v tags to add to the check - #circonus-check-tags: "" - ## Use a static target to ensure that the agent can find the check - ## the next time the pod starts. Otherwise, the pod's hostname will - ## be used and a new check would be created each time the pod is - ## created when create is enabled. The kubernetes-name will be - ## used if check-target is not set. - circonus-check-target: "" - ## set a custom display title for the check when it is created - #circonus-check-title: "" - ## comma delimited list of k:v streamtags to add to every metric - #circonus-default-streamtags: "" - ## - ## set a name identifying the cluster, to be used in the check - ## title when it is created - kubernetes-name: "" - #kubernetes-api-url: "https://kubernetes.default.svc" - #kubernetes-api-ca-file: "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" - #kubernetes-bearer-token-file: "/var/run/secrets/kubernetes.io/serviceaccount/token" - ## collect event metrics - default is enabled for dashboard - kubernetes-enable-events: "true" - ## collect metrics from kube-state-metrics if running - default is enabled for dashboard - kubernetes-enable-kube-state-metrics: "true" - ## kube-state-metrics fieldSelector query, default from https://github.com/kubernetes/kube-state-metrics/blob/master/examples/standard/service.yaml - kubernetes-ksm-field-selector-query: "metadata.name=kube-state-metrics" - ## kube-state-metrics metrics port, no default, service endpoint ports will be used if not set - kubernetes-ksm-metrics-port: "" - ## kube-state-metrics metrics port name, default from https://github.com/kubernetes/kube-state-metrics/blob/master/examples/standard/service.yaml - ## if using helm or some other tool, look at the configuration to see if the port is named differently in the service endpoint... - kubernetes-ksm-metrics-port-name: "http-metrics" - ## collect metrics from api-server - default is enabled for dashboard - kubernetes-enable-api-server: "true" - ## collect node metrics - default is enabled for dashboard - kubernetes-enable-nodes: "true" - ## expression to use for node labelSelector - blank = all nodes - kubernetes-node-selector: "" - ## collect kublet /stats/summary performance metrics (e.g. cpu, memory, fs) - default is enabled for dashboard (k8s // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. @@ -73,7 +75,7 @@ const ( } ` defaultMetricFiltersStr120 = ` -{{ .metric_filters }} +{{ .data.metric_filters }} ` )