Browse Source

docs: add apiserver kubelet node alerts template to k8s (#1508)

* add apiserver kubelet node alerts to k8s

* modify node name to node-exporter
tags/v6.0.0-ga12
青牛踏雪 GitHub 3 years ago
parent
commit
aaf66cb386
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 1642 additions and 0 deletions
  1. +266
    -0
      integrations/kubernetes/alerts/apiserver.json
  2. +366
    -0
      integrations/kubernetes/alerts/kubelet.json
  3. +1010
    -0
      integrations/kubernetes/alerts/node-exporter.json

+ 266
- 0
integrations/kubernetes/alerts/apiserver.json View File

@@ -0,0 +1,266 @@
[
{
"name": "KubeClientCertificateExpiration-S2",
"note": "A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.",
"severity": 2,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 604800\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "KubeClientCertificateExpiration-S1",
"note": "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.",
"severity": 1,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 86400\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "AggregatedAPIErrors",
"note": "An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often.",
"severity": 2,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "AggregatedAPIDown",
"note": "An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.",
"severity": 2,
"disabled": 0,
"prom_for_duration": 300,
"prom_ql": "(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "KubeAPIDown",
"note": "KubeAPI has disappeared from Prometheus target discovery.",
"severity": 1,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "absent(up{job=\"apiserver\"} == 1)\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "KubeAPIErrorBudgetBurn-S1-120秒",
"note": "The API server is burning too much error budget.",
"severity": 1,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)\nand\nsum(apiserver_request:burnrate5m) > (14.40 * 0.01000)\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"long=1h",
"short=5m"
]
},
{
"name": "KubeAPIErrorBudgetBurn-S1-900秒",
"note": "The API server is burning too much error budget.",
"severity": 1,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)\nand\nsum(apiserver_request:burnrate30m) > (6.00 * 0.01000)\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"long=6h",
"short=30m"
]
},
{
"name": "KubeAPIErrorBudgetBurn-S2-3600秒",
"note": "The API server is burning too much error budget.",
"severity": 2,
"disabled": 0,
"prom_for_duration": 3600,
"prom_ql": "sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)\nand\nsum(apiserver_request:burnrate2h) > (3.00 * 0.01000)\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"long=1d",
"short=2h"
]
},
{
"name": "KubeAPIErrorBudgetBurn-S2-10800秒",
"note": "The API server is burning too much error budget.",
"severity": 2,
"disabled": 0,
"prom_for_duration": 10800,
"prom_ql": "sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)\nand\nsum(apiserver_request:burnrate6h) > (1.00 * 0.01000)\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"long=3d",
"short=6h"
]
}
]

+ 366
- 0
integrations/kubernetes/alerts/kubelet.json View File

@@ -0,0 +1,366 @@
[
{
"name": "Node状态异常",
"note": "{{ $labels.node }} has been unready for more than 15 minutes.",
"severity": 2,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "kube_node_status_condition{job=\"kube-state-metrics\",condition=\"Ready\",status=\"true\"} == 0\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "Node不可达",
"note": "{{ $labels.node }} is unreachable and some workloads may be rescheduled.",
"severity": 2,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "(kube_node_spec_taint{job=\"kube-state-metrics\",key=\"node.kubernetes.io/unreachable\",effect=\"NoSchedule\"} unless ignoring(key,value) kube_node_spec_taint{job=\"kube-state-metrics\",key=~\"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn\"}) == 1\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "Node运行太多Pod",
"note": "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.",
"severity": 2,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "count by(node) (\n (kube_pod_status_phase{job=\"kube-state-metrics\",phase=\"Running\"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job=\"kube-state-metrics\"})\n)\n/\nmax by(node) (\n kube_node_status_capacity_pods{job=\"kube-state-metrics\"} != 1\n) > 0.95\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "Node状态抖动",
"note": "The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.",
"severity": 2,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "sum(changes(kube_node_status_condition{status=\"true\",condition=\"Ready\"}[15m])) by (node) > 2\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "PLEG耗时高",
"note": "The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.",
"severity": 2,
"disabled": 0,
"prom_for_duration": 300,
"prom_ql": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile=\"0.99\"} >= 10\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "Pod启动耗时高",
"note": "Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.",
"severity": 2,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\", metrics_path=\"/metrics\"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"} > 60\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "客户端证书过期-S2",
"note": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.",
"severity": 2,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "kubelet_certificate_manager_client_ttl_seconds < 604800\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "客户端证书过期-S1",
"note": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.",
"severity": 1,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "kubelet_certificate_manager_client_ttl_seconds < 86400\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "服务端证书过期-S2",
"note": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.",
"severity": 2,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "kubelet_certificate_manager_server_ttl_seconds < 604800\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "服务端证书过期-S1",
"note": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.",
"severity": 1,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "kubelet_certificate_manager_server_ttl_seconds < 86400\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "客户端证书续签错误",
"note": "Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes).",
"severity": 2,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "服务证书续签错误",
"note": "Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes).",
"severity": 2,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "increase(kubelet_server_expiration_renew_errors[5m]) > 0\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "kubelet故障",
"note": "Kubelet has disappeared from Prometheus target discovery.",
"severity": 1,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "absent(up{job=\"kubelet\"} == 1)\n",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
}
]

+ 1010
- 0
integrations/kubernetes/alerts/node-exporter.json
File diff suppressed because it is too large
View File


Loading…
Cancel
Save