|
|
|
@@ -0,0 +1,366 @@ |
|
|
|
[ |
|
|
|
{ |
|
|
|
"name": "Node状态异常", |
|
|
|
"note": "{{ $labels.node }} has been unready for more than 15 minutes.", |
|
|
|
"severity": 2, |
|
|
|
"disabled": 0, |
|
|
|
"prom_for_duration": 900, |
|
|
|
"prom_ql": "kube_node_status_condition{job=\"kube-state-metrics\",condition=\"Ready\",status=\"true\"} == 0\n", |
|
|
|
"prom_eval_interval": 15, |
|
|
|
"enable_stime": "00:00", |
|
|
|
"enable_etime": "23:59", |
|
|
|
"enable_days_of_week": [ |
|
|
|
"1", |
|
|
|
"2", |
|
|
|
"3", |
|
|
|
"4", |
|
|
|
"5", |
|
|
|
"6", |
|
|
|
"0" |
|
|
|
], |
|
|
|
"enable_in_bg": 0, |
|
|
|
"notify_recovered": 1, |
|
|
|
"notify_channels": [], |
|
|
|
"notify_repeat_step": 60, |
|
|
|
"recover_duration": 0, |
|
|
|
"callbacks": [], |
|
|
|
"runbook_url": "", |
|
|
|
"append_tags": [] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"name": "Node不可达", |
|
|
|
"note": "{{ $labels.node }} is unreachable and some workloads may be rescheduled.", |
|
|
|
"severity": 2, |
|
|
|
"disabled": 0, |
|
|
|
"prom_for_duration": 900, |
|
|
|
"prom_ql": "(kube_node_spec_taint{job=\"kube-state-metrics\",key=\"node.kubernetes.io/unreachable\",effect=\"NoSchedule\"} unless ignoring(key,value) kube_node_spec_taint{job=\"kube-state-metrics\",key=~\"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn\"}) == 1\n", |
|
|
|
"prom_eval_interval": 15, |
|
|
|
"enable_stime": "00:00", |
|
|
|
"enable_etime": "23:59", |
|
|
|
"enable_days_of_week": [ |
|
|
|
"1", |
|
|
|
"2", |
|
|
|
"3", |
|
|
|
"4", |
|
|
|
"5", |
|
|
|
"6", |
|
|
|
"0" |
|
|
|
], |
|
|
|
"enable_in_bg": 0, |
|
|
|
"notify_recovered": 1, |
|
|
|
"notify_channels": [], |
|
|
|
"notify_repeat_step": 60, |
|
|
|
"recover_duration": 0, |
|
|
|
"callbacks": [], |
|
|
|
"runbook_url": "", |
|
|
|
"append_tags": [] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"name": "Node运行太多Pod", |
|
|
|
"note": "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.", |
|
|
|
"severity": 2, |
|
|
|
"disabled": 0, |
|
|
|
"prom_for_duration": 900, |
|
|
|
"prom_ql": "count by(node) (\n (kube_pod_status_phase{job=\"kube-state-metrics\",phase=\"Running\"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job=\"kube-state-metrics\"})\n)\n/\nmax by(node) (\n kube_node_status_capacity_pods{job=\"kube-state-metrics\"} != 1\n) > 0.95\n", |
|
|
|
"prom_eval_interval": 15, |
|
|
|
"enable_stime": "00:00", |
|
|
|
"enable_etime": "23:59", |
|
|
|
"enable_days_of_week": [ |
|
|
|
"1", |
|
|
|
"2", |
|
|
|
"3", |
|
|
|
"4", |
|
|
|
"5", |
|
|
|
"6", |
|
|
|
"0" |
|
|
|
], |
|
|
|
"enable_in_bg": 0, |
|
|
|
"notify_recovered": 1, |
|
|
|
"notify_channels": [], |
|
|
|
"notify_repeat_step": 60, |
|
|
|
"recover_duration": 0, |
|
|
|
"callbacks": [], |
|
|
|
"runbook_url": "", |
|
|
|
"append_tags": [] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"name": "Node状态抖动", |
|
|
|
"note": "The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.", |
|
|
|
"severity": 2, |
|
|
|
"disabled": 0, |
|
|
|
"prom_for_duration": 900, |
|
|
|
"prom_ql": "sum(changes(kube_node_status_condition{status=\"true\",condition=\"Ready\"}[15m])) by (node) > 2\n", |
|
|
|
"prom_eval_interval": 15, |
|
|
|
"enable_stime": "00:00", |
|
|
|
"enable_etime": "23:59", |
|
|
|
"enable_days_of_week": [ |
|
|
|
"1", |
|
|
|
"2", |
|
|
|
"3", |
|
|
|
"4", |
|
|
|
"5", |
|
|
|
"6", |
|
|
|
"0" |
|
|
|
], |
|
|
|
"enable_in_bg": 0, |
|
|
|
"notify_recovered": 1, |
|
|
|
"notify_channels": [], |
|
|
|
"notify_repeat_step": 60, |
|
|
|
"recover_duration": 0, |
|
|
|
"callbacks": [], |
|
|
|
"runbook_url": "", |
|
|
|
"append_tags": [] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"name": "PLEG耗时高", |
|
|
|
"note": "The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.", |
|
|
|
"severity": 2, |
|
|
|
"disabled": 0, |
|
|
|
"prom_for_duration": 300, |
|
|
|
"prom_ql": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile=\"0.99\"} >= 10\n", |
|
|
|
"prom_eval_interval": 15, |
|
|
|
"enable_stime": "00:00", |
|
|
|
"enable_etime": "23:59", |
|
|
|
"enable_days_of_week": [ |
|
|
|
"1", |
|
|
|
"2", |
|
|
|
"3", |
|
|
|
"4", |
|
|
|
"5", |
|
|
|
"6", |
|
|
|
"0" |
|
|
|
], |
|
|
|
"enable_in_bg": 0, |
|
|
|
"notify_recovered": 1, |
|
|
|
"notify_channels": [], |
|
|
|
"notify_repeat_step": 60, |
|
|
|
"recover_duration": 0, |
|
|
|
"callbacks": [], |
|
|
|
"runbook_url": "", |
|
|
|
"append_tags": [] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"name": "Pod启动耗时高", |
|
|
|
"note": "Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.", |
|
|
|
"severity": 2, |
|
|
|
"disabled": 0, |
|
|
|
"prom_for_duration": 900, |
|
|
|
"prom_ql": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\", metrics_path=\"/metrics\"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"} > 60\n", |
|
|
|
"prom_eval_interval": 15, |
|
|
|
"enable_stime": "00:00", |
|
|
|
"enable_etime": "23:59", |
|
|
|
"enable_days_of_week": [ |
|
|
|
"1", |
|
|
|
"2", |
|
|
|
"3", |
|
|
|
"4", |
|
|
|
"5", |
|
|
|
"6", |
|
|
|
"0" |
|
|
|
], |
|
|
|
"enable_in_bg": 0, |
|
|
|
"notify_recovered": 1, |
|
|
|
"notify_channels": [], |
|
|
|
"notify_repeat_step": 60, |
|
|
|
"recover_duration": 0, |
|
|
|
"callbacks": [], |
|
|
|
"runbook_url": "", |
|
|
|
"append_tags": [] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"name": "客户端证书过期-S2", |
|
|
|
"note": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.", |
|
|
|
"severity": 2, |
|
|
|
"disabled": 0, |
|
|
|
"prom_for_duration": 0, |
|
|
|
"prom_ql": "kubelet_certificate_manager_client_ttl_seconds < 604800\n", |
|
|
|
"prom_eval_interval": 15, |
|
|
|
"enable_stime": "00:00", |
|
|
|
"enable_etime": "23:59", |
|
|
|
"enable_days_of_week": [ |
|
|
|
"1", |
|
|
|
"2", |
|
|
|
"3", |
|
|
|
"4", |
|
|
|
"5", |
|
|
|
"6", |
|
|
|
"0" |
|
|
|
], |
|
|
|
"enable_in_bg": 0, |
|
|
|
"notify_recovered": 1, |
|
|
|
"notify_channels": [], |
|
|
|
"notify_repeat_step": 60, |
|
|
|
"recover_duration": 0, |
|
|
|
"callbacks": [], |
|
|
|
"runbook_url": "", |
|
|
|
"append_tags": [] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"name": "客户端证书过期-S1", |
|
|
|
"note": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.", |
|
|
|
"severity": 1, |
|
|
|
"disabled": 0, |
|
|
|
"prom_for_duration": 0, |
|
|
|
"prom_ql": "kubelet_certificate_manager_client_ttl_seconds < 86400\n", |
|
|
|
"prom_eval_interval": 15, |
|
|
|
"enable_stime": "00:00", |
|
|
|
"enable_etime": "23:59", |
|
|
|
"enable_days_of_week": [ |
|
|
|
"1", |
|
|
|
"2", |
|
|
|
"3", |
|
|
|
"4", |
|
|
|
"5", |
|
|
|
"6", |
|
|
|
"0" |
|
|
|
], |
|
|
|
"enable_in_bg": 0, |
|
|
|
"notify_recovered": 1, |
|
|
|
"notify_channels": [], |
|
|
|
"notify_repeat_step": 60, |
|
|
|
"recover_duration": 0, |
|
|
|
"callbacks": [], |
|
|
|
"runbook_url": "", |
|
|
|
"append_tags": [] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"name": "服务端证书过期-S2", |
|
|
|
"note": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.", |
|
|
|
"severity": 2, |
|
|
|
"disabled": 0, |
|
|
|
"prom_for_duration": 0, |
|
|
|
"prom_ql": "kubelet_certificate_manager_server_ttl_seconds < 604800\n", |
|
|
|
"prom_eval_interval": 15, |
|
|
|
"enable_stime": "00:00", |
|
|
|
"enable_etime": "23:59", |
|
|
|
"enable_days_of_week": [ |
|
|
|
"1", |
|
|
|
"2", |
|
|
|
"3", |
|
|
|
"4", |
|
|
|
"5", |
|
|
|
"6", |
|
|
|
"0" |
|
|
|
], |
|
|
|
"enable_in_bg": 0, |
|
|
|
"notify_recovered": 1, |
|
|
|
"notify_channels": [], |
|
|
|
"notify_repeat_step": 60, |
|
|
|
"recover_duration": 0, |
|
|
|
"callbacks": [], |
|
|
|
"runbook_url": "", |
|
|
|
"append_tags": [] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"name": "服务端证书过期-S1", |
|
|
|
"note": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.", |
|
|
|
"severity": 1, |
|
|
|
"disabled": 0, |
|
|
|
"prom_for_duration": 0, |
|
|
|
"prom_ql": "kubelet_certificate_manager_server_ttl_seconds < 86400\n", |
|
|
|
"prom_eval_interval": 15, |
|
|
|
"enable_stime": "00:00", |
|
|
|
"enable_etime": "23:59", |
|
|
|
"enable_days_of_week": [ |
|
|
|
"1", |
|
|
|
"2", |
|
|
|
"3", |
|
|
|
"4", |
|
|
|
"5", |
|
|
|
"6", |
|
|
|
"0" |
|
|
|
], |
|
|
|
"enable_in_bg": 0, |
|
|
|
"notify_recovered": 1, |
|
|
|
"notify_channels": [], |
|
|
|
"notify_repeat_step": 60, |
|
|
|
"recover_duration": 0, |
|
|
|
"callbacks": [], |
|
|
|
"runbook_url": "", |
|
|
|
"append_tags": [] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"name": "客户端证书续签错误", |
|
|
|
"note": "Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes).", |
|
|
|
"severity": 2, |
|
|
|
"disabled": 0, |
|
|
|
"prom_for_duration": 900, |
|
|
|
"prom_ql": "increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0\n", |
|
|
|
"prom_eval_interval": 15, |
|
|
|
"enable_stime": "00:00", |
|
|
|
"enable_etime": "23:59", |
|
|
|
"enable_days_of_week": [ |
|
|
|
"1", |
|
|
|
"2", |
|
|
|
"3", |
|
|
|
"4", |
|
|
|
"5", |
|
|
|
"6", |
|
|
|
"0" |
|
|
|
], |
|
|
|
"enable_in_bg": 0, |
|
|
|
"notify_recovered": 1, |
|
|
|
"notify_channels": [], |
|
|
|
"notify_repeat_step": 60, |
|
|
|
"recover_duration": 0, |
|
|
|
"callbacks": [], |
|
|
|
"runbook_url": "", |
|
|
|
"append_tags": [] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"name": "服务证书续签错误", |
|
|
|
"note": "Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes).", |
|
|
|
"severity": 2, |
|
|
|
"disabled": 0, |
|
|
|
"prom_for_duration": 900, |
|
|
|
"prom_ql": "increase(kubelet_server_expiration_renew_errors[5m]) > 0\n", |
|
|
|
"prom_eval_interval": 15, |
|
|
|
"enable_stime": "00:00", |
|
|
|
"enable_etime": "23:59", |
|
|
|
"enable_days_of_week": [ |
|
|
|
"1", |
|
|
|
"2", |
|
|
|
"3", |
|
|
|
"4", |
|
|
|
"5", |
|
|
|
"6", |
|
|
|
"0" |
|
|
|
], |
|
|
|
"enable_in_bg": 0, |
|
|
|
"notify_recovered": 1, |
|
|
|
"notify_channels": [], |
|
|
|
"notify_repeat_step": 60, |
|
|
|
"recover_duration": 0, |
|
|
|
"callbacks": [], |
|
|
|
"runbook_url": "", |
|
|
|
"append_tags": [] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"name": "kubelet故障", |
|
|
|
"note": "Kubelet has disappeared from Prometheus target discovery.", |
|
|
|
"severity": 1, |
|
|
|
"disabled": 0, |
|
|
|
"prom_for_duration": 900, |
|
|
|
"prom_ql": "absent(up{job=\"kubelet\"} == 1)\n", |
|
|
|
"prom_eval_interval": 15, |
|
|
|
"enable_stime": "00:00", |
|
|
|
"enable_etime": "23:59", |
|
|
|
"enable_days_of_week": [ |
|
|
|
"1", |
|
|
|
"2", |
|
|
|
"3", |
|
|
|
"4", |
|
|
|
"5", |
|
|
|
"6", |
|
|
|
"0" |
|
|
|
], |
|
|
|
"enable_in_bg": 0, |
|
|
|
"notify_recovered": 1, |
|
|
|
"notify_channels": [], |
|
|
|
"notify_repeat_step": 60, |
|
|
|
"recover_duration": 0, |
|
|
|
"callbacks": [], |
|
|
|
"runbook_url": "", |
|
|
|
"append_tags": [] |
|
|
|
} |
|
|
|
] |