From aaf66cb3869fbf7facbb02ef2f13a49d70deffb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9D=92=E7=89=9B=E8=B8=8F=E9=9B=AA?= <56914892+cyancow@users.noreply.github.com> Date: Wed, 26 Apr 2023 14:18:13 +0800 Subject: [PATCH] docs: add apiserver kubelet node alerts template to k8s (#1508) * add apiserver kubelet node alerts to k8s * modify node name to node-exporter --- integrations/kubernetes/alerts/apiserver.json | 266 +++++ integrations/kubernetes/alerts/kubelet.json | 366 ++++++ .../kubernetes/alerts/node-exporter.json | 1010 +++++++++++++++++ 3 files changed, 1642 insertions(+) create mode 100644 integrations/kubernetes/alerts/apiserver.json create mode 100644 integrations/kubernetes/alerts/kubelet.json create mode 100644 integrations/kubernetes/alerts/node-exporter.json diff --git a/integrations/kubernetes/alerts/apiserver.json b/integrations/kubernetes/alerts/apiserver.json new file mode 100644 index 00000000..46872c26 --- /dev/null +++ b/integrations/kubernetes/alerts/apiserver.json @@ -0,0 +1,266 @@ +[ + { + "name": "KubeClientCertificateExpiration-S2", + "note": "A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.", + "severity": 2, + "disabled": 0, + "prom_for_duration": 0, + "prom_ql": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 604800\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "KubeClientCertificateExpiration-S1", + "note": "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.", + "severity": 1, + "disabled": 0, + "prom_for_duration": 0, + "prom_ql": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 86400\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "AggregatedAPIErrors", + "note": "An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often.", + "severity": 2, + "disabled": 0, + "prom_for_duration": 0, + "prom_ql": "sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "AggregatedAPIDown", + "note": "An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.", + "severity": 2, + "disabled": 0, + "prom_for_duration": 300, + "prom_ql": "(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "KubeAPIDown", + "note": "KubeAPI has disappeared from Prometheus target discovery.", + "severity": 1, + "disabled": 0, + "prom_for_duration": 900, + "prom_ql": "absent(up{job=\"apiserver\"} == 1)\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "KubeAPIErrorBudgetBurn-S1-120秒", + "note": "The API server is burning too much error budget.", + "severity": 1, + "disabled": 0, + "prom_for_duration": 120, + "prom_ql": "sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)\nand\nsum(apiserver_request:burnrate5m) > (14.40 * 0.01000)\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [ + "long=1h", + "short=5m" + ] + }, + { + "name": "KubeAPIErrorBudgetBurn-S1-900秒", + "note": "The API server is burning too much error budget.", + "severity": 1, + "disabled": 0, + "prom_for_duration": 900, + "prom_ql": "sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)\nand\nsum(apiserver_request:burnrate30m) > (6.00 * 0.01000)\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [ + "long=6h", + "short=30m" + ] + }, + { + "name": "KubeAPIErrorBudgetBurn-S2-3600秒", + "note": "The API server is burning too much error budget.", + "severity": 2, + "disabled": 0, + "prom_for_duration": 3600, + "prom_ql": "sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)\nand\nsum(apiserver_request:burnrate2h) > (3.00 * 0.01000)\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [ + "long=1d", + "short=2h" + ] + }, + { + "name": "KubeAPIErrorBudgetBurn-S2-10800秒", + "note": "The API server is burning too much error budget.", + "severity": 2, + "disabled": 0, + "prom_for_duration": 10800, + "prom_ql": "sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)\nand\nsum(apiserver_request:burnrate6h) > (1.00 * 0.01000)\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [ + "long=3d", + "short=6h" + ] + } +] \ No newline at end of file diff --git a/integrations/kubernetes/alerts/kubelet.json b/integrations/kubernetes/alerts/kubelet.json new file mode 100644 index 00000000..85b7f577 --- /dev/null +++ b/integrations/kubernetes/alerts/kubelet.json @@ -0,0 +1,366 @@ +[ + { + "name": "Node状态异常", + "note": "{{ $labels.node }} has been unready for more than 15 minutes.", + "severity": 2, + "disabled": 0, + "prom_for_duration": 900, + "prom_ql": "kube_node_status_condition{job=\"kube-state-metrics\",condition=\"Ready\",status=\"true\"} == 0\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "Node不可达", + "note": "{{ $labels.node }} is unreachable and some workloads may be rescheduled.", + "severity": 2, + "disabled": 0, + "prom_for_duration": 900, + "prom_ql": "(kube_node_spec_taint{job=\"kube-state-metrics\",key=\"node.kubernetes.io/unreachable\",effect=\"NoSchedule\"} unless ignoring(key,value) kube_node_spec_taint{job=\"kube-state-metrics\",key=~\"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn\"}) == 1\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "Node运行太多Pod", + "note": "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.", + "severity": 2, + "disabled": 0, + "prom_for_duration": 900, + "prom_ql": "count by(node) (\n (kube_pod_status_phase{job=\"kube-state-metrics\",phase=\"Running\"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job=\"kube-state-metrics\"})\n)\n/\nmax by(node) (\n kube_node_status_capacity_pods{job=\"kube-state-metrics\"} != 1\n) > 0.95\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "Node状态抖动", + "note": "The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.", + "severity": 2, + "disabled": 0, + "prom_for_duration": 900, + "prom_ql": "sum(changes(kube_node_status_condition{status=\"true\",condition=\"Ready\"}[15m])) by (node) > 2\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "PLEG耗时高", + "note": "The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.", + "severity": 2, + "disabled": 0, + "prom_for_duration": 300, + "prom_ql": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile=\"0.99\"} >= 10\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "Pod启动耗时高", + "note": "Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.", + "severity": 2, + "disabled": 0, + "prom_for_duration": 900, + "prom_ql": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\", metrics_path=\"/metrics\"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"} > 60\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "客户端证书过期-S2", + "note": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.", + "severity": 2, + "disabled": 0, + "prom_for_duration": 0, + "prom_ql": "kubelet_certificate_manager_client_ttl_seconds < 604800\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "客户端证书过期-S1", + "note": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.", + "severity": 1, + "disabled": 0, + "prom_for_duration": 0, + "prom_ql": "kubelet_certificate_manager_client_ttl_seconds < 86400\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "服务端证书过期-S2", + "note": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.", + "severity": 2, + "disabled": 0, + "prom_for_duration": 0, + "prom_ql": "kubelet_certificate_manager_server_ttl_seconds < 604800\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "服务端证书过期-S1", + "note": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.", + "severity": 1, + "disabled": 0, + "prom_for_duration": 0, + "prom_ql": "kubelet_certificate_manager_server_ttl_seconds < 86400\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "客户端证书续签错误", + "note": "Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes).", + "severity": 2, + "disabled": 0, + "prom_for_duration": 900, + "prom_ql": "increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "服务证书续签错误", + "note": "Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes).", + "severity": 2, + "disabled": 0, + "prom_for_duration": 900, + "prom_ql": "increase(kubelet_server_expiration_renew_errors[5m]) > 0\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "kubelet故障", + "note": "Kubelet has disappeared from Prometheus target discovery.", + "severity": 1, + "disabled": 0, + "prom_for_duration": 900, + "prom_ql": "absent(up{job=\"kubelet\"} == 1)\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + } +] \ No newline at end of file diff --git a/integrations/kubernetes/alerts/node-exporter.json b/integrations/kubernetes/alerts/node-exporter.json new file mode 100644 index 00000000..d0fcbff6 --- /dev/null +++ b/integrations/kubernetes/alerts/node-exporter.json @@ -0,0 +1,1010 @@ +[ + { + "name": "主机内存不足", + "note": "节点内存不足 (instance {{ $labels.instance }})", + "severity": 2, + "disabled": 0, + "prom_for_duration": 120, + "prom_ql": "node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "主机内存有压力", + "note": "节点内存压力大 (instance {{ $labels.instance }})", + "severity": 2, + "disabled": 0, + "prom_for_duration": 120, + "prom_ql": "rate(node_vmstat_pgmajfault[1m]) > 1000", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "主机入口网络吞吐量异常", + "note": "主机异常网络吞吐量 入 (instance {{ $labels.instance }})", + "severity": 2, + "disabled": 0, + "prom_for_duration": 300, + "prom_ql": "sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "主机出口网络吞吐量异常", + "note": "主机异常网络吞吐量 出 (instance {{ $labels.instance }})", + "severity": 2, + "disabled": 0, + "prom_for_duration": 300, + "prom_ql": "sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "主机磁盘读取速率异常", + "note": "主机异常磁盘读取率 (instance {{ $labels.instance }})", + "severity": 2, + "disabled": 0, + "prom_for_duration": 300, + "prom_ql": "sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "主机磁盘写入速率异常", + "note": "主机异常磁盘写入率 (instance {{ $labels.instance }})", + "severity": 2, + "disabled": 0, + "prom_for_duration": 120, + "prom_ql": "sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "主机磁盘空间不足", + "note": "主机磁盘空间不足 (instance {{ $labels.instance }})", + "severity": 2, + "disabled": 0, + "prom_for_duration": 120, + "prom_ql": "(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "主机磁盘将在 24 小时内填满", + "note": "主机磁盘将在 24 小时内填满 (instance {{ $labels.instance }})", + "severity": 2, + "disabled": 0, + "prom_for_duration": 120, + "prom_ql": "(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~\"tmpfs\"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "文件系统inode使用率高", + "note": "主机不足 inode (instance {{ $labels.instance }})", + "severity": 2, + "disabled": 0, + "prom_for_duration": 120, + "prom_ql": "node_filesystem_files_free{mountpoint =\"/rootfs\"} / node_filesystem_files{mountpoint=\"/rootfs\"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint=\"/rootfs\"} == 0", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "文件系统inode将在24小时内填满", + "note": "主机 inode 将在 24 小时内填满 (instance {{ $labels.instance }})", + "severity": 2, + "disabled": 0, + "prom_for_duration": 120, + "prom_ql": "node_filesystem_files_free{mountpoint =\"/rootfs\"} / node_filesystem_files{mountpoint=\"/rootfs\"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint=\"/rootfs\"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint=\"/rootfs\"} == 0", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "主机磁盘读取延迟异常", + "note": "主机异常磁盘读取延迟 (instance {{ $labels.instance }})", + "severity": 2, + "disabled": 0, + "prom_for_duration": 120, + "prom_ql": "rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "主机磁盘写入延迟异常", + "note": "主机异常磁盘写入延迟 (instance {{ $labels.instance }})", + "severity": 2, + "disabled": 0, + "prom_for_duration": 120, + "prom_ql": "rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "节点cpu负载高", + "note": "主机 CPU 负载高 (instance {{ $labels.instance }})", + "severity": 2, + "disabled": 0, + "prom_for_duration": 0, + "prom_ql": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[2m])) * 100) > 80", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "有其他云主机窃取cpu", + "note": "Host CPU steal noisy neighbor (instance {{ $labels.instance }})", + "severity": 2, + "disabled": 0, + "prom_for_duration": 0, + "prom_ql": "avg by(instance) (rate(node_cpu_seconds_total{mode=\"steal\"}[5m])) * 100 > 10", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "主机上下文切换异常", + "note": "主机上下文切换 (instance {{ $labels.instance }})", + "severity": 2, + "disabled": 0, + "prom_for_duration": 0, + "prom_ql": "(rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode=\"idle\"})) > 1000", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "主机交换内存快满了", + "note": "主机交换内存已满 (instance {{ $labels.instance }})", + "severity": 2, + "disabled": 0, + "prom_for_duration": 120, + "prom_ql": "(1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "主机系统服务崩溃", + "note": "主机 systemd 服务崩溃 (instance {{ $labels.instance }})", + "severity": 2, + "disabled": 0, + "prom_for_duration": 0, + "prom_ql": "node_systemd_unit_state{state=\"failed\"} == 1", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "主机内核版本偏差", + "note": "主机内核版本偏差 (instance {{ $labels.instance }})", + "severity": 2, + "disabled": 0, + "prom_for_duration": 21600, + "prom_ql": "count(sum(label_replace(node_uname_info, \"kernel\", \"$1\", \"release\", \"([0-9]+.[0-9]+.[0-9]+).*\")) by (kernel)) > 1", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "检测到 HostOomKill", + "note": "检测到主机 OOM 终止 (instance {{ $labels.instance }})", + "severity": 2, + "disabled": 0, + "prom_for_duration": 0, + "prom_ql": "increase(node_vmstat_oom_kill[1m]) > 0", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "主机网络接收错误", + "note": "主机网络接收错误 (instance {{ $labels.instance }})", + "severity": 2, + "disabled": 0, + "prom_for_duration": 120, + "prom_ql": "rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "主机网络出口错误", + "note": "主机网络传输错误 (instance {{ $labels.instance }})", + "severity": 2, + "disabled": 0, + "prom_for_duration": 120, + "prom_ql": "rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "主机网络接口饱和", + "note": "主机网络接口饱和 (instance {{ $labels.instance }})", + "severity": 2, + "disabled": 0, + "prom_for_duration": 60, + "prom_ql": "(rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} > 0.8 < 10000", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "conntrack数量接近极限", + "note": "主机连接限制 (instance {{ $labels.instance }})", + "severity": 2, + "disabled": 0, + "prom_for_duration": 300, + "prom_ql": "node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "主机时钟偏差", + "note": "主机时钟偏差 (instance {{ $labels.instance }})", + "severity": 2, + "disabled": 0, + "prom_for_duration": 120, + "prom_ql": "(node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "主机时钟不同步", + "note": "主机时钟不同步 (instance {{ $labels.instance }})", + "severity": 2, + "disabled": 0, + "prom_for_duration": 120, + "prom_ql": "min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "节点文件系统空间填满-S2", + "note": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left and is filling up.", + "severity": 2, + "disabled": 0, + "prom_for_duration": 3600, + "prom_ql": "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "节点文件系统空间填满-S1", + "note": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left and is filling up fast.", + "severity": 1, + "disabled": 0, + "prom_for_duration": 3600, + "prom_ql": "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 15\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "节点磁盘快满了-S2-space-5", + "note": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left.", + "severity": 2, + "disabled": 0, + "prom_for_duration": 3600, + "prom_ql": "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "节点磁盘快满了-S1-space-3", + "note": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left.", + "severity": 1, + "disabled": 0, + "prom_for_duration": 3600, + "prom_ql": "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "节点磁盘快满了-S2-inodes-40", + "note": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left and is filling up.", + "severity": 2, + "disabled": 0, + "prom_for_duration": 3600, + "prom_ql": "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "节点磁盘快满了-S1-inodes-20", + "note": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left and is filling up fast.", + "severity": 1, + "disabled": 0, + "prom_for_duration": 3600, + "prom_ql": "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "节点磁盘快满了-S2-inodes-5", + "note": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left.", + "severity": 2, + "disabled": 0, + "prom_for_duration": 3600, + "prom_ql": "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "节点磁盘快满了-S1-inodes-3", + "note": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left.", + "severity": 1, + "disabled": 0, + "prom_for_duration": 3600, + "prom_ql": "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "使用的节点Conntrack条目数量高", + "note": "{{ $value | humanizePercentage }} of conntrack entries are used.", + "severity": 2, + "disabled": 0, + "prom_for_duration": 0, + "prom_ql": "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "节点文本文件收集器抓取错误", + "note": "Node Exporter text file collector failed to scrape.", + "severity": 2, + "disabled": 0, + "prom_for_duration": 0, + "prom_ql": "node_textfile_scrape_error{job=\"node-exporter\"} == 1\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + }, + { + "name": "节点网络接口抖动", + "note": "Network interface \"{{ $labels.device }}\" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}\"", + "severity": 2, + "disabled": 0, + "prom_for_duration": 120, + "prom_ql": "changes(node_network_up{job=\"node-exporter\",device!~\"veth.+\"}[2m]) > 2\n", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + } +] \ No newline at end of file