| @@ -1,18 +1,20 @@ | |||
| # net_response | |||
| 网络探测插件,通常用于监控本机某个端口是否在监听,或远端某个端口是否能连通 | |||
| ## code meanings | |||
| 网络探测插件,通常用于监控本机某个端口是否在监听,或远端某个端口是否能连通。因为 Prometheus 生态的时序库只能存储 float64 类型的值,所以网络探测插件探测的结果也是 float64 类型的值,但是这个值的含义是不同的,具体含义如下: | |||
| ``` | |||
| - 0: Success | |||
| - 1: Timeout | |||
| - 2: ConnectionFailed | |||
| - 3: ReadFailed | |||
| - 4: StringMismatch | |||
| ``` | |||
| 如果一切正常,这个值是 0,如果有异常,这个值是 1-4 之间的值,具体含义如上。这个值对应的指标名字是 `net_response_result_code`。 | |||
| ## Configuration | |||
| 最核心的配置就是 targets 部分,指定探测的目标,下面的例子: | |||
| categraf 的 `conf/input.net_response/net_response.toml`。最核心的配置就是 targets 部分,指定探测的目标,下面的例子: | |||
| ```toml | |||
| [[instances]] | |||
| @@ -27,7 +29,7 @@ targets = [ | |||
| - `localhost:6379` 表示探测本机的 6379 端口是否可以连通 | |||
| - `:9090` 表示探测本机的 9090 端口是否可以连通 | |||
| 监控数据或告警事件中只是一个 IP 和端口,接收告警的人看到了,可能不清楚只是哪个业务的模块告警了,可以附加一些更有价值的信息放到标签里,比如例子中: | |||
| 监控数据或告警事件中只是一个 IP 和端口,接收告警的人看到了,可能不清楚只是哪个业务的模块告警了,可以附加一些更有价值的信息放到标签里,比如: | |||
| ```toml | |||
| labels = { region="cloud", product="n9e" } | |||
| @@ -35,6 +37,46 @@ labels = { region="cloud", product="n9e" } | |||
| 标识了这是 cloud 这个 region,n9e 这个产品,这俩标签会附到时序数据上,告警的时候自然也会报出来。 | |||
| 完整配置样例如下: | |||
| ```toml | |||
| [mappings] | |||
| # "127.0.0.1:22"= {region="local",ssh="test"} | |||
| # "127.0.0.1:22"= {region="local",ssh="redis"} | |||
| [[instances]] | |||
| targets = [ | |||
| # "127.0.0.1:22", | |||
| # "localhost:6379", | |||
| # ":9090" | |||
| ] | |||
| # # append some labels for series | |||
| # labels = { region="cloud", product="n9e" } | |||
| # # interval = global.interval * interval_times | |||
| # interval_times = 1 | |||
| ## Protocol, must be "tcp" or "udp" | |||
| ## NOTE: because the "udp" protocol does not respond to requests, it requires | |||
| ## a send/expect string pair (see below). | |||
| # protocol = "tcp" | |||
| ## Set timeout | |||
| # timeout = "1s" | |||
| ## Set read timeout (only used if expecting a response) | |||
| # read_timeout = "1s" | |||
| ## The following options are required for UDP checks. For TCP, they are | |||
| ## optional. The plugin will send the given string to the server and then | |||
| ## expect to receive the given 'expect' string back. | |||
| ## string sent to the server | |||
| # send = "ssh" | |||
| ## expected string in answer | |||
| # expect = "ssh" | |||
| ``` | |||
| ## 监控大盘和告警规则 | |||
| 该 README 的同级目录下,提供了 dashboard.json 就是监控大盘的配置,alerts.json 是告警规则,可以导入夜莺使用。 | |||
| 夜莺内置了仪表盘和告警规则,克隆到自己的业务组即可使用。 | |||
| @@ -1,73 +0,0 @@ | |||
| [ | |||
| { | |||
| "cate": "prometheus", | |||
| "datasource_ids": [ | |||
| 0 | |||
| ], | |||
| "name": "Network address probe failed", | |||
| "note": "", | |||
| "prod": "metric", | |||
| "algorithm": "", | |||
| "algo_params": null, | |||
| "delay": 0, | |||
| "severity": 2, | |||
| "severities": [ | |||
| 2 | |||
| ], | |||
| "disabled": 1, | |||
| "prom_for_duration": 60, | |||
| "prom_ql": "", | |||
| "rule_config": { | |||
| "algo_params": null, | |||
| "inhibit": false, | |||
| "prom_ql": "", | |||
| "queries": [ | |||
| { | |||
| "prom_ql": "net_response_result_code != 0", | |||
| "severity": 2 | |||
| } | |||
| ], | |||
| "severity": 0 | |||
| }, | |||
| "prom_eval_interval": 15, | |||
| "enable_stime": "00:00", | |||
| "enable_stimes": [ | |||
| "00:00" | |||
| ], | |||
| "enable_etime": "23:59", | |||
| "enable_etimes": [ | |||
| "23:59" | |||
| ], | |||
| "enable_days_of_week": [ | |||
| "1", | |||
| "2", | |||
| "3", | |||
| "4", | |||
| "5", | |||
| "6", | |||
| "0" | |||
| ], | |||
| "enable_days_of_weeks": [ | |||
| [ | |||
| "1", | |||
| "2", | |||
| "3", | |||
| "4", | |||
| "5", | |||
| "6", | |||
| "0" | |||
| ] | |||
| ], | |||
| "enable_in_bg": 0, | |||
| "notify_recovered": 1, | |||
| "notify_channels": [], | |||
| "notify_repeat_step": 60, | |||
| "notify_max_number": 0, | |||
| "recover_duration": 0, | |||
| "callbacks": [], | |||
| "runbook_url": "", | |||
| "append_tags": [], | |||
| "annotations": null, | |||
| "extra_config": null | |||
| } | |||
| ] | |||
| @@ -1,131 +0,0 @@ | |||
| { | |||
| "name": "TCP detection by UlricQin", | |||
| "tags": "", | |||
| "ident": "", | |||
| "configs": { | |||
| "panels": [ | |||
| { | |||
| "type": "table", | |||
| "id": "73c6eaf9-1685-4a7a-bf53-3d52afa1792e", | |||
| "layout": { | |||
| "h": 15, | |||
| "w": 24, | |||
| "x": 0, | |||
| "y": 0, | |||
| "i": "73c6eaf9-1685-4a7a-bf53-3d52afa1792e", | |||
| "isResizable": true | |||
| }, | |||
| "version": "3.0.0", | |||
| "datasourceCate": "prometheus", | |||
| "datasourceValue": "${prom}", | |||
| "targets": [ | |||
| { | |||
| "expr": "max(net_response_result_code) by (target)", | |||
| "legend": "UP?", | |||
| "refId": "A" | |||
| }, | |||
| { | |||
| "expr": "max(net_response_response_time) by (target) * 1000", | |||
| "legend": "Latency(ms)", | |||
| "refId": "C" | |||
| } | |||
| ], | |||
| "transformations": [ | |||
| { | |||
| "id": "organize", | |||
| "options": { | |||
| "indexByName": { | |||
| "target": 0 | |||
| } | |||
| } | |||
| } | |||
| ], | |||
| "name": "Targets", | |||
| "custom": { | |||
| "showHeader": true, | |||
| "colorMode": "background", | |||
| "calc": "lastNotNull", | |||
| "displayMode": "labelValuesToRows", | |||
| "aggrDimension": "target" | |||
| }, | |||
| "options": { | |||
| "valueMappings": [], | |||
| "standardOptions": {} | |||
| }, | |||
| "overrides": [ | |||
| { | |||
| "matcher": { | |||
| "value": "A" | |||
| }, | |||
| "properties": { | |||
| "standardOptions": {}, | |||
| "valueMappings": [ | |||
| { | |||
| "match": { | |||
| "special": 0 | |||
| }, | |||
| "result": { | |||
| "color": "#2c9d3d", | |||
| "text": "UP" | |||
| }, | |||
| "type": "special" | |||
| }, | |||
| { | |||
| "match": { | |||
| "from": 1, | |||
| "special": 1 | |||
| }, | |||
| "result": { | |||
| "color": "#e90f0f", | |||
| "text": "DOWN" | |||
| }, | |||
| "type": "range" | |||
| } | |||
| ] | |||
| } | |||
| }, | |||
| { | |||
| "type": "special", | |||
| "matcher": { | |||
| "value": "C" | |||
| }, | |||
| "properties": { | |||
| "valueMappings": [ | |||
| { | |||
| "type": "range", | |||
| "result": { | |||
| "color": "#f10c0c" | |||
| }, | |||
| "match": { | |||
| "from": 1 | |||
| } | |||
| }, | |||
| { | |||
| "type": "range", | |||
| "result": { | |||
| "color": "#2c9d3d" | |||
| }, | |||
| "match": { | |||
| "to": 1 | |||
| } | |||
| } | |||
| ], | |||
| "standardOptions": { | |||
| "util": "milliseconds", | |||
| "decimals": 3 | |||
| } | |||
| } | |||
| } | |||
| ] | |||
| } | |||
| ], | |||
| "var": [ | |||
| { | |||
| "definition": "prometheus", | |||
| "name": "prom", | |||
| "type": "datasource" | |||
| } | |||
| ], | |||
| "version": "3.0.0" | |||
| } | |||
| } | |||
| @@ -1,82 +0,0 @@ | |||
| # net_response plugin | |||
| 网络探测插件,通常用于监控本机某个端口是否在监听,或远端某个端口是否能连通。因为 Prometheus 生态的时序库只能存储 float64 类型的值,所以网络探测插件探测的结果也是 float64 类型的值,但是这个值的含义是不同的,具体含义如下: | |||
| ``` | |||
| - 0: Success | |||
| - 1: Timeout | |||
| - 2: ConnectionFailed | |||
| - 3: ReadFailed | |||
| - 4: StringMismatch | |||
| ``` | |||
| 如果一切正常,这个值是 0,如果有异常,这个值是 1-4 之间的值,具体含义如上。这个值对应的指标名字是 `net_response_result_code`。 | |||
| ## Configuration | |||
| categraf 的 `conf/input.net_response/net_response.toml`。最核心的配置就是 targets 部分,指定探测的目标,下面的例子: | |||
| ```toml | |||
| [[instances]] | |||
| targets = [ | |||
| "10.2.3.4:22", | |||
| "localhost:6379", | |||
| ":9090" | |||
| ] | |||
| ``` | |||
| - `10.2.3.4:22` 表示探测 10.2.3.4 这个机器的 22 端口是否可以连通 | |||
| - `localhost:6379` 表示探测本机的 6379 端口是否可以连通 | |||
| - `:9090` 表示探测本机的 9090 端口是否可以连通 | |||
| 监控数据或告警事件中只是一个 IP 和端口,接收告警的人看到了,可能不清楚只是哪个业务的模块告警了,可以附加一些更有价值的信息放到标签里,比如: | |||
| ```toml | |||
| labels = { region="cloud", product="n9e" } | |||
| ``` | |||
| 标识了这是 cloud 这个 region,n9e 这个产品,这俩标签会附到时序数据上,告警的时候自然也会报出来。 | |||
| 完整配置样例如下: | |||
| ```toml | |||
| [mappings] | |||
| # "127.0.0.1:22"= {region="local",ssh="test"} | |||
| # "127.0.0.1:22"= {region="local",ssh="redis"} | |||
| [[instances]] | |||
| targets = [ | |||
| # "127.0.0.1:22", | |||
| # "localhost:6379", | |||
| # ":9090" | |||
| ] | |||
| # # append some labels for series | |||
| # labels = { region="cloud", product="n9e" } | |||
| # # interval = global.interval * interval_times | |||
| # interval_times = 1 | |||
| ## Protocol, must be "tcp" or "udp" | |||
| ## NOTE: because the "udp" protocol does not respond to requests, it requires | |||
| ## a send/expect string pair (see below). | |||
| # protocol = "tcp" | |||
| ## Set timeout | |||
| # timeout = "1s" | |||
| ## Set read timeout (only used if expecting a response) | |||
| # read_timeout = "1s" | |||
| ## The following options are required for UDP checks. For TCP, they are | |||
| ## optional. The plugin will send the given string to the server and then | |||
| ## expect to receive the given 'expect' string back. | |||
| ## string sent to the server | |||
| # send = "ssh" | |||
| ## expected string in answer | |||
| # expect = "ssh" | |||
| ``` | |||
| ## 监控大盘和告警规则 | |||
| 夜莺内置了仪表盘和告警规则,克隆到自己的业务组即可使用。 | |||