diff --git a/integrations/AliYun/collect/cloud.toml b/integrations/AliYun/collect/cloud.toml new file mode 100644 index 00000000..86e900ef --- /dev/null +++ b/integrations/AliYun/collect/cloud.toml @@ -0,0 +1,21 @@ +# # collect interval +# interval = 60 +[[instances]] +# # endpoint region 参考 https://help.aliyun.com/document_detail/28616.html#section-72p-xhs-6qt +# region="cn-beijing" +# endpoint="metrics.cn-hangzhou.aliyuncs.com" +# access_key_id="your-access-key-id" +# access_key_secret="your-access-key-secret" +# interval_times=4 +# delay="10m" +# period="60s" +# # namespace 参考 https://help.aliyun.com/document_detail/163515.htm?spm=a2c4g.11186623.0.0.44d65c58mhgNw3 +# namespaces=["acs_ecs_dashboard"] +# [[instances.metric_filters]] +# # metric name 参考 https://help.aliyun.com/document_detail/163515.htm?spm=a2c4g.11186623.0.0.401d15c73Z0dZh +# # 参考页面中的Metric Id 填入下面的metricName ,页面中包含中文的Metric Name对应接口中的Description +# metric_names=["cpu_cores","vm.TcpCount"] +# namespace="" +# ratelimit=25 +# catch_ttl="1h" +# timeout="5s" diff --git a/integrations/Ceph/markdown/alerts.png b/integrations/Ceph/markdown/alerts.png new file mode 100644 index 00000000..69d63eda Binary files /dev/null and b/integrations/Ceph/markdown/alerts.png differ diff --git a/integrations/Ceph/markdown/ceph.png b/integrations/Ceph/markdown/ceph.png new file mode 100644 index 00000000..962449be Binary files /dev/null and b/integrations/Ceph/markdown/ceph.png differ diff --git a/integrations/ElasticSearch/collect/elasticsearch.toml b/integrations/ElasticSearch/collect/elasticsearch.toml new file mode 100644 index 00000000..225b636f --- /dev/null +++ b/integrations/ElasticSearch/collect/elasticsearch.toml @@ -0,0 +1,63 @@ +# # collect interval +# interval = 15 + +############################################################################ +# !!! uncomment [[instances]] to enable this plugin +[[instances]] +# # interval = global.interval * interval_times +# interval_times = 1 + +# append some labels to metrics +# labels = { cluster="cloud-n9e-es" } + +## specify a list of one or more Elasticsearch servers +# servers = ["http://localhost:9200"] +servers = [] + +## Timeout for HTTP requests to the elastic search server(s) +http_timeout = "10s" + +# either /_nodes/stats or /_nodes/_local/stats depending on this setting +local = false + +## Set cluster_health to true when you want to obtain cluster health stats +cluster_health = true + +## Adjust cluster_health_level when you want to obtain detailed health stats +## The options are +## - indices (default) +## - cluster +cluster_health_level = "cluster" + +## Set cluster_stats to true when you want to obtain cluster stats. +cluster_stats = true + +## Indices to collect; can be one or more indices names or _all +## Use of wildcards is allowed. Use a wildcard at the end to retrieve index names that end with a changing value, like a date. +# indices_include = ["zipkin*"] + +## use "shards" or blank string for indices level +indices_level = "" + +## node_stats is a list of sub-stats that you want to have gathered. Valid options +## are "indices", "os", "process", "jvm", "thread_pool", "fs", "transport", "http", +## "breaker". Per default, all stats are gathered. +node_stats = ["jvm", "breaker", "process", "os", "fs", "indices", "thread_pool", "transport"] + +## HTTP Basic Authentication username and password. +username = "elastic" +password = "password" + +## Optional TLS Config +# use_tls = false +# tls_ca = "/etc/categraf/ca.pem" +# tls_cert = "/etc/categraf/cert.pem" +# tls_key = "/etc/categraf/key.pem" +## Use TLS but skip chain & host verification +# insecure_skip_verify = true + +## Sets the number of most recent indices to return for indices that are configured with a date-stamped suffix. +## Each 'indices_include' entry ending with a wildcard (*) or glob matching pattern will group together all indices that match it, and +## sort them by the date or number after the wildcard. Metrics then are gathered for only the 'num_most_recent_indices' amount of most +## recent indices. +num_most_recent_indices = 1 \ No newline at end of file diff --git a/integrations/ElasticSearch/dashboards/elasticsearch_by_categraf.json b/integrations/ElasticSearch/dashboards/elasticsearch_by_categraf.json new file mode 100644 index 00000000..854f708c --- /dev/null +++ b/integrations/ElasticSearch/dashboards/elasticsearch_by_categraf.json @@ -0,0 +1,1945 @@ +{ + "name": "ElasticSearch", + "tags": "ElasticSearch Prometheus", + "ident": "", + "configs": { + "var": [ + { + "name": "service", + "type": "query", + "datasource": { + "cate": "prometheus" + }, + "definition": "label_values(elasticsearch_up, service)" + }, + { + "name": "node_host", + "type": "query", + "datasource": { + "cate": "prometheus" + }, + "definition": "label_values(elasticsearch_jvm_uptime_in_millis{service =\"$service\"}, node_host)", + "multi": true, + "allOption": true + } + ], + "panels": [ + { + "type": "stat", + "id": "f70f4198-dec2-40c0-97d9-6986c7001e73", + "layout": { + "h": 3, + "w": 4, + "x": 0, + "y": 0, + "i": "f70f4198-dec2-40c0-97d9-6986c7001e73", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "refId": "A", + "expr": "min(elasticsearch_cluster_health_status_code{service =\"$service\"})" + } + ], + "name": "", + "custom": { + "textMode": "valueAndName", + "colorMode": "background", + "calc": "lastNotNull", + "valueField": "Value", + "colSpan": 1, + "textSize": {} + }, + "options": { + "valueMappings": [ + { + "type": "special", + "match": { + "special": 0 + }, + "result": { + "text": "N/A" + } + }, + { + "type": "special", + "match": { + "special": 1 + }, + "result": { + "color": "#417505", + "text": "Green" + } + }, + { + "type": "special", + "match": { + "special": 2 + }, + "result": { + "color": "#f5a623", + "text": "Yellow" + } + }, + { + "type": "special", + "match": { + "special": 3 + }, + "result": { + "color": "#d0021b", + "text": "Red" + } + } + ], + "standardOptions": {}, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + } + }, + { + "targets": [ + { + "refId": "A", + "expr": "min(elasticsearch_cluster_health_number_of_nodes{service =\"$service\"})" + } + ], + "name": "Nodes", + "custom": { + "textMode": "valueAndName", + "colorMode": "value", + "calc": "lastNotNull", + "colSpan": 1, + "textSize": {} + }, + "options": { + "standardOptions": {} + }, + "version": "2.0.0", + "type": "stat", + "layout": { + "h": 3, + "w": 4, + "x": 4, + "y": 0, + "i": "7dafe232-ee30-479b-a2f1-e1064572c154", + "isResizable": true + }, + "id": "7dafe232-ee30-479b-a2f1-e1064572c154" + }, + { + "targets": [ + { + "refId": "A", + "expr": "min(elasticsearch_cluster_health_number_of_data_nodes{service =\"$service\"})" + } + ], + "name": "Data Nodes", + "custom": { + "textMode": "valueAndName", + "colorMode": "value", + "calc": "lastNotNull", + "colSpan": 1, + "textSize": {} + }, + "options": { + "standardOptions": {} + }, + "version": "2.0.0", + "type": "stat", + "layout": { + "h": 3, + "w": 4, + "x": 8, + "y": 0, + "i": "c6953ef5-3e29-44dc-bf9e-74905934e9df", + "isResizable": true + }, + "id": "447fb784-a7e4-41cf-820f-6086837590e6" + }, + { + "targets": [ + { + "refId": "A", + "expr": "max(elasticsearch_process_cpu_percent{service =\"$service\", node_host=~\"$node_host\"})" + } + ], + "name": "CPU Util Percent", + "custom": { + "textMode": "valueAndName", + "colorMode": "value", + "calc": "lastNotNull", + "colSpan": 1, + "textSize": {} + }, + "options": { + "valueMappings": [ + { + "type": "range", + "match": { + "to": 50 + }, + "result": { + "color": "#417505" + } + }, + { + "type": "range", + "match": { + "from": 50, + "to": 80 + }, + "result": { + "color": "#f5a623" + } + }, + { + "type": "range", + "match": { + "from": 80 + }, + "result": { + "color": "#d0021b" + } + } + ], + "standardOptions": { + "util": "percent" + } + }, + "version": "2.0.0", + "type": "stat", + "layout": { + "h": 3, + "w": 4, + "x": 12, + "y": 0, + "i": "e6cf29e7-bb5d-4c8f-8aa6-67a63fc325c7", + "isResizable": true + }, + "id": "f0375f72-4ca1-474f-81e9-ce6b64f22204" + }, + { + "targets": [ + { + "refId": "A", + "expr": "max(elasticsearch_jvm_mem_heap_used_percent{service =\"$service\", node_host=~\"$node_host\"})" + } + ], + "name": "JVM Heap Util Percent", + "custom": { + "textMode": "valueAndName", + "colorMode": "value", + "calc": "lastNotNull", + "colSpan": 1, + "textSize": {} + }, + "options": { + "valueMappings": [ + { + "type": "range", + "match": { + "to": 50 + }, + "result": { + "color": "#417505" + } + }, + { + "type": "range", + "match": { + "from": 50, + "to": 80 + }, + "result": { + "color": "#f5a623" + } + }, + { + "type": "range", + "match": { + "from": 80 + }, + "result": { + "color": "#d0021b" + } + } + ], + "standardOptions": { + "util": "percent" + } + }, + "version": "2.0.0", + "type": "stat", + "layout": { + "h": 3, + "w": 4, + "x": 16, + "y": 0, + "i": "34ae1975-6acb-48a7-adce-1d67b7c581ec", + "isResizable": true + }, + "id": "288ee5f1-b484-43f5-86bf-5b81c01b3c2c" + }, + { + "targets": [ + { + "refId": "A", + "expr": "max(elasticsearch_cluster_health_number_of_pending_tasks{service =\"$service\"})" + } + ], + "name": "Pending Tasks", + "custom": { + "textMode": "valueAndName", + "colorMode": "value", + "calc": "lastNotNull", + "colSpan": 1, + "textSize": {} + }, + "options": { + "valueMappings": [ + { + "type": "range", + "match": { + "to": 1 + }, + "result": { + "color": "#417505" + } + }, + { + "type": "range", + "match": { + "from": 1 + }, + "result": { + "color": "#d0021b" + } + } + ], + "standardOptions": {} + }, + "version": "2.0.0", + "type": "stat", + "layout": { + "h": 3, + "w": 4, + "x": 20, + "y": 0, + "i": "01c403f3-c3b0-4910-84df-a50d4968bcd6", + "isResizable": true + }, + "id": "4dd345c1-2bc1-474e-83b1-153be10a5b5b" + }, + { + "type": "row", + "id": "b398d46e-7345-4cc9-90b4-918cbd1e8d1f", + "name": "Breakers", + "collapsed": true, + "layout": { + "h": 1, + "w": 24, + "x": 0, + "y": 3, + "i": "b398d46e-7345-4cc9-90b4-918cbd1e8d1f", + "isResizable": false + }, + "panels": [] + }, + { + "targets": [ + { + "refId": "A", + "expr": "sum({__name__=~\"elasticsearch_breakers_.+_tripped\", service =\"$service\", node_host=~\"$node_host\"}) by (node_host)", + "legend": "" + } + ], + "name": "Tripped for breakers", + "options": { + "tooltip": { + "mode": "all", + "sort": "none" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.04, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 12, + "x": 0, + "y": 4, + "i": "15882e6f-0585-4035-bfb6-71cb9caaa0a8", + "isResizable": true + }, + "id": "15882e6f-0585-4035-bfb6-71cb9caaa0a8" + }, + { + "type": "timeseries", + "id": "8adbc8e4-f630-4a25-98e3-ee03dec92011", + "layout": { + "h": 4, + "w": 12, + "x": 12, + "y": 4, + "i": "d05d16d0-022d-49f8-9b55-2388c4cbb2b1", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "refId": "A", + "expr": "{__name__=~\"elasticsearch_breakers_.+_size_in_bytes\", service =\"$service\", node_host=~\"$node_host\"}", + "legend": "{{__name__}} {{node_host}}" + } + ], + "name": "Estimated size in bytes of breaker", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "spanNulls": false, + "lineWidth": 1, + "fillOpacity": 0.04, + "gradientMode": "none", + "stack": "off", + "scaleDistribution": { + "type": "linear" + } + } + }, + { + "type": "row", + "id": "2aadd697-3bed-4f70-bc74-4bc801ef7d1d", + "name": "Shards", + "collapsed": true, + "layout": { + "h": 1, + "w": 24, + "x": 0, + "y": 8, + "i": "2aadd697-3bed-4f70-bc74-4bc801ef7d1d", + "isResizable": false + }, + "panels": [] + }, + { + "targets": [ + { + "refId": "A", + "expr": "max(elasticsearch_cluster_health_active_shards{service =\"$service\"})" + } + ], + "name": "Active shards", + "custom": { + "textMode": "valueAndName", + "colorMode": "value", + "calc": "lastNotNull", + "colSpan": 1, + "textSize": {} + }, + "options": { + "standardOptions": {} + }, + "version": "2.0.0", + "type": "stat", + "layout": { + "h": 3, + "w": 4, + "x": 0, + "y": 9, + "i": "7aec074e-1672-4dbb-8529-28292f9a4221", + "isResizable": true + }, + "id": "7aec074e-1672-4dbb-8529-28292f9a4221" + }, + { + "targets": [ + { + "refId": "A", + "expr": "max(elasticsearch_cluster_health_active_primary_shards{service =\"$service\"})" + } + ], + "name": "Active primary shards", + "custom": { + "textMode": "valueAndName", + "colorMode": "value", + "calc": "lastNotNull", + "colSpan": 1, + "textSize": {} + }, + "options": { + "standardOptions": {} + }, + "version": "2.0.0", + "type": "stat", + "layout": { + "h": 3, + "w": 4, + "x": 4, + "y": 9, + "i": "fe82bb33-7b8d-4909-adda-64a4121f29fd", + "isResizable": true + }, + "id": "f138daa7-b98f-4575-89e3-42363a8102c9" + }, + { + "targets": [ + { + "refId": "A", + "expr": "max(elasticsearch_cluster_health_initializing_shards{service =\"$service\"})" + } + ], + "name": "Initializing shards", + "custom": { + "textMode": "valueAndName", + "colorMode": "value", + "calc": "lastNotNull", + "colSpan": 1, + "textSize": {} + }, + "options": { + "standardOptions": {} + }, + "version": "2.0.0", + "type": "stat", + "layout": { + "h": 3, + "w": 4, + "x": 8, + "y": 9, + "i": "de2c46fd-dcfd-43a3-847b-9fd1320dfaa7", + "isResizable": true + }, + "id": "7412543a-dba5-4624-96ff-11e30b7e8ff4" + }, + { + "targets": [ + { + "refId": "A", + "expr": "max(elasticsearch_cluster_health_relocating_shards{service =\"$service\"})" + } + ], + "name": "Relocating shards", + "custom": { + "textMode": "valueAndName", + "colorMode": "value", + "calc": "lastNotNull", + "colSpan": 1, + "textSize": {} + }, + "options": { + "standardOptions": {} + }, + "version": "2.0.0", + "type": "stat", + "layout": { + "h": 3, + "w": 4, + "x": 12, + "y": 9, + "i": "4403206d-a491-4564-9f61-db25a6beb356", + "isResizable": true + }, + "id": "2f26f24f-2a79-4552-b79d-60b41fa3aee6" + }, + { + "targets": [ + { + "refId": "A", + "expr": "max(elasticsearch_cluster_health_delayed_unassigned_shards{service =\"$service\"})" + } + ], + "name": "Delayed Unassigned shards", + "custom": { + "textMode": "valueAndName", + "colorMode": "value", + "calc": "lastNotNull", + "colSpan": 1, + "textSize": {} + }, + "options": { + "standardOptions": {} + }, + "version": "2.0.0", + "type": "stat", + "layout": { + "h": 3, + "w": 4, + "x": 16, + "y": 9, + "i": "e24c847a-5704-4b7c-861e-75dd4e4b59d8", + "isResizable": true + }, + "id": "394a83cc-f4e1-467e-83fa-b77d2c2be907" + }, + { + "targets": [ + { + "refId": "A", + "expr": "max(elasticsearch_cluster_health_unassigned_shards{service =\"$service\"})" + } + ], + "name": "Unassigned shards", + "custom": { + "textMode": "valueAndName", + "colorMode": "value", + "calc": "lastNotNull", + "colSpan": 1, + "textSize": {} + }, + "options": { + "standardOptions": {} + }, + "version": "2.0.0", + "type": "stat", + "layout": { + "h": 3, + "w": 4, + "x": 20, + "y": 9, + "i": "d82314d5-028c-41fb-a79f-34699d56d17a", + "isResizable": true + }, + "id": "720b9719-5c37-44d9-bce8-539308afa6ae" + }, + { + "type": "row", + "id": "a1bc2be7-723b-4fe4-b217-bfdd8248559e", + "name": "JVM", + "collapsed": true, + "layout": { + "h": 1, + "w": 24, + "x": 0, + "y": 12, + "i": "a1bc2be7-723b-4fe4-b217-bfdd8248559e", + "isResizable": false + }, + "panels": [] + }, + { + "targets": [ + { + "refId": "A", + "expr": "irate(elasticsearch_jvm_gc_collectors_old_collection_count{service =\"$service\", node_host=~\"$node_host\"}[5m])", + "legend": "old gc {{node_host}}" + }, + { + "expr": "irate(elasticsearch_jvm_gc_collectors_young_collection_count{service =\"$service\", node_host=~\"$node_host\"}[5m])", + "refId": "B", + "legend": "young gc {{node_host}}" + } + ], + "name": "GC counts / second", + "options": { + "tooltip": { + "mode": "all", + "sort": "none" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 12, + "x": 0, + "y": 13, + "i": "ee0c56e0-8f8e-4cbe-ac41-de2afad7b75a", + "isResizable": true + }, + "id": "ee0c56e0-8f8e-4cbe-ac41-de2afad7b75a" + }, + { + "targets": [ + { + "refId": "A", + "expr": "irate(elasticsearch_jvm_gc_collectors_old_collection_time_in_millis{service =\"$service\", node_host=~\"$node_host\"}[5m])", + "legend": "old gc {{node_host}}" + }, + { + "expr": "irate(elasticsearch_jvm_gc_collectors_young_collection_time_in_millis{service =\"$service\", node_host=~\"$node_host\"}[5m])", + "refId": "B", + "legend": "young gc {{node_host}}" + } + ], + "name": "GC time in millis", + "options": { + "tooltip": { + "mode": "all", + "sort": "none" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 12, + "x": 12, + "y": 13, + "i": "4f21ebfc-b51c-469b-b149-479966750920", + "isResizable": true + }, + "id": "5c361278-8a94-4b16-afdd-e6def804b9ff" + }, + { + "type": "timeseries", + "id": "ff81d109-79e5-4909-8765-857a75cebf17", + "layout": { + "h": 4, + "w": 6, + "x": 0, + "y": 17, + "i": "5105f1dc-26cb-4818-a04d-90f2e5803da2", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "refId": "A", + "expr": "{__name__=~\"elasticsearch_jvm_mem_pools_young_used_in_bytes\", service =\"$service\", node_host=~\"$node_host\"}", + "legend": "{{__name__}} {{node_name}}" + } + ], + "name": "Yong Used(Bytes)", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "bytesIEC", + "decimals": 1 + }, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "spanNulls": false, + "lineWidth": 1, + "fillOpacity": 0.5, + "gradientMode": "none", + "stack": "off", + "scaleDistribution": { + "type": "linear" + } + } + }, + { + "type": "timeseries", + "id": "d7c76456-8f34-4e1b-843b-9d174bbdfcee", + "layout": { + "h": 4, + "w": 6, + "x": 6, + "y": 17, + "i": "86841663-2a17-4858-a9e1-13c296b3bb76", + "isResizable": true + }, + "version": "2.0.0", + "datasourceCate": "prometheus", + "targets": [ + { + "refId": "A", + "expr": "{__name__=~\"elasticsearch_jvm_mem_pools_old_used_in_bytes\", service =\"$service\", node_host=~\"$node_host\"}", + "legend": "{{__name__}} {{node_name}}" + } + ], + "name": "Old Used(Bytes)", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "bytesIEC", + "decimals": 1 + }, + "thresholds": { + "steps": [ + { + "color": "#634CD9", + "value": null, + "type": "base" + } + ] + } + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "spanNulls": false, + "lineWidth": 1, + "fillOpacity": 0.5, + "gradientMode": "none", + "stack": "off", + "scaleDistribution": { + "type": "linear" + } + } + }, + { + "targets": [ + { + "refId": "A", + "expr": "{__name__=~\"elasticsearch_jvm_mem_.+_committed_in_bytes\", service =\"$service\", node_host=~\"$node_host\"}", + "legend": "{{__name__}} {{node_name}}" + } + ], + "name": "Committed Bytes", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "bytesIEC", + "decimals": 1 + }, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 12, + "x": 12, + "y": 17, + "i": "14f655ac-9c1c-40fa-bfef-158cc8601ead", + "isResizable": true + }, + "id": "a0096936-3790-40a1-b2ad-d7805945b948" + }, + { + "type": "row", + "id": "e1d04a8c-81ee-4949-87de-3b70bc637584", + "name": "Translog", + "collapsed": true, + "layout": { + "h": 1, + "w": 24, + "x": 0, + "y": 21, + "i": "e1d04a8c-81ee-4949-87de-3b70bc637584", + "isResizable": false + }, + "panels": [] + }, + { + "targets": [ + { + "refId": "A", + "expr": "irate(elasticsearch_indices_translog_operations{service =\"$service\", node_host=~\"$node_host\"}[5m])", + "legend": "{{node_host}}" + } + ], + "name": "Total translog operations", + "options": { + "tooltip": { + "mode": "all", + "sort": "none" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 12, + "x": 0, + "y": 22, + "i": "45aafb11-c694-4686-89ab-685068f91560", + "isResizable": true + }, + "id": "45aafb11-c694-4686-89ab-685068f91560" + }, + { + "targets": [ + { + "refId": "A", + "expr": "irate(elasticsearch_indices_translog_size_in_bytes{service =\"$service\", node_host=~\"$node_host\"}[5m])", + "legend": "{{node_host}}" + } + ], + "name": "Total translog size in bytes", + "options": { + "tooltip": { + "mode": "all", + "sort": "none" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "bytesIEC", + "decimals": 1 + }, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 12, + "x": 12, + "y": 22, + "i": "56806f8a-525a-4ab4-a9d3-c83559ae4828", + "isResizable": true + }, + "id": "09ca6329-8eec-4a61-b19e-9bbeea2b9712" + }, + { + "type": "row", + "id": "d9694c3f-9a14-4bde-9427-88531b0ea3a6", + "name": "Disk and Network", + "collapsed": true, + "layout": { + "h": 1, + "w": 24, + "x": 0, + "y": 26, + "i": "d9694c3f-9a14-4bde-9427-88531b0ea3a6", + "isResizable": false + }, + "panels": [] + }, + { + "targets": [ + { + "refId": "A", + "expr": "1-(elasticsearch_fs_total_available_in_bytes{service =\"$service\",node_host=~\"$node_host\"}/elasticsearch_fs_total_total_in_bytes{service =\"$service\",node_host=~\"$node_host\"})", + "legend": "{{node_host}}" + } + ], + "name": "Disk usage %", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "percentUnit" + }, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 12, + "x": 0, + "y": 27, + "i": "6c0d9b3c-dda5-4da9-825e-33f650dbb008", + "isResizable": true + }, + "id": "6c0d9b3c-dda5-4da9-825e-33f650dbb008" + }, + { + "targets": [ + { + "refId": "A", + "expr": "irate(elasticsearch_transport_tx_size_in_bytes{service =\"$service\",node_host=~\"$node_host\"}[5m])", + "legend": "{{node_host}}: sent" + }, + { + "expr": "-irate(elasticsearch_transport_rx_size_in_bytes{service =\"$service\",node_host=~\"$node_host\"}[5m])", + "refId": "B", + "legend": "{{node_host}}: received" + } + ], + "name": "Network usage", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 12, + "x": 12, + "y": 27, + "i": "4f7ce5a7-2771-4cbf-a569-b1a90b070b93", + "isResizable": true + }, + "id": "4f7ce5a7-2771-4cbf-a569-b1a90b070b93" + }, + { + "type": "row", + "id": "292b6c24-5471-4eeb-9d65-1a1e7a684fe3", + "name": "Documents", + "collapsed": true, + "layout": { + "h": 1, + "w": 24, + "x": 0, + "y": 31, + "i": "292b6c24-5471-4eeb-9d65-1a1e7a684fe3", + "isResizable": false + }, + "panels": [] + }, + { + "targets": [ + { + "refId": "A", + "expr": "elasticsearch_indices_docs_count{service =\"$service\", node_host=~\"$node_host\"}", + "legend": "{{node_host}}" + } + ], + "name": "Documents count on node", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 12, + "x": 0, + "y": 32, + "i": "e98839c4-e3f3-4e6e-be3a-c44b70e6072c", + "isResizable": true + }, + "id": "e98839c4-e3f3-4e6e-be3a-c44b70e6072c" + }, + { + "targets": [ + { + "refId": "A", + "expr": "irate(elasticsearch_indices_indexing_index_total{service =\"$service\",node_host=~\"$node_host\"}[5m])", + "legend": "{{node_host}}" + } + ], + "name": "Documents indexed rate", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 12, + "x": 12, + "y": 32, + "i": "45c4e3d9-90f1-41bd-8169-1d8c0a921ba9", + "isResizable": true + }, + "id": "45c4e3d9-90f1-41bd-8169-1d8c0a921ba9" + }, + { + "targets": [ + { + "refId": "A", + "expr": "irate(elasticsearch_indices_docs_deleted{service =\"$service\",node_host=~\"$node_host\"}[5m])", + "legend": "{{node_host}}" + } + ], + "name": "Documents deleted rate", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 12, + "x": 0, + "y": 36, + "i": "3b2a922d-4423-4845-8cfc-95970f3300d6", + "isResizable": true + }, + "id": "3b2a922d-4423-4845-8cfc-95970f3300d6" + }, + { + "targets": [ + { + "refId": "A", + "expr": "rate(elasticsearch_indices_merges_total_docs{service =\"$service\",node_host=~\"$node_host\"}[5m])", + "legend": "{{node_host}}" + } + ], + "name": "Documents merged rate", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 6, + "x": 12, + "y": 36, + "i": "764fbcf7-3056-41ef-b62a-51813a6c315f", + "isResizable": true + }, + "id": "764fbcf7-3056-41ef-b62a-51813a6c315f" + }, + { + "targets": [ + { + "refId": "A", + "expr": "irate(elasticsearch_indices_merges_total_size_in_bytes{service =\"$service\",node_host=~\"$node_host\"}[5m])", + "legend": "{{node_host}}" + } + ], + "name": "Documents merged bytes", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "bytesIEC", + "decimals": 1 + }, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 6, + "x": 18, + "y": 36, + "i": "97b5d900-e91e-4e0e-8184-f508a3433bc6", + "isResizable": true + }, + "id": "7cc04ae4-946d-4837-9ea9-764a7cc2eecd" + }, + { + "type": "row", + "id": "a0545cbd-6df5-4845-90e0-88a710f738ba", + "name": "Times", + "collapsed": true, + "layout": { + "h": 1, + "w": 24, + "x": 0, + "y": 40, + "i": "a0545cbd-6df5-4845-90e0-88a710f738ba", + "isResizable": false + }, + "panels": [] + }, + { + "targets": [ + { + "refId": "A", + "expr": "irate(elasticsearch_indices_search_query_time_in_millis{service =\"$service\",node_host=~\"$node_host\"}[5m])", + "legend": "{{node_host}}" + } + ], + "name": "Query time(Unit: ms)", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 12, + "x": 0, + "y": 41, + "i": "ad0445b0-8539-440d-bbf4-712450132a7a", + "isResizable": true + }, + "id": "ad0445b0-8539-440d-bbf4-712450132a7a" + }, + { + "targets": [ + { + "refId": "A", + "expr": "irate(elasticsearch_indices_indexing_index_time_in_millis{service =\"$service\",node_host=~\"$node_host\"}[5m])", + "legend": "{{node_host}}" + } + ], + "name": "Indexing time(Unit: ms)", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 12, + "x": 12, + "y": 41, + "i": "2af98dc1-f24e-4c7b-bd2c-723224facc5d", + "isResizable": true + }, + "id": "c3cf6c57-c4ce-4bc2-a150-df32c4951144" + }, + { + "targets": [ + { + "refId": "A", + "expr": "irate(elasticsearch_indices_merges_total_time_in_millis{service =\"$service\",node_host=~\"$node_host\"}[5m])", + "legend": "{{node_host}}" + } + ], + "name": "Merging time(Unit: ms)", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 12, + "x": 0, + "y": 45, + "i": "553c7da8-2d83-4ea0-a6ef-b064a5101633", + "isResizable": true + }, + "id": "022db454-70ba-49f5-8c11-f89b76d145cb" + }, + { + "targets": [ + { + "refId": "A", + "expr": "irate(elasticsearch_indices_indexing_throttle_time_in_millis{service =\"$service\",node_host=~\"$node_host\"}[5m])", + "legend": "{{node_host}}" + } + ], + "name": "Indexing throttle time(Unit: ms)", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 12, + "x": 12, + "y": 45, + "i": "51056e8d-6dc8-4c7f-91e9-9c24c056462d", + "isResizable": true + }, + "id": "f20bad4f-656c-428a-a1cf-aafb7d92137c" + }, + { + "type": "row", + "id": "2c56fb7a-85a0-4396-a317-6754d761cff2", + "name": "Thread Pool", + "collapsed": true, + "layout": { + "h": 1, + "w": 24, + "x": 0, + "y": 49, + "i": "2c56fb7a-85a0-4396-a317-6754d761cff2", + "isResizable": false + }, + "panels": [] + }, + { + "targets": [ + { + "refId": "A", + "expr": "irate\n(label_replace({__name__=~\"elasticsearch_thread_pool_(.*)_rejected\", service =\"$service\", node_host=~\"$node_host\"}, \"type\", \"$1\", \"__name__\", \"elasticsearch_thread_pool_(.*)_rejected\")[5m:])", + "legend": "{{node_host}}: {{type}}" + } + ], + "name": "Thread Pool operations rejected", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 6, + "x": 0, + "y": 50, + "i": "cefafeb9-fc8a-4c73-92b3-648cd6f08b11", + "isResizable": true + }, + "id": "cefafeb9-fc8a-4c73-92b3-648cd6f08b11" + }, + { + "targets": [ + { + "refId": "A", + "expr": "label_replace({__name__=~\"elasticsearch_thread_pool_(.*)_active\", service =\"$service\", node_host=~\"$node_host\"}, \"type\", \"$1\", \"__name__\", \"elasticsearch_thread_pool_(.*)_active\")", + "legend": "{{node_host}}: {{type}}" + } + ], + "name": "Thread Pool threads active", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 6, + "x": 6, + "y": 50, + "i": "793e98e7-2729-4106-940c-ecccff1d4b89", + "isResizable": true + }, + "id": "0ab67903-16ea-4001-b784-ae04d8b815c0" + }, + { + "targets": [ + { + "refId": "A", + "expr": "label_replace({__name__=~\"elasticsearch_thread_pool_(.*)_queue\", service =\"$service\", node_host=~\"$node_host\"}, \"type\", \"$1\", \"__name__\", \"elasticsearch_thread_pool_(.*)_queue\")", + "legend": "{{node_host}}: {{type}}" + } + ], + "name": "Thread Pool threads queued", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 6, + "x": 12, + "y": 50, + "i": "9a14c86a-86af-4464-ac82-41f621ce7166", + "isResizable": true + }, + "id": "bb5dc07d-673b-4e2d-b44c-441acfa7c27b" + }, + { + "targets": [ + { + "refId": "A", + "expr": "irate\n(label_replace({__name__=~\"elasticsearch_thread_pool_(.*)_completed\", service =\"$service\", node_host=~\"$node_host\"}, \"type\", \"$1\", \"__name__\", \"elasticsearch_thread_pool_(.*)_completed\")[5m:])", + "legend": "{{node_host}}: {{type}}" + } + ], + "name": "Thread Pool operations completed", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 6, + "x": 18, + "y": 50, + "i": "d5f42ea7-bdb5-44da-9ba6-7c0f09ba7c71", + "isResizable": true + }, + "id": "4cac1498-c141-483f-97c6-e1177317a2ea" + }, + { + "type": "row", + "id": "a5c3b529-c329-4a66-aab0-6caebba8be96", + "name": "Caches", + "collapsed": true, + "layout": { + "h": 1, + "w": 24, + "x": 0, + "y": 54, + "i": "a5c3b529-c329-4a66-aab0-6caebba8be96", + "isResizable": false + }, + "panels": [] + }, + { + "targets": [ + { + "refId": "A", + "expr": "elasticsearch_indices_fielddata_memory_size_in_bytes{service =\"$service\", node_host=~\"$node_host\"}", + "legend": "{{node_host}}" + } + ], + "name": "Field data memory size", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "bytesIEC", + "decimals": 0 + }, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 4, + "x": 0, + "y": 55, + "i": "5247f393-a934-4d9e-be0f-40b177d2be80", + "isResizable": true + }, + "id": "5247f393-a934-4d9e-be0f-40b177d2be80" + }, + { + "targets": [ + { + "refId": "A", + "expr": "rate(elasticsearch_indices_fielddata_evictions{service =\"$service\", node_host=~\"$node_host\"}[5m])", + "legend": "{{node_host}}" + } + ], + "name": "Field data evictions", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "bytesIEC", + "decimals": 0 + }, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 4, + "x": 4, + "y": 55, + "i": "c33fceb6-df37-483e-ba53-4ffa5f5e5456", + "isResizable": true + }, + "id": "b87c56f7-4e50-4d15-8bcd-1218fee879d9" + }, + { + "targets": [ + { + "refId": "A", + "expr": "elasticsearch_indices_query_cache_memory_size_in_bytes{service =\"$service\", node_host=~\"$node_host\"}", + "legend": "{{node_host}}" + } + ], + "name": "Query cache size", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "bytesIEC", + "decimals": 0 + }, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 4, + "x": 8, + "y": 55, + "i": "445484f4-32d3-4569-af8d-76790d0aa56b", + "isResizable": true + }, + "id": "ae2d0a7a-b6cd-4fd5-99d4-3c4289b8b5a8" + }, + { + "targets": [ + { + "refId": "A", + "expr": "rate(elasticsearch_indices_query_cache_evictions{service =\"$service\", node_host=~\"$node_host\"}[5m])", + "legend": "{{node_host}}" + } + ], + "name": "Query cache evictions", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "bytesIEC", + "decimals": 0 + }, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 4, + "x": 12, + "y": 55, + "i": "ce9aa255-9d5b-44ed-9071-85e9d95675ec", + "isResizable": true + }, + "id": "78e4badc-8d51-4aa6-81c5-d1c9183810a2" + }, + { + "targets": [ + { + "refId": "A", + "expr": "rate(elasticsearch_indices_request_cache_evictions{service =\"$service\", node_host=~\"$node_host\"}[5m])", + "legend": "{{node_host}}" + } + ], + "name": "Request cache evictions", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "bytesIEC", + "decimals": 0 + }, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 8, + "x": 16, + "y": 55, + "i": "b8b2604c-d84f-426f-b033-af9035a9e80d", + "isResizable": true + }, + "id": "296b43f1-2f33-492a-bce8-6f0fde1e7b52" + }, + { + "type": "row", + "id": "d3161bf5-27a7-4552-a7d6-7b2b7d46b611", + "name": "Segments", + "collapsed": true, + "layout": { + "h": 1, + "w": 24, + "x": 0, + "y": 59, + "i": "d3161bf5-27a7-4552-a7d6-7b2b7d46b611", + "isResizable": false + } + }, + { + "targets": [ + { + "refId": "A", + "expr": "elasticsearch_indices_segments_count{service =\"$service\", node_host=~\"$node_host\"}", + "legend": "{{node_host}}" + } + ], + "name": "Count of index segments", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 12, + "x": 0, + "y": 60, + "i": "1537acaa-d5ce-48c5-b740-26fd543eb120", + "isResizable": true + }, + "id": "1537acaa-d5ce-48c5-b740-26fd543eb120" + }, + { + "targets": [ + { + "refId": "A", + "expr": "elasticsearch_indices_segments_memory_in_bytes{service =\"$service\", node_host=~\"$node_host\"}", + "legend": "{{node_host}}" + } + ], + "name": "Current memory size of segments in bytes", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 4, + "w": 12, + "x": 12, + "y": 60, + "i": "9c8efed6-7ced-4805-87e0-3da3b18d2989", + "isResizable": true + }, + "id": "a1c34fa4-4549-41a6-8d31-d25e7d860106" + } + ], + "version": "2.0.0" + } +} \ No newline at end of file diff --git a/integrations/HTTP/icon/http.png b/integrations/HTTP/icon/http.png new file mode 100644 index 00000000..500fdec4 Binary files /dev/null and b/integrations/HTTP/icon/http.png differ diff --git a/integrations/IPMI/collect/conf.toml b/integrations/IPMI/collect/conf.toml new file mode 100644 index 00000000..8146b783 --- /dev/null +++ b/integrations/IPMI/collect/conf.toml @@ -0,0 +1,43 @@ +# Read metrics from the bare metal servers via IPMI +[[instances]] + ## optionally specify the path to the ipmitool executable + # path = "/usr/bin/ipmitool" + ## + ## Setting 'use_sudo' to true will make use of sudo to run ipmitool. + ## Sudo must be configured to allow the telegraf user to run ipmitool + ## without a password. + # use_sudo = false + ## + ## optionally force session privilege level. Can be CALLBACK, USER, OPERATOR, ADMINISTRATOR + # privilege = "ADMINISTRATOR" + ## + ## optionally specify one or more servers via a url matching + ## [username[:password]@][protocol[(address)]] + ## e.g. + ## root:passwd@lan(127.0.0.1) + ## + ## if no servers are specified, local machine sensor stats will be queried + ## + # servers = ["USERID:PASSW0RD@lan(192.168.1.1)"] + + ## Recommended: use metric 'interval' that is a multiple of 'timeout' to avoid + ## gaps or overlap in pulled data + interval = "30s" + + ## Timeout for the ipmitool command to complete. Default is 20 seconds. + timeout = "20s" + + ## Schema Version: (Optional, defaults to version 1) + metric_version = 2 + + ## Optionally provide the hex key for the IMPI connection. + # hex_key = "" + + ## If ipmitool should use a cache + ## for me ipmitool runs about 2 to 10 times faster with cache enabled on HP G10 servers (when using ubuntu20.04) + ## the cache file may not work well for you if some sensors come up late + # use_cache = false + + ## Path to the ipmitools cache file (defaults to OS temp dir) + ## The provided path must exist and must be writable + # cache_path = "" diff --git a/integrations/Kafka/collect/kafka.toml b/integrations/Kafka/collect/kafka.toml new file mode 100644 index 00000000..becaa057 --- /dev/null +++ b/integrations/Kafka/collect/kafka.toml @@ -0,0 +1,99 @@ +# # collect interval +# interval = 15 + +############################################################################ +# !!! uncomment [[instances]] to enable this plugin +[[instances]] +# # interval = global.interval * interval_times +# interval_times = 1 + +# append some labels to metrics +# cluster is a preferred tag with the cluster name. If none is provided, the first of kafka_uris will be used +labels = { cluster="kafka-cluster-01" } + +# log level only for kafka exporter +log_level = "error" + +# Address (host:port) of Kafka server. +# kafka_uris = ["127.0.0.1:9092","127.0.0.1:9092","127.0.0.1:9092"] +kafka_uris = [] + +# Connect using SASL/PLAIN +# Default is false +# use_sasl = false + +# Only set this to false if using a non-Kafka SASL proxy +# Default is true +# use_sasl_handshake = false + +# SASL user name +# sasl_username = "username" + +# SASL user password +# sasl_password = "password" + +# The SASL SCRAM SHA algorithm sha256 or sha512 as mechanism +# sasl_mechanism = "" + +# Connect using TLS +# use_tls = false + +# The optional certificate authority file for TLS client authentication +# ca_file = "" + +# The optional certificate file for TLS client authentication +# cert_file = "" + +# The optional key file for TLS client authentication +# key_file = "" + +# If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure +# insecure_skip_verify = true + +# Kafka broker version +# Default is 2.0.0 +# kafka_version = "2.0.0" + +# if you need to use a group from zookeeper +# Default is false +# use_zookeeper_lag = false + +# Address array (hosts) of zookeeper server. +# zookeeper_uris = [] + +# Metadata refresh interval +# Default is 1m +# metadata_refresh_interval = "1m" + +# Whether show the offset/lag for all consumer group, otherwise, only show connected consumer groups, default is true +# Default is true +# offset_show_all = true + +# If true, all scrapes will trigger kafka operations otherwise, they will share results. WARN: This should be disabled on large clusters +# Default is false +# allow_concurrency = false + +# Maximum number of offsets to store in the interpolation table for a partition +# Default is 1000 +# max_offsets = 1000 + +# How frequently should the interpolation table be pruned, in seconds. +# Default is 30 +# prune_interval_seconds = 30 + +# Regex filter for topics to be monitored +# Default is ".*" +# topics_filter_regex = ".*" + +# Regex filter for consumer groups to be monitored +# Default is ".*" +# groups_filter_regex = ".*" + +# if rename kafka_consumergroup_uncommitted_offsets to kafka_consumergroup_lag +# Default is false +# rename_uncommit_offset_to_lag = false + + +# if disable calculating lag rate +# Default is false +# disable_calculate_lag_rate = false \ No newline at end of file diff --git a/integrations/Kafka/markdown/alerts..png b/integrations/Kafka/markdown/alerts..png new file mode 100644 index 00000000..2dcdfc72 Binary files /dev/null and b/integrations/Kafka/markdown/alerts..png differ diff --git a/integrations/Kafka/markdown/dashboards.png b/integrations/Kafka/markdown/dashboards.png new file mode 100644 index 00000000..20685773 Binary files /dev/null and b/integrations/Kafka/markdown/dashboards.png differ diff --git a/integrations/Kubernetes/collect/kubernetes.toml b/integrations/Kubernetes/collect/kubernetes.toml new file mode 100644 index 00000000..1f86d066 --- /dev/null +++ b/integrations/Kubernetes/collect/kubernetes.toml @@ -0,0 +1,42 @@ +# # collect interval +# interval = 15 + +[[instances]] +# # append some labels for series +# labels = { region="cloud", product="n9e" } + +# # interval = global.interval * interval_times +# interval_times = 1 + +# URL for the kubelet +# url = "https://$HOSTIP:10250" +url = "" + +gather_system_container_metrics = true +gather_node_metrics = true +gather_pod_container_metrics = true +gather_pod_volume_metrics = true +gather_pod_network_metrics = true + +## Use bearer token for authorization. ('bearer_token' takes priority) +## If both of these are empty, we'll use the default serviceaccount: +## at: /var/run/secrets/kubernetes.io/serviceaccount/token +# bearer_token = "/path/to/bearer/token" +## OR +# bearer_token_string = "abc_123" + +## Pod labels to be added as tags. An empty array for both include and +## exclude will include all labels. +# label_include = [] +# label_exclude = ["*"] + +## Set response_timeout (default 5 seconds) +# response_timeout = "5s" + +## Optional TLS Config +use_tls = true +# tls_ca = "/etc/categraf/ca.pem" +# tls_cert = "/etc/categraf/cert.pem" +# tls_key = "/etc/categraf/key.pem" +## Use TLS but skip chain & host verification +insecure_skip_verify = true \ No newline at end of file diff --git a/integrations/MinIO/markdown/alerts.png b/integrations/MinIO/markdown/alerts.png new file mode 100644 index 00000000..8d97d764 Binary files /dev/null and b/integrations/MinIO/markdown/alerts.png differ diff --git a/integrations/MinIO/markdown/minio.png b/integrations/MinIO/markdown/minio.png new file mode 100644 index 00000000..794851b0 Binary files /dev/null and b/integrations/MinIO/markdown/minio.png differ diff --git a/integrations/MongoDB/collect/mongodb.toml b/integrations/MongoDB/collect/mongodb.toml new file mode 100644 index 00000000..35ff1eb7 --- /dev/null +++ b/integrations/MongoDB/collect/mongodb.toml @@ -0,0 +1,57 @@ +[[instances]] +# log level, enum: panic, fatal, error, warn, warning, info, debug, trace, defaults to info. +log_level = "info" +# append some const labels to metrics +# NOTICE! the instance label is required for dashboards +labels = { instance="mongo-cluster-01" } + +# mongodb dsn, see https://www.mongodb.com/docs/manual/reference/connection-string/ +# mongodb_uri = "mongodb://127.0.0.1:27017" +mongodb_uri = "" +# if you don't specify the username or password in the mongodb_uri, you can set here. +# This will overwrite the dsn, it would be helpful when special characters existing in the username or password and you don't want to encode them. +# NOTICE! this user must be granted enough rights to query needed stats, see ../inputs/mongodb/README.md +username = "username@Bj" +password = "password@Bj" +# if set to true, use the direct connection way +# direct_connect = true + +# collect all means you collect all the metrics, if set, all below enable_xxx flags in this section will be ignored +collect_all = true +# if set to true, collect databases metrics +# enable_db_stats = true +# if set to true, collect getDiagnosticData metrics +# enable_diagnostic_data = true +# if set to true, collect replSetGetStatus metrics +# enable_replicaset_status = true +# if set to true, collect top metrics by admin command +# enable_top_metrics = true +# if set to true, collect index metrics. You should specify one of the coll_stats_namespaces and the discovering_mode flags. +# enable_index_stats = true +# if set to true, collect collections metrics. You should specify one of the coll_stats_namespaces and the discovering_mode flags. +# enable_coll_stats = true + +# Only get stats for the collections matching this list of namespaces. if none set, discovering_mode will be enabled. +# Example: db1.col1,db.col1 +# coll_stats_namespaces = [] +# Only get stats for index with the collections matching this list of namespaces. +# Example: db1.col1,db.col1 +# index_stats_collections = [] +# if set to true, replace -1 to DESC for label key_name of the descending_index metrics +# enable_override_descending_index = true + +# which exposes metrics with 0.1x compatible metric names has been implemented which simplifies migration from the old version to the current version. +# compatible_mode = true + + +# [[instances]] +# # interval = global.interval * interval_times +# interval_times = 1 + +# log_level = "error" + +# append some labels to metrics +# labels = { instance="mongo-cluster-02" } +# mongodb_uri = "mongodb://username:password@127.0.0.1:27017" +# collect_all = true +# compatible_mode = true \ No newline at end of file diff --git a/integrations/MySQL/collect/mysql.toml b/integrations/MySQL/collect/mysql.toml new file mode 100644 index 00000000..993be854 --- /dev/null +++ b/integrations/MySQL/collect/mysql.toml @@ -0,0 +1,65 @@ +# # collect interval +# interval = 15 + +# [[queries]] +# mesurement = "users" +# metric_fields = [ "total" ] +# label_fields = [ "service" ] +# timeout = "3s" +# request = ''' +# select 'n9e' as service, count(*) as total from n9e_v5.users +# ''' + + +[[instances]] +# address = "127.0.0.1:3306" +# username = "root" +# password = "1234" + +# # set tls=custom to enable tls +# parameters = "tls=false" + +# extra_status_metrics = true +# extra_innodb_metrics = false +# gather_processlist_processes_by_state = false +# gather_processlist_processes_by_user = false +# gather_schema_size = true +# gather_table_size = false +# gather_system_table_size = false +# gather_slave_status = true + +# # timeout +# timeout_seconds = 3 + +# # interval = global.interval * interval_times +# interval_times = 1 + +# important! use global unique string to specify instance +# labels = { instance="n9e-10.2.3.4:3306" } + +## Optional TLS Config +# use_tls = false +# tls_min_version = "1.2" +# tls_ca = "/etc/categraf/ca.pem" +# tls_cert = "/etc/categraf/cert.pem" +# tls_key = "/etc/categraf/key.pem" +## Use TLS but skip chain & host verification +# insecure_skip_verify = true + +#[[instances.queries]] +# mesurement = "lock_wait" +# metric_fields = [ "total" ] +# timeout = "3s" +# request = ''' +#SELECT count(*) as total FROM information_schema.innodb_trx WHERE trx_state='LOCK WAIT' +#''' + +# [[instances.queries]] +# mesurement = "users" +# metric_fields = [ "total" ] +# label_fields = [ "service" ] +# # field_to_append = "" +# timeout = "3s" +# request = ''' +# select 'n9e' as service, count(*) as total from n9e_v5.users +# ''' diff --git a/integrations/MySQL/dashboards/mysql_by_categraf.json b/integrations/MySQL/dashboards/mysql_by_categraf.json new file mode 100644 index 00000000..d40b1aaa --- /dev/null +++ b/integrations/MySQL/dashboards/mysql_by_categraf.json @@ -0,0 +1,906 @@ +{ + "name": "MySQL Overview by categraf", + "tags": "Prometheus MySQL", + "ident": "", + "configs": { + "panels": [ + { + "collapsed": true, + "id": "fe0e2a5d-4e82-4eaf-b13a-6d98aa6b6860", + "layout": { + "h": 1, + "w": 24, + "x": 0, + "y": 0, + "i": "fe0e2a5d-4e82-4eaf-b13a-6d98aa6b6860", + "isResizable": false + }, + "name": "Basic Info", + "type": "row" + }, + { + "custom": { + "calc": "lastNotNull", + "colSpan": 1, + "colorMode": "value", + "textMode": "value", + "textSize": {} + }, + "id": "80079949-dbff-48fe-a1eb-54b646c30135", + "layout": { + "h": 3, + "w": 6, + "x": 0, + "y": 1, + "i": "80079949-dbff-48fe-a1eb-54b646c30135", + "isResizable": true + }, + "name": "MySQL Uptime", + "options": { + "standardOptions": { + "util": "humantimeSeconds" + }, + "valueMappings": [ + { + "match": { + "to": 1800 + }, + "result": { + "color": "#ec7718" + }, + "type": "range" + }, + { + "match": { + "from": 1800 + }, + "result": { + "color": "#369603" + }, + "type": "range" + } + ] + }, + "targets": [ + { + "expr": "min(mysql_global_status_uptime{instance=~\"$instance\"})" + } + ], + "type": "stat", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "custom": { + "calc": "lastNotNull", + "colSpan": 1, + "colorMode": "value", + "textMode": "value", + "textSize": {} + }, + "description": "mysql_global_status_queries", + "id": "9fd6dd09-d131-4c0e-88ea-ed62c72baf97", + "layout": { + "h": 3, + "w": 6, + "x": 6, + "y": 1, + "i": "9fd6dd09-d131-4c0e-88ea-ed62c72baf97", + "isResizable": true + }, + "name": "Current QPS", + "options": { + "standardOptions": { + "decimals": 2 + }, + "valueMappings": [ + { + "match": { + "to": 100 + }, + "result": { + "color": "#05a31f" + }, + "type": "range" + }, + { + "match": { + "from": 100 + }, + "result": { + "color": "#ea3939" + }, + "type": "range" + } + ] + }, + "targets": [ + { + "expr": "rate(mysql_global_status_queries{instance=~\"$instance\"}[5m])" + } + ], + "type": "stat", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "custom": { + "calc": "lastNotNull", + "colSpan": 1, + "colorMode": "value", + "textMode": "value", + "textSize": {} + }, + "description": "**InnoDB Buffer Pool Size**\n\nInnoDB maintains a storage area called the buffer pool for caching data and indexes in memory. Knowing how the InnoDB buffer pool works, and taking advantage of it to keep frequently accessed data in memory, is one of the most important aspects of MySQL tuning. The goal is to keep the working set in memory. In most cases, this should be between 60%-90% of available memory on a dedicated database host, but depends on many factors.", + "id": "24913190-b86d-44b7-a8db-555351d9d3c2", + "layout": { + "h": 3, + "w": 6, + "x": 12, + "y": 1, + "i": "24913190-b86d-44b7-a8db-555351d9d3c2", + "isResizable": true + }, + "name": "InnoDB Buffer Pool", + "options": { + "standardOptions": { + "util": "bytesIEC" + } + }, + "targets": [ + { + "expr": "avg(mysql_global_variables_innodb_buffer_pool_size{instance=~\"$instance\"})" + } + ], + "type": "stat", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "custom": { + "calc": "lastNotNull", + "colSpan": 1, + "colorMode": "value", + "textMode": "value", + "textSize": {} + }, + "description": "**Table Locks**\n\nMySQL takes a number of different locks for varying reasons. In this graph we see how many Table level locks MySQL has requested from the storage engine. In the case of InnoDB, many times the locks could actually be row locks as it only takes table level locks in a few specific cases.\n\nIt is most useful to compare Locks Immediate and Locks Waited. If Locks waited is rising, it means you have lock contention. Otherwise, Locks Immediate rising and falling is normal activity.", + "id": "94a1e97e-2241-4e05-a9e9-a9b1e69d1070", + "layout": { + "h": 3, + "w": 6, + "x": 18, + "y": 1, + "i": "94a1e97e-2241-4e05-a9e9-a9b1e69d1070", + "isResizable": true + }, + "name": "Table Locks Waited(5min)", + "options": { + "standardOptions": {}, + "valueMappings": [ + { + "match": { + "from": 1 + }, + "result": { + "color": "#e70d0d" + }, + "type": "range" + }, + { + "match": { + "to": 1 + }, + "result": { + "color": "#53b503" + }, + "type": "range" + } + ] + }, + "targets": [ + { + "expr": "sum(increase(mysql_global_status_table_locks_waited{instance=~\"$instance\"}[5m]))" + } + ], + "type": "stat", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "collapsed": true, + "id": "ca82d30f-8e0d-4caa-8a00-2ed9efe4ad85", + "layout": { + "h": 1, + "w": 24, + "x": 0, + "y": 4, + "i": "ca82d30f-8e0d-4caa-8a00-2ed9efe4ad85", + "isResizable": false + }, + "name": "Connections", + "type": "row" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "**Max Connections** \n\nMax Connections is the maximum permitted number of simultaneous client connections. By default, this is 151. Increasing this value increases the number of file descriptors that mysqld requires. If the required number of descriptors are not available, the server reduces the value of Max Connections.\n\nmysqld actually permits Max Connections + 1 clients to connect. The extra connection is reserved for use by accounts that have the SUPER privilege, such as root.\n\nMax Used Connections is the maximum number of connections that have been in use simultaneously since the server started.\n\nConnections is the number of connection attempts (successful or not) to the MySQL server.", + "id": "e2c85e72-0286-49bc-8ddb-5fba5f449b53", + "layout": { + "h": 7, + "w": 12, + "x": 0, + "y": 5, + "i": "e2c85e72-0286-49bc-8ddb-5fba5f449b53", + "isResizable": true + }, + "name": "MySQL Connections", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "desc" + } + }, + "targets": [ + { + "expr": "sum(mysql_global_status_threads_connected{instance=~\"$instance\"})", + "legend": "Connections" + }, + { + "expr": "sum(mysql_global_status_max_used_connections{instance=~\"$instance\"})", + "legend": "Max Used Connections" + }, + { + "expr": "sum(mysql_global_variables_max_connections{instance=~\"$instance\"})", + "legend": "Max Connections" + }, + { + "expr": "sum(rate(mysql_global_status_aborted_connects{instance=~\"$instance\"}[5m]))", + "legend": "Aborted Connections" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "Threads Connected is the number of open connections, while Threads Running is the number of threads not sleeping.", + "id": "fbd43ac2-159d-4e55-8bc6-800d1bbfbd59", + "layout": { + "h": 7, + "w": 12, + "x": 12, + "y": 5, + "i": "fbd43ac2-159d-4e55-8bc6-800d1bbfbd59", + "isResizable": true + }, + "name": "MySQL Client Thread Activity", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "desc" + } + }, + "targets": [ + { + "expr": "sum(mysql_global_status_threads_connected{instance=~\"$instance\"})", + "legend": "Threads Connected" + }, + { + "expr": "sum(mysql_global_status_threads_running{instance=~\"$instance\"})", + "legend": "Threads Running" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "collapsed": true, + "id": "cb81def4-ac63-4d42-b66e-440f9061794b", + "layout": { + "h": 1, + "w": 24, + "x": 0, + "y": 12, + "i": "cb81def4-ac63-4d42-b66e-440f9061794b", + "isResizable": false + }, + "name": "Query Performance", + "type": "row" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.64, + "lineInterpolation": "smooth", + "stack": "off" + }, + "id": "5fa65a30-a49b-457f-b46a-11d2029188bd", + "layout": { + "h": 7, + "w": 12, + "x": 0, + "y": 13, + "i": "5fa65a30-a49b-457f-b46a-11d2029188bd", + "isResizable": true + }, + "name": "MySQL Temporary Objects", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "desc" + } + }, + "targets": [ + { + "expr": "sum(rate(mysql_global_status_created_tmp_tables{instance=~\"$instance\"}[5m]))", + "legend": "Created Tmp Tables" + }, + { + "expr": "sum(rate(mysql_global_status_created_tmp_disk_tables{instance=~\"$instance\"}[5m]))", + "legend": "Created Tmp Disk Tables" + }, + { + "expr": "sum(rate(mysql_global_status_created_tmp_files{instance=~\"$instance\"}[5m]))", + "legend": "Created Tmp Files" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.41, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "**MySQL Select Types**\n\nAs with most relational databases, selecting based on indexes is more efficient than scanning an entire table's data. Here we see the counters for selects not done with indexes.\n\n* ***Select Scan*** is how many queries caused full table scans, in which all the data in the table had to be read and either discarded or returned.\n* ***Select Range*** is how many queries used a range scan, which means MySQL scanned all rows in a given range.\n* ***Select Full Join*** is the number of joins that are not joined on an index, this is usually a huge performance hit.", + "id": "20efd251-6207-4cec-aa3b-4351e8e9b125", + "layout": { + "h": 7, + "w": 12, + "x": 12, + "y": 13, + "i": "20efd251-6207-4cec-aa3b-4351e8e9b125", + "isResizable": true + }, + "name": "MySQL Select Types", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "desc" + } + }, + "targets": [ + { + "expr": "sum(rate(mysql_global_status_select_full_join{ instance=~\"$instance\"}[5m]))", + "legend": "Select Full Join" + }, + { + "expr": "sum(rate(mysql_global_status_select_full_range_join{ instance=~\"$instance\"}[5m]))", + "legend": "Select Full Range Join" + }, + { + "expr": "sum(rate(mysql_global_status_select_range{ instance=~\"$instance\"}[5m]))", + "legend": "Select Range" + }, + { + "expr": "sum(rate(mysql_global_status_select_range_check{ instance=~\"$instance\"}[5m]))", + "legend": "Select Range Check" + }, + { + "expr": "sum(rate(mysql_global_status_select_scan{ instance=~\"$instance\"}[5m]))", + "legend": "Select Scan" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "**MySQL Sorts**\n\nDue to a query's structure, order, or other requirements, MySQL sorts the rows before returning them. For example, if a table is ordered 1 to 10 but you want the results reversed, MySQL then has to sort the rows to return 10 to 1.\n\nThis graph also shows when sorts had to scan a whole table or a given range of a table in order to return the results and which could not have been sorted via an index.", + "id": "a4d0c5fb-04e0-4627-8722-ae996d70e2aa", + "layout": { + "h": 7, + "w": 12, + "x": 0, + "y": 20, + "i": "a4d0c5fb-04e0-4627-8722-ae996d70e2aa", + "isResizable": true + }, + "name": "MySQL Sorts", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "desc" + } + }, + "targets": [ + { + "expr": "sum(rate(mysql_global_status_sort_rows{instance=~\"$instance\"}[5m]))", + "legend": "Sort Rows" + }, + { + "expr": "sum(rate(mysql_global_status_sort_range{instance=~\"$instance\"}[5m]))", + "legend": "Sort Range" + }, + { + "expr": "sum(rate(mysql_global_status_sort_merge_passes{instance=~\"$instance\"}[5m]))", + "legend": "Sort Merge Passes" + }, + { + "expr": "sum(rate(mysql_global_status_sort_scan{instance=~\"$instance\"}[5m]))", + "legend": "Sort Scan" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "custom": { + "drawStyle": "bars", + "fillOpacity": 0.81, + "stack": "off" + }, + "description": "**MySQL Slow Queries**\n\nSlow queries are defined as queries being slower than the long_query_time setting. For example, if you have long_query_time set to 3, all queries that take longer than 3 seconds to complete will show on this graph.", + "id": "2e13ada4-1128-440d-9360-028f16c3779b", + "layout": { + "h": 7, + "w": 12, + "x": 12, + "y": 20, + "i": "2e13ada4-1128-440d-9360-028f16c3779b", + "isResizable": true + }, + "name": "MySQL Slow Queries", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "none" + } + }, + "targets": [ + { + "expr": "sum(rate(mysql_global_status_slow_queries{instance=~\"$instance\"}[5m]))", + "legend": "Slow Queries" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "collapsed": true, + "id": "c9df805c-8ae7-41d7-b28b-575f478fd9ce", + "layout": { + "h": 1, + "w": 24, + "x": 0, + "y": 27, + "i": "c9df805c-8ae7-41d7-b28b-575f478fd9ce", + "isResizable": false + }, + "name": "Network", + "type": "row" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "**MySQL Network Traffic**\n\nHere we can see how much network traffic is generated by MySQL. Outbound is network traffic sent from MySQL and Inbound is network traffic MySQL has received.", + "id": "6107714f-bedd-437c-b6e4-d6eb74db6d30", + "layout": { + "h": 7, + "w": 24, + "x": 0, + "y": 28, + "i": "6107714f-bedd-437c-b6e4-d6eb74db6d30", + "isResizable": true + }, + "name": "MySQL Network Traffic", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "decimals": 2, + "util": "bytesSI" + }, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "desc" + } + }, + "targets": [ + { + "expr": "sum(rate(mysql_global_status_bytes_received{instance=~\"$instance\"}[5m]))", + "legend": "Inbound" + }, + { + "expr": "sum(rate(mysql_global_status_bytes_sent{instance=~\"$instance\"}[5m]))", + "legend": "Outbound" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "collapsed": true, + "id": "00fd2b70-a133-4ad7-bd56-69a3c91ecf0c", + "layout": { + "h": 1, + "w": 24, + "x": 0, + "y": 35, + "i": "00fd2b70-a133-4ad7-bd56-69a3c91ecf0c", + "isResizable": false + }, + "name": "Commands, Handlers", + "type": "row" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.2, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "**Top Command Counters**\n\nThe Com_{{xxx}} statement counter variables indicate the number of times each xxx statement has been executed. There is one status variable for each type of statement. For example, Com_delete and Com_update count [``DELETE``](https://dev.mysql.com/doc/refman/5.7/en/delete.html) and [``UPDATE``](https://dev.mysql.com/doc/refman/5.7/en/update.html) statements, respectively. Com_delete_multi and Com_update_multi are similar but apply to [``DELETE``](https://dev.mysql.com/doc/refman/5.7/en/delete.html) and [``UPDATE``](https://dev.mysql.com/doc/refman/5.7/en/update.html) statements that use multiple-table syntax.", + "id": "f90ca2bc-0809-45f6-88b6-e258805def04", + "layout": { + "h": 7, + "w": 24, + "x": 0, + "y": 36, + "i": "f90ca2bc-0809-45f6-88b6-e258805def04", + "isResizable": true + }, + "name": "Top Command Counters", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "decimals": 2 + }, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "desc" + } + }, + "targets": [ + { + "expr": "topk(10, rate(mysql_global_status_commands_total{instance=~\"$instance\"}[5m])>0)", + "legend": "Com_{{command}}" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "**MySQL Handlers**\n\nHandler statistics are internal statistics on how MySQL is selecting, updating, inserting, and modifying rows, tables, and indexes.\n\nThis is in fact the layer between the Storage Engine and MySQL.\n\n* `read_rnd_next` is incremented when the server performs a full table scan and this is a counter you don't really want to see with a high value.\n* `read_key` is incremented when a read is done with an index.\n* `read_next` is incremented when the storage engine is asked to 'read the next index entry'. A high value means a lot of index scans are being done.", + "id": "74e1844d-a918-48fa-a29f-6535dc087dac", + "layout": { + "h": 7, + "w": 12, + "x": 0, + "y": 43, + "i": "74e1844d-a918-48fa-a29f-6535dc087dac", + "isResizable": true + }, + "name": "MySQL Handlers", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "decimals": 3 + }, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "desc" + } + }, + "targets": [ + { + "expr": "rate(mysql_global_status_handlers_total{instance=~\"$instance\", handler!~\"commit|rollback|savepoint.*|prepare\"}[5m])", + "legend": "{{handler}}" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "id": "b2c3a13d-898f-407b-b6a9-db852072b12f", + "layout": { + "h": 7, + "w": 12, + "x": 12, + "y": 43, + "i": "b2c3a13d-898f-407b-b6a9-db852072b12f", + "isResizable": true + }, + "name": "MySQL Transaction Handlers", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "desc" + } + }, + "targets": [ + { + "expr": "rate(mysql_global_status_handlers_total{instance=~\"$instance\", handler=~\"commit|rollback|savepoint.*|prepare\"}[5m])", + "legend": "{{handler}}" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "collapsed": true, + "id": "c32a02da-6c61-4b9e-9365-c0b56088fabc", + "layout": { + "h": 1, + "w": 24, + "x": 0, + "y": 50, + "i": "c32a02da-6c61-4b9e-9365-c0b56088fabc", + "isResizable": false + }, + "name": "Open Files", + "type": "row" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "id": "fc13eadb-890d-4184-ac16-943d54188db8", + "layout": { + "h": 7, + "w": 24, + "x": 0, + "y": 51, + "i": "fc13eadb-890d-4184-ac16-943d54188db8", + "isResizable": true + }, + "name": "MySQL Open Files", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "desc" + } + }, + "targets": [ + { + "expr": "mysql_global_variables_open_files_limit{instance=~\"$instance\"}", + "legend": "Open Files Limit" + }, + { + "expr": "mysql_global_status_open_files{instance=~\"$instance\"}", + "legend": "Open Files" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "collapsed": true, + "id": "6f596e65-3e4b-4d9a-aad7-a32c8c7b8239", + "layout": { + "h": 1, + "w": 24, + "x": 0, + "y": 58, + "i": "6f596e65-3e4b-4d9a-aad7-a32c8c7b8239", + "isResizable": false + }, + "name": "Table Openings", + "type": "row" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "**MySQL Table Open Cache Status**\n\nThe recommendation is to set the `table_open_cache_instances` to a loose correlation to virtual CPUs, keeping in mind that more instances means the cache is split more times. If you have a cache set to 500 but it has 10 instances, each cache will only have 50 cached.\n\nThe `table_definition_cache` and `table_open_cache` can be left as default as they are auto-sized MySQL 5.6 and above (ie: do not set them to any value).", + "id": "0b78fbb5-a0b4-4a1b-98b1-af15bc91779d", + "layout": { + "h": 7, + "w": 12, + "x": 0, + "y": 59, + "i": "0b78fbb5-a0b4-4a1b-98b1-af15bc91779d", + "isResizable": true + }, + "name": "Table Open Cache Hit Ratio Mysql 5.6.6+", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "percentUnit" + }, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "none" + } + }, + "targets": [ + { + "expr": "rate(mysql_global_status_table_open_cache_hits{instance=~\"$instance\"}[5m])\n/\n(\nrate(mysql_global_status_table_open_cache_hits{instance=~\"$instance\"}[5m])\n+\nrate(mysql_global_status_table_open_cache_misses{instance=~\"$instance\"}[5m])\n)", + "legend": "Table Open Cache Hit Ratio" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "**MySQL Open Tables**\n\nThe recommendation is to set the `table_open_cache_instances` to a loose correlation to virtual CPUs, keeping in mind that more instances means the cache is split more times. If you have a cache set to 500 but it has 10 instances, each cache will only have 50 cached.\n\nThe `table_definition_cache` and `table_open_cache` can be left as default as they are auto-sized MySQL 5.6 and above (ie: do not set them to any value).", + "id": "948ad10b-8b22-4d42-9e94-99ef09e12927", + "layout": { + "h": 7, + "w": 12, + "x": 12, + "y": 59, + "i": "948ad10b-8b22-4d42-9e94-99ef09e12927", + "isResizable": true + }, + "name": "MySQL Open Tables", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "desc" + } + }, + "targets": [ + { + "expr": "mysql_global_status_open_tables{instance=~\"$instance\"}", + "legend": "Open Tables" + }, + { + "expr": "mysql_global_variables_table_open_cache{instance=~\"$instance\"}", + "legend": "Table Open Cache" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + } + ], + "var": [ + { + "name": "prom", + "type": "datasource", + "definition": "prometheus" + }, + { + "definition": "label_values(mysql_global_status_uptime, instance)", + "name": "instance", + "type": "query", + "datasource": { + "cate": "prometheus", + "value": "${prom}" + } + } + ], + "version": "3.0.0" + } +} \ No newline at end of file diff --git a/integrations/MySQL/markdown/mysql.md b/integrations/MySQL/markdown/mysql.md new file mode 100644 index 00000000..96540194 --- /dev/null +++ b/integrations/MySQL/markdown/mysql.md @@ -0,0 +1,103 @@ +# mysql + +mysql 监控采集插件,核心原理就是连到 mysql 实例,执行一些 sql,解析输出内容,整理为监控数据上报。 + +## Configuration + +```toml +# # collect interval +# interval = 15 + +# 要监控 MySQL,首先要给出要监控的MySQL的连接地址、用户名、密码 +[[instances]] +address = "127.0.0.1:3306" +username = "root" +password = "1234" + +# # set tls=custom to enable tls +# parameters = "tls=false" + +# 通过 show global status监控mysql,默认抓取一些基础指标, +# 如果想抓取更多global status的指标,把下面的配置设置为true +extra_status_metrics = true + +# 通过show global variables监控mysql的全局变量,默认抓取一些常规的 +# 常规的基本够用了,扩展的部分,默认不采集,下面的配置设置为false +extra_innodb_metrics = false + +# 监控processlist,关注较少,默认不采集 +gather_processlist_processes_by_state = false +gather_processlist_processes_by_user = false + +# 监控各个数据库的磁盘占用大小 +gather_schema_size = false + +# 监控所有的table的磁盘占用大小 +gather_table_size = false + +# 是否采集系统表的大小,通过不用,所以默认设置为false +gather_system_table_size = false + +# 通过 show slave status监控slave的情况,比较关键,所以默认采集 +gather_slave_status = true + +# # timeout +# timeout_seconds = 3 + +# # interval = global.interval * interval_times +# interval_times = 1 + +# 为mysql实例附一个instance的标签,因为通过address=127.0.0.1:3306不好区分 +# important! use global unique string to specify instance +# labels = { instance="n9e-10.2.3.4:3306" } + +## Optional TLS Config +# use_tls = false +# tls_min_version = "1.2" +# tls_ca = "/etc/categraf/ca.pem" +# tls_cert = "/etc/categraf/cert.pem" +# tls_key = "/etc/categraf/key.pem" +## Use TLS but skip chain & host verification +# insecure_skip_verify = true + +# 自定义SQL,指定SQL、返回的各个列那些是作为metric,哪些是作为label +# [[instances.queries]] +# mesurement = "users" +# metric_fields = [ "total" ] +# label_fields = [ "service" ] +# # field_to_append = "" +# timeout = "3s" +# request = ''' +# select 'n9e' as service, count(*) as total from n9e_v5.users +# ''' +``` + +## 监控多个实例 + +当主机填写为localhost时mysql会采用 unix domain socket连接 +当主机填写为127.0.0.1时mysql会采用tcp方式连接 +大家最常问的问题是如何监控多个mysql实例,实际大家对toml配置学习一下就了解了,`[[instances]]` 部分表示数组,是可以出现多个的,address参数支持通过unix路径连接 所以,举例: + +```toml +[[instances]] +address = "10.2.3.6:3306" +username = "root" +password = "1234" +labels = { instance="n9e-10.2.3.6:3306" } + +[[instances]] +address = "10.2.6.9:3306" +username = "root" +password = "1234" +labels = { instance="zbx-10.2.6.9:3306" } + +[[instances]] +address = "/tmp/mysql.sock" +username = "root" +password = "1234" +labels = { instance="zbx-localhost:3306" } +``` + +## 监控大盘和告警规则 + +本 README 的同级目录,大家可以看到alerts.json 是告警规则,导入夜莺就可以使用, dashboard-by-instance.json 就是监控大盘(注意!监控大盘使用instance大盘变量,所以,上面的配置文件中要配置一个instance的标签,就是 `labels = { instance="n9e-10.2.3.4:3306" }` 部分),也是导入夜莺就可以使用。dashboard-by-ident是使用ident作为大盘变量,适用于先找到宿主机器,再找机器上面的mysql实例的场景 \ No newline at end of file diff --git a/integrations/Net_Response/alerts/net_response_by_categraf.json b/integrations/Net_Response/alerts/net_response_by_categraf.json new file mode 100644 index 00000000..76cb065d --- /dev/null +++ b/integrations/Net_Response/alerts/net_response_by_categraf.json @@ -0,0 +1,30 @@ +[ + { + "name": "Network address probe failed", + "note": "", + "severity": 2, + "disabled": 0, + "prom_for_duration": 60, + "prom_ql": "net_response_result_code != 0", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [] + } + ] \ No newline at end of file diff --git a/integrations/Net_Response/collect/net_response.toml b/integrations/Net_Response/collect/net_response.toml new file mode 100644 index 00000000..87070f27 --- /dev/null +++ b/integrations/Net_Response/collect/net_response.toml @@ -0,0 +1,38 @@ +# # collect interval +# interval = 15 + +[mappings] +# "127.0.0.1:22"= {region="local",ssh="test"} +# "127.0.0.1:22"= {region="local",ssh="redis"} + +[[instances]] +targets = [ +# "127.0.0.1:22", +# "localhost:6379", +# ":9090" +] + +# # append some labels for series +# labels = { region="cloud", product="n9e" } + +# # interval = global.interval * interval_times +# interval_times = 1 + +## Protocol, must be "tcp" or "udp" +## NOTE: because the "udp" protocol does not respond to requests, it requires +## a send/expect string pair (see below). +# protocol = "tcp" + +## Set timeout +# timeout = "1s" + +## Set read timeout (only used if expecting a response) +# read_timeout = "1s" + +## The following options are required for UDP checks. For TCP, they are +## optional. The plugin will send the given string to the server and then +## expect to receive the given 'expect' string back. +## string sent to the server +# send = "ssh" +## expected string in answer +# expect = "ssh" diff --git a/integrations/Net_Response/dashboards/net_response_by_categraf.json b/integrations/Net_Response/dashboards/net_response_by_categraf.json new file mode 100644 index 00000000..aee64fad --- /dev/null +++ b/integrations/Net_Response/dashboards/net_response_by_categraf.json @@ -0,0 +1,131 @@ +{ + "name": "TCP detection by UlricQin", + "tags": "", + "ident": "", + "configs": { + "panels": [ + { + "type": "table", + "id": "73c6eaf9-1685-4a7a-bf53-3d52afa1792e", + "layout": { + "h": 15, + "w": 24, + "x": 0, + "y": 0, + "i": "73c6eaf9-1685-4a7a-bf53-3d52afa1792e", + "isResizable": true + }, + "version": "3.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}", + "targets": [ + { + "expr": "max(net_response_result_code) by (target)", + "legend": "UP?", + "refId": "A" + }, + { + "expr": "max(net_response_response_time) by (target) * 1000", + "legend": "Latency(ms)", + "refId": "C" + } + ], + "transformations": [ + { + "id": "organize", + "options": { + "indexByName": { + "target": 0 + } + } + } + ], + "name": "Targets", + "custom": { + "showHeader": true, + "colorMode": "background", + "calc": "lastNotNull", + "displayMode": "labelValuesToRows", + "aggrDimension": "target" + }, + "options": { + "valueMappings": [], + "standardOptions": {} + }, + "overrides": [ + { + "matcher": { + "value": "A" + }, + "properties": { + "standardOptions": {}, + "valueMappings": [ + { + "match": { + "special": 0 + }, + "result": { + "color": "#2c9d3d", + "text": "UP" + }, + "type": "special" + }, + { + "match": { + "from": 1, + "special": 1 + }, + "result": { + "color": "#e90f0f", + "text": "DOWN" + }, + "type": "range" + } + ] + } + }, + { + "type": "special", + "matcher": { + "value": "C" + }, + "properties": { + "valueMappings": [ + { + "type": "range", + "result": { + "color": "#f10c0c" + }, + "match": { + "from": 1 + } + }, + { + "type": "range", + "result": { + "color": "#2c9d3d" + }, + "match": { + "to": 1 + } + } + ], + "standardOptions": { + "util": "milliseconds", + "decimals": 3 + } + } + } + ] + } + ], + "var": [ + { + "definition": "prometheus", + "name": "prom", + "type": "datasource" + } + ], + "version": "3.0.0" + } +} \ No newline at end of file diff --git a/integrations/Net_Response/icon/network.png b/integrations/Net_Response/icon/network.png new file mode 100644 index 00000000..a432e285 Binary files /dev/null and b/integrations/Net_Response/icon/network.png differ diff --git a/integrations/Net_Response/markdown/README.md b/integrations/Net_Response/markdown/README.md new file mode 100644 index 00000000..f06db467 --- /dev/null +++ b/integrations/Net_Response/markdown/README.md @@ -0,0 +1,40 @@ +# net_response + +网络探测插件,通常用于监控本机某个端口是否在监听,或远端某个端口是否能连通 + +## code meanings + +- 0: Success +- 1: Timeout +- 2: ConnectionFailed +- 3: ReadFailed +- 4: StringMismatch + +## Configuration + +最核心的配置就是 targets 部分,指定探测的目标,下面的例子: + +```toml +[[instances]] +targets = [ + "10.2.3.4:22", + "localhost:6379", + ":9090" +] +``` + +- `10.2.3.4:22` 表示探测 10.2.3.4 这个机器的 22 端口是否可以连通 +- `localhost:6379` 表示探测本机的 6379 端口是否可以连通 +- `:9090` 表示探测本机的 9090 端口是否可以连通 + +监控数据或告警事件中只是一个 IP 和端口,接收告警的人看到了,可能不清楚只是哪个业务的模块告警了,可以附加一些更有价值的信息放到标签里,比如例子中: + +```toml +labels = { region="cloud", product="n9e" } +``` + +标识了这是 cloud 这个 region,n9e 这个产品,这俩标签会附到时序数据上,告警的时候自然也会报出来。 + +## 监控大盘和告警规则 + +该 README 的同级目录下,提供了 dashboard.json 就是监控大盘的配置,alerts.json 是告警规则,可以导入夜莺使用。 \ No newline at end of file diff --git a/integrations/Nginx/collect/nginx.toml b/integrations/Nginx/collect/nginx.toml new file mode 100644 index 00000000..46692069 --- /dev/null +++ b/integrations/Nginx/collect/nginx.toml @@ -0,0 +1,36 @@ +# # collect interval +# interval = 15 + +[[instances]] +## An array of Nginx stub_status URI to gather stats. +urls = [ +# "http://192.168.0.216:8000/nginx_status", +# "https://www.baidu.com/ngx_status" +] + +## append some labels for series +# labels = { region="cloud", product="n9e" } + +## interval = global.interval * interval_times +# interval_times = 1 + +## Set response_timeout (default 5 seconds) +response_timeout = "5s" + +## Whether to follow redirects from the server (defaults to false) +# follow_redirects = false + +## Optional HTTP Basic Auth Credentials +#username = "admin" +#password = "admin" + +## Optional headers +# headers = ["X-From", "categraf", "X-Xyz", "abc"] + +## Optional TLS Config +# use_tls = false +# tls_ca = "/etc/categraf/ca.pem" +# tls_cert = "/etc/categraf/cert.pem" +# tls_key = "/etc/categraf/key.pem" +## Use TLS but skip chain & host verification +# insecure_skip_verify = false \ No newline at end of file diff --git a/integrations/Oracle/collect/metric.toml b/integrations/Oracle/collect/metric.toml new file mode 100644 index 00000000..16709483 --- /dev/null +++ b/integrations/Oracle/collect/metric.toml @@ -0,0 +1,106 @@ +[[metrics]] +mesurement = "sessions" +label_fields = [ "status", "type" ] +metric_fields = [ "value" ] +timeout = "3s" +request = ''' +SELECT status, type, COUNT(*) as value FROM v$session GROUP BY status, type +''' + +[[metrics]] +mesurement = "lock" +metric_fields = [ "cnt" ] +timeout = "3s" +request = ''' +SELECT COUNT(*) AS cnt + FROM ALL_OBJECTS A, V$LOCKED_OBJECT B, SYS.GV_$SESSION C + WHERE A.OBJECT_ID = B.OBJECT_ID + AND B.PROCESS = C.PROCESS +''' + +[[metrics]] +mesurement = "slow_queries" +metric_fields = [ "p95_time_usecs" , "p99_time_usecs"] +timeout = "3s" +request = ''' +select percentile_disc(0.95) within group (order by elapsed_time) as p95_time_usecs, + percentile_disc(0.99) within group (order by elapsed_time) as p99_time_usecs +from v$sql where last_active_time >= sysdate - 5/(24*60) +''' + +[[metrics]] +mesurement = "resource" +label_fields = [ "resource_name" ] +metric_fields = [ "current_utilization", "limit_value" ] +timeout = "3s" +request = ''' +SELECT resource_name,current_utilization,CASE WHEN TRIM(limit_value) LIKE 'UNLIMITED' THEN '-1' ELSE TRIM(limit_value) END as limit_value FROM v$resource_limit +''' + +[[metrics]] +mesurement = "asm_diskgroup" +label_fields = [ "name" ] +metric_fields = [ "total", "free" ] +timeout = "3s" +request = ''' +SELECT name,total_mb*1024*1024 as total,free_mb*1024*1024 as free FROM v$asm_diskgroup_stat where exists (select 1 from v$datafile where name like '+%') +''' +IgnoreZeroResult = true + +[[metrics]] +mesurement = "activity" +metric_fields = [ "value" ] +field_to_append = "name" +timeout = "3s" +request = ''' +SELECT name, value FROM v$sysstat WHERE name IN ('parse count (total)', 'execute count', 'user commits', 'user rollbacks') +''' + +[[metrics]] +mesurement = "process" +metric_fields = [ "count" ] +timeout = "3s" +request = ''' +SELECT COUNT(*) as count FROM v$process +''' + +[[metrics]] +mesurement = "wait_time" +metric_fields = [ "value" ] +label_fields = ["wait_class"] +timeout = "3s" +request = ''' +SELECT + n.wait_class as WAIT_CLASS, + round(m.time_waited/m.INTSIZE_CSEC,3) as VALUE +FROM + v$waitclassmetric m, v$system_wait_class n +WHERE + m.wait_class_id=n.wait_class_id AND n.wait_class != 'Idle' +''' + +[[metrics]] +mesurement = "tablespace" +label_fields = [ "tablespace", "type" ] +metric_fields = [ "bytes", "max_bytes", "free" ] +timeout = "3s" +request = ''' +SELECT + dt.tablespace_name as tablespace, + dt.contents as type, + dt.block_size * dtum.used_space as bytes, + dt.block_size * dtum.tablespace_size as max_bytes, + dt.block_size * (dtum.tablespace_size - dtum.used_space) as free +FROM dba_tablespace_usage_metrics dtum, dba_tablespaces dt +WHERE dtum.tablespace_name = dt.tablespace_name +ORDER by tablespace +''' + +[[metrics]] +mesurement = "sysmetric" +metric_fields = [ "value" ] +field_to_append = "metric_name" +timeout = "3s" +request = ''' +select METRIC_NAME,VALUE from v$sysmetric where group_id=2 +''' diff --git a/integrations/Oracle/collect/oracle.toml b/integrations/Oracle/collect/oracle.toml new file mode 100644 index 00000000..9c7730be --- /dev/null +++ b/integrations/Oracle/collect/oracle.toml @@ -0,0 +1,33 @@ +# # collect interval +# interval = 15 + +#[[instances]] +# address = "10.1.2.3:1521/orcl" +# username = "monitor" +# password = "123456" +# is_sys_dba = false +# is_sys_oper = false +# disable_connection_pool = false +# max_open_connections = 5 +# # interval = global.interval * interval_times +# interval_times = 1 +# labels = { region="cloud" } + +# [[instances.metrics]] +# mesurement = "sessions" +# label_fields = [ "status", "type" ] +# metric_fields = [ "value" ] +# timeout = "3s" +# request = ''' +# SELECT status, type, COUNT(*) as value FROM v$session GROUP BY status, type +# ''' + +# [[instances]] +# address = "192.168.10.10:1521/orcl" +# username = "monitor" +# password = "123456" +# is_sys_dba = false +# is_sys_oper = false +# disable_connection_pool = false +# max_open_connections = 5 +# # labels = { region="local" } diff --git a/integrations/Ping/collect/ping.toml b/integrations/Ping/collect/ping.toml new file mode 100644 index 00000000..187afae3 --- /dev/null +++ b/integrations/Ping/collect/ping.toml @@ -0,0 +1,43 @@ +# # collect interval +# interval = 15 + +[[instances]] +# send ping packets to +targets = [ +# "www.baidu.com", +# "127.0.0.1", +# "10.4.5.6", +# "10.4.5.7" +] + +# # append some labels for series +# labels = { region="cloud", product="n9e" } + +# # interval = global.interval * interval_times +# interval_times = 1 + +## Number of ping packets to send per interval. Corresponds to the "-c" +## option of the ping command. +# count = 1 + +## Time to wait between sending ping packets in seconds. Operates like the +## "-i" option of the ping command. +# ping_interval = 1.0 + +## If set, the time to wait for a ping response in seconds. Operates like +## the "-W" option of the ping command. +# timeout = 3.0 + +## Interface or source address to send ping from. Operates like the -I or -S +## option of the ping command. +# interface = "" + +## Use only IPv6 addresses when resolving a hostname. +# ipv6 = false + +## Number of data bytes to be sent. Corresponds to the "-s" +## option of the ping command. +# size = 56 + +# max concurrency coroutine +# concurrency = 50 diff --git a/integrations/Ping/collect/ping2.toml b/integrations/Ping/collect/ping2.toml new file mode 100644 index 00000000..187afae3 --- /dev/null +++ b/integrations/Ping/collect/ping2.toml @@ -0,0 +1,43 @@ +# # collect interval +# interval = 15 + +[[instances]] +# send ping packets to +targets = [ +# "www.baidu.com", +# "127.0.0.1", +# "10.4.5.6", +# "10.4.5.7" +] + +# # append some labels for series +# labels = { region="cloud", product="n9e" } + +# # interval = global.interval * interval_times +# interval_times = 1 + +## Number of ping packets to send per interval. Corresponds to the "-c" +## option of the ping command. +# count = 1 + +## Time to wait between sending ping packets in seconds. Operates like the +## "-i" option of the ping command. +# ping_interval = 1.0 + +## If set, the time to wait for a ping response in seconds. Operates like +## the "-W" option of the ping command. +# timeout = 3.0 + +## Interface or source address to send ping from. Operates like the -I or -S +## option of the ping command. +# interface = "" + +## Use only IPv6 addresses when resolving a hostname. +# ipv6 = false + +## Number of data bytes to be sent. Corresponds to the "-s" +## option of the ping command. +# size = 56 + +# max concurrency coroutine +# concurrency = 50 diff --git a/integrations/Ping/dashboards/ping_by_categraf.json b/integrations/Ping/dashboards/ping_by_categraf.json new file mode 100644 index 00000000..23a48efa --- /dev/null +++ b/integrations/Ping/dashboards/ping_by_categraf.json @@ -0,0 +1,171 @@ +{ + "name": "PING detection by UlricQin", + "tags": "", + "ident": "", + "configs": { + "panels": [ + { + "type": "table", + "id": "1677138f-0f33-485c-8ee1-2db24cabbf54", + "layout": { + "h": 15, + "w": 24, + "x": 0, + "y": 0, + "i": "1677138f-0f33-485c-8ee1-2db24cabbf54", + "isResizable": true + }, + "version": "3.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}", + "targets": [ + { + "expr": "max(ping_result_code) by (target)", + "legend": "UP?", + "refId": "A" + }, + { + "expr": "max(ping_percent_packet_loss) by (target)", + "legend": "Packet Loss %", + "refId": "B" + }, + { + "expr": "max(ping_maximum_response_ms) by (target) ", + "legend": "Latency(ms)", + "refId": "C" + } + ], + "transformations": [ + { + "id": "organize", + "options": {} + } + ], + "name": "Ping", + "custom": { + "showHeader": true, + "colorMode": "background", + "calc": "lastNotNull", + "displayMode": "labelValuesToRows", + "aggrDimension": "target" + }, + "options": { + "valueMappings": [], + "standardOptions": {} + }, + "overrides": [ + { + "matcher": { + "value": "A" + }, + "properties": { + "standardOptions": {}, + "valueMappings": [ + { + "match": { + "special": 0 + }, + "result": { + "color": "#2c9d3d", + "text": "UP" + }, + "type": "special" + }, + { + "match": { + "from": 1, + "special": 1 + }, + "result": { + "color": "#e90f0f", + "text": "DOWN" + }, + "type": "range" + } + ] + } + }, + { + "type": "special", + "matcher": { + "value": "B" + }, + "properties": { + "valueMappings": [ + { + "type": "range", + "result": { + "color": "#f30a0a" + }, + "match": { + "from": 1 + } + }, + { + "type": "special", + "result": { + "color": "#2c9d3d" + }, + "match": { + "special": 0 + } + } + ], + "standardOptions": {} + } + }, + { + "type": "special", + "matcher": { + "value": "C" + }, + "properties": { + "valueMappings": [ + { + "type": "range", + "result": { + "color": "#2c9d3d" + }, + "match": { + "from": null, + "to": 100 + } + }, + { + "type": "range", + "result": { + "color": "#ff8286" + }, + "match": { + "to": 300 + } + }, + { + "type": "range", + "result": { + "color": "#f00808" + }, + "match": { + "to": null, + "from": 1000 + } + } + ], + "standardOptions": { + "util": "milliseconds" + } + } + } + ] + } + ], + "var": [ + { + "definition": "prometheus", + "name": "prom", + "type": "datasource" + } + ], + "version": "3.0.0" + } +} \ No newline at end of file diff --git a/integrations/PostgreSQL/collect/postgresql.toml b/integrations/PostgreSQL/collect/postgresql.toml new file mode 100644 index 00000000..995de33c --- /dev/null +++ b/integrations/PostgreSQL/collect/postgresql.toml @@ -0,0 +1,49 @@ +# Read metrics from one or many postgresql servers +# # collect interval +# interval = 15 + +[[instances]] + ## specify address via a url matching: + ## postgres://[pqgotest[:password]]@localhost[/dbname]?sslmode=[disable|verify-ca|verify-full] + ## or a simple string: + ## host=localhost user=pqgotest password=... sslmode=... dbname=app_production + ## + ## All connection parameters are optional. + ## + ## Without the dbname parameter, the driver will default to a database + ## with the same name as the user. This dbname is just for instantiating a + ## connection with the server and doesn't restrict the databases we are trying + ## to grab metrics for. + ## + # address = "host=localhost user=postgres sslmode=disable" + + ## A custom name for the database that will be used as the "server" tag in the + ## measurement output. If not specified, a default one generated from + ## the connection address is used. + # outputaddress = "db01" + + ## connection configuration. + ## maxlifetime - specify the maximum lifetime of a connection. + ## default is forever (0s) + # max_lifetime = "0s" + + ## A list of databases to explicitly ignore. If not specified, metrics for all + ## databases are gathered. Do NOT use with the 'databases' option. + # ignored_databases = ["postgres", "template0", "template1"] + + ## A list of databases to pull metrics about. If not specified, metrics for all + ## databases are gathered. Do NOT use with the 'ignored_databases' option. + # databases = ["app_production", "testing"] + + ## Whether to use prepared statements when connecting to the database. + ## This should be set to false when connecting through a PgBouncer instance + ## with pool_mode set to transaction. + #prepared_statements = true + # [[instances.metrics]] + # mesurement = "sessions" + # label_fields = [ "status", "type" ] + # metric_fields = [ "value" ] + # timeout = "3s" + # request = ''' + # SELECT status, type, COUNT(*) as value FROM v$session GROUP BY status, type + # ''' \ No newline at end of file diff --git a/integrations/PostgreSQL/markdown/alerts.png b/integrations/PostgreSQL/markdown/alerts.png new file mode 100644 index 00000000..d0b1230a Binary files /dev/null and b/integrations/PostgreSQL/markdown/alerts.png differ diff --git a/integrations/PostgreSQL/markdown/postgresql.png b/integrations/PostgreSQL/markdown/postgresql.png new file mode 100644 index 00000000..6ac15c2d Binary files /dev/null and b/integrations/PostgreSQL/markdown/postgresql.png differ diff --git a/integrations/Process/alerts/process_by_exporter.json b/integrations/Process/alerts/process_by_exporter.json new file mode 100644 index 00000000..6e9b6742 --- /dev/null +++ b/integrations/Process/alerts/process_by_exporter.json @@ -0,0 +1,221 @@ +[ + { + "cate": "prometheus", + "datasource_ids": [ + 0 + ], + "name": "Process X high number of open files - exporter", + "note": "", + "prod": "metric", + "algorithm": "", + "algo_params": null, + "delay": 0, + "severity": 2, + "severities": [ + 2 + ], + "disabled": 1, + "prom_for_duration": 60, + "prom_ql": "", + "rule_config": { + "algo_params": null, + "inhibit": false, + "prom_ql": "", + "queries": [ + { + "prom_ql": "avg by (instance) (namedprocess_namegroup_worst_fd_ratio{groupname=\"X\"}) * 100 > 80", + "severity": 2 + } + ], + "severity": 0 + }, + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_stimes": [ + "00:00" + ], + "enable_etime": "23:59", + "enable_etimes": [ + "23:59" + ], + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_days_of_weeks": [ + [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ] + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "notify_max_number": 0, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [ + "alertname=ProcessHighOpenFiles" + ], + "annotations": null, + "extra_config": null + }, + { + "cate": "prometheus", + "datasource_ids": [ + 0 + ], + "name": "Process X is down - exporter", + "note": "", + "prod": "metric", + "algorithm": "", + "algo_params": null, + "delay": 0, + "severity": 1, + "severities": [ + 1 + ], + "disabled": 1, + "prom_for_duration": 0, + "prom_ql": "", + "rule_config": { + "algo_params": null, + "inhibit": false, + "prom_ql": "", + "queries": [ + { + "prom_ql": "sum by (instance) (namedprocess_namegroup_num_procs{groupname=\"X\"}) == 0", + "severity": 1 + } + ], + "severity": 0 + }, + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_stimes": [ + "00:00" + ], + "enable_etime": "23:59", + "enable_etimes": [ + "23:59" + ], + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_days_of_weeks": [ + [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ] + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "notify_max_number": 0, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [ + "alertname=ProcessNotRunning" + ], + "annotations": null, + "extra_config": null + }, + { + "cate": "prometheus", + "datasource_ids": [ + 0 + ], + "name": "Process X is restarted - exporter", + "note": "", + "prod": "metric", + "algorithm": "", + "algo_params": null, + "delay": 0, + "severity": 3, + "severities": [ + 3 + ], + "disabled": 1, + "prom_for_duration": 0, + "prom_ql": "", + "rule_config": { + "algo_params": null, + "inhibit": false, + "prom_ql": "", + "queries": [ + { + "prom_ql": "namedprocess_namegroup_oldest_start_time_seconds{groupname=\"X\"} > time() - 60 ", + "severity": 3 + } + ], + "severity": 0 + }, + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_stimes": [ + "00:00" + ], + "enable_etime": "23:59", + "enable_etimes": [ + "23:59" + ], + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_days_of_weeks": [ + [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ] + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "notify_max_number": 0, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [ + "alertname=ProcessRestarted" + ], + "annotations": null, + "extra_config": null + } +] \ No newline at end of file diff --git a/integrations/Process/alerts/procstat_by_categraf.json b/integrations/Process/alerts/procstat_by_categraf.json new file mode 100644 index 00000000..bfc0e183 --- /dev/null +++ b/integrations/Process/alerts/procstat_by_categraf.json @@ -0,0 +1,152 @@ +[ + { + "cate": "prometheus", + "datasource_ids": [ + 0 + ], + "name": "process handle limit is too low", + "note": "", + "prod": "metric", + "algorithm": "", + "algo_params": null, + "delay": 0, + "severity": 3, + "severities": [ + 3 + ], + "disabled": 1, + "prom_for_duration": 60, + "prom_ql": "", + "rule_config": { + "algo_params": null, + "inhibit": false, + "prom_ql": "", + "queries": [ + { + "prom_ql": "procstat_rlimit_num_fds_soft < 2048", + "severity": 3 + } + ], + "severity": 0 + }, + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_stimes": [ + "00:00" + ], + "enable_etime": "23:59", + "enable_etimes": [ + "23:59" + ], + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_days_of_weeks": [ + [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ] + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [ + "email", + "dingtalk", + "wecom" + ], + "notify_repeat_step": 60, + "notify_max_number": 0, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [], + "annotations": null, + "extra_config": null + }, + { + "cate": "prometheus", + "datasource_ids": [ + 0 + ], + "name": "there is a process count of 0, indicating that a certain process may have crashed", + "note": "", + "prod": "metric", + "algorithm": "", + "algo_params": null, + "delay": 0, + "severity": 1, + "severities": [ + 1 + ], + "disabled": 1, + "prom_for_duration": 60, + "prom_ql": "", + "rule_config": { + "algo_params": null, + "inhibit": false, + "prom_ql": "", + "queries": [ + { + "prom_ql": "procstat_lookup_count == 0", + "severity": 1 + } + ], + "severity": 0 + }, + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_stimes": [ + "00:00" + ], + "enable_etime": "23:59", + "enable_etimes": [ + "23:59" + ], + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_days_of_weeks": [ + [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ] + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [ + "email", + "dingtalk", + "wecom" + ], + "notify_repeat_step": 60, + "notify_max_number": 0, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [], + "annotations": null, + "extra_config": null + } +] \ No newline at end of file diff --git a/integrations/Process/dashboards/process_by_exporter.json b/integrations/Process/dashboards/process_by_exporter.json new file mode 100644 index 00000000..8da2236e --- /dev/null +++ b/integrations/Process/dashboards/process_by_exporter.json @@ -0,0 +1,1106 @@ +{ + "name": "Linux Process - exporter", + "tags": "Prometheus Process", + "ident": "", + "configs": { + "panels": [ + { + "collapsed": true, + "id": "021aedd5-dac7-4431-aeea-50c0c594e784", + "layout": { + "h": 1, + "i": "021aedd5-dac7-4431-aeea-50c0c594e784", + "w": 24, + "x": 0, + "y": 0 + }, + "name": "Cpu Usage", + "type": "row" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "进程占用CPU时间(用户态+内核态),倒排前5", + "id": "0f55cad5-ae67-44de-bba5-f6e1bd23066a", + "layout": { + "h": 7, + "i": "0f55cad5-ae67-44de-bba5-f6e1bd23066a", + "w": 12, + "x": 0, + "y": 1 + }, + "name": "Top processes by Total CPU cores used", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "percentUnit" + }, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "none" + } + }, + "targets": [ + { + "expr": "topk(5,(rate(namedprocess_namegroup_cpu_seconds_total{mode=\"user\",groupname=~\"$processes\",instance=~\"$instance\"}[5m]) + ignoring(mode) rate(namedprocess_namegroup_cpu_seconds_total{mode=\"system\",groupname=~\"$processes\",instance=~\"$instance\"}[5m])) or (irate(namedprocess_namegroup_cpu_seconds_total{mode=\"user\",groupname=~\"$processes\",instance=~\"$instance\"}[5m]) + ignoring(mode) irate(namedprocess_namegroup_cpu_seconds_total{mode=\"system\",groupname=~\"$processes\",instance=~\"$instance\"}[5m])))", + "legend": "{{groupname}}", + "refId": "A" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "进程占用CPU时间(内核态),倒排前5", + "id": "e200eb2a-9cd6-4954-880f-9a354c1af2cc", + "layout": { + "h": 7, + "i": "e200eb2a-9cd6-4954-880f-9a354c1af2cc", + "w": 12, + "x": 12, + "y": 1 + }, + "name": "Top processes by System CPU cores used", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "percentUnit" + }, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "none" + } + }, + "targets": [ + { + "expr": "topk(5, rate(namedprocess_namegroup_cpu_seconds_total{mode=\"system\",groupname=~\"$processes\",instance=~\"$instance\"}[5m]) or ( irate(namedprocess_namegroup_cpu_seconds_total{mode=\"system\",groupname=~\"$processes\",instance=~\"$instance\"}[5m])))", + "legend": "{{groupname}}", + "refId": "A" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "collapsed": true, + "id": "3f0a9068-51ab-4889-82e1-444cd23c56e8", + "layout": { + "h": 1, + "i": "3f0a9068-51ab-4889-82e1-444cd23c56e8", + "w": 24, + "x": 0, + "y": 8 + }, + "name": "Memory Usage", + "type": "row" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "进程常驻内存与交换空间平均占用容量之和,倒排前5", + "id": "9cb0cc2f-c122-4f5e-98ed-8bf310bd612a", + "layout": { + "h": 7, + "i": "9cb0cc2f-c122-4f5e-98ed-8bf310bd612a", + "w": 12, + "x": 0, + "y": 9 + }, + "name": "Top processes by Used memory", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "bytesIEC" + }, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "none" + } + }, + "targets": [ + { + "expr": "topk(5,( (avg_over_time(namedprocess_namegroup_memory_bytes{groupname=~\"$processes\", memtype=\"swapped\",instance=~\"$instance\"}[5m])+ ignoring (memtype) avg_over_time(namedprocess_namegroup_memory_bytes{groupname=~\"$processes\", memtype=\"resident\",instance=~\"$instance\"}[5m])) or (avg_over_time(namedprocess_namegroup_memory_bytes{groupname=~\"$processes\", memtype=\"swapped\",instance=~\"$instance\"}[5m])+ ignoring (memtype) avg_over_time(namedprocess_namegroup_memory_bytes{groupname=~\"$processes\", memtype=\"resident\",instance=~\"$instance\"}[5m])) ))", + "legend": "{{groupname}}", + "refId": "A" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "进程常驻内存平均占用容量之和,倒排前5", + "id": "7aceae00-23be-447f-87df-47f74b278190", + "layout": { + "h": 7, + "i": "7aceae00-23be-447f-87df-47f74b278190", + "w": 12, + "x": 12, + "y": 9 + }, + "name": "Top processes by Resident Memory", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "bytesIEC" + }, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "none" + } + }, + "targets": [ + { + "expr": "topk(5, (avg_over_time(namedprocess_namegroup_memory_bytes{groupname=~\"$processes\", memtype=\"resident\",instance=~\"$instance\"}[5m]) or avg_over_time(namedprocess_namegroup_memory_bytes{groupname=~\"$processes\", memtype=\"resident\",instance=~\"$instance\"}[5m]) ))", + "legend": "{{groupname}}", + "refId": "A" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "进程交换内存平均占用容量,倒排前5", + "id": "55b255d8-756e-421c-9690-675d7c174e86", + "layout": { + "h": 7, + "i": "55b255d8-756e-421c-9690-675d7c174e86", + "w": 12, + "x": 0, + "y": 11 + }, + "name": "Top processes by Swapped Memory", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "bytesIEC" + }, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "none" + } + }, + "targets": [ + { + "expr": "topk(5,( avg_over_time(namedprocess_namegroup_memory_bytes{groupname=~\"$processes\", memtype=\"swapped\",instance=~\"$instance\"}[5m]) or avg_over_time(namedprocess_namegroup_memory_bytes{groupname=~\"$processes\", memtype=\"swapped\",instance=~\"$instance\"}[5m]))) ", + "legend": "{{groupname}}", + "refId": "A" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "进程虚拟内存平均占用容量,倒排前5", + "id": "3c900819-5955-4bef-b419-ea8c47107b0d", + "layout": { + "h": 7, + "i": "3c900819-5955-4bef-b419-ea8c47107b0d", + "w": 12, + "x": 12, + "y": 11 + }, + "name": "Top processes by Virtual Memory", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "bytesIEC" + }, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "none" + } + }, + "targets": [ + { + "expr": "topk(5,( avg_over_time(namedprocess_namegroup_memory_bytes{groupname=~\"$processes\", memtype=\"virtual\",instance=~\"$instance\"}[5m]) or avg_over_time(namedprocess_namegroup_memory_bytes{groupname=~\"$processes\", memtype=\"virtual\",instance=~\"$instance\"}[5m])))", + "legend": "{{groupname}}", + "refId": "A" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "collapsed": true, + "id": "4d72a763-48f3-480b-9701-bc297be1bbfb", + "layout": { + "h": 1, + "i": "4d72a763-48f3-480b-9701-bc297be1bbfb", + "w": 24, + "x": 0, + "y": 18 + }, + "name": "Disk IO Usage", + "type": "row" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "进程写数据量,倒排前5", + "id": "4cb30905-65f6-42aa-895e-ef3679905ef8", + "layout": { + "h": 7, + "i": "4cb30905-65f6-42aa-895e-ef3679905ef8", + "w": 12, + "x": 0, + "y": 19 + }, + "name": "Top processes by Bytes Written", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "bytesSI" + }, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "none" + } + }, + "targets": [ + { + "expr": "topk(5,(rate(namedprocess_namegroup_write_bytes_total{groupname=~\"$processes\",instance=~\"$instance\"}[5m]) or irate(namedprocess_namegroup_write_bytes_total{groupname=~\"$processes\",instance=~\"$instance\"}[5m])))", + "legend": "{{groupname}}", + "refId": "A" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "进程读数据量,倒排前5", + "id": "d74f4d5f-db71-4781-b539-0df17dfcd42f", + "layout": { + "h": 7, + "i": "d74f4d5f-db71-4781-b539-0df17dfcd42f", + "w": 12, + "x": 12, + "y": 19 + }, + "name": "Top processes by Bytes Read", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "bytesSI" + }, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "none" + } + }, + "targets": [ + { + "expr": "topk(5,(rate(namedprocess_namegroup_read_bytes_total{groupname=~\"$processes\",instance=~\"$instance\"}[5m]) or irate(namedprocess_namegroup_read_bytes_total{groupname=~\"$processes\",instance=~\"$instance\"}[5m])))", + "legend": "{{groupname}}", + "refId": "A" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "collapsed": true, + "id": "e460097e-c3c8-45da-8202-dee1de996ddf", + "layout": { + "h": 1, + "i": "e460097e-c3c8-45da-8202-dee1de996ddf", + "w": 24, + "x": 0, + "y": 26 + }, + "name": "Process and Thread Counts", + "type": "row" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "同名进程数,倒排前5", + "id": "1b44ce6f-ca65-49f5-892e-05444c9aa580", + "layout": { + "h": 7, + "i": "1b44ce6f-ca65-49f5-892e-05444c9aa580", + "w": 12, + "x": 0, + "y": 27 + }, + "name": "Top processes by number of processes instances", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "none" + } + }, + "targets": [ + { + "expr": "topk(5,(max_over_time(namedprocess_namegroup_num_procs{groupname=~\"$processes\",instance=~\"$instance\"}[5m]) or max_over_time(namedprocess_namegroup_num_procs{groupname=~\"$processes\",instance=~\"$instance\"}[5m])))", + "legend": "{{groupname}}", + "refId": "A" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "进程内线程数,倒排前5", + "id": "24975aae-a573-4998-bdab-ffcb19cee2ae", + "layout": { + "h": 7, + "i": "24975aae-a573-4998-bdab-ffcb19cee2ae", + "w": 12, + "x": 12, + "y": 27 + }, + "name": "Top processes by number of threads", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "none" + } + }, + "targets": [ + { + "expr": "topk(5,(max_over_time(namedprocess_namegroup_num_threads{groupname=~\"$processes\",instance=~\"$instance\"}[5m]) or max_over_time(namedprocess_namegroup_num_threads{groupname=~\"$processes\",instance=~\"$instance\"}[5m])))", + "legend": "{{groupname}}", + "refId": "A" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "collapsed": true, + "id": "3df9fc7b-bf9c-45bc-baa1-72ef83bc6b34", + "layout": { + "h": 1, + "i": "3df9fc7b-bf9c-45bc-baa1-72ef83bc6b34", + "w": 24, + "x": 0, + "y": 34 + }, + "name": "Context Switches", + "type": "row" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "进程自愿中断(如I/O完成)次数,倒排前5", + "id": "3775f2e7-fa98-4458-9659-0d1221b3cc03", + "layout": { + "h": 7, + "i": "3775f2e7-fa98-4458-9659-0d1221b3cc03", + "w": 12, + "x": 0, + "y": 35 + }, + "name": "Top Processes by Voluntary Context Switches", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "none" + } + }, + "targets": [ + { + "expr": "topk(5,( rate(namedprocess_namegroup_context_switches_total{groupname=~\"$processes\",instance=~\"$instance\",ctxswitchtype=\"voluntary\"}[5m]) or irate(namedprocess_namegroup_context_switches_total{groupname=~\"$processes\",instance=~\"$instance\",ctxswitchtype=\"voluntary\"}[5m])))", + "legend": "{{groupname}}", + "refId": "A" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "进程被迫中断(如CPU时间耗尽)次数,倒排前5", + "id": "a0e912d2-5b0d-42d0-bcf6-27bd54b5f07d", + "layout": { + "h": 7, + "i": "a0e912d2-5b0d-42d0-bcf6-27bd54b5f07d", + "w": 12, + "x": 12, + "y": 35 + }, + "name": "Top Processes by Non-Voluntary Context Switches", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "none" + } + }, + "targets": [ + { + "expr": "topk(5,( rate(namedprocess_namegroup_context_switches_total{groupname=~\"$processes\",instance=~\"$instance\",ctxswitchtype=\"nonvoluntary\"}[5m]) or irate(namedprocess_namegroup_context_switches_total{groupname=~\"$processes\",instance=~\"$instance\",ctxswitchtype=\"nonvoluntary\"}[5m])))", + "legend": "{{groupname}}", + "refId": "A" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "collapsed": true, + "id": "79aedbba-da75-4393-b322-1565655017c0", + "layout": { + "h": 1, + "i": "79aedbba-da75-4393-b322-1565655017c0", + "w": 24, + "x": 0, + "y": 42 + }, + "name": "File Descriptors", + "type": "row" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "打开文件数,倒排前5", + "id": "73dc1195-8ffd-4c6e-b963-d10cde9ae7ba", + "layout": { + "h": 7, + "i": "73dc1195-8ffd-4c6e-b963-d10cde9ae7ba", + "w": 12, + "x": 0, + "y": 43 + }, + "name": "Top processes by Open File Descriptors", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "none" + } + }, + "targets": [ + { + "expr": "topk(5,(max_over_time(namedprocess_namegroup_open_filedesc{groupname=~\"$processes\",instance=~\"$instance\"}[5m]) or max_over_time(namedprocess_namegroup_open_filedesc{groupname=~\"$processes\",instance=~\"$instance\"}[5m])))", + "legend": "{{groupname}}", + "refId": "A" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "已打开文件数与允许打开文件数占比,倒排前5", + "id": "5ffa8032-5b55-4841-a12e-81c1fc8443dd", + "layout": { + "h": 7, + "i": "5ffa8032-5b55-4841-a12e-81c1fc8443dd", + "w": 12, + "x": 12, + "y": 43 + }, + "name": "Top processes by File Descriptor Usage Percent", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "percent" + }, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "none" + } + }, + "targets": [ + { + "expr": "topk(5,( max_over_time(namedprocess_namegroup_worst_fd_ratio{groupname=~\"$processes\",instance=~\"$instance\"}[5m]) or max_over_time(namedprocess_namegroup_worst_fd_ratio{groupname=~\"$processes\",instance=~\"$instance\"}[5m]) ))*100", + "legend": "{{groupname}}", + "refId": "A" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "collapsed": true, + "id": "fe967a6c-63bc-48d6-9632-07da1675581b", + "layout": { + "h": 1, + "i": "fe967a6c-63bc-48d6-9632-07da1675581b", + "w": 24, + "x": 0, + "y": 50 + }, + "name": "Page Faults", + "type": "row" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "主要页缺失次数,倒排前5", + "id": "8944f17d-51e6-435d-808a-8301b8539f08", + "layout": { + "h": 7, + "i": "8944f17d-51e6-435d-808a-8301b8539f08", + "w": 12, + "x": 0, + "y": 51 + }, + "name": "Top processes by Major Page Faults", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "none" + } + }, + "targets": [ + { + "expr": "topk(5,( rate(namedprocess_namegroup_major_page_faults_total{groupname=~\"$processes\",instance=~\"$instance\"}[5m]) or irate(namedprocess_namegroup_major_page_faults_total{groupname=~\"$processes\",instance=~\"$instance\"}[5m])))", + "legend": "{{groupname}}", + "refId": "A" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "次要页缺失次数,倒排前5", + "id": "b9bed634-d25a-4542-8fd9-4c63e9db766c", + "layout": { + "h": 7, + "i": "b9bed634-d25a-4542-8fd9-4c63e9db766c", + "w": 12, + "x": 12, + "y": 51 + }, + "name": "Top processes by Minor Page Faults", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "none" + } + }, + "targets": [ + { + "expr": "topk(5,( rate(namedprocess_namegroup_minor_page_faults_total{groupname=~\"$processes\",instance=~\"$instance\"}[5m]) or irate(namedprocess_namegroup_minor_page_faults_total{groupname=~\"$processes\",instance=~\"$instance\"}[5m])))", + "legend": "{{groupname}}", + "refId": "A" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "collapsed": true, + "id": "76910d2f-d2c7-48c1-ad05-9651110f0ff1", + "layout": { + "h": 1, + "i": "76910d2f-d2c7-48c1-ad05-9651110f0ff1", + "w": 24, + "x": 0, + "y": 58 + }, + "name": "Statuses", + "type": "row" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "运行态同名进程数量,倒排前5", + "id": "7c0a07a2-f936-41c1-a03e-328d8bc1feea", + "layout": { + "h": 7, + "i": "7c0a07a2-f936-41c1-a03e-328d8bc1feea", + "w": 12, + "x": 0, + "y": 59 + }, + "name": "Top running processes", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "none" + } + }, + "targets": [ + { + "expr": "topk(5,( max_over_time(namedprocess_namegroup_states{instance=~\"$instance\", groupname=~\"$processes\", state=\"Running\"}[5m]) or max_over_time(namedprocess_namegroup_states{instance=~\"$instance\", groupname=~\"$processes\", state=\"Running\"}[5m])))", + "legend": "{{groupname}}", + "refId": "A" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "等待IO状态同名进程数量,倒排前5", + "id": "31829466-def5-496f-83e8-10806b19410b", + "layout": { + "h": 7, + "i": "31829466-def5-496f-83e8-10806b19410b", + "w": 12, + "x": 12, + "y": 59 + }, + "name": "Top of processes waiting on IO", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "none" + } + }, + "targets": [ + { + "expr": "topk(5,( max_over_time(namedprocess_namegroup_states{instance=~\"$instance\", groupname=~\"$processes\", state=\"Waiting\"}[5m]) or max_over_time(namedprocess_namegroup_states{instance=~\"$instance\", groupname=~\"$processes\", state=\"Waiting\"}[5m])))", + "legend": "{{groupname}}", + "refId": "A" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "collapsed": true, + "id": "9c79f06d-0262-4284-a65e-bcd0d84130be", + "layout": { + "h": 1, + "i": "9c79f06d-0262-4284-a65e-bcd0d84130be", + "w": 24, + "x": 0, + "y": 66 + }, + "name": "Kernel Waits(WCHAN)", + "type": "row" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "内核函数等待线程数量,倒排前5", + "id": "5decb873-5452-4084-8bbf-7bd3c9fa33f4", + "layout": { + "h": 7, + "i": "5decb873-5452-4084-8bbf-7bd3c9fa33f4", + "w": 12, + "x": 0, + "y": 67 + }, + "name": "Kernel waits for All", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "none" + } + }, + "targets": [ + { + "expr": "topk(5,sum(avg_over_time(namedprocess_namegroup_threads_wchan{instance=~\"$instance\", groupname=~\"$processes\"}[5m])) by (wchan) )", + "legend": "{{wchan}}", + "refId": "A" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "custom": { + "drawStyle": "lines", + "fillOpacity": 0.5, + "lineInterpolation": "smooth", + "stack": "off" + }, + "description": "内核函数等待线程数量按进程统计,倒排前5", + "id": "20f14e96-5ac8-43f8-93a8-3f7696f5d8d5", + "layout": { + "h": 7, + "i": "20f14e96-5ac8-43f8-93a8-3f7696f5d8d5", + "w": 12, + "x": 12, + "y": 67 + }, + "name": "Kernel wait Details for All", + "options": { + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {}, + "tooltip": { + "mode": "all", + "sort": "none" + } + }, + "targets": [ + { + "expr": "topk(5,sum(avg_over_time(namedprocess_namegroup_threads_wchan{instance=~\"$instance\", groupname=~\"$processes\"}[5m])) by (wchan,groupname) )", + "legend": "{{wchan}}", + "refId": "A" + } + ], + "type": "timeseries", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + }, + { + "collapsed": true, + "id": "972e68da-d03d-4638-81a2-fba73d215788", + "layout": { + "h": 1, + "i": "972e68da-d03d-4638-81a2-fba73d215788", + "w": 24, + "x": 0, + "y": 74 + }, + "name": "Uptime", + "type": "row" + }, + { + "custom": { + "calc": "lastNotNull", + "displayMode": "seriesToRows", + "showHeader": true + }, + "description": "进程启动时长", + "id": "eff1ee11-1d70-4c79-a159-588a363a2b60", + "layout": { + "h": 7, + "i": "eff1ee11-1d70-4c79-a159-588a363a2b60", + "w": 24, + "x": 0, + "y": 75 + }, + "name": "Processes by uptime", + "options": { + "standardOptions": { + "util": "seconds" + } + }, + "overrides": [ + { + "properties": { + "standardOptions": { + "util": "seconds" + }, + "valueMappings": [ + { + "match": { + "to": 1800 + }, + "result": { + "color": "#f91010" + }, + "type": "range" + }, + { + "match": { + "from": 1800 + }, + "result": { + "color": "#21f312" + }, + "type": "range" + } + ] + } + } + ], + "targets": [ + { + "expr": "time()-(namedprocess_namegroup_oldest_start_time_seconds{instance=~\"$instance\",groupname=~\"$processes\"}>0)", + "legend": "{{groupname}}", + "refId": "A" + } + ], + "type": "table", + "version": "2.0.0", + "datasourceCate": "prometheus", + "datasourceValue": "${prom}" + } + ], + "var": [ + { + "name": "prom", + "type": "datasource", + "definition": "prometheus" + }, + { + "definition": "label_values(namedprocess_namegroup_num_procs, instance)", + "multi": false, + "name": "instance", + "options": [ + "tt-fc-es02.nj:12346" + ], + "type": "query", + "datasource": { + "cate": "prometheus", + "value": "${prom}" + } + }, + { + "allOption": true, + "definition": "label_values(namedprocess_namegroup_cpu_seconds_total{instance=~\"$instance\"},groupname)", + "multi": true, + "name": "processes", + "options": [ + "(sd-pam)", + "NetworkManager", + "YDLive", + "YDPython", + "YDService", + "agent", + "agetty", + "atd", + "auditd", + "barad_agent", + "bash", + "chronyd", + "crond", + "dbus-daemon", + "fc-agent", + "fc-alert", + "fc-checker", + "gpg-agent", + "less", + "lsmd", + "mongod", + "mysql", + "mysqld", + "nginx", + "ngo", + "node", + "openvpn", + "podman pause", + "polkitd", + "process-agent", + "redis-server", + "rsyslogd", + "sedispatch", + "sgagent", + "sh", + "ssh-agent", + "sshd", + "sssd", + "sssd_be", + "sssd_nss", + "su", + "sudo", + "systemd", + "systemd-journal", + "systemd-logind", + "systemd-udevd", + "tat_agent", + "trace-agent", + "tuned", + "unbound-anchor", + "vminsert-prod", + "vmselect-prod", + "vmstorage-prod" + ], + "type": "query", + "datasource": { + "cate": "prometheus", + "value": "${prom}" + } + } + ], + "version": "3.0.0" + } +} \ No newline at end of file diff --git a/integrations/Process/icon/linux.png b/integrations/Process/icon/linux.png new file mode 100644 index 00000000..906c61b3 Binary files /dev/null and b/integrations/Process/icon/linux.png differ diff --git a/integrations/Processes/collect/processes.toml b/integrations/Processes/collect/processes.toml new file mode 100644 index 00000000..af7436b6 --- /dev/null +++ b/integrations/Processes/collect/processes.toml @@ -0,0 +1,8 @@ +# # collect interval +# interval = 15 + +# # force use ps command to gather +# force_ps = false + +# # force use /proc to gather +# force_proc = false \ No newline at end of file diff --git a/integrations/Procstat/collect/procstat.toml b/integrations/Procstat/collect/procstat.toml new file mode 100644 index 00000000..8cfa10cf --- /dev/null +++ b/integrations/Procstat/collect/procstat.toml @@ -0,0 +1,42 @@ +# # collect interval +# interval = 15 + +# [[instances]] +# # executable name (ie, pgrep ) +# search_exec_substring = "nginx" + +# # pattern as argument for pgrep (ie, pgrep -f ) +# search_cmdline_substring = "n9e server" + +# # windows service name +# search_win_service = "" + +# # search process with specific user, option with exec_substring or cmdline_substring +# search_user = "" + +# # append some labels for series +# labels = { region="cloud", product="n9e" } + +# # interval = global.interval * interval_times +# interval_times = 1 + +# # mode to use when calculating CPU usage. can be one of 'solaris' or 'irix' +# mode = "irix" + +# sum of threads/fd/io/cpu/mem, min of uptime/limit +gather_total = true + +# will append pid as tag +gather_per_pid = false + +# gather jvm metrics only when jstat is ready +# gather_more_metrics = [ +# "threads", +# "fd", +# "io", +# "uptime", +# "cpu", +# "mem", +# "limit", +# "jvm" +# ] diff --git a/integrations/RabbitMQ/collect/rabbitmq.toml b/integrations/RabbitMQ/collect/rabbitmq.toml new file mode 100644 index 00000000..41b9707b --- /dev/null +++ b/integrations/RabbitMQ/collect/rabbitmq.toml @@ -0,0 +1,68 @@ +# As of 3.8.0, RabbitMQ ships with built-in Prometheus & Grafana support. +# Support for Prometheus metric collector ships in the rabbitmq_prometheus plugin. +# The plugin exposes all RabbitMQ metrics on a dedicated TCP port, in Prometheus text format. +# +# enable prometheus plugin: +# `rabbitmq-plugins enable rabbitmq_prometheus` +# `curl http://localhost:15692/metrics` +# +# then use categraf prometheus plugin scrape http://localhost:15692/metrics instead of this rabbitmq plugin + +# # collect interval +# interval = 15 + +[[instances]] +# # Management Plugin url +# url = "http://localhost:15672" +# username = "guest" +# password = "guest" + +## Optional TLS Config +# use_tls = false +# tls_min_version = "1.2" +# tls_ca = "/etc/categraf/ca.pem" +# tls_cert = "/etc/categraf/cert.pem" +# tls_key = "/etc/categraf/key.pem" +## Use TLS but skip chain & host verification +# insecure_skip_verify = true + +## Optional request timeouts +## +## ResponseHeaderTimeout, if non-zero, specifies the amount of time to wait +## for a server's response headers after fully writing the request. +# header_timeout = "3s" +## +## client_timeout specifies a time limit for requests made by this client. +## Includes connection time, any redirects, and reading the response body. +# client_timeout = "4s" + +## A list of nodes to gather as the rabbitmq_node measurement. If not +## specified, metrics for all nodes are gathered. +# nodes = ["rabbit@node1", "rabbit@node2"] + +## A list of exchanges to gather as the rabbitmq_exchange measurement. If not +## specified, metrics for all exchanges are gathered. +# exchanges = ["categraf"] + +## Metrics to include and exclude. Globs accepted. +## Note that an empty array for both will include all metrics +## Currently the following metrics are supported: "exchange", "federation", "node", "overview", "queue" +# metric_include = [] +# metric_exclude = [] + +## Queues to include and exclude. Globs accepted. +## Note that an empty array for both will include all queues +# queue_name_include = [] +# queue_name_exclude = [] + +## Federation upstreams to include and exclude specified as an array of glob +## pattern strings. Federation links can also be limited by the queue and +## exchange filters. +# federation_upstream_include = [] +# federation_upstream_exclude = [] + +# # interval = global.interval * interval_times +# interval_times = 1 + +# important! use global unique string to specify instance +# labels = { instance="rabbitmq-001" } \ No newline at end of file diff --git a/integrations/Redis/collect/redis.toml b/integrations/Redis/collect/redis.toml new file mode 100644 index 00000000..bd5c0261 --- /dev/null +++ b/integrations/Redis/collect/redis.toml @@ -0,0 +1,29 @@ +# # collect interval +# interval = 15 + +[[instances]] +# address = "127.0.0.1:6379" +# username = "" +# password = "" +# pool_size = 2 + +# # Optional. Specify redis commands to retrieve values +# commands = [ +# {command = ["get", "sample-key1"], metric = "custom_metric_name1"}, +# {command = ["get", "sample-key2"], metric = "custom_metric_name2"} +# ] + +# # interval = global.interval * interval_times +# interval_times = 1 + +# important! use global unique string to specify instance +# labels = { instance="n9e-10.2.3.4:6379" } + +## Optional TLS Config +# use_tls = false +# tls_min_version = "1.2" +# tls_ca = "/etc/categraf/ca.pem" +# tls_cert = "/etc/categraf/cert.pem" +# tls_key = "/etc/categraf/key.pem" +## Use TLS but skip chain & host verification +# insecure_skip_verify = true diff --git a/integrations/SMART/collect/smart.toml b/integrations/SMART/collect/smart.toml new file mode 100644 index 00000000..d87860b8 --- /dev/null +++ b/integrations/SMART/collect/smart.toml @@ -0,0 +1,50 @@ +# Read metrics from storage devices supporting S.M.A.R.T. +[[instances]] + ## Optionally specify the path to the smartctl executable + # path_smartctl = "/usr/bin/smartctl" + + ## Optionally specify the path to the nvme-cli executable + # path_nvme = "/usr/bin/nvme" + + ## Optionally specify if vendor specific attributes should be propagated for NVMe disk case + ## ["auto-on"] - automatically find and enable additional vendor specific disk info + ## ["vendor1", "vendor2", ...] - e.g. "Intel" enable additional Intel specific disk info + # enable_extensions = ["auto-on"] + + ## On most platforms used cli utilities requires root access. + ## Setting 'use_sudo' to true will make use of sudo to run smartctl or nvme-cli. + ## Sudo must be configured to allow the telegraf user to run smartctl or nvme-cli + ## without a password. + # use_sudo = true + + ## Skip checking disks in this power mode. Defaults to + ## "standby" to not wake up disks that have stopped rotating. + ## See --nocheck in the man pages for smartctl. + ## smartctl version 5.41 and 5.42 have faulty detection of + ## power mode and might require changing this value to + ## "never" depending on your disks. + # nocheck = "standby" + + ## Gather all returned S.M.A.R.T. attribute metrics and the detailed + ## information from each drive into the 'smart_attribute' measurement. + # attributes = true + + ## Optionally specify devices to exclude from reporting if disks auto-discovery is performed. + # excludes = [ "/dev/pass6" ] + + ## Optionally specify devices and device type, if unset + ## a scan (smartctl --scan and smartctl --scan -d nvme) for S.M.A.R.T. devices will be done + ## and all found will be included except for the excluded in excludes. + # devices = [ "/dev/ada0 -d atacam", "/dev/nvme0"] + # devices = ["dev/nvme0 -d nvme", "/dev/nvme0"] + + ## Timeout for the cli command to complete. + timeout = "30s" + + ## Optionally call smartctl and nvme-cli with a specific concurrency policy. + ## By default, smartctl and nvme-cli are called in separate threads (goroutines) to gather disk attributes. + ## Some devices (e.g. disks in RAID arrays) may have access limitations that require sequential reading of + ## SMART data - one individual array drive at the time. In such case please set this configuration option + ## to "sequential" to get readings for all drives. + ## valid options: concurrent, sequential + # read_method = "concurrent" diff --git a/integrations/SNMP/collect/snmp.toml b/integrations/SNMP/collect/snmp.toml new file mode 100644 index 00000000..7ec0b5c5 --- /dev/null +++ b/integrations/SNMP/collect/snmp.toml @@ -0,0 +1,87 @@ +# Retrieves SNMP values from remote agents +[[instances]] +## Agent addresses to retrieve values from. +## format: agents = [":"] +## scheme: optional, either udp, udp4, udp6, tcp, tcp4, tcp6. +## default is udp +## port: optional +## example: agents = ["udp://127.0.0.1:161"] +## agents = ["tcp://127.0.0.1:161"] +## agents = ["udp4://v4only-snmp-agent"] +#agents = ["udp://127.0.0.1:161"] +agents = [ + # +] + +## Timeout for each request. +# timeout = "5s" + +## SNMP version; can be 1, 2, or 3. +# version = 2 + +## Unconnected UDP socket +## When true, SNMP reponses are accepted from any address not just +## the requested address. This can be useful when gathering from +## redundant/failover systems. +# unconnected_udp_socket = false + +## Path to mib files +## Used by the gosmi translator. +## To add paths when translating with netsnmp, use the MIBDIRS environment variable +# path = ["/usr/share/snmp/mibs"] + +## SNMP community string. +# community = "public" + +## Agent host tag +# agent_host_tag = "agent_host" + +## Number of retries to attempt. +# retries = 3 + +## The GETBULK max-repetitions parameter. +# max_repetitions = 10 + +## SNMPv3 authentication and encryption options. +## +## Security Name. +# sec_name = "myuser" +## Authentication protocol; one of "MD5", "SHA", "SHA224", "SHA256", "SHA384", "SHA512" or "". +# auth_protocol = "MD5" +## Authentication password. +# auth_password = "pass" +## Security Level; one of "noAuthNoPriv", "authNoPriv", or "authPriv". +# sec_level = "authNoPriv" +## Context Name. +# context_name = "" +## Privacy protocol used for encrypted messages; one of "DES", "AES", "AES192", "AES192C", "AES256", "AES256C", or "". +### Protocols "AES192", "AES192", "AES256", and "AES256C" require the underlying net-snmp tools +### to be compiled with --enable-blumenthal-aes (http://www.net-snmp.org/docs/INSTALL.html) +# priv_protocol = "" +## Privacy password used for encrypted messages. +# priv_password = "" + +## Add fields and tables defining the variables you wish to collect. This +## example collects the system uptime and interface variables. Reference the +## full plugin documentation for configuration details. + +#[[instances.field]] +#oid = "RFC1213-MIB::sysUpTime.0" +#name = "uptime" + +#[[instances.field]] +#oid = "RFC1213-MIB::sysName.0" +#name = "source" +#is_tag = true +# filters = ["A:ifIndex:^2$","B:ifOperStatus:1", "C:ifDescr:^eno*"] +# filters_expression = "(A && B) || C" + +#[[instances.table]] +#oid = "IF-MIB::ifTable" +#name = "interface" +#inherit_tags = ["source"] + +#[[instances.table.field]] +#oid = "IF-MIB::ifDescr" +#name = "ifDescr" +#is_tag = true diff --git a/integrations/SNMP/collect/snmp.toml.example b/integrations/SNMP/collect/snmp.toml.example new file mode 100644 index 00000000..2e7f5b36 --- /dev/null +++ b/integrations/SNMP/collect/snmp.toml.example @@ -0,0 +1,134 @@ +# Retrieves SNMP values from remote agents +[[instances]] +## Agent addresses to retrieve values from. +## format: agents = [":"] +## scheme: optional, either udp, udp4, udp6, tcp, tcp4, tcp6. +## default is udp +## port: optional +## example: agents = ["udp://127.0.0.1:161"] +## agents = ["tcp://127.0.0.1:161"] +## agents = ["udp4://v4only-snmp-agent"] +#agents = ["udp://127.0.0.1:161"] +# metrics_pass = ["*2$"] +agents = [ +# "udp://10.206.0.16:161", +] + +## Timeout for each request. + timeout = "5s" + +## SNMP version; can be 1, 2, or 3. + version = 2 + +## Unconnected UDP socket +## When true, SNMP reponses are accepted from any address not just +## the requested address. This can be useful when gathering from +## redundant/failover systems. +# unconnected_udp_socket = false + +## Path to mib files +## Used by the gosmi translator. +## To add paths when translating with netsnmp, use the MIBDIRS environment variable +# path = ["/usr/share/snmp/mibs"] + +## SNMP community string. + community = "public" + +## Agent host tag + agent_host_tag = "agent_hostname" + +## Number of retries to attempt. + retries = 3 + +## The GETBULK max-repetitions parameter. +# max_repetitions = 10 + +## SNMPv3 authentication and encryption options. +## +## Security Name. +# sec_name = "myuser" +## Authentication protocol; one of "MD5", "SHA", "SHA224", "SHA256", "SHA384", "SHA512" or "". +# auth_protocol = "MD5" +## Authentication password. +# auth_password = "pass" +## Security Level; one of "noAuthNoPriv", "authNoPriv", or "authPriv". +# sec_level = "authNoPriv" +## Context Name. +# context_name = "" +## Privacy protocol used for encrypted messages; one of "DES", "AES", "AES192", "AES192C", "AES256", "AES256C", or "". +### Protocols "AES192", "AES192", "AES256", and "AES256C" require the underlying net-snmp tools +### to be compiled with --enable-blumenthal-aes (http://www.net-snmp.org/docs/INSTALL.html) +# priv_protocol = "" +## Privacy password used for encrypted messages. +# priv_password = "" + +## Add fields and tables defining the variables you wish to collect. This +## example collects the system uptime and interface variables. Reference the +## full plugin documentation for configuration details. + + [[instances.field]] + #oid = "RFC1213-MIB::sysUpTime.0" + oid = ".1.3.6.1.2.1.1.3.0" + name = "uptime" + + [[instances.field]] + oid = ".1.3.6.1.4.1.2021.11.9.0" # % + #oid = "UCD-SNMP-MIB::ssCpuUser.0" + name = "cpu_user" + + [[instances.field]] + oid = ".1.3.6.1.4.1.2021.11.10.0" # % + name = "cpu_sys" + + [[instances.field]] + oid = "1.3.6.1.4.1.2021.11.11.0" # % + name = "cpu_idle" + + [[instances.field]] + oid = ".1.3.6.1.2.1.25.2.2.0" + name = "mem_total" + + [[instances.field]] + oid = ".1.3.6.1.4.1.2021.4.11.0" + name = "mem_free" + + [[instances.field]] + oid = ".1.3.6.1.4.1.2021.4.13.0" + name = "mem_shared" + + [[instances.field]] + oid = ".1.3.6.1.4.1.2021.4.14.0" + name = "mem_buffer" + + [[instances.field]] + oid = ".1.3.6.1.4.1.2021.4.15.0" + name = "mem_cached" + + [[instances.field]] + oid = ".1.3.6.1.4.1.2021.10.1.3.1" + name = "cpu_load1" + + [[instances.field]] + oid = ".1.3.6.1.4.1.2021.10.1.3.2" + name = "cpu_load5" + + [[instances.field]] + oid = ".1.3.6.1.4.1.2021.10.1.3.3" + name = "cpu_load15" + + # network + [[instances.table]] + oid = "IF-MIB::ifTable" + name = "interface" + inherit_tags = ["source"] + index_as_tag = true + include_filter = ["ifIndex:2","ifIndex:4"] + + [[instances.table.field]] + oid = "IF-MIB::ifDescr" + name = "ifDescr" + is_tag = true + [[instances.table.field]] + oid = "IF-MIB::ifPhysAddress" + name = "ifPhysAddress" + is_tag = true diff --git a/integrations/SNMP/dashboards/placeholder.json b/integrations/SNMP/dashboards/placeholder.json new file mode 100644 index 00000000..5ffe6a0f --- /dev/null +++ b/integrations/SNMP/dashboards/placeholder.json @@ -0,0 +1,10 @@ +{ + "name": "占位的,等待老炮 PR", + "tags": "", + "ident": "", + "configs": { + "var": [], + "panels": [], + "version": "3.0.0" + } +} \ No newline at end of file diff --git a/integrations/Tomcat/collect/tomcat.toml b/integrations/Tomcat/collect/tomcat.toml new file mode 100644 index 00000000..8aed7600 --- /dev/null +++ b/integrations/Tomcat/collect/tomcat.toml @@ -0,0 +1,30 @@ +# # collect interval +# interval = 15 + +# Gather metrics from the Tomcat server status page. +[[instances]] +## URL of the Tomcat server status +# url = "http://127.0.0.1:8080/manager/status/all?XML=true" +url = "" + +## HTTP Basic Auth Credentials +# username = "tomcat" +# password = "s3cret" + +## Request timeout +# timeout = "5s" + +# # interval = global.interval * interval_times +# interval_times = 1 + +# important! use global unique string to specify instance +# labels = { instance="192.168.1.2:8080", url="-" } + +## Optional TLS Config +# use_tls = false +# tls_min_version = "1.2" +# tls_ca = "/etc/categraf/ca.pem" +# tls_cert = "/etc/categraf/cert.pem" +# tls_key = "/etc/categraf/key.pem" +## Use TLS but skip chain & host verification +# insecure_skip_verify = true diff --git a/integrations/VictoriaMetrics/markdown/alerts.png b/integrations/VictoriaMetrics/markdown/alerts.png new file mode 100644 index 00000000..8e9a5344 Binary files /dev/null and b/integrations/VictoriaMetrics/markdown/alerts.png differ diff --git a/integrations/VictoriaMetrics/markdown/dashboard.png b/integrations/VictoriaMetrics/markdown/dashboard.png new file mode 100644 index 00000000..f2d99d53 Binary files /dev/null and b/integrations/VictoriaMetrics/markdown/dashboard.png differ diff --git a/integrations/ZooKeeper/collect/zookeeper.toml b/integrations/ZooKeeper/collect/zookeeper.toml new file mode 100644 index 00000000..e96bf6cc --- /dev/null +++ b/integrations/ZooKeeper/collect/zookeeper.toml @@ -0,0 +1,19 @@ +# # collect interval +# interval = 15 + +[[instances]] +# cluster_name = "dev-zk-cluster" +# addresses = "127.0.0.1:2181" +# timeout = 10 + +# important! use global unique string to specify instance +# labels = { instance="n9e-10.2.3.4:2181" } + +## Optional TLS Config +# use_tls = false +# tls_min_version = "1.2" +# tls_ca = "/etc/categraf/ca.pem" +# tls_cert = "/etc/categraf/cert.pem" +# tls_key = "/etc/categraf/key.pem" +## Use TLS but skip chain & host verification +# insecure_skip_verify = true \ No newline at end of file