Browse Source

refactor

tags/v6.0.0-ga12
Ulric Qin 3 years ago
parent
commit
e8fd80b6d5
8 changed files with 153 additions and 222 deletions
  1. +8
    -0
      docker/categraf/conf/config.toml
  2. +0
    -35
      docker/categraf/conf/logs.toml
  3. +5
    -23
      docker/docker-compose.yaml
  4. +0
    -38
      docker/ibexetc/agentd.conf
  5. +1
    -55
      docker/n9eetc/config.toml
  6. +62
    -52
      integrations/linux/alerts/linux_by_categraf.json
  7. +49
    -9
      integrations/linux/alerts/linux_by_telegraf.json
  8. +28
    -10
      memsto/notify_config.go

+ 8
- 0
docker/categraf/conf/config.toml View File

@@ -73,3 +73,11 @@ timeout = 5000
dial_timeout = 2500
max_idle_conns_per_host = 100

[ibex]
enable = true
## ibex flush interval
interval = "1000ms"
## n9e ibex server rpc address
servers = ["ibex:20090"]
## temp script dir
meta_dir = "./meta"

+ 0
- 35
docker/categraf/conf/logs.toml View File

@@ -1,35 +0,0 @@
[logs]
## key 占位符
api_key = "ef4ahfbwzwwtlwfpbertgq1i6mq0ab1q"
## 是否开启日志采集
enable = false
## 接受日志的server地址
send_to = "127.0.0.1:17878"
## 发送日志的协议 http/tcp
send_type = "http"
## 是否压缩发送
use_compress = false
## 是否采用ssl
send_with_tls = false
##
batch_wait = 5
## 日志offset信息保存目录
run_path = "/opt/categraf/run"
## 最多同时采集多少个日志文件
open_files_limit = 100
## 定期扫描目录下是否有新增日志
scan_period = 10
##
frame_size = 10
##
collect_container_all = true
## 全局的处理规则
[[logs.Processing_rules]]
## 单个日志采集配置
[[logs.items]]
## file/journald
type = "file"
## type=file时 path必填,type=journald时 port必填
path = "/opt/tomcat/logs/*.txt"
source = "tomcat"
service = "my_service"

+ 5
- 23
docker/docker-compose.yaml View File

@@ -12,7 +12,7 @@ services:
hostname: mysql
restart: always
ports:
- "3306:3306"
- "3406:3306"
environment:
TZ: Asia/Shanghai
MYSQL_ROOT_PASSWORD: 1234
@@ -80,7 +80,7 @@ services:
sh -c "/wait && /app/ibex server"

n9e:
image: flashcatcloud/nightingale:6.0.0-beta.1
image: flashcatcloud/nightingale:latest
container_name: n9e
hostname: n9e
restart: always
@@ -122,31 +122,13 @@ services:
- ./categraf/conf:/etc/categraf/conf
- /:/hostfs
- /var/run/docker.sock:/var/run/docker.sock
ports:
- "9100:9100/tcp"
# ports:
# - "9100:9100/tcp"
networks:
- nightingale
depends_on:
- n9e
links:
- n9e:n9e

agentd:
image: ulric2019/ibex:0.3
container_name: agentd
hostname: agentd
restart: always
environment:
GIN_MODE: release
TZ: Asia/Shanghai
volumes:
- ./ibexetc:/app/etc
networks:
- nightingale
depends_on:
- ibex
links:
- n9e:n9e
- ibex:ibex
command:
- "/app/ibex"
- "agentd"

+ 0
- 38
docker/ibexetc/agentd.conf View File

@@ -1,38 +0,0 @@
# debug, release
RunMode = "release"

# task meta storage dir
MetaDir = "./meta"

[HTTP]
Enable = true
# http listening address
Host = "0.0.0.0"
# http listening port
Port = 2090
# https cert file path
CertFile = ""
# https key file path
KeyFile = ""
# whether print access log
PrintAccessLog = true
# whether enable pprof
PProf = false
# http graceful shutdown timeout, unit: s
ShutdownTimeout = 30
# max content length: 64M
MaxContentLength = 67108864
# http server read timeout, unit: s
ReadTimeout = 20
# http server write timeout, unit: s
WriteTimeout = 40
# http server idle timeout, unit: s
IdleTimeout = 120

[Heartbeat]
# unit: ms
Interval = 1000
# rpc servers
Servers = ["ibex:20090"]
# $ip or $hostname or specified string
Host = "categraf01"

+ 1
- 55
docker/n9eetc/config.toml View File

@@ -118,15 +118,6 @@ IP = ""
Interval = 1000
ClusterName = "default"

# [Alert.SMTP]
# Host = "smtp.163.com"
# Port = 994
# User = "username"
# Pass = "password"
# From = "username@163.com"
# InsecureSkipVerify = true
# Batch = 5

# [Alert.Alerting]
# NotifyConcurrency = 10

@@ -147,52 +138,7 @@ BasicAuthPass = "ibex"
Timeout = 3000

[Pushgw]
# use target labels in database instead of in series
LabelRewrite = true
# # default busigroup key name
# BusiGroupLabelKey = "busigroup"
# ForceUseServerTS = false

# [Pushgw.DebugSample]
# ident = "xx"
# __name__ = "xx"

# [Pushgw.WriterOpt]
# # Writer Options
# QueueCount = 1000
# QueueMaxSize = 1000000
# QueuePopSize = 1000
# # ident or metric
# ShardingKey = "ident"

[[Pushgw.Writers]]
# Url = "http://127.0.0.1:8480/insert/0/prometheus/api/v1/write"
Url = "http://prometheus:9090/api/v1/write"
# Basic auth username
BasicAuthUser = ""
# Basic auth password
BasicAuthPass = ""
# timeout settings, unit: ms
Headers = ["X-From", "n9e"]
Timeout = 10000
DialTimeout = 3000
TLSHandshakeTimeout = 30000
ExpectContinueTimeout = 1000
IdleConnTimeout = 90000
# time duration, unit: ms
KeepAlive = 30000
MaxConnsPerHost = 0
MaxIdleConns = 100
MaxIdleConnsPerHost = 100
## Optional TLS Config
# UseTLS = false
# TLSCA = "/etc/n9e/ca.pem"
# TLSCert = "/etc/n9e/cert.pem"
# TLSKey = "/etc/n9e/key.pem"
# InsecureSkipVerify = false
# [[Writers.WriteRelabels]]
# Action = "replace"
# SourceLabels = ["__address__"]
# Regex = "([^:]+)(?::\\d+)?"
# Replacement = "$1:80"
# TargetLabel = "__address__"
Url = "http://prometheus:9090/api/v1/write"

+ 62
- 52
integrations/linux/alerts/linux_by_categraf.json View File

@@ -1,45 +1,54 @@
[
{
"name": "Lost connection with monitoring target - categraf",
"note": "",
"severity": 1,
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "max_over_time(target_up[130s]) == 0",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"callbacks": [],
"runbook_url": "",
"append_tags": []
{
"cate": "host",
"datasource_ids": null,
"name": "Lost connection with monitoring target - categraf",
"note": "",
"prod": "host",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"inhibit": false,
"queries": [
{
"key": "all_hosts",
"op": "==",
"values": []
}
],
"triggers": [
{
"duration": 60,
"severity": 2,
"type": "target_miss"
}
]
},
{
"name": "Machine load - high CPU, please pay attention - categraf",
"note": "",
"severity": 3,
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "cpu_usage_idle{cpu=\"cpu-total\"} < 25",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
@@ -47,18 +56,19 @@
"5",
"6",
"0"
],
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_repeat_step": 60,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {}
},
{
"name": "Machine load - high memory, please pay attention - categraf",
"note": "",


+ 49
- 9
integrations/linux/alerts/linux_by_telegraf.json View File

@@ -30,15 +30,44 @@
"append_tags": []
},
{
"cate": "host",
"datasource_ids": null,
"name": "Lost connection with monitoring target - telegraf",
"note": "",
"severity": 1,
"prod": "host",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "target_up != 1",
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"inhibit": false,
"queries": [
{
"key": "all_hosts",
"op": "==",
"values": []
}
],
"triggers": [
{
"duration": 60,
"severity": 2,
"type": "target_miss"
}
]
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
@@ -48,16 +77,27 @@
"6",
"0"
],
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
"append_tags": [],
"annotations": {}
},
{
"name": "Port detection failed, please pay attention - telegraf",


+ 28
- 10
memsto/notify_config.go View File

@@ -2,6 +2,7 @@ package memsto

import (
"encoding/json"
"strings"
"sync"
"time"

@@ -57,7 +58,14 @@ func (w *NotifyConfigCacheType) syncNotifyConfigs() error {
if err != nil {
return err
}
json.Unmarshal([]byte(cval), &w.webhooks)

if strings.TrimSpace(cval) != "" {
err = json.Unmarshal([]byte(cval), &w.webhooks)
if err != nil {
logger.Errorf("failed to unmarshal webhooks:%s config:", cval, err)
}
}

logger.Infof("timer: sync wbhooks done number: %d", len(w.webhooks))

cval, err = models.ConfigsGet(w.ctx, models.SMTP)
@@ -65,9 +73,11 @@ func (w *NotifyConfigCacheType) syncNotifyConfigs() error {
return err
}

err = toml.Unmarshal([]byte(cval), &w.smtp)
if err != nil {
logger.Errorf("failed to unmarshal smtp:%s config:", cval, err)
if strings.TrimSpace(cval) != "" {
err = toml.Unmarshal([]byte(cval), &w.smtp)
if err != nil {
logger.Errorf("failed to unmarshal smtp:%s config:", cval, err)
}
}

logger.Infof("timer: sync smtp:%+v done", w.smtp)
@@ -76,20 +86,28 @@ func (w *NotifyConfigCacheType) syncNotifyConfigs() error {
if err != nil {
return err
}
err = json.Unmarshal([]byte(cval), &w.script)
if err != nil {
logger.Errorf("failed to unmarshal notify script:%s config:", cval, err)

if strings.TrimSpace(cval) != "" {
err = json.Unmarshal([]byte(cval), &w.script)
if err != nil {
logger.Errorf("failed to unmarshal notify script:%s config:", cval, err)
}
}

logger.Infof("timer: sync notify script done")

cval, err = models.ConfigsGet(w.ctx, models.IBEX)
if err != nil {
return err
}
err = toml.Unmarshal([]byte(cval), &w.ibex)
if err != nil {
logger.Errorf("failed to unmarshal ibex:%s config:", cval, err)

if strings.TrimSpace(cval) != "" {
err = toml.Unmarshal([]byte(cval), &w.ibex)
if err != nil {
logger.Errorf("failed to unmarshal ibex:%s config:", cval, err)
}
}

logger.Infof("timer: sync ibex done")

return nil


Loading…
Cancel
Save