From 0fb4e4b723a124862d64af29ffb3a6db041d64ec Mon Sep 17 00:00:00 2001 From: ning <710leo@gmail.com> Date: Tue, 27 May 2025 11:27:33 +0800 Subject: [PATCH] refactor: add eval duration --- alert/astats/stats.go | 10 ++++++++++ alert/eval/eval.go | 12 ++++++++++++ 2 files changed, 22 insertions(+) diff --git a/alert/astats/stats.go b/alert/astats/stats.go index 3d27e095..9d2e6de3 100644 --- a/alert/astats/stats.go +++ b/alert/astats/stats.go @@ -25,6 +25,7 @@ type Stats struct { CounterHeartbeatErrorTotal *prometheus.CounterVec CounterSubEventTotal *prometheus.CounterVec GaugeQuerySeriesCount *prometheus.GaugeVec + GaugeRuleEvalDuration *prometheus.GaugeVec GaugeNotifyRecordQueueSize prometheus.Gauge } @@ -136,6 +137,13 @@ func NewSyncStats() *Stats { Help: "The size of notify record queue.", }) + GaugeRuleEvalDuration := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "rule_eval_duration_ms", + Help: "Duration of rule eval in milliseconds.", + }, []string{"rule_id", "datasource_id"}) + CounterVarFillingQuery := prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: subsystem, @@ -158,6 +166,7 @@ func NewSyncStats() *Stats { CounterHeartbeatErrorTotal, CounterSubEventTotal, GaugeQuerySeriesCount, + GaugeRuleEvalDuration, GaugeNotifyRecordQueueSize, CounterVarFillingQuery, ) @@ -177,6 +186,7 @@ func NewSyncStats() *Stats { CounterHeartbeatErrorTotal: CounterHeartbeatErrorTotal, CounterSubEventTotal: CounterSubEventTotal, GaugeQuerySeriesCount: GaugeQuerySeriesCount, + GaugeRuleEvalDuration: GaugeRuleEvalDuration, GaugeNotifyRecordQueueSize: GaugeNotifyRecordQueueSize, CounterVarFillingQuery: CounterVarFillingQuery, } diff --git a/alert/eval/eval.go b/alert/eval/eval.go index 08cc0ff1..ea513c6f 100644 --- a/alert/eval/eval.go +++ b/alert/eval/eval.go @@ -232,6 +232,10 @@ func (arw *AlertRuleWorker) Stop() { func (arw *AlertRuleWorker) GetPromAnomalyPoint(ruleConfig string) ([]models.AnomalyPoint, error) { var lst []models.AnomalyPoint + start := time.Now() + defer func() { + arw.Processor.Stats.GaugeRuleEvalDuration.WithLabelValues(fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId())).Set(float64(time.Since(start).Milliseconds())) + }() var rule *models.PromRuleConfig if err := json.Unmarshal([]byte(ruleConfig), &rule); err != nil { @@ -742,6 +746,10 @@ func combine(paramKeys []string, paraMap map[string][]string, index int, current func (arw *AlertRuleWorker) GetHostAnomalyPoint(ruleConfig string) ([]models.AnomalyPoint, error) { var lst []models.AnomalyPoint + start := time.Now() + defer func() { + arw.Processor.Stats.GaugeRuleEvalDuration.WithLabelValues(fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId())).Set(float64(time.Since(start).Milliseconds())) + }() var rule *models.HostRuleConfig if err := json.Unmarshal([]byte(ruleConfig), &rule); err != nil { @@ -1416,6 +1424,10 @@ func fillVar(curRealQuery string, paramKey string, val string) string { func (arw *AlertRuleWorker) GetAnomalyPoint(rule *models.AlertRule, dsId int64) ([]models.AnomalyPoint, []models.AnomalyPoint) { // 获取查询和规则判断条件 + start := time.Now() + defer func() { + arw.Processor.Stats.GaugeRuleEvalDuration.WithLabelValues(fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId())).Set(float64(time.Since(start).Milliseconds())) + }() points := []models.AnomalyPoint{} recoverPoints := []models.AnomalyPoint{} ruleConfig := strings.TrimSpace(rule.RuleConfig)