05-Kapacitor监控案例-C-创建任务模板脚本

创建任务模板脚本

// Which measurement to consume
var measurement string
// Optional where filter
var where_filter = lambda: TRUE
// Optional list of group by dimensions
var groups = [*]
// Which field to process
var field string
// Warning criteria, has access to 'mean' field
var warn lambda
// Critical criteria, has access to 'mean' field
var crit lambda
// How much data to window
var window = 5m
// The slack channel for alerts
var slack_channel = '#alerts'

stream
|from()
.measurement(measurement)
.where(where_filter)
.groupBy(groups)
|window()
.period(window)
.every(window)
|mean(field)
|alert()
.warn(warn)
.crit(crit)
.slack()
.channel(slack_channel)

声明模板

kapacitor define-template generic_mean_alert -tick path/to/template_script.tick

查看模板

kapacitor show-template generic_mean_alert

使用JSON变量文件来定义新任务

01通过-template-vars定义新任务以触发CPU使用率警报

{
"measurement": {"type" : "string", "value" : "cpu" },
"where_filter": {"type": "lambda", "value": "\"cpu\" == 'cpu-total'"},
"groups": {"type": "list", "value": [{"type":"string", "value":"host"},{"type":"string", "value":"dc"}]},
"field": {"type" : "string", "value" : "usage_idle" },
"warn": {"type" : "lambda", "value" : "\"mean\" < 30.0" },
"crit": {"type" : "lambda", "value" : "\"mean\" < 10.0" },
"window": {"type" : "duration", "value" : "1m" },
"slack_channel": {"type" : "string", "value" : "#alerts_testing" }
}

通过运行带有参数-template-vars参数的命令来传递模板文件和JSON变量文件

kapacitor define cpu_alert -template generic_mean_alert -vars cpu_vars.json -dbrp telegraf.autogen

查看

kapacitor show cpu_alert

02通过-file定义新任务以触发CPU使用率警报

{
"template-id": "generic_mean_alert",
"dbrps": [{"db": "telegraf", "rp": "autogen"}],
"vars": {
"measurement": {"type" : "string", "value" : "mem" },
"groups": {"type": "list", "value": [{"type":"star", "value":"*"}]},
"field": {"type" : "string", "value" : "used_percent" },
"warn": {"type" : "lambda", "value" : "\"mean\" > 80.0" },
"crit": {"type" : "lambda", "value" : "\"mean\" > 90.0" },
"window": {"type" : "duration", "value" : "10m" },
"slack_channel": {"type" : "string", "value" : "#alerts_testing" }
}
}

通过-file,该参数与该任务定义文件的新的内容,替换命令行参数templatedbrpvars

kapacitor define mem_alert -file mem_template_task.json

使用YAML变量文件来定义新任务

使用YAML,任务定义文件mem_template_task.yaml如下所示:

示例:YAML中的任务定义文件

template-id: generic_mean_alert
dbrps:
- db: telegraf
rp: autogen
vars:
measurement:
type: string
value: mem
groups:
type: list
value:
- type: star
value: "*"
field:
type: string
value: used_percent
warn:
type: lambda
value: '"mean" > 80.0'
crit:
type: lambda
value: '"mean" > 90.0'
window:
type: duration
value: 10m
slack_channel:
type: string
value: "#alerts_testing"

然后可以使用file如前所示的参数定义任务。

kapacitor define mem_alert -file mem_template_task.yaml

课堂练习

要求:判断平均值和指定阈值对比触发告警的模板,通过配置yml变量文件来定义任务。

时序数据

InfluxDB shell version: 1.7.7
> use telegraf
Using database telegraf
> show field keys from cpu
name: cpu
fieldKey fieldType
-------- ---------
usage_guest float
usage_guest_nice float
usage_idle float
usage_iowait float
usage_irq float
usage_nice float
usage_softirq float
usage_steal float
usage_system float
usage_user float
> show tag keys from cpu
name: cpu
tagKey
------
cpu
host

模板template_alert.tick

// 测量名
var measurement string
// 过滤条件
var where_filter = lambda: TRUE
// 分组
var groups = [*]
// 时间窗口
var window duration
// 执行频率
var every duration
// 字段
var field string
// 触发告警的表达式
var crit lambda
// 告警描述
var message string

stream
|from()
.measurement(measurement)
.where(where_filter)
.groupBy(groups)
|window()
.period(window)
.every(every)
|mean(field)
|alert()
.crit(crit)
.stateChangesOnly()
.message(message)

cpu空闲告警的变量文件cpu_idle.yml

template-id: mean_alert
dbrps:
- db: telegraf
rp: autogen
vars:
measurement:
type: string
value: cpu
groups:
type: list
value:
- type: star
value: "*"
field:
type: string
value: usage_idle
crit:
type: lambda
value: '"mean" < 10.0'
window:
type: duration
value: 5m
every:
type: duration
value: 1m
message:
type: string
value: '{
"type": "cpu_usage",
"id": "主机 CPU 使用率大于90%",
"host": "{{ index .Tags "host" }}",
"description": "主机{{ index .Tags "host" }} CPU使用率{{ index .Fields "cpu_usage" | printf "%0.2f" }}%,CPU负载过高会导致服务器运行缓慢,应用无法正常使用等问题。请查看CPU使用率高的进程/应用是否为异常导致"
}'

操作过程:

[root@tick:/var/lib/kapacitor/task]# vim template_alert.tick
[root@tick:/var/lib/kapacitor/task]# kapacitor define-template mean_alert -tick template_alert.tick
[root@tick:/var/lib/kapacitor/task]# kapacitor show-template mean_alert
ID: mean_alert
Error:
Type: stream
Created: 09 Sep 19 21:46 CST
Modified: 09 Sep 19 21:50 CST
TICKscript:
// 测量名
var measurement string

// 过滤条件
var where_filter = lambda: TRUE

// 分组
var groups = [*]

// 时间窗口
var window duration

// 执行频率
var every duration

// 字段
var field string

// 触发告警的表达式
var crit lambda

// 告警描述
var message string

stream
|from()
.measurement(measurement)
.where(where_filter)
.groupBy(groups)
|window()
.period(window)
.every(every)
|mean(field)
|alert()
.crit(crit)
.stateChangesOnly()
.message(message)

Vars:
Name Type Default Value Description
crit lambda <required> 触发告警的表达式
every duration <required> 执行频率
field string <required> 字段
groups list [*] 分组
measurement string <required> 测量名
message string <required> 告警描述
where_filter lambda TRUE 过滤条件
window duration <required> 时间窗口
DOT:
digraph mean_alert {
stream0 -> from1;
from1 -> window2;
window2 -> mean3;
mean3 -> alert4;
}

[root@tick:/var/lib/kapacitor/task]# vim cpu_idle.yml
[root@tick:/var/lib/kapacitor/task]# kapacitor define cpu_idle_alert -file cpu_idle.yml
[root@tick:/var/lib/kapacitor/task]# kapacitor list tasks
ID Type Status Executing Databases and Retention Policies
cpu_idle_alert stream disabled false ["telegraf"."autogen"]
[root@tick:/var/lib/kapacitor/task]# kapacitor enable cpu_idle_alert
[root@tick:/var/lib/kapacitor/task]# kapacitor show cpu_idle_alert
ID: cpu_idle_alert
Error:
Template: mean_alert
Type: stream
Status: enabled
Executing: true
Created: 09 Sep 19 22:04 CST
Modified: 09 Sep 19 22:05 CST
LastEnabled: 09 Sep 19 22:05 CST
Databases Retention Policies: ["telegraf"."autogen"]
TICKscript:
// 测量名
var measurement string

// 过滤条件
var where_filter = lambda: TRUE

// 分组
var groups = [*]

// 时间窗口
var window duration

// 执行频率
var every duration

// 字段
var field string

// 触发告警的表达式
var crit lambda

// 告警描述
var message string

stream
|from()
.measurement(measurement)
.where(where_filter)
.groupBy(groups)
|window()
.period(window)
.every(every)
|mean(field)
|alert()
.crit(crit)
.stateChangesOnly()
.message(message)

Vars:
Name Type Value
crit lambda "mean" < 10.0
every duration 1m0s
field string usage_idle
groups list [*]
measurement string cpu
message string { "type": "cpu_usage", "id": "主机 CPU 使用率大于90%", "host": "{{ index .Tags "host" }}", "description": "主机{{ index .Tags "host" }} CPU使用率{{ index .Fields "cpu_usage" | printf "%0.2f" }}%,CPU负载过高会导致服务器运行缓慢,应用无法正常使用等问题。请查看CPU使用率高的进程/应用是否为异常导致" }
window duration 5m0s
DOT:
digraph cpu_idle_alert {
graph [throughput="0.00 points/s"];

stream0 [avg_exec_time_ns="0s" errors="0" working_cardinality="0" ];
stream0 -> from1 [processed="0"];

from1 [avg_exec_time_ns="0s" errors="0" working_cardinality="0" ];
from1 -> window2 [processed="0"];

window2 [avg_exec_time_ns="0s" errors="0" working_cardinality="0" ];
window2 -> mean3 [processed="0"];

mean3 [avg_exec_time_ns="0s" errors="0" working_cardinality="0" ];
mean3 -> alert4 [processed="0"];

alert4 [alerts_inhibited="0" alerts_triggered="0" avg_exec_time_ns="0s" crits_triggered="0" errors="0" infos_triggered="0" oks_triggered="0" warns_triggered="0" working_cardinality="0" ];
}