前面博主已将商品中心服务接入到skywalking,实现了链路追踪功能。而在运维过程中,我们还需要配置监控来触发告警,让故障信息尽快通知到相关人员进行分析,所以这里我们就给我们的服务加上监控和告警配置

告警指标

对域skywalking的告警指标,默认路径在skywalking服务的config/oal/core.oal内

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
root@8e2113b4496c:/skywalking# cat config/oal/core.oal
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

// For services using protocols HTTP 1/2, gRPC, RPC, etc., the cpm metrics means "calls per minute",
// for services that are built on top of TCP, the cpm means "packages per minute".

// Service scope metrics
service_resp_time = from(Service.latency).longAvg();
service_sla = from(Service.*).percent(status == true);
service_cpm = from(Service.*).cpm();
service_percentile = from(Service.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_apdex = from(Service.latency).apdex(name, status);
service_mq_consume_count = from(Service.*).filter(type == RequestType.MQ).count();
service_mq_consume_latency = from((str->long)Service.tag["transmission.latency"]).filter(type == RequestType.MQ).filter(tag["transmission.latency"] != null).longAvg();

// Service relation scope metrics for topology
service_relation_client_cpm = from(ServiceRelation.*).filter(detectPoint == DetectPoint.CLIENT).cpm();
service_relation_server_cpm = from(ServiceRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
service_relation_client_call_sla = from(ServiceRelation.*).filter(detectPoint == DetectPoint.CLIENT).percent(status == true);
service_relation_server_call_sla = from(ServiceRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
service_relation_client_resp_time = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).longAvg();
service_relation_server_resp_time = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.SERVER).longAvg();
service_relation_client_percentile = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_relation_server_percentile = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99

// Service Instance relation scope metrics for topology
service_instance_relation_client_cpm = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.CLIENT).cpm();
service_instance_relation_server_cpm = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
service_instance_relation_client_call_sla = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.CLIENT).percent(status == true);
service_instance_relation_server_call_sla = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
service_instance_relation_client_resp_time = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).longAvg();
service_instance_relation_server_resp_time = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.SERVER).longAvg();
service_instance_relation_client_percentile = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_instance_relation_server_percentile = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99

// Service Instance Scope metrics
service_instance_sla = from(ServiceInstance.*).percent(status == true);
service_instance_resp_time= from(ServiceInstance.latency).longAvg();
service_instance_cpm = from(ServiceInstance.*).cpm();

// Endpoint scope metrics
endpoint_cpm = from(Endpoint.*).cpm();
endpoint_resp_time = from(Endpoint.latency).longAvg();
endpoint_sla = from(Endpoint.*).percent(status == true);
endpoint_percentile = from(Endpoint.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
endpoint_mq_consume_count = from(Endpoint.*).filter(type == RequestType.MQ).count();
endpoint_mq_consume_latency = from((str->long)Endpoint.tag["transmission.latency"]).filter(type == RequestType.MQ).filter(tag["transmission.latency"] != null).longAvg();

// Endpoint relation scope metrics
endpoint_relation_cpm = from(EndpointRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
endpoint_relation_resp_time = from(EndpointRelation.rpcLatency).filter(detectPoint == DetectPoint.SERVER).longAvg();
endpoint_relation_sla = from(EndpointRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
endpoint_relation_percentile = from(EndpointRelation.rpcLatency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99

database_access_resp_time = from(DatabaseAccess.latency).longAvg();
database_access_sla = from(DatabaseAccess.*).percent(status == true);
database_access_cpm = from(DatabaseAccess.*).cpm();
database_access_percentile = from(DatabaseAccess.latency).percentile(10);
  • service_resp_time #服务的响应时间
  • service_sla #服务的http请求成功率SLA,比如99%等。
  • service_cpm #表示每分钟的吞吐量.
  • service_apdex : 应用性能指数是0.8是0.x
  • service_percentile: 指定最近多少数据范围内的响应时间百分比,即p99, p95, p90, p75, p50在内的数据统计结果
  • endpoint_relation_cpm #端点的每分钟的吞吐量
  • endpoint_relation_resp_time #端点的响应时间
  • endpoint_relation_sla #端点的http请求成功率SLA,比如99%等。
  • endpoint_relation_percentile ##端点的最近多少数据范围内的响应时间百分比,即p99、 p95、 p90、 p75、p50在内的数据统计结果

如果指标不满足自己的业务的需求,可以参考上面去定制

告警规则

skywalking的告警配置文件为config/alarm-settings.yml

普通告警

  • 规则名称:在告警信息中显示的唯一名称,必须以_rule结尾。

  • metrics-name:度量名称,也是OAL脚本中的度量名。默认配置中可以用于告警的度量有:服务实例端点服务关系实例关系端点关系。它只支持long,double和int类型。

  • include-names:包含在此规则之内的实体名称列表。

  • exclude-names:排除在此规则以外的实体名称列表。

  • include-names-regex:提供一个正则表达式来包含实体名称。如果同时设置包含名称列表和包含名称的正则表达式,则两个规则都将生效。

  • exclude-names-regex:提供一个正则表达式来排除实体名称。如果同时设置排除名称列表和排除名称的正则表达式,则两个规则都将生效。

  • include-labels:包含在此规则之内的标签。

  • exclude-labels:排除在此规则以外的标签。

  • include-labels-regex:提供一个正则表达式来包含标签。如果同时设置包含标签列表和包含标签的正则表达式,则两个规则都将生效。

  • exclude-labels-regex:提供一个正则表达式来排除标签。如果同时设置排除标签列表和排除标签的正则表达式,则两个规则都将生效。

  • threshold:阈值。对于多个值指标,例如percentile,阈值是一个数组。像value1 value2 value3 value4 value5这样描述。 每个值可以作为度量中每个值的阈值。如果不想通过此值或某些值触发警报,则将值设置为 -。 例如在percentile中,value1是P50的阈值,value2是P75的阈值,那么-,-,value3, value4, value5的意思是,没有阈值的P50和P75的percentile告警规则。

  • op:操作符,支持>, >=, <, <=, =

  • period:多久告警规则需要被检查一下。这是一个时间窗口,与后端部署环境时间相匹配。

  • count:在一个周期窗口中,如果按op计算超过阈值的次数达到count,则发送告警。

  • only-as-conditiontrue或者false,指定规则是否可以发送告警,或者仅作为复合规则的条件。

  • silence-period:在时间N中触发报警后,在N -> N + silence-period这段时间内不告警。默认情况下,它和period一样,这意味着相同的告警(同一个度量名称拥有相同的Id)在同一个周期内只会触发一次。

  • message:该规则触发时,发送的通知消息。

下面是两条告警示例:

  • 服务在2分钟内调用次数大于1,一次就触发,触发告警后静默2分钟
  • 服务响应时间在2分钟内超过100ms,一次就触发,触发告警后静默2分钟
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
root@8e2113b4496c:/skywalking# cat config/alarm-settings.yml
rules:
service_cpm_rule:
# 服务调用次数
metrics-name: service_cpm
op: ">"
threshold: 1
period: 2
count: 1
silence-period: 2
message: 服务 {name} 访问次数大于1
# Rule unique name, must be ended with `_rule`.
service_resp_time_rule:
metrics-name: service_resp_time
op: ">"
threshold: 100
period: 2
count: 1
silence-period: 2
message: Response time of service {name} is more than 100ms in last 2 minutes.
# webhook

官方的一些样例规则

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
rules:
# Rule unique name, must be ended with `_rule`.
endpoint_percent_rule:
# Metrics value need to be long, double or int
metrics-name: endpoint_percent
threshold: 75
op: <
# The length of time to evaluate the metrics
period: 10
# How many times after the metrics match the condition, will trigger alarm
count: 3
# How many times of checks, the alarm keeps silence after alarm triggered, default as same as period.
silence-period: 10
# Specify if the rule can send notification or just as an condition of composite rule
only-as-condition: false
tags:
level: WARNING
service_percent_rule:
metrics-name: service_percent
# [Optional] Default, match all services in this metrics
include-names:
- service_a
- service_b
exclude-names:
- service_c
# Single value metrics threshold.
threshold: 85
op: <
period: 10
count: 4
only-as-condition: false
service_resp_time_percentile_rule:
# Metrics value need to be long, double or int
metrics-name: service_percentile
op: ">"
# Multiple value metrics threshold. Thresholds for P50, P75, P90, P95, P99.
threshold: 1000,1000,1000,1000,1000
period: 10
count: 3
silence-period: 5
message: Percentile response time of service {name} alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 1000, p75 > 1000, p90 > 1000, p95 > 1000, p99 > 1000
only-as-condition: false
meter_service_status_code_rule:
metrics-name: meter_status_code
exclude-labels:
- "200"
op: ">"
threshold: 10
period: 10
count: 3
silence-period: 5
message: The request number of entity {name} non-200 status is more than expected.
only-as-condition: false

读者自行根据上面的用法编写合适业务的告警监控规则,注意: endpoint 规则,相比 service、instance 规则耗费更多内存及资源~

复合规则

就是将多个规则进行判断:

1
2
3
4
5
6
7
composite-rules:
comp_rule:
# Must satisfied percent rule and resp time rule
expression: service_percent_rule && service_resp_time_percentile_rule
message: Service {name} successful rate is less than 80% and P50 of response time is over 1000ms
tags:
level: CRITICAL
  • 规则名称:在告警信息中显示的唯一名称,必须以_rule结尾
  • expression:指定如何组成规则,支持&&, , ()操作符
  • message:该规则触发时,发送的通知消息

告警通知

钉钉告警

1
2
3
4
5
6
7
8
9
10
11
12
13
14
root@8e2113b4496c:/skywalking# cat config/alarm-settings.yml
# 略

dingtalkHooks:
textTemplate: -
{
"msgtype": "text",
"text": {
"content": "Apache SkyWalking Alarm: \n %s."
}
}
webhooks:
- url: https://oapi.dingtalk.com/robot/send?access_token=ceb8f51dddddddddddaf640cae1db92
secret: SEC80939ddddddddd21bcb6e2652f

配置完规则重启服务,测试下接口访问

在告警或事件中可以看到触发信息

回到钉钉群中可以看到告警信息以推送过来

微信告警

微信告警配置样例

1
2
3
4
5
6
7
8
9
10
wechatHooks:
textTemplate: -
{
"msgtype": "text",
"text": {
"content": "Apache SkyWalking Alarm: \n %s."
}
}
webhooks:
- https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=dummy_key

飞书告警

飞书告警样例配置

1
2
3
4
5
6
7
8
9
10
11
12
feishuHooks:
textTemplate: -
{
"msg_type": "text",
"content": {
"text": "Apache SkyWalking Alarm: \n %s."
},
"ats":"feishu_user_id_1,feishu_user_id_2"
}
webhooks:
- url: https://open.feishu.cn/open-apis/bot/v2/hook/dummy_token
secret: dummysecret

参考文档