Birdhk
是自定义的,也是通过 servicemonitor 添加的监控,不过内置的告警,我看startsAt 的时间也不对
apiVersion: alerting.kubesphere.io/v2beta1
kind: ClusterRuleGroup
metadata:
annotations:
labels:
# alerting.kubesphere.io/builtin: "true"
alerting.kubesphere.io/data_source: default
alerting.kubesphere.io/enable: "true"
alerting.kubesphere.io/owner_cluster: host
name: host.ingress-nginx
spec:
rules:
- alert: NginxIngressHigh5xxRate
id: 9i0j1k2l-3m4n-5o6p-7q8r-9s0t1u2v3w4x
expr: |
sum(rate(nginx_ingress_controller_requests{status=~"5.*"}[5m])) by (ingress) / sum(rate(nginx_ingress_controller_requests[5m])) by (ingress) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "Ingress {{ $labels.ingress }} 的 5xx 错误率过高"
description: "Ingress {{ $labels.ingress }} 的 5xx 错误率超过了 5%"
- alert: NginxIngressHighLatency
id: 2l3m4n5o-6p7q-8r9s-0t1u-2v3w4x5y6z7a
expr: |
histogram_quantile(0.99, sum(rate(nginx_ingress_controller_request_duration_seconds_bucket[5m])) by (le, ingress)) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "Ingress {{ $labels.ingress }} 请求延迟过高"
description: "Ingress {{ $labels.ingress }} 的 99% 请求延迟超过了 1 秒"
- alert: NginxIngressHigh4xxRate
id: 1k2l3m4n-5o6p-7q8r-9s0t-1u2v3w4x5y6z
expr: |
(sum(rate(nginx_ingress_controller_requests{status=~"4.*"}[5m])) by (ingress) / sum(rate(nginx_ingress_controller_requests[5m])) by (ingress) > 0.10)
and
(sum(rate(nginx_ingress_controller_requests{status=~"4.*"}[5m])) by (ingress) / sum(rate(nginx_ingress_controller_requests[5m])) by (ingress) < 1)
for: 5m
labels:
severity: warning
annotations:
summary: "Ingress {{ $labels.ingress }} 的 4xx 错误率过高"
description: "Ingress {{ $labels.ingress }} 的 4xx 错误率超过了 10%"
- alert: NginxIngressPodRestarting
id: 7g8h9i0j-1k2l-3m4n-5o6p-7q8r9s0t1u2v
expr: |
increase(kube_pod_container_status_restarts_total{namespace="ingress-nginx", container="controller"}[10m]) > 4
for: 2m
labels:
severity: critical
annotations:
summary: "Nginx Ingress 控制器 Pod 频繁重启"
description: "ingress-nginx 命名空间中的 Nginx Ingress 控制器 Pod 在过去 2 分钟内重启次数超过 4 次"
- alert: NginxIngressDown
id: 6f7g8h9i-0j1k-2l3m-4n5o-6p7q8r9s0t1u
expr: |
absent(up{job="nginx-ingress"}) == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Nginx Ingress 控制器已停止运行"
description: "Nginx Ingress 控制器已停止运行超过 1 分钟"
- alert: NginxIngressHighUV
id: 4d5e6f7g-8h9i-0j1k-2l3m-4n5o6p7q8r9s
expr: |
sum(rate(nginx_ingress_controller_requests[1m])) by (ingress) > 20
for: 1m
labels:
severity: warning
annotations:
summary: "Ingress {{ $labels.ingress }} 用户请求量过高"
description: "Ingress {{ $labels.ingress }} 的每分钟请求数超过了 20 次"