PDNS-Recursor递归器的告警规则
groups:
- name: powerdns-recursor.rules
rules:
- alert: DNS Recursor Down
expr: powerdns_recursor_up == 0
for: 2m
labels:
severity: critical
cluster_name: PowerDNS-Recursor-DEV&STG
annotations:
summary: "DNS Recursor Down"
description: "DNS Recursor Down, indicates a recursion fault."
- alert: DNS Query Rate Alert
expr: rate(powerdns_recursor_incoming_queries_total[5m]) > 1500
for: 2m
labels:
severity: warning
cluster_name: PowerDNS-Recursor-DEV&STG
annotations:
summary: "High DNS Query Rate detected"
description: "The rate of incoming DNS queries over UDP is very high, qps over 1500, indicating potential abnormal traffic or a DDoS attack."
- alert: DNS Incoming Queries TCP Alert
expr: rate(powerdns_recursor_incoming_queries_total{net="tcp"}[5m]) > 100
for: 5m
labels:
severity: warning
cluster_name: PowerDNS-Recursor-DEV&STG
annotations:
summary: "High rate of incoming TCP DNS queries detected"
description: "The rate of incoming DNS queries over TCP is very high, qps over 100, indicating potential issues with UDP traffic handling."
- alert: DNS Cache Miss Alert
expr: |
(
100 * sum(rate(powerdns_recursor_cache_lookups_total{result="hit",job="PowerDNS-Recursor-DEV&STG"}[5m]))
/
(
sum(rate(powerdns_recursor_cache_lookups_total{result="hit",job="PowerDNS-Recursor-DEV&STG"}[5m]))
+
sum(rate(powerdns_recursor_cache_lookups_total{result="miss",job="PowerDNS-Recursor-DEV&STG"}[5m]))
)
) < 15
for: 5m
labels:
cluster_name: PowerDNS-Recursor-DEV&STG
severity: critical
annotations:
summary: "High DNS Cache Miss Rate detected"
description: "The percentage of DNS cache misses is high,The percentage of cache hit requests is under 15% in the last 5 minutes, indicating potential performance issues or DNS resolution problems."
- alert: DNS Response Time Outliers
expr: histogram_quantile(0.95, sum by (le) (rate(powerdns_recursor_response_time_seconds_bucket[5m]))) > 1
for: 5m
labels:
cluster_name: PowerDNS-Recursor-DEV&STG
severity: warning
annotations:
summary: "High DNS Response Time outliers detected"
description: "The 95th percentile of DNS response times is higher than expected,DNS recursive response time is greater than 1000 ms, indicating potential performance issues or network latency problems."
PDNS-Authoritative权威服务器的告警规则
groups:
- name: powerdns_authoritative_alerts
rules:
- alert: DNS Query Rate Increase
expr: rate(powerdns_authoritative_queries_total[5m]) > 1000
for: 5m
labels:
cluster_name: PowerDNS-Authoritative-DEV&STG
severity: warning
annotations:
summary: "High DNS query rate detected"
description: "The rate of DNS queries has exceeded the threshold of 1000 queries per second for the past 5 minutes."
- alert: PowerDNS Status of the authorization server
expr: powerdns_authoritative_up == 0
for: 5m
labels:
cluster_name: PowerDNS-Authoritative-DEV&STG
severity: critical
annotations:
summary: "PowerDNS Authoritative server down"
description: "PowerDNS Authoritative server down."
- alert: High Cpu Milliseconds User
expr: rate(powerdns_authoritative_cpu_milliseconds{type="user"}[5m]) > 100
for: 5m
labels:
cluster_name: PowerDNS-Authoritative-DEV&STG
severity: critical
annotations:
summary: "High CPU milliseconds in user space"
description: "The number of CPU milliseconds spent in user space in the PowerDNS Authoritative server has increased significantly. The average CPU millisecond in five minutes is over 100 milliseconds. This may indicate a potential performance issue that needs investigation."
- alert: High Cpu Milliseconds Sys
expr: rate(powerdns_authoritative_cpu_milliseconds{type="sys"}[5m]) > 100
for: 5m
labels:
cluster_name: PowerDNS-Authoritative-DEV&STG
severity: critical
annotations:
summary: "High CPU milliseconds in kernel space"
description: "The number of CPU milliseconds spent in kernel space in the PowerDNS Authoritative server has increased significantly. The average CPU millisecond in five minutes is over 100 milliseconds. This may indicate a potential performance issue that needs investigation."
- alert: UDP NoPort Errors
expr: increase(powerdns_authoritative_exceptions_total{error="udp_noport_errors"}[5m]) > 0
for: 5m
labels:
cluster_name: PowerDNS-Authoritative-DEV&STG
severity: critical
annotations:
summary: "UDP no port errors detected"
description: "Errors related to UDP no port have been detected in the past 5 minutes."
- alert: Process Memory Usage Increase
expr: (process_resident_memory_bytes{job="PowerDNS-Authoritative-DEV&STG"} / 1024 / 1024) > 1000
for: 5m
labels:
cluster_name: PowerDNS-Authoritative-DEV&STG
severity: warning
annotations:
summary: "High process memory usage detected"
description: "The resident memory size of the PowerDNS process has exceeded 1000 MB for the past 5 minutes."