diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index 9e5e7326..1efb63a3 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -175,6 +175,22 @@ groups: summary: "188 Host 備份超過 25 小時未成功" description: "backup-from-110.sh 可能失敗,/backup/110 資料可能過舊" + # ADR-075: CoreDNS 解析失敗 (2026-04-12 ogt) + - alert: CoreDNSResolutionFailed + expr: rate(coredns_dns_responses_total{rcode="SERVFAIL"}[5m]) > 0.05 + for: 5m + labels: + severity: critical + layer: k8s + team: ops + auto_repair: "true" + alert_category: kubernetes + notification_type: TYPE-3 + annotations: + summary: "CoreDNS SERVFAIL 率過高 {{ $value | humanizePercentage }}" + description: "CoreDNS 在 5 分鐘內 SERVFAIL 回應率超過 5%,K8s 服務間 DNS 解析可能失敗" + runbook: "kubectl -n kube-system get pods -l k8s-app=kube-dns && kubectl -n kube-system logs -l k8s-app=kube-dns" + # ========================================================================= # 資料庫告警 (database_alerts) # ========================================================================= @@ -711,6 +727,21 @@ groups: annotations: summary: "Gemini API 錯誤率過高: {{ $value | humanizePercentage }}" description: "Gemini API 5 分鐘錯誤率超過 20%,AI 降級可能失效" + # ADR-075: 業務爬蟲健康 (2026-04-12 ogt) + - alert: MomoScraperSuccessLow + expr: | + rate(momo_scraper_requests_total{status="success"}[5m]) + / rate(momo_scraper_requests_total[5m]) < 0.9 + for: 10m + labels: + severity: warning + layer: docker-110 + auto_repair: "false" + alert_category: business + notification_type: TYPE-3 + annotations: + summary: "Momo 抓取成功率跌至 {{ $value | humanizePercentage }}" + description: "Momo 爬蟲成功率低於 90%,業務資料可能缺失" - name: awoooi_flywheel_meta_alerts interval: 60s