From bd75aca727acc7d97f3b86c010de32891677ac5a Mon Sep 17 00:00:00 2001 From: OG T Date: Sun, 12 Apr 2026 21:59:11 +0800 Subject: [PATCH] =?UTF-8?q?feat(adr-075):=20=E8=A3=9C=E5=85=A8=202=20?= =?UTF-8?q?=E5=80=8B=E6=AC=A0=E7=BC=BA=E7=9A=84=20Prometheus=20=E5=91=8A?= =?UTF-8?q?=E8=AD=A6=E8=A6=8F=E5=89=87?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - MomoScraperSuccessLow: 業務爬蟲成功率 <90% (business group) - CoreDNSResolutionFailed: CoreDNS SERVFAIL 率 >5% (kubernetes group) ADR-075 Phase 3 完成 Co-Authored-By: Claude Sonnet 4.6 --- ops/monitoring/alerts-unified.yml | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index 9e5e7326..1efb63a3 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -175,6 +175,22 @@ groups: summary: "188 Host 備份超過 25 小時未成功" description: "backup-from-110.sh 可能失敗,/backup/110 資料可能過舊" + # ADR-075: CoreDNS 解析失敗 (2026-04-12 ogt) + - alert: CoreDNSResolutionFailed + expr: rate(coredns_dns_responses_total{rcode="SERVFAIL"}[5m]) > 0.05 + for: 5m + labels: + severity: critical + layer: k8s + team: ops + auto_repair: "true" + alert_category: kubernetes + notification_type: TYPE-3 + annotations: + summary: "CoreDNS SERVFAIL 率過高 {{ $value | humanizePercentage }}" + description: "CoreDNS 在 5 分鐘內 SERVFAIL 回應率超過 5%,K8s 服務間 DNS 解析可能失敗" + runbook: "kubectl -n kube-system get pods -l k8s-app=kube-dns && kubectl -n kube-system logs -l k8s-app=kube-dns" + # ========================================================================= # 資料庫告警 (database_alerts) # ========================================================================= @@ -711,6 +727,21 @@ groups: annotations: summary: "Gemini API 錯誤率過高: {{ $value | humanizePercentage }}" description: "Gemini API 5 分鐘錯誤率超過 20%,AI 降級可能失效" + # ADR-075: 業務爬蟲健康 (2026-04-12 ogt) + - alert: MomoScraperSuccessLow + expr: | + rate(momo_scraper_requests_total{status="success"}[5m]) + / rate(momo_scraper_requests_total[5m]) < 0.9 + for: 10m + labels: + severity: warning + layer: docker-110 + auto_repair: "false" + alert_category: business + notification_type: TYPE-3 + annotations: + summary: "Momo 抓取成功率跌至 {{ $value | humanizePercentage }}" + description: "Momo 爬蟲成功率低於 90%,業務資料可能缺失" - name: awoooi_flywheel_meta_alerts interval: 60s