Files
awoooi/ops/monitoring/alerts-unified.yml
OG T 946fe1fa7c
All checks were successful
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 44s
fix(monitoring): 合併重複飛輪告警 group + 補 notification_type: TYPE-8M
awoooi_flywheel_health (重複) 合入 awoooi_flywheel_meta_alerts:
- 所有 5 條規則加 notification_type: TYPE-8M
- 新增 FlywheelAlertnameNullHigh(原僅在舊 group)
- 刪除重複 group,消除 Prometheus 同名告警衝突

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-12 22:43:02 +08:00

932 lines
34 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# ops/monitoring/alerts-unified.yml
# AWOOOI 統一 Prometheus 告警規則
# 2026-04-05 Claude Code: 整合所有規則,加入統一 layer 標籤
# 2026-04-08 Claude Sonnet 4.6: 補 database_detail_alerts 群組 (6條詳細規則)
# 2026-04-12 Claude Sonnet 4.6: 補回 4 個僅存在主機的群組 (backup/flywheel/connectivity/infra-detailed)
# 部署目標: 192.168.0.110:/home/wooo/monitoring/alerts.yml
# 部署方式: scripts/ops/deploy-alerts.sh (CD 自動部署)
#
# 標籤規範:
# layer: k8s | docker-110 | docker-188 | systemd-188
# component: 服務名稱
# team: ops | backend | ai | platform
# host: "110" | "188" | "120" | "121"
# auto_repair: "true" | "false"
groups:
# =========================================================================
# 主機層告警 (host_alerts)
# =========================================================================
- name: host_alerts
rules:
- alert: HostDown
expr: up{job=~"node-exporter.*"} == 0
for: 1m
labels:
severity: critical
layer: systemd-188
team: ops
auto_repair: "false"
annotations:
summary: "主機 {{ $labels.host }} 不可達"
description: "Node Exporter 無回應超過 1 分鐘"
- alert: HostHighCpuLoad
expr: 100 - (avg by(host) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
layer: systemd-188
team: ops
auto_repair: "true"
# MCP Phase 2a (ADR-071, 2026-04-11 Claude Sonnet 4.6): SSH MCP 路由標籤
mcp_provider: "ssh_host"
host_type: "bare_metal"
alert_category: "host_resource"
annotations:
summary: "主機 {{ $labels.host }} CPU 高負載"
description: "CPU 使用率超過 80%"
- alert: HostOutOfMemory
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
severity: warning
layer: systemd-188
team: ops
auto_repair: "false"
mcp_provider: "ssh_host"
host_type: "bare_metal"
alert_category: "host_resource"
annotations:
summary: "主機 {{ $labels.host }} 記憶體不足"
description: "記憶體使用率超過 85%"
- alert: HostOutOfDiskSpace
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 85
for: 5m
labels:
severity: warning
layer: systemd-188
team: ops
auto_repair: "false"
mcp_provider: "ssh_host"
host_type: "bare_metal"
alert_category: "host_resource"
annotations:
summary: "主機 {{ $labels.host }} 磁碟空間不足"
description: "磁碟使用率超過 85%"
# =========================================================================
# K8s 叢集告警 (kubernetes_alerts)
# =========================================================================
- name: kubernetes_alerts
rules:
- alert: K3sNodeNotReady
expr: kube_node_status_condition{condition="Ready", status="true"} == 0
for: 2m
labels:
severity: critical
layer: k8s
team: ops
auto_repair: "false"
annotations:
summary: "K3s 節點 {{ $labels.node }} 未就緒"
description: "節點超過 2 分鐘未達到 Ready 狀態"
- alert: KubePodCrashLooping
expr: rate(kube_pod_container_status_restarts_total{namespace="awoooi-prod"}[15m]) > 0
for: 5m
labels:
severity: warning
layer: k8s
team: ops
auto_repair: "true"
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} 持續重啟"
description: "Pod 在過去 15 分鐘內重啟次數異常"
- alert: KubePodNotReady
expr: kube_pod_status_ready{condition="true",namespace="awoooi-prod"} == 0
for: 5m
labels:
severity: warning
layer: k8s
team: ops
auto_repair: "true"
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} 未就緒"
description: "Running 中的 Pod 超過 5 分鐘未達到 Ready 狀態"
- alert: KubeDeploymentReplicasMismatch
expr: kube_deployment_spec_replicas{namespace="awoooi-prod"} != kube_deployment_status_replicas_available{namespace="awoooi-prod"}
for: 10m
labels:
severity: warning
layer: k8s
team: ops
auto_repair: "true"
annotations:
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} 副本數不匹配"
description: "期望副本數與可用副本數不一致超過 10 分鐘"
- alert: VeleroBackupFailed
expr: increase(velero_backup_failure_total[24h]) > 0
for: 5m
labels:
severity: warning
layer: k8s
team: ops
component: velero
auto_repair: "false"
annotations:
summary: "Velero 備份失敗"
description: "過去 24 小時有備份失敗"
- alert: VeleroBackupNotRun
expr: time() - velero_backup_last_successful_timestamp > 86400
for: 10m
labels:
severity: critical
layer: k8s
team: ops
component: velero
auto_repair: "false"
annotations:
summary: "Velero 超過 24 小時未成功備份"
description: "最後一次成功備份超過 24 小時"
# Sprint C-2 Host rsync 備份告警 (2026-04-11 Claude Sonnet 4.6)
# backup-from-110.sh 成功時寫入 /var/run/backup-110.last_success
# node-exporter textfile collector 讀取此檔案暴露指標
- alert: HostBackupFailed
expr: time() - node_textfile_scrape_error{collector="backup_110"} == 0 or absent(node_textfile_scrape_error{collector="backup_110"}) or (time() - backup_110_last_success_timestamp > 90000)
for: 10m
labels:
severity: warning
layer: docker-188
team: ops
component: backup
host: "188"
auto_repair: "false"
alert_category: host_resource
annotations:
summary: "188 Host 備份超過 25 小時未成功"
description: "backup-from-110.sh 可能失敗,/backup/110 資料可能過舊"
# ADR-075: CoreDNS 解析失敗 (2026-04-12 ogt)
- alert: CoreDNSResolutionFailed
expr: rate(coredns_dns_responses_total{rcode="SERVFAIL"}[5m]) > 0.05
for: 5m
labels:
severity: critical
layer: k8s
team: ops
auto_repair: "true"
alert_category: kubernetes
notification_type: TYPE-3
annotations:
summary: "CoreDNS SERVFAIL 率過高 {{ $value | humanizePercentage }}"
description: "CoreDNS 在 5 分鐘內 SERVFAIL 回應率超過 5%K8s 服務間 DNS 解析可能失敗"
runbook: "kubectl -n kube-system get pods -l k8s-app=kube-dns && kubectl -n kube-system logs -l k8s-app=kube-dns"
# =========================================================================
# 資料庫告警 (database_alerts)
# =========================================================================
- name: database_alerts
rules:
- alert: PostgreSQLDown
expr: up{job="postgres-exporter"} == 0 or pg_up == 0
for: 1m
labels:
severity: critical
layer: systemd-188
component: postgres
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "PostgreSQL 資料庫離線"
description: "PostgreSQL Exporter 無法連接資料庫超過 1 分鐘"
- alert: RedisDown
expr: up{job="redis-exporter"} == 0 or redis_up == 0
for: 1m
labels:
severity: critical
layer: systemd-188
component: redis
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "Redis 快取服務離線"
description: "Redis Exporter 無法連接 Redis 超過 1 分鐘"
- alert: PostgreSQLHighConnections
expr: pg_stat_activity_count > 80
for: 5m
labels:
severity: warning
layer: systemd-188
component: postgres
team: ops
auto_repair: "false"
annotations:
summary: "PostgreSQL 連接數過高"
description: "當前連接數 {{ $value }} 超過 80"
- alert: RedisMemoryHigh
expr: redis_memory_max_bytes > 0 and redis_memory_used_bytes / redis_memory_max_bytes * 100 > 80
for: 5m
labels:
severity: warning
layer: systemd-188
component: redis
team: ops
auto_repair: "false"
annotations:
summary: "Redis 記憶體使用過高"
description: "Redis 記憶體使用率超過 80%"
# =========================================================================
# Sprint 5.2 Plan B: 資料庫詳細指標告警 (database_detail_alerts)
# 前置: postgres-exporter:9187 + redis-exporter:9121 on 192.168.0.188
# 2026-04-08 Claude Sonnet 4.6 Asia/Taipei
# =========================================================================
- name: database_detail_alerts
rules:
# ---- PostgreSQL 詳細指標 ----
- alert: PostgreSQLSlowQueries
expr: pg_stat_activity_max_tx_duration{datname="awoooi_prod"} > 60
for: 5m
labels:
severity: warning
layer: systemd-188
component: postgres
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "PostgreSQL 有慢查詢 (>60s)"
description: "awoooi_prod 資料庫最長事務超過 60 秒"
- alert: PostgreSQLDeadlocks
expr: increase(pg_stat_database_deadlocks{datname="awoooi_prod"}[5m]) > 0
for: 1m
labels:
severity: warning
layer: systemd-188
component: postgres
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "PostgreSQL 死鎖發生"
description: "過去 5 分鐘 awoooi_prod 資料庫有死鎖"
- alert: PostgreSQLTooManyConnections
expr: pg_stat_activity_count{datname="awoooi_prod"} > 50
for: 5m
labels:
severity: warning
layer: systemd-188
component: postgres
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "PostgreSQL 連接數過高 ({{ $value }})"
description: "awoooi_prod 連接數超過 50"
# ---- Redis 詳細指標 ----
- alert: RedisKeyEviction
expr: increase(redis_evicted_keys_total[5m]) > 0
for: 1m
labels:
severity: warning
layer: systemd-188
component: redis
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "Redis 發生 Key 驅逐"
description: "過去 5 分鐘有 Key 被驅逐,可能記憶體不足"
- alert: RedisConnectionsHigh
expr: redis_connected_clients > 100
for: 5m
labels:
severity: warning
layer: systemd-188
component: redis
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "Redis 連接數過高 ({{ $value }})"
description: "Redis 連接數超過 100"
- alert: RedisCommandLatencyHigh
expr: redis_commands_duration_seconds_total / redis_commands_processed_total > 0.01
for: 5m
labels:
severity: warning
layer: systemd-188
component: redis
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "Redis 命令平均延遲過高 (>10ms)"
description: "Redis 命令平均延遲超過 10ms"
# =========================================================================
# 服務可用性告警 (service_alerts) — 含 layer 標籤決定修復路徑
# =========================================================================
- name: service_alerts
rules:
# ---- 188 Docker 層 ----
- alert: OpenClawDown
# 2026-04-05 Claude Code: 修正舊命名 ClawBotDown → OpenClawDown
expr: up{job="clawbot"} == 0
for: 2m
labels:
severity: critical
layer: docker-188
component: openclaw
host: "188"
team: ops
auto_repair: "true"
annotations:
summary: "OpenClaw 服務離線"
description: "OpenClaw (192.168.0.188:8088) 已離線超過 2 分鐘"
- alert: SignOzDown
expr: probe_success{job="blackbox-http", instance=~".*3301.*"} == 0
for: 2m
labels:
severity: warning
layer: docker-188
component: signoz
host: "188"
team: ops
auto_repair: "true"
mcp_provider: "ssh_host"
target_host: "192.168.0.188"
alert_category: "devops_tool"
annotations:
summary: "SignOz 服務離線"
description: "SignOz (192.168.0.188:3301) 已離線超過 2 分鐘"
# ---- 110 Docker 層 ----
- alert: SentryDown
expr: probe_success{job="blackbox-tcp", instance="192.168.0.110:9000"} == 0
for: 2m
labels:
severity: warning
layer: docker-110
component: sentry
host: "110"
team: ops
auto_repair: "true"
mcp_provider: "ssh_host"
target_host: "192.168.0.110"
alert_category: "devops_tool"
annotations:
summary: "Sentry 服務離線"
description: "Sentry (192.168.0.110:9000) 已離線超過 2 分鐘"
- alert: HarborDown
expr: probe_success{job="blackbox-http", instance=~".*5000.*"} == 0
for: 2m
labels:
severity: critical
layer: docker-110
component: harbor
host: "110"
team: ops
auto_repair: "true"
mcp_provider: "ssh_host"
target_host: "192.168.0.110"
alert_category: "devops_tool"
annotations:
summary: "Harbor Registry 離線"
description: "Harbor (192.168.0.110:5000) 已離線超過 2 分鐘CD pipeline 將無法拉取映像"
- alert: GiteaDown
expr: probe_success{job="blackbox-http", instance="http://192.168.0.110:3001"} == 0
for: 2m
labels:
severity: critical
layer: docker-110
component: gitea
host: "110"
team: ops
auto_repair: "true"
mcp_provider: "ssh_host"
target_host: "192.168.0.110"
alert_category: "devops_tool"
annotations:
summary: "Gitea Git 服務離線"
description: "Gitea (192.168.0.110:3001) 已離線超過 2 分鐘CD pipeline 失效"
- alert: AlertmanagerDown
expr: probe_success{job="blackbox-tcp", instance="192.168.0.110:9093"} == 0
for: 2m
labels:
severity: critical
layer: docker-110
component: alertmanager
host: "110"
team: ops
auto_repair: "true"
annotations:
summary: "Alertmanager 離線"
description: "Alertmanager (192.168.0.110:9093) 已離線,所有告警將靜默"
# =========================================================================
# 告警鏈路監控 (alert_chain) — 防止 2026-03-26/04-05 事故重演
# =========================================================================
- name: alert_chain
rules:
- alert: AlertChainBroken_Alertmanager
expr: |
sum(rate(awoooi_webhook_requests_total{source="alertmanager",status!="success"}[5m]))
/ sum(rate(awoooi_webhook_requests_total{source="alertmanager"}[5m])) > 0.1
for: 10m
labels:
severity: critical
layer: k8s
team: platform
auto_repair: "false"
annotations:
summary: "Alertmanager Webhook 錯誤率 > 10%"
description: "告警鏈路可能斷裂,請執行 E2E 驗證"
- alert: AlertChainBroken_Sentry
expr: |
sum(rate(awoooi_webhook_requests_total{source="sentry",status!="success"}[5m]))
/ sum(rate(awoooi_webhook_requests_total{source="sentry"}[5m])) > 0.1
for: 10m
labels:
severity: warning
layer: k8s
team: platform
auto_repair: "false"
annotations:
summary: "Sentry Webhook 錯誤率 > 10%"
description: "Sentry 錯誤可能無法正確處理"
- alert: NoAlertsReceived2Hours
expr: time() - max by (source)(awoooi_alert_chain_last_success_timestamp) > 7200
for: 5m
labels:
severity: warning
layer: k8s
team: platform
auto_repair: "false"
annotations:
summary: "2 小時內未收到任何告警 ({{ $labels.source }})"
description: "可能是告警鏈路問題,請執行 Smoke Test"
- alert: AlertChainUnhealthy
expr: awoooi_alert_chain_healthy == 0
for: 5m
labels:
severity: critical
layer: k8s
team: platform
auto_repair: "false"
annotations:
summary: "告警鏈路不健康 ({{ $labels.source }})"
description: "告警鏈路標記為不健康,最近處理失敗"
# =========================================================================
# 自動修復監控 (auto_repair)
# =========================================================================
- name: auto_repair
rules:
- alert: AutoRepairLowSuccessRate
expr: awoooi_auto_repair_success_rate < 0.3
for: 30m
labels:
severity: warning
layer: k8s
team: backend
auto_repair: "false"
annotations:
summary: "自動修復成功率過低 ({{ $value | humanizePercentage }})"
description: "動作 {{ $labels.action }} 的成功率低於 30%,建議檢查 Playbook"
- alert: PermanentFixRequired
expr: sum(rate(awoooi_anomaly_escalation_total{level="PERMANENT_FIX"}[1h])) > 0
for: 1m
labels:
severity: critical
layer: k8s
team: backend
auto_repair: "false"
annotations:
summary: "需要永久修復的異常升級"
description: "有異常升級到 PERMANENT_FIX 級別,需要根本修復"
# =========================================================================
# Sprint 5.1: Docker 容器健康監控docker-health-monitor 感知層接入)
# 由 docker-health-monitor.sh 送 Alertmanager 格式 webhook
# 或 Prometheus 自訂 exporter 上報時使用。
# auto_repair: "true" 代表允許 AWOOOI Guardrail 決策(非直接修復)
# 實際修復動作由 Service Registry 分級決定ADR-062
# 2026-04-08 Claude Sonnet 4.6 Asia/Taipei
# =========================================================================
- name: docker_health_alerts
rules:
- alert: DockerContainerUnhealthy
expr: container_health_status{job="docker-health-monitor"} == 0
for: 2m
labels:
severity: warning
layer: docker
team: ops
auto_repair: "true"
mcp_provider: "ssh_host"
alert_category: "devops_tool"
annotations:
summary: "容器 {{ $labels.container }} 健康檢查失敗"
description: "主機 {{ $labels.host }} 容器 {{ $labels.container }} 健康狀態異常,持續 2 分鐘"
- alert: DockerContainerExited
expr: container_running_status{job="docker-health-monitor"} == 0
for: 1m
labels:
severity: critical
layer: docker
team: ops
auto_repair: "true"
mcp_provider: "ssh_host"
alert_category: "devops_tool"
annotations:
summary: "容器 {{ $labels.container }} 已停止"
description: "主機 {{ $labels.host }} 容器 {{ $labels.container }} 已 exited/dead持續 1 分鐘"
# =========================================================================
# MinIO / Kali 告警
# =========================================================================
- name: minio_kali_alerts
rules:
- alert: MinIODown
expr: probe_success{job="blackbox-http", instance=~".*9001.*"} == 0
for: 2m
labels:
severity: warning
layer: docker-188
component: minio
host: "188"
team: ops
auto_repair: "true"
annotations:
summary: "MinIO (Velero 備份) 離線"
description: "MinIO (192.168.0.188:9001) 已離線超過 2 分鐘Velero 備份可能失敗"
- alert: KaliScannerDown
expr: probe_success{job="blackbox-http", instance=~".*192.168.0.112.*"} == 0
for: 5m
labels:
severity: info
layer: docker-188
component: kali
host: "112"
team: ops
auto_repair: "false"
annotations:
summary: "Kali Scanner 離線"
description: "Kali (192.168.0.112:8080) 離線,安全掃描功能暫停"
# =========================================================================
# Plan C — 外部網站監控 (Sprint 5.2, 2026-04-09 Claude Sonnet 4.6 Asia/Taipei)
# blackbox-http 已涵蓋 4 個外部網站,此群組提供結構化告警
# auto_repair: "true" — 由 AWOOOI Guardrail 決策Service Registry 分級)
# =========================================================================
- name: external_website_alerts
rules:
- alert: MoWoooWorkDown
expr: probe_success{job="blackbox-http", instance="https://mo.wooo.work"} == 0
for: 3m
labels:
severity: critical
layer: external
component: momo-app
host: "188"
team: ops
auto_repair: "true"
annotations:
summary: "外部網站 mo.wooo.work 離線"
description: "mo.wooo.work 探測失敗超過 3 分鐘,容器 momo-app (188) 可能需要重啟"
- alert: TsenyangWebsiteDown
expr: probe_success{job="blackbox-http", instance="https://www.tsenyang.com"} == 0
for: 3m
labels:
severity: critical
layer: external
component: tsenyang-website
host: "188"
team: ops
auto_repair: "true"
annotations:
summary: "外部網站 tsenyang.com 離線"
description: "tsenyang.com 探測失敗超過 3 分鐘,容器 tsenyang-website (188) 可能需要重啟"
- alert: StockWoooWorkDown
expr: probe_success{job="blackbox-http", instance="http://stock.wooo.work"} == 0
for: 3m
labels:
severity: critical
layer: external
component: stock-platform
host: "110"
team: ops
auto_repair: "true"
annotations:
summary: "外部網站 stock.wooo.work 離線"
description: "stock.wooo.work 探測失敗超過 3 分鐘,容器 stock-platform (110) 可能需要重啟"
- alert: BitanWoooWorkDown
expr: probe_success{job="blackbox-http", instance="https://bitan.wooo.work"} == 0
for: 3m
labels:
severity: critical
layer: external
component: bitan-app
host: "188"
team: ops
auto_repair: "true"
annotations:
summary: "外部網站 bitan.wooo.work 離線"
description: "bitan.wooo.work 探測失敗超過 3 分鐘,容器 bitan-app (188) 可能需要重啟"
- alert: ExternalSiteSSLExpiringSoon
expr: probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time() < 14 * 24 * 3600
for: 1h
labels:
severity: warning
layer: external
component: ssl
team: ops
auto_repair: "false"
annotations:
summary: "SSL 憑證即將到期: {{ $labels.instance }}"
description: "{{ $labels.instance }} SSL 憑證將在 14 天內到期,請手動更新"
# =============================================================================
# ADR-075 新增規則群組 (2026-04-12 ogt)
# =============================================================================
- name: awoooi_secops_alerts
interval: 60s
rules:
- alert: UnauthorizedSSHLogin
expr: increase(node_failed_auth_attempts_total[5m]) > 10
for: 1m
labels:
severity: critical
layer: systemd-188
team: security
auto_repair: "false"
alert_category: secops
annotations:
summary: "異常 SSH 登入嘗試: {{ $labels.instance }}"
description: "5 分鐘內失敗登入 {{ $value }} 次,可能遭受暴力破解"
- name: awoooi_business_alerts
interval: 60s
rules:
- alert: AITokenCostSpike
expr: increase(awoooi_ai_token_cost_usd_total[1h]) > 10
for: 5m
labels:
severity: warning
layer: k8s
team: finops
auto_repair: "false"
alert_category: business
annotations:
summary: "AI Token 費用 1 小時內暴增 ${{ $value | humanize }}"
description: "AI API 調用費用異常,請檢查是否有迴圈或濫用"
- alert: GeminiAPIErrorRateHigh
expr: rate(awoooi_ai_request_errors_total{provider="gemini"}[5m]) / rate(awoooi_ai_requests_total{provider="gemini"}[5m]) > 0.2
for: 10m
labels:
severity: warning
layer: k8s
team: finops
auto_repair: "false"
alert_category: business
annotations:
summary: "Gemini API 錯誤率過高: {{ $value | humanizePercentage }}"
description: "Gemini API 5 分鐘錯誤率超過 20%AI 降級可能失效"
# ADR-075: 業務爬蟲健康 (2026-04-12 ogt)
- alert: MomoScraperSuccessLow
expr: |
rate(momo_scraper_requests_total{status="success"}[5m])
/ rate(momo_scraper_requests_total[5m]) < 0.9
for: 10m
labels:
severity: warning
layer: docker-110
auto_repair: "false"
alert_category: business
notification_type: TYPE-3
annotations:
summary: "Momo 抓取成功率跌至 {{ $value | humanizePercentage }}"
description: "Momo 爬蟲成功率低於 90%,業務資料可能缺失"
- name: awoooi_flywheel_meta_alerts
interval: 60s
rules:
- alert: FlywheelPlaybookZero
expr: awoooi_flywheel_playbook_count == 0
for: 1h
labels:
severity: critical
layer: k8s
team: aiops
auto_repair: "false"
alert_category: flywheel_health
notification_type: TYPE-8M
annotations:
summary: "飛輪 Playbook 數量為零AI 修復完全依賴 LLM"
description: "Redis 中無任何已批准 Playbook自動修復能力大幅降低"
runbook: "執行 scripts/cold_start_playbooks.py 冷啟動"
- alert: FlywheelExecutionSuccessLow
expr: awoooi_flywheel_execution_success_rate < 0.1
for: 2h
labels:
severity: warning
layer: k8s
team: aiops
auto_repair: "false"
alert_category: flywheel_health
notification_type: TYPE-8M
annotations:
summary: "飛輪執行成功率 {{ $value | humanizePercentage }} 低於 10%"
description: "連續 2 小時執行成功率不足 10%Playbook 可能已過時"
runbook: "檢查 decision_manager 日誌,確認 target 解析和 SSH MCP 狀態"
- alert: FlywheelKMVectorizationLow
expr: awoooi_flywheel_km_unvectorized_count > 10
for: 30m
labels:
severity: warning
layer: k8s
team: aiops
auto_repair: "false"
alert_category: flywheel_health
notification_type: TYPE-8M
annotations:
summary: "{{ $value }} 筆 KM 未向量化RAG 查詢命中率下降"
description: "knowledge_entries 中 embedding IS NULL 超過 10 筆且持續 30 分鐘"
runbook: "執行 scripts/batch_vectorize_km.py 或檢查每日 CronJob 狀態"
- alert: FlywheelAlertnameNullHigh
expr: awoooi_flywheel_alertname_null_rate > 0.05
for: 30m
labels:
severity: warning
layer: k8s
team: aiops
auto_repair: "false"
alert_category: flywheel_health
notification_type: TYPE-8M
annotations:
summary: "飛輪 alertname NULL 率超過 5%"
description: "alertname NULL 率 {{ $value | humanizePercentage }},影響路由準確性。"
runbook: "執行 scripts/backfill_alertname.py 回填"
- alert: FlywheelIncidentsStuck
expr: awoooi_flywheel_incidents_stuck > 5
for: 10m
labels:
severity: warning
layer: k8s
team: aiops
auto_repair: "false"
alert_category: flywheel_health
notification_type: TYPE-8M
annotations:
summary: "{{ $value }} 筆 Incident 卡在 INVESTIGATING 超過 24h"
description: "飛輪推理匹配節點可能堵塞,需人工清理或重新觸發診斷"
# =========================================================================
# 備份還原告警 (awoooi_backup_restore) — 從主機補回 2026-04-12
# =========================================================================
- name: awoooi_backup_restore
interval: 1h
rules:
- alert: BackupRestoreTestFailed
expr: awoooi_backup_restore_test_success == 0
for: 5m
labels:
severity: critical
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "備份還原 dry-run 測試失敗"
description: "Velero restore dry-run 失敗,備份可能無法還原。立即人工驗證備份狀態。"
runbook: "執行 velero backup describe awoooi-daily 及 velero restore create --from-backup awoooi-daily --dry-run"
- alert: BackupRestoreTestStale
expr: (time() - awoooi_backup_restore_test_timestamp_seconds) > 691200
for: 10m
labels:
severity: warning
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "備份還原測試超過 8 天未執行"
description: "上次備份測試距今 {{ $value | humanizeDuration }},週排程 CronJob 可能失效。"
runbook: "檢查 velero namespace 中的 backup-restore-test CronJob 狀態"
# =========================================================================
# 基礎設施詳細告警 (awoooi_infrastructure_detailed) — 從主機補回 2026-04-12
# =========================================================================
- name: awoooi_infrastructure_detailed
interval: 60s
rules:
- alert: DockerContainerUnhealthyDetailed
expr: |
count by (name, instance) (
container_tasks_state{state="running", instance=~"192.168.0.188.*"}
) == 0
or
container_last_seen{instance=~"192.168.0.188.*", name!=""} < (time() - 120)
for: 5m
labels:
severity: warning
layer: docker-188
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "188 主機容器 {{ $labels.name }} 異常"
description: "容器 {{ $labels.name }} 在 {{ $labels.instance }} 已超過 2 分鐘無活動或不在 running 狀態。"
runbook: "SSH 到 192.168.0.188docker inspect {{ $labels.name }} 確認健康狀態"
- alert: RedisStreamBacklogHigh
expr: awoooi_redis_stream_len > 500
for: 10m
labels:
severity: warning
layer: docker-188
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "Redis Stream {{ $labels.stream }} 積壓 {{ $value }} 筆"
description: "Redis Stream 積壓超過 500 筆,飛輪消費者可能阻塞。"
runbook: "檢查 consumer group lagXINFO GROUPS <stream-key>"
- alert: PostgreSQLDiskGrowthRate
expr: |
(
node_filesystem_size_bytes{instance="192.168.0.188:9100", mountpoint="/"}
- node_filesystem_avail_bytes{instance="192.168.0.188:9100", mountpoint="/"}
)
- (
node_filesystem_size_bytes{instance="192.168.0.188:9100", mountpoint="/"} offset 1h
- node_filesystem_avail_bytes{instance="192.168.0.188:9100", mountpoint="/"} offset 1h
)
> 524288000
for: 5m
labels:
severity: warning
layer: docker-188
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "188 主機磁碟 1 小時增長超過 500MB"
description: "磁碟在過去 1 小時增長 {{ $value | humanize1024 }}B可能是 PostgreSQL WAL 或日誌暴增。"
runbook: "SSH 188df -h / && du -sh /var/lib/postgresql/*/pg_wal"
# =========================================================================
# 主機連通性告警 (awoooi_host_connectivity) — 從主機補回 2026-04-12
# =========================================================================
- name: awoooi_host_connectivity
interval: 60s
rules:
- alert: HostNetworkPartition
expr: probe_success{job="host-connectivity"} == 0
for: 5m
labels:
severity: critical
layer: systemd-188
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "主機 {{ $labels.instance }} 無法連通"
description: "TCP probe 到 {{ $labels.instance }} 失敗超過 5 分鐘,可能發生網路分區。"
runbook: "SSH 檢查路由和防火牆規則"