# ops/monitoring/alerts-unified.yml # AWOOOI 統一 Prometheus 告警規則 # 2026-04-05 Claude Code: 整合所有規則,加入統一 layer 標籤 # 2026-04-08 Claude Sonnet 4.6: 補 database_detail_alerts 群組 (6條詳細規則) # 2026-04-12 Claude Sonnet 4.6: 補回 4 個僅存在主機的群組 (backup/flywheel/connectivity/infra-detailed) # 部署目標: 192.168.0.110:/home/wooo/monitoring/alerts.yml # 部署方式: scripts/ops/deploy-alerts.sh (CD 自動部署) # # 標籤規範: # layer: k8s | docker-110 | docker-188 | systemd-188 # component: 服務名稱 # team: ops | backend | ai | platform # host: "110" | "188" | "120" | "121" # auto_repair: "true" | "false" groups: # ========================================================================= # 主機層告警 (host_alerts) # ========================================================================= - name: host_alerts rules: - alert: HostDown expr: up{job=~"node-exporter.*"} == 0 for: 1m labels: severity: critical layer: systemd-188 team: ops auto_repair: "false" annotations: summary: "主機 {{ $labels.host }} 不可達" description: "Node Exporter 無回應超過 1 分鐘" - alert: HostHighCpuLoad # 2026-05-05 ogt + Codex: keep this as early warning only. # Sustained overload/root-cause automation is handled by HostLoadAverageSustainedHigh. expr: 100 - (avg by(host) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90 for: 10m labels: severity: warning layer: systemd-188 team: ops auto_repair: "true" # MCP Phase 2a (ADR-071, 2026-04-11 Claude Sonnet 4.6): SSH MCP 路由標籤 mcp_provider: "ssh_host" host_type: "bare_metal" alert_category: "host_resource" annotations: summary: "主機 {{ $labels.host }} CPU 高負載" description: "CPU 使用率超過 90% 持續 10 分鐘;若 load5/core 未超過 1.5,先視為容量觀察與診斷,不直接修復。" # 2026-05-02 ogt + Claude Sonnet 4.6: 引導 LLM 走 SSH 診斷而非 kubectl auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%cpu | head -20' (host CPU 診斷;禁 kubectl restart awoooi-* — 主因常為第三方服務 Sentry/ClickHouse/Snuba)" runbook: "host CPU 高負載排查:先 SSH ps aux 看 top 進程;若為第三方服務(Sentry/ClickHouse 等)寫 ADR 升級資源或調 limit,禁止 kubectl restart 跨 domain" - alert: HostLoadAverageSustainedHigh # 2026-05-05 ogt + Codex: 110/188 長時間過載基線。 # Why: CPU% 只看 busy time,無法充分表達 ClickHouse merge、Kafka 回補、Chrome/Ollama 造成的 runnable queue。 expr: node_load5{host=~"110|188"} / on(host) count by(host) (count by(host,cpu) (node_cpu_seconds_total{mode="idle",host=~"110|188"})) > 1.5 for: 15m labels: severity: critical layer: systemd-188 team: ops auto_repair: "true" mcp_provider: "ssh_host" host_type: "bare_metal" alert_category: "host_resource" annotations: summary: "主機 {{ $labels.host }} load5/core 長時間過高" description: "load5 / CPU core > 1.5 持續 15 分鐘;這通常代表 runnable queue 已長期塞車,不是短暫尖峰。" auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'echo \"=== LOAD ===\"; uptime; echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -25; echo \"=== DOCKER STATS ===\"; docker stats --no-stream | head -40'" runbook: "先判斷高 load 來源:ClickHouse merge / Kafka backlog / Chrome scraper / Ollama runner / exporter;只允許 read-only 診斷,自動修復需走服務專屬 playbook。" - alert: HostOutOfMemory expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85 for: 5m labels: severity: warning layer: systemd-188 team: ops auto_repair: "true" mcp_provider: "ssh_host" host_type: "bare_metal" alert_category: "host_resource" annotations: summary: "主機 {{ $labels.host }} 記憶體不足" description: "記憶體使用率超過 85%" # 2026-05-02 ogt + Claude Sonnet 4.6: 引導 LLM 走 SSH 診斷 auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%mem | head -20' (host 記憶體診斷;禁 kubectl restart — 主因常為第三方服務)" runbook: "host 記憶體不足排查:SSH 看 top 進程;若為第三方服務需擴容或調 limit" - alert: HostOutOfDiskSpace expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 85 for: 5m labels: severity: warning layer: systemd-188 team: ops auto_repair: "true" mcp_provider: "ssh_host" host_type: "bare_metal" alert_category: "host_resource" annotations: summary: "主機 {{ $labels.host }} 磁碟空間不足" description: "磁碟使用率超過 85%" auto_repair_action: "ssh {{ $labels.instance }} 'echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -15; echo \"=== MEMORY ===\"; free -h; echo \"=== DISK ===\"; df -h; echo \"=== LOAD ===\"; uptime'" # ========================================================================= # K8s 叢集告警 (kubernetes_alerts) # ========================================================================= - name: kubernetes_alerts rules: - alert: K3sNodeNotReady expr: kube_node_status_condition{condition="Ready", status="true"} == 0 for: 2m labels: severity: critical layer: k8s team: ops auto_repair: "false" annotations: summary: "K3s 節點 {{ $labels.node }} 未就緒" description: "節點超過 2 分鐘未達到 Ready 狀態" - alert: KubePodCrashLooping expr: rate(kube_pod_container_status_restarts_total{namespace="awoooi-prod"}[15m]) > 0 for: 5m labels: severity: warning layer: k8s team: ops auto_repair: "true" annotations: summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} 持續重啟" description: "Pod 在過去 15 分鐘內重啟次數異常" - alert: KubePodNotReady expr: kube_pod_status_ready{condition="true",namespace="awoooi-prod"} == 0 for: 5m labels: severity: warning layer: k8s team: ops auto_repair: "true" annotations: summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} 未就緒" description: "Running 中的 Pod 超過 5 分鐘未達到 Ready 狀態" - alert: KubeDeploymentReplicasMismatch expr: kube_deployment_spec_replicas{namespace="awoooi-prod"} != kube_deployment_status_replicas_available{namespace="awoooi-prod"} for: 10m labels: severity: warning layer: k8s team: ops auto_repair: "true" annotations: summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} 副本數不匹配" description: "期望副本數與可用副本數不一致超過 10 分鐘" - alert: VeleroBackupFailed expr: increase(velero_backup_failure_total[24h]) > 0 for: 5m labels: severity: warning layer: k8s team: ops component: velero auto_repair: "false" annotations: summary: "Velero 備份失敗" description: "過去 24 小時有備份失敗" - alert: VeleroBackupNotRun expr: time() - velero_backup_last_successful_timestamp > 86400 for: 10m labels: severity: critical layer: k8s team: ops component: velero auto_repair: "false" annotations: summary: "Velero 超過 24 小時未成功備份" description: "最後一次成功備份超過 24 小時" # Sprint C-2 Host rsync 備份告警 (2026-04-11 Claude Sonnet 4.6) # backup-from-110.sh 成功時寫入 /var/run/backup-110.last_success # node-exporter textfile collector 讀取此檔案暴露指標 - alert: HostBackupFailed # 2026-05-02 ogt + Claude Sonnet 4.6: 刪除 backup_110 collector 標籤條件 # 根因:node_textfile_scrape_error 已移除 collector 欄位,原條件一直判斷為 absent,造成錯誤告警。 # 修法:以 backup_110_last_success_timestamp 是否缺失/超時判斷。 expr: absent(backup_110_last_success_timestamp) or (time() - backup_110_last_success_timestamp > 90000) for: 10m labels: severity: warning layer: docker-188 team: ops component: backup host: "188" auto_repair: "true" alert_category: host_resource annotations: summary: "188 Host 備份超過 25 小時未成功" description: "backup-from-110.sh 可能失敗,/backup/110 資料可能過舊" auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%cpu | head -15; echo \"=== BACKUP STATUS ===\"; ls -lah /home/ollama/backup/110 2>/dev/null || true; echo \"=== LAST SUCCESS ===\"; cat /home/ollama/backup/110/last_success 2>/dev/null || true; echo \"=== BACKUP LOG ===\"; tail -80 /home/ollama/backup/110/backup.log 2>/dev/null || true; echo \"=== DISK ===\"; df -h /home/ollama /backup / 2>/dev/null || df -h'" # ADR-075: CoreDNS 解析失敗 (2026-04-12 ogt) - alert: CoreDNSResolutionFailed expr: rate(coredns_dns_responses_total{rcode="SERVFAIL"}[5m]) > 0.05 for: 5m labels: severity: critical layer: k8s team: ops auto_repair: "true" alert_category: kubernetes notification_type: TYPE-3 annotations: summary: "CoreDNS SERVFAIL 率過高 {{ $value | humanizePercentage }}" description: "CoreDNS 在 5 分鐘內 SERVFAIL 回應率超過 5%,K8s 服務間 DNS 解析可能失敗" runbook: "kubectl -n kube-system get pods -l k8s-app=kube-dns && kubectl -n kube-system logs -l k8s-app=kube-dns" # ========================================================================= # 資料庫告警 (database_alerts) # ========================================================================= - name: database_alerts rules: - alert: PostgreSQLDown expr: up{job="postgres-exporter"} == 0 or pg_up == 0 for: 1m labels: severity: critical layer: systemd-188 component: postgres host: "188" team: ops auto_repair: "false" annotations: summary: "PostgreSQL 資料庫離線" description: "PostgreSQL Exporter 無法連接資料庫超過 1 分鐘" - alert: RedisDown expr: up{job="redis-exporter"} == 0 or redis_up == 0 for: 1m labels: severity: critical layer: systemd-188 component: redis host: "188" team: ops auto_repair: "false" annotations: summary: "Redis 快取服務離線" description: "Redis Exporter 無法連接 Redis 超過 1 分鐘" - alert: PostgreSQLHighConnections expr: pg_stat_activity_count > 80 for: 5m labels: severity: warning layer: systemd-188 component: postgres team: ops auto_repair: "false" annotations: summary: "PostgreSQL 連接數過高" description: "當前連接數 {{ $value }} 超過 80" - alert: RedisMemoryHigh expr: redis_memory_max_bytes > 0 and redis_memory_used_bytes / redis_memory_max_bytes * 100 > 80 for: 5m labels: severity: warning layer: systemd-188 component: redis team: ops auto_repair: "false" annotations: summary: "Redis 記憶體使用過高" description: "Redis 記憶體使用率超過 80%" # ========================================================================= # Sprint 5.2 Plan B: 資料庫詳細指標告警 (database_detail_alerts) # 前置: postgres-exporter:9187 + redis-exporter:9121 on 192.168.0.188 # 2026-04-08 Claude Sonnet 4.6 Asia/Taipei # ========================================================================= - name: database_detail_alerts rules: # ---- PostgreSQL 詳細指標 ---- - alert: PostgreSQLSlowQueries expr: pg_stat_activity_max_tx_duration{datname="awoooi_prod"} > 60 for: 5m labels: severity: warning layer: systemd-188 component: postgres host: "188" team: ops auto_repair: "false" annotations: summary: "PostgreSQL 有慢查詢 (>60s)" description: "awoooi_prod 資料庫最長事務超過 60 秒" - alert: PostgreSQLDeadlocks expr: increase(pg_stat_database_deadlocks{datname="awoooi_prod"}[5m]) > 0 for: 1m labels: severity: warning layer: systemd-188 component: postgres host: "188" team: ops auto_repair: "false" annotations: summary: "PostgreSQL 死鎖發生" description: "過去 5 分鐘 awoooi_prod 資料庫有死鎖" - alert: PostgreSQLTooManyConnections expr: pg_stat_activity_count{datname="awoooi_prod"} > 50 for: 5m labels: severity: warning layer: systemd-188 component: postgres host: "188" team: ops auto_repair: "false" annotations: summary: "PostgreSQL 連接數過高 ({{ $value }})" description: "awoooi_prod 連接數超過 50" # ---- Redis 詳細指標 ---- - alert: RedisKeyEviction expr: increase(redis_evicted_keys_total[5m]) > 0 for: 1m labels: severity: warning layer: systemd-188 component: redis host: "188" team: ops auto_repair: "false" annotations: summary: "Redis 發生 Key 驅逐" description: "過去 5 分鐘有 Key 被驅逐,可能記憶體不足" - alert: RedisConnectionsHigh expr: redis_connected_clients > 100 for: 5m labels: severity: warning layer: systemd-188 component: redis host: "188" team: ops auto_repair: "false" annotations: summary: "Redis 連接數過高 ({{ $value }})" description: "Redis 連接數超過 100" - alert: RedisCommandLatencyHigh expr: redis_commands_duration_seconds_total / redis_commands_processed_total > 0.01 for: 5m labels: severity: warning layer: systemd-188 component: redis host: "188" team: ops auto_repair: "false" annotations: summary: "Redis 命令平均延遲過高 (>10ms)" description: "Redis 命令平均延遲超過 10ms" # ========================================================================= # 服務可用性告警 (service_alerts) — 含 layer 標籤決定修復路徑 # ========================================================================= - name: service_alerts rules: # ---- 188 Docker 層 ---- - alert: OpenClawDown # 2026-04-05 Claude Code: 修正舊命名 ClawBotDown → OpenClawDown expr: up{job="clawbot"} == 0 for: 2m labels: severity: critical layer: docker-188 component: openclaw host: "188" team: ops auto_repair: "true" annotations: summary: "OpenClaw 服務離線" description: "OpenClaw (192.168.0.188:8088) 已離線超過 2 分鐘" - alert: SignOzDown expr: probe_success{job="blackbox-http", instance=~".*3301.*"} == 0 for: 2m labels: severity: warning layer: docker-188 component: signoz host: "188" team: ops auto_repair: "true" mcp_provider: "ssh_host" target_host: "192.168.0.188" alert_category: "devops_tool" annotations: summary: "SignOz 服務離線" description: "SignOz (192.168.0.188:3301) 已離線超過 2 分鐘" # ---- 110 Docker 層 ---- - alert: SentryDown expr: probe_success{job="blackbox-tcp", instance="192.168.0.110:9000"} == 0 for: 2m labels: severity: warning layer: docker-110 component: sentry host: "110" team: ops auto_repair: "true" mcp_provider: "ssh_host" target_host: "192.168.0.110" alert_category: "devops_tool" annotations: summary: "Sentry 服務離線" description: "Sentry (192.168.0.110:9000) 已離線超過 2 分鐘" - alert: HarborDown expr: probe_success{job="blackbox-http", instance=~".*5000.*"} == 0 for: 2m labels: severity: critical layer: docker-110 component: harbor host: "110" team: ops auto_repair: "true" mcp_provider: "ssh_host" target_host: "192.168.0.110" alert_category: "devops_tool" annotations: summary: "Harbor Registry 離線" description: "Harbor (192.168.0.110:5000) 已離線超過 2 分鐘,CD pipeline 將無法拉取映像" - alert: GiteaDown expr: probe_success{job="blackbox-http", instance="http://192.168.0.110:3001"} == 0 for: 2m labels: severity: critical layer: docker-110 component: gitea host: "110" team: ops auto_repair: "true" mcp_provider: "ssh_host" target_host: "192.168.0.110" alert_category: "devops_tool" annotations: summary: "Gitea Git 服務離線" description: "Gitea (192.168.0.110:3001) 已離線超過 2 分鐘,CD pipeline 失效" - alert: AlertmanagerDown expr: probe_success{job="blackbox-tcp", instance="192.168.0.110:9093"} == 0 for: 2m labels: severity: critical layer: docker-110 component: alertmanager host: "110" team: ops auto_repair: "true" annotations: summary: "Alertmanager 離線" description: "Alertmanager (192.168.0.110:9093) 已離線,所有告警將靜默" # ========================================================================= # 告警鏈路監控 (alert_chain) — 防止 2026-03-26/04-05 事故重演 # ========================================================================= - name: alert_chain rules: - alert: AlertChainBroken_Alertmanager expr: | sum(rate(awoooi_webhook_requests_total{source="alertmanager",status!="success"}[5m])) / sum(rate(awoooi_webhook_requests_total{source="alertmanager"}[5m])) > 0.1 for: 10m labels: severity: critical layer: k8s team: platform auto_repair: "false" annotations: summary: "Alertmanager Webhook 錯誤率 > 10%" description: "告警鏈路可能斷裂,請執行 E2E 驗證" - alert: AlertChainBroken_Sentry expr: | sum(rate(awoooi_webhook_requests_total{source="sentry",status!="success"}[5m])) / sum(rate(awoooi_webhook_requests_total{source="sentry"}[5m])) > 0.1 for: 10m labels: severity: warning layer: k8s team: platform auto_repair: "false" annotations: summary: "Sentry Webhook 錯誤率 > 10%" description: "Sentry 錯誤可能無法正確處理" - alert: NoAlertsReceived2Hours expr: time() - max by (source)(awoooi_alert_chain_last_success_timestamp{source="alertmanager"}) > 7200 for: 5m labels: severity: warning layer: k8s team: platform auto_repair: "false" annotations: summary: "Alertmanager 主鏈路 2 小時內未收到告警" description: "Alertmanager 是固定主鏈路;Sentry/SignOz 沉默不代表鏈路故障,錯誤率另有 AlertChainBroken_* 規則監控。請執行 Smoke Test" - alert: SourceProviderIngestionStale expr: | time() - max by (source) ( awoooi_alert_chain_last_success_timestamp{source=~"sentry|signoz"} ) > 86400 for: 15m labels: severity: warning layer: k8s component: source-ingestion team: platform auto_repair: "false" alert_category: "alertchain_provider_freshness" annotations: summary: "{{ $labels.source }} source ingestion 超過 24 小時未更新" description: "{{ $labels.source }} webhook endpoint 可能仍健康,但 AwoooP source dossier 已超過 24 小時沒有新事件。這是 provider ingestion / upstream smoke / correlation freshness 缺口,不是 Alertmanager 主鏈路故障。" runbook: "先查 /api/v1/webhooks/{{ $labels.source }}/health,再查 /api/v1/platform/events/dossier/coverage?provider={{ $labels.source }};若 endpoint OK 但 latest stale,檢查上游 Sentry/SignOz notification channel 或排程 smoke。" - alert: AlertChainUnhealthy expr: awoooi_alert_chain_healthy == 0 for: 5m labels: severity: critical layer: k8s team: platform auto_repair: "false" annotations: summary: "告警鏈路不健康 ({{ $labels.source }})" description: "告警鏈路標記為不健康,最近處理失敗" # ========================================================================= # 自動修復監控 (auto_repair) # ========================================================================= - name: auto_repair rules: - alert: AutoRepairLowSuccessRate expr: awoooi_auto_repair_success_rate < 0.3 for: 30m labels: severity: warning layer: k8s team: backend auto_repair: "false" annotations: summary: "自動修復成功率過低 ({{ $value | humanizePercentage }})" description: "動作 {{ $labels.action }} 的成功率低於 30%,建議檢查 Playbook" - alert: PermanentFixRequired expr: sum(rate(awoooi_anomaly_escalation_total{level="PERMANENT_FIX"}[1h])) > 0 for: 1m labels: severity: critical layer: k8s team: backend auto_repair: "false" annotations: summary: "需要永久修復的異常升級" description: "有異常升級到 PERMANENT_FIX 級別,需要根本修復" # ========================================================================= # Sprint 5.1: Docker 容器健康監控(docker-health-monitor 感知層接入) # 由 docker-health-monitor.sh 送 Alertmanager 格式 webhook, # 或 Prometheus 自訂 exporter 上報時使用。 # auto_repair: "true" 代表允許 AWOOOI Guardrail 決策(非直接修復) # 實際修復動作由 Service Registry 分級決定(ADR-062) # 2026-04-08 Claude Sonnet 4.6 Asia/Taipei # ========================================================================= - name: docker_health_alerts rules: - alert: DockerContainerUnhealthy expr: container_health_status{job="docker-health-monitor"} == 0 for: 2m labels: severity: warning layer: docker team: ops auto_repair: "true" mcp_provider: "ssh_host" alert_category: "devops_tool" annotations: summary: "容器 {{ $labels.container }} 健康檢查失敗" description: "主機 {{ $labels.host }} 容器 {{ $labels.container }} 健康狀態異常,持續 2 分鐘" - alert: DockerContainerExited expr: container_running_status{job="docker-health-monitor"} == 0 for: 1m labels: severity: critical layer: docker team: ops auto_repair: "true" mcp_provider: "ssh_host" alert_category: "devops_tool" annotations: summary: "容器 {{ $labels.container }} 已停止" description: "主機 {{ $labels.host }} 容器 {{ $labels.container }} 已 exited/dead,持續 1 分鐘" - alert: DockerContainerCpuSustainedHigh # 2026-05-05 ogt + Codex: Docker Compose 長期過載基線。 # Baseline: 單容器 >2 core 10m 為 warning;用於提早抓 cadvisor、ClickHouse、momo-scheduler、Ollama runner 類問題。 expr: docker_container_cpu_cores > 2 for: 10m labels: severity: warning layer: docker team: ops alert_category: infrastructure notification_type: TYPE-1 auto_repair: "true" annotations: summary: "容器 {{ $labels.container_name }} CPU 持續超過 2 core" description: "{{ $labels.container_name }} 5m CPU 使用 >2 core 且持續 10 分鐘,需檢查是否缺 cpus limit、merge backlog、爬蟲尖峰或模型推理。" auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker stats --no-stream {{ $labels.container_name }}; docker inspect {{ $labels.container_name }} | head -80'" runbook: "自動階段只做診斷;若是已知服務,交給 service-specific playbook:ClickHouse 降 merge/查 backlog、scheduler 限 concurrency、litellm/blackbox 查 liveliness。" - alert: DockerContainerCpuRunawayCritical expr: docker_container_cpu_cores > 4 for: 15m labels: severity: critical layer: docker team: ops alert_category: infrastructure notification_type: TYPE-3 auto_repair: "true" annotations: summary: "容器 {{ $labels.container_name }} CPU 持續超過 4 core" description: "{{ $labels.container_name }} 已持續吃超過 4 core,會拖垮 110/188 主機;需要 AI 判斷是否限流、暫停 worker 或套用既有 playbook。" auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'echo \"=== CONTAINER ===\"; docker stats --no-stream {{ $labels.container_name }}; echo \"=== TOP ===\"; docker top {{ $labels.container_name }} -eo pid,ppid,stat,pcpu,pmem,comm,args | head -30'" runbook: "禁止通用 docker restart;先抓根因,只有 health down 或 crash loop 才可走重啟。" - alert: DockerContainerMemoryLimitPressure # 2026-05-05 ogt + Codex: 防止亂設 memory limit 把 Kafka/Gitea/Taskworker 類服務卡死。 expr: docker_container_memory_limit_bytes > 0 and docker_container_memory_usage_bytes / docker_container_memory_limit_bytes > 0.85 for: 10m labels: severity: warning layer: docker team: ops alert_category: infrastructure notification_type: TYPE-1 auto_repair: "true" annotations: summary: "容器 {{ $labels.container_name }} 記憶體超過 limit 85%" description: "{{ $labels.container_name }} memory / limit > 85% 持續 10 分鐘。若是 Kafka/ClickHouse/Gitea/Taskworker,需先判斷 workload,不可直接降 limit。" auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker stats --no-stream {{ $labels.container_name }}; docker inspect {{ $labels.container_name }} | head -80'" runbook: "若服務已接近 limit:優先調整 retention/concurrency/cache,再評估提高 memory;禁止用更低 memory limit 當止血。" - alert: DockerContainerRestartSpike # 2026-05-05 ogt + Codex: cAdvisor v0.47 無 restart metric,吃 node-exporter textfile docker_container_restart_count。 expr: increase(docker_container_restart_count[15m]) > 5 or increase(docker_container_inspect_restart_count[15m]) > 5 for: 3m labels: severity: critical layer: docker team: ops alert_category: infrastructure notification_type: TYPE-3 auto_repair: "true" annotations: summary: "容器 {{ $labels.container_name }} 15 分鐘重啟超過 5 次" description: "Docker restart textfile exporter 顯示 {{ $labels.container_name }} 重啟暴增,避免再次出現 litellm 24,464 次靜默崩潰。" auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps -a --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}'" runbook: "先抓 crash signature;若是 config/DB/網路問題,修設定,不用無限 restart。" - alert: DockerContainerMissingResourceLimit # 2026-05-05 ogt + Codex: catch Compose services that silently run with unlimited CPU/memory. expr: (docker_container_cpu_limit_cores{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|taskbroker|taskscheduler|redis|uptime-checker|process-spans).*"} == 0) or (docker_container_memory_limit_bytes{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|taskbroker|taskscheduler|redis|uptime-checker|process-spans).*"} == 0) for: 30m labels: severity: warning layer: docker team: ops alert_category: infrastructure notification_type: TYPE-1 auto_repair: "true" annotations: summary: "容器 {{ $labels.container_name }} 缺 CPU 或 memory limit" description: "{{ $labels.container_name }} 在 Docker Host 上沒有完整 CPU/memory guardrail,長時間尖峰可能拖垮 110/188。" auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker inspect {{ $labels.container_name }} --format \"NanoCpus={{.HostConfig.NanoCpus}} Memory={{.HostConfig.Memory}}\"; docker stats --no-stream {{ $labels.container_name }}'" runbook: "先盤點 workload,再在 compose/service-specific playbook 補 cpus/mem_limit。不可盲目降 ClickHouse/Kafka;momo app/scheduler 可用 2 core/2GiB 起步。" - alert: DockerGiteaActionsJobStale # 2026-05-05 ogt + Codex: Gitea/act job containers can outlive workflow timeout and keep 110 hot. expr: time() - docker_container_started_seconds{container_name=~"GITEA-ACTIONS-.*"} > 1200 for: 5m labels: severity: warning layer: docker-110 team: ops alert_category: infrastructure notification_type: TYPE-1 auto_repair: "true" annotations: summary: "Gitea Actions job {{ $labels.container_name }} 執行超過 20 分鐘" description: "{{ $labels.container_name }} 已超過 20 分鐘,110 曾出現 code-review/CD job timeout 失效而持續佔用 runner。" auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}; bash /home/wooo/scripts/stop-stale-gitea-actions-jobs.sh --min-age-seconds 1200'" runbook: "先執行 dry-run;清理腳本會依 workflow/job 名稱套停止門檻。若 logs 空白且超過該 job timeout buffer,再用 --apply 停止 stale job container。不要停止仍在輸出或正在部署的 job。" - alert: SystemdRunnerRestartSpike # 2026-05-05 ogt + Codex: 110 GitHub Actions runner watchdog loop was outside Docker/cAdvisor coverage. expr: increase(systemd_unit_restarts_total{unit=~"actions\\.runner\\..*"}[15m]) > 2 for: 3m labels: severity: critical layer: systemd-110 team: ops alert_category: infrastructure notification_type: TYPE-3 auto_repair: "true" annotations: summary: "Systemd runner {{ $labels.unit }} 15 分鐘重啟超過 2 次" description: "{{ $labels.unit }} 在 15 分鐘內重啟暴增;110 曾發生 WatchdogSec=5min 造成 runner 每 5 分鐘自殺重啟。" auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'systemctl show {{ $labels.unit }} -p WatchdogUSec -p NRestarts -p DropInPaths -p CPUQuotaPerSecUSec -p MemoryMax; journalctl -u {{ $labels.unit }} --since \"20 minutes ago\" --no-pager | tail -120'" runbook: "自動階段先診斷。若確認 WatchdogSec 對 GitHub runner 誤設,需人工或 sudo playbook 執行 /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply。" - alert: SystemdRunnerWatchdogEnabled expr: systemd_unit_watchdog_seconds{unit=~"actions\\.runner\\..*"} > 0 for: 10m labels: severity: warning layer: systemd-110 team: ops alert_category: infrastructure notification_type: TYPE-1 auto_repair: "true" annotations: summary: "Systemd runner {{ $labels.unit }} 啟用了 WatchdogSec" description: "{{ $labels.unit }} WatchdogSec={{ $value }} 秒。GitHub Actions runner service 不應被 systemd watchdog 週期性殺掉。" auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'systemctl show {{ $labels.unit }} -p WatchdogUSec -p DropInPaths -p NRestarts'" runbook: "確認 drop-in 來源;需要 sudo 時由人工套用 /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply。" - alert: SystemdRunnerMissingResourceQuota expr: systemd_unit_cpu_quota_cores{unit=~"actions\\.runner\\..*"} == 0 or systemd_unit_memory_max_bytes{unit=~"actions\\.runner\\..*"} == 0 for: 30m labels: severity: warning layer: systemd-110 team: ops alert_category: infrastructure notification_type: TYPE-1 auto_repair: "true" annotations: summary: "Systemd runner {{ $labels.unit }} 缺 CPU 或 memory quota" description: "{{ $labels.unit }} 仍為 unlimited;CI runner 會與 Sentry/ClickHouse/Gitea 搶主機 CPU。" auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'systemctl show {{ $labels.unit }} -p CPUQuotaPerSecUSec -p MemoryMax -p ActiveState -p SubState'" runbook: "建議 baseline:每個 runner CPUQuota=200%、MemoryMax=2G;由 /home/wooo/scripts/apply-runner-systemd-guardrails.sh 套用,若仍過載再限制並行度或分流。" # ========================================================================= # MinIO / Kali 告警 # ========================================================================= - name: minio_kali_alerts rules: - alert: MinIODown expr: probe_success{job="blackbox-http", instance=~".*9001.*"} == 0 for: 2m labels: severity: warning layer: docker-188 component: minio host: "188" team: ops auto_repair: "true" annotations: summary: "MinIO (Velero 備份) 離線" description: "MinIO (192.168.0.188:9001) 已離線超過 2 分鐘,Velero 備份可能失敗" - alert: KaliScannerDown expr: probe_success{job="blackbox-http", instance=~".*192.168.0.112.*"} == 0 for: 5m labels: severity: info layer: docker-188 component: kali host: "112" team: ops auto_repair: "false" annotations: summary: "Kali Scanner 離線" description: "Kali (192.168.0.112:8080) 離線,安全掃描功能暫停" # ========================================================================= # Plan C — 外部網站監控 (Sprint 5.2, 2026-04-09 Claude Sonnet 4.6 Asia/Taipei) # blackbox-http 已涵蓋 4 個外部網站,此群組提供結構化告警 # auto_repair: "true" — 由 AWOOOI Guardrail 決策(Service Registry 分級) # ========================================================================= - name: external_website_alerts rules: - alert: MoWoooWorkDown expr: probe_success{job="blackbox-http", instance="https://mo.wooo.work"} == 0 for: 3m labels: severity: critical layer: external component: momo-app host: "188" team: ops auto_repair: "true" annotations: summary: "外部網站 mo.wooo.work 離線" description: "mo.wooo.work 探測失敗超過 3 分鐘,容器 momo-app (188) 可能需要重啟" - alert: TsenyangWebsiteDown expr: probe_success{job="blackbox-http", instance="https://www.tsenyang.com"} == 0 for: 3m labels: severity: critical layer: external component: tsenyang-website host: "188" team: ops auto_repair: "true" annotations: summary: "外部網站 tsenyang.com 離線" description: "tsenyang.com 探測失敗超過 3 分鐘,容器 tsenyang-website (188) 可能需要重啟" - alert: StockWoooWorkDown expr: probe_success{job="blackbox-http", instance="http://stock.wooo.work"} == 0 for: 3m labels: severity: critical layer: external component: stock-platform host: "110" team: ops auto_repair: "true" annotations: summary: "外部網站 stock.wooo.work 離線" description: "stock.wooo.work 探測失敗超過 3 分鐘,容器 stock-platform (110) 可能需要重啟" - alert: BitanWoooWorkDown expr: probe_success{job="blackbox-http", instance="https://bitan.wooo.work"} == 0 for: 3m labels: severity: critical layer: external component: bitan-app host: "188" team: ops auto_repair: "true" annotations: summary: "外部網站 bitan.wooo.work 離線" description: "bitan.wooo.work 探測失敗超過 3 分鐘,容器 bitan-app (188) 可能需要重啟" - alert: ExternalSiteSSLExpiringSoon expr: probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time() < 14 * 24 * 3600 for: 1h labels: severity: warning layer: external component: ssl team: ops auto_repair: "false" annotations: summary: "SSL 憑證即將到期: {{ $labels.instance }}" description: "{{ $labels.instance }} SSL 憑證將在 14 天內到期,請手動更新" # ============================================================================= # ADR-075 新增規則群組 (2026-04-12 ogt) # ============================================================================= - name: awoooi_secops_alerts interval: 60s rules: - alert: UnauthorizedSSHLogin expr: increase(node_failed_auth_attempts_total[5m]) > 10 for: 1m labels: severity: critical layer: systemd-188 team: security auto_repair: "false" alert_category: secops annotations: summary: "異常 SSH 登入嘗試: {{ $labels.instance }}" description: "5 分鐘內失敗登入 {{ $value }} 次,可能遭受暴力破解" - name: awoooi_business_alerts interval: 60s rules: - alert: AITokenCostSpike expr: increase(awoooi_ai_token_cost_usd_total[1h]) > 10 for: 5m labels: severity: warning layer: k8s team: finops auto_repair: "false" alert_category: business annotations: summary: "AI Token 費用 1 小時內暴增 ${{ $value | humanize }}" description: "AI API 調用費用異常,請檢查是否有迴圈或濫用" - alert: GeminiAPIErrorRateHigh expr: rate(awoooi_ai_request_errors_total{provider="gemini"}[5m]) / rate(awoooi_ai_requests_total{provider="gemini"}[5m]) > 0.2 for: 10m labels: severity: warning layer: k8s team: finops auto_repair: "false" alert_category: business annotations: summary: "Gemini API 錯誤率過高: {{ $value | humanizePercentage }}" description: "Gemini API 5 分鐘錯誤率超過 20%,AI 降級可能失效" # ADR-075: 業務爬蟲健康 (2026-04-12 ogt) - alert: MomoScraperSuccessLow expr: | rate(momo_scraper_requests_total{status="success"}[5m]) / rate(momo_scraper_requests_total[5m]) < 0.9 for: 10m labels: severity: warning layer: docker-110 auto_repair: "false" alert_category: business notification_type: TYPE-3 annotations: summary: "Momo 抓取成功率跌至 {{ $value | humanizePercentage }}" description: "Momo 爬蟲成功率低於 90%,業務資料可能缺失" - name: awoooi_flywheel_meta_alerts interval: 60s rules: - alert: FlywheelPlaybookZero expr: awoooi_flywheel_playbook_count == 0 for: 1h labels: severity: critical layer: k8s team: aiops auto_repair: "false" alert_category: flywheel_health notification_type: TYPE-8M annotations: summary: "飛輪 Playbook 數量為零,AI 修復完全依賴 LLM" description: "Redis 中無任何已批准 Playbook,自動修復能力大幅降低" runbook: "執行 scripts/cold_start_playbooks.py 冷啟動" # 2026-05-03 ogt + Claude Opus 4.7(亞太)— anti-silencing 補配對告警 # NaN sentinel 不會被 < 0.1 誤觸;下方 FlywheelExecutionRateMissing 補「無資料」獨立告警 - alert: FlywheelExecutionSuccessLow expr: awoooi_flywheel_execution_success_rate < 0.1 for: 2h labels: severity: warning layer: k8s team: aiops auto_repair: "false" alert_category: flywheel_health notification_type: TYPE-8M annotations: summary: "飛輪執行成功率 {{ $value | humanizePercentage }} 低於 10%" description: "連續 2 小時執行成功率不足 10%,Playbook 可能已過時" runbook: "檢查 decision_manager 日誌,確認 target 解析和 SSH MCP 狀態" - alert: FlywheelExecutionRateMissing expr: absent(awoooi_flywheel_execution_success_rate) or (awoooi_flywheel_execution_success_rate != awoooi_flywheel_execution_success_rate) for: 30m labels: severity: warning layer: k8s team: aiops auto_repair: "false" alert_category: flywheel_health notification_type: TYPE-8M annotations: summary: "飛輪執行率指標 30 分鐘無資料" description: "execution_success_rate 連續 30 分鐘為 NaN 或不存在,Redis playbook 統計斷流(資料管線壞 / Redis flush / FlywheelStatsService 異常)" runbook: "1) 檢查 Redis playbook:* keys 2) 檢查 FlywheelStatsService 日誌 3) curl /metrics 直接拉看 NaN 來源" - alert: FlywheelKMVectorizationLow expr: awoooi_flywheel_km_unvectorized_count > 10 for: 30m labels: severity: warning layer: k8s team: aiops auto_repair: "false" alert_category: flywheel_health notification_type: TYPE-8M annotations: summary: "{{ $value }} 筆 KM 未向量化,RAG 查詢命中率下降" description: "knowledge_entries 中 embedding IS NULL 超過 10 筆且持續 30 分鐘" runbook: "執行 scripts/batch_vectorize_km.py 或檢查每日 CronJob 狀態" - alert: FlywheelAlertnameNullHigh expr: awoooi_flywheel_alertname_null_rate > 0.05 for: 30m labels: severity: warning layer: k8s team: aiops auto_repair: "false" alert_category: flywheel_health notification_type: TYPE-8M annotations: summary: "飛輪 alertname NULL 率超過 5%" description: "alertname NULL 率 {{ $value | humanizePercentage }},影響路由準確性。" runbook: "執行 scripts/backfill_alertname.py 回填" - alert: FlywheelIncidentsStuck expr: awoooi_flywheel_incidents_stuck > 5 for: 10m labels: severity: warning layer: k8s team: aiops auto_repair: "false" alert_category: flywheel_health notification_type: TYPE-8M annotations: summary: "{{ $value }} 筆 Incident 卡在 INVESTIGATING 超過 24h" description: "飛輪推理匹配節點可能堵塞,需人工清理或重新觸發診斷" # ========================================================================= # 備份還原告警 (awoooi_backup_restore) — 從主機補回 2026-04-12 # ========================================================================= - name: awoooi_backup_restore interval: 1h rules: - alert: BackupRestoreTestFailed expr: awoooi_backup_restore_test_success == 0 for: 5m labels: severity: critical alert_category: infrastructure notification_type: TYPE-3 auto_repair: "false" annotations: summary: "備份還原 dry-run 測試失敗" description: "Velero restore dry-run 失敗,備份可能無法還原。立即人工驗證備份狀態。" runbook: "執行 velero backup describe awoooi-daily 及 velero restore create --from-backup awoooi-daily --dry-run" - alert: BackupRestoreTestStale expr: (time() - awoooi_backup_restore_test_timestamp_seconds) > 691200 for: 10m labels: severity: warning alert_category: infrastructure notification_type: TYPE-3 auto_repair: "false" annotations: summary: "備份還原測試超過 8 天未執行" description: "上次備份測試距今 {{ $value | humanizeDuration }},週排程 CronJob 可能失效。" runbook: "檢查 velero namespace 中的 backup-restore-test CronJob 狀態" # ========================================================================= # 基礎設施詳細告警 (awoooi_infrastructure_detailed) — 從主機補回 2026-04-12 # ========================================================================= - name: awoooi_infrastructure_detailed interval: 60s rules: - alert: DockerContainerUnhealthyDetailed expr: | count by (name, instance) ( container_tasks_state{state="running", instance=~"192.168.0.188.*"} ) == 0 or container_last_seen{instance=~"192.168.0.188.*", name!=""} < (time() - 120) for: 5m labels: severity: warning layer: docker-188 alert_category: infrastructure notification_type: TYPE-3 auto_repair: "false" annotations: summary: "188 主機容器 {{ $labels.name }} 異常" description: "容器 {{ $labels.name }} 在 {{ $labels.instance }} 已超過 2 分鐘無活動或不在 running 狀態。" runbook: "SSH 到 192.168.0.188:docker inspect {{ $labels.name }} 確認健康狀態" - alert: RedisStreamBacklogHigh expr: awoooi_redis_stream_len > 500 for: 10m labels: severity: warning layer: docker-188 alert_category: infrastructure notification_type: TYPE-3 auto_repair: "false" annotations: summary: "Redis Stream {{ $labels.stream }} 積壓 {{ $value }} 筆" description: "Redis Stream 積壓超過 500 筆,飛輪消費者可能阻塞。" runbook: "檢查 consumer group lag:XINFO GROUPS " # 2026-04-19 Hermes E3 決策: PostgreSQLDiskGrowthRate deprecated # 真因: 500MB/h 增長是 PG WAL 正常行為 (commits/checkpoints),不該告警 # 過去 30d 觸發 7 次,全部 AI 判 NO_ACTION 或誤判 kubectl rollout restart 失敗 # 統帥決策 (2026-04-19 18:xx Taipei): 選項 C 刪除舊規則 + 改用絕對磁碟使用率 # ----------------------------------------------------------------- - alert: HostDiskUsageHigh expr: | ( node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"} - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay", mountpoint="/"} ) / node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"} * 100 > 80 for: 10m labels: severity: warning layer: systemd-188 alert_category: host_resource notification_type: TYPE-3 # 2026-05-02 ogt + Claude Sonnet 4.6: ADR-068 飛輪 — disk full SOP # auto_repair: false → true,路由到 ssh_host MCP Group B `ssh_docker_prune` # 工具內含 ≥75% 磁碟守衛,低於閾值 no-op,避免誤刪 auto_repair: "true" mcp_provider: "ssh_host" host_type: "bare_metal" supersedes: PostgreSQLDiskGrowthRate annotations: summary: "主機 {{ $labels.instance }} 根目錄磁碟使用率 {{ $value | humanize }}% (>80%)" description: "磁碟使用率持續 10 分鐘超過 80%, 需清理或擴容. 常見原因: PG WAL, 日誌, container images, 舊 build cache." auto_repair_action: "ssh {{ $labels.instance }} docker prune (image+volume+builder; gated by 75% disk usage)" runbook: "SSH 該主機: df -h / && du -sh /var/lib/postgresql/*/pg_wal /var/log /var/lib/docker" - alert: HostDiskUsageCritical expr: | ( node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"} - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay", mountpoint="/"} ) / node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"} * 100 > 90 for: 5m labels: severity: critical layer: systemd-188 alert_category: infrastructure notification_type: TYPE-3 auto_repair: "true" supersedes: PostgreSQLDiskGrowthRate annotations: summary: "🔴 主機 {{ $labels.instance }} 根目錄磁碟使用率 {{ $value | humanize }}% (>90%, critical)" description: "磁碟即將滿, 需立即清理. 超過 95% 可能導致服務中斷." runbook: "立即 SSH 該主機: df -h / && du -sh /* 2>/dev/null | sort -h | tail -10" # ========================================================================= # 主機連通性告警 (awoooi_host_connectivity) — 從主機補回 2026-04-12 # ========================================================================= - name: awoooi_host_connectivity interval: 60s rules: - alert: HostNetworkPartition expr: probe_success{job="host-connectivity"} == 0 for: 5m labels: severity: critical layer: systemd-188 alert_category: infrastructure notification_type: TYPE-3 auto_repair: "true" annotations: summary: "主機 {{ $labels.instance }} 無法連通" description: "TCP probe 到 {{ $labels.instance }} 失敗超過 5 分鐘,可能發生網路分區。" runbook: "SSH 檢查路由和防火牆規則" # ========================================================================= # 監控工具自監控 (infra_self_monitoring) — ADR-090 Phase 7 # 2026-04-19 Claude Opus 4.7 / 鐵律:監控工具必須被監控 # 設計:不寫死 CPU% 或 MB 數,改用 (配額佔比) + (throttle 訊號) 動態判斷 # 配額由 docker-compose 宣告,告警條件 = 使用量 / 配額 > 0.8 # 比寫死 80% 更智能 — 配額改告警閾值自動跟著變 # ========================================================================= - name: infra_self_monitoring interval: 1m rules: # --- cadvisor 自監控 --- - alert: CadvisorDown expr: up{job=~".*cadvisor.*"} == 0 for: 5m labels: severity: critical layer: docker-110-188 component: cadvisor team: ops alert_category: infrastructure notification_type: TYPE-3 auto_repair: "true" annotations: summary: "cAdvisor ({{ $labels.instance }}) 停擺" description: "主機 {{ $labels.instance }} 的 cadvisor 已停擺 5 分鐘,容器監控中斷。" runbook: "SSH 主機 docker compose up -d cadvisor;檢查 OOMKill 訊號" - alert: CadvisorMemoryPressure expr: container_memory_usage_bytes{name="cadvisor"} / container_spec_memory_limit_bytes{name="cadvisor"} > 0.8 for: 10m labels: severity: warning component: cadvisor team: ops alert_category: infrastructure notification_type: TYPE-1 auto_repair: "false" annotations: summary: "cAdvisor 記憶體使用率 > 80% limit" description: "cadvisor 記憶體用量 / mem_limit = {{ $value | humanizePercentage }},接近 OOMKill。" runbook: "若頻繁觸發 → 檢查 cardinality 是否持續成長,考慮調整 --disable_metrics" - alert: CadvisorCPUThrottled expr: rate(container_cpu_cfs_throttled_seconds_total{name="cadvisor"}[5m]) > 0.5 for: 15m labels: severity: warning component: cadvisor team: ops alert_category: infrastructure notification_type: TYPE-1 auto_repair: "false" annotations: summary: "cAdvisor CPU 被 throttle(配額不足)" description: "cadvisor 每秒被 throttle {{ $value }} 秒,表示實際需求超過 cpus 配額。" runbook: "調高 docker-compose cpus 設定,或檢查 scrape interval / cardinality" # --- node-exporter 自監控 --- - alert: NodeExporterDown expr: up{job=~"node-exporter.*|node_exporter.*"} == 0 for: 5m labels: severity: critical component: node-exporter team: ops alert_category: infrastructure notification_type: TYPE-3 auto_repair: "true" annotations: summary: "node-exporter ({{ $labels.instance }}) 停擺" description: "主機 {{ $labels.instance }} node-exporter 已停擺 5 分鐘,主機 metrics 中斷。" runbook: "SSH 主機檢查 docker ps node-exporter;重啟 docker compose up -d node-exporter" - alert: NodeExporterCPUThrottled expr: rate(container_cpu_cfs_throttled_seconds_total{name="node-exporter"}[5m]) > 0.5 for: 15m labels: severity: warning component: node-exporter team: ops alert_category: infrastructure notification_type: TYPE-1 auto_repair: "false" annotations: summary: "node-exporter CPU 被 throttle(配額不足)" description: "node-exporter 每秒被 throttle {{ $value }} 秒。可能 collector 未適度 disable。" runbook: "檢查 node-exporter --collector.* flags 是否該關掉閒置硬體 probe" # --- Sentry self-hosted 自監控(110)--- # 2026-04-25 ogt + Claude Opus 4.7: 修正假告警根因 # 舊規則用 container_memory_usage_bytes(含 page cache),導致 ClickHouse # 執行大查詢時 OS 把 SSTable 緩存進 page cache,比例衝到 88.5% 觸發誤報 # (2026-04-23 23:13 鐵證:usage_bytes=88.5% / working_set=7.8%)。 # 改用 container_memory_working_set_bytes — 這才是 K8s/Docker OOM killer # 實際追蹤的「真實工作集」(RSS + active page cache),不含 inactive page cache。 # 參考: https://github.com/google/cadvisor/blob/master/info/v1/container.go - alert: SentryClickHouseMemoryPressure expr: container_memory_working_set_bytes{name=~".*sentry.*clickhouse.*"} / container_spec_memory_limit_bytes{name=~".*sentry.*clickhouse.*"} > 0.85 for: 10m labels: severity: warning component: sentry-clickhouse team: platform alert_category: infrastructure notification_type: TYPE-1 auto_repair: "false" annotations: summary: "Sentry ClickHouse 工作集記憶體 > 85% limit" description: "sentry clickhouse working_set / mem_limit = {{ $value | humanizePercentage }} (排除 page cache)。" runbook: "檢查 Sentry 查詢壓力;確認非 page cache 假象;必要時調整 /opt/sentry/docker-compose.override.yml clickhouse mem_limit" - alert: SentryClickHouseCPUThrottled expr: rate(container_cpu_cfs_throttled_seconds_total{name=~".*sentry.*clickhouse.*"}[5m]) > 1.0 for: 15m labels: severity: warning component: sentry-clickhouse team: platform alert_category: infrastructure notification_type: TYPE-1 auto_repair: "false" annotations: summary: "Sentry ClickHouse CPU 持續被 throttle" description: "每秒 throttle {{ $value }} 秒,配額 cpus=4.0 可能不足。" runbook: "檢查 Sentry retention / query pattern;必要時調高 override.yml cpus" # --- Gitea 自監控 --- - alert: GiteaMemoryPressure # 2026-04-25 ogt + Claude Sonnet 4.6 — 同 ClickHouse 假警報根因: # container_memory_usage_bytes 含 page cache(OS inactive,OOM killer 不管)→ 虛高假警報 # 改用 container_memory_working_set_bytes(RSS + active cache,真實壓力,cadvisor 適用 Docker + K8s) expr: container_memory_working_set_bytes{name="gitea"} / container_spec_memory_limit_bytes{name="gitea"} > 0.85 for: 10m labels: severity: warning component: gitea team: ops alert_category: infrastructure notification_type: TYPE-1 auto_repair: "false" annotations: summary: "Gitea 記憶體工作集 > 85% limit" description: "gitea working_set / mem_limit = {{ $value | humanizePercentage }}(真實記憶體壓力,非 page cache 干擾)。" runbook: "檢查 CI/CD 任務堆積;必要時調高 docker-compose mem_limit" - alert: GiteaCPUThrottled expr: rate(container_cpu_cfs_throttled_seconds_total{name=~"gitea|gitea-runner"}[5m]) > 1.0 for: 15m labels: severity: warning component: gitea team: ops alert_category: infrastructure notification_type: TYPE-1 auto_repair: "false" annotations: summary: "Gitea / Runner CPU 持續被 throttle" description: "{{ $labels.name }} 每秒 throttle {{ $value }} 秒,CD peak 可能卡關。" runbook: "檢查 job 並行度;考慮縮減並行或調高 cpus" # --- 監控自監控元層(Prometheus 本身)--- - alert: PrometheusDown expr: up{job="prometheus"} == 0 for: 2m labels: severity: critical component: prometheus team: ops alert_category: infrastructure notification_type: TYPE-3 auto_repair: "false" annotations: summary: "Prometheus ({{ $labels.instance }}) 停擺" description: "Prometheus 自己停擺 → 所有其他告警失效" runbook: "SSH 110 docker compose -f /home/wooo/monitoring/docker-compose.yml up -d prometheus"