diff --git a/apps/api/alert_rules.yaml b/apps/api/alert_rules.yaml index 2095e495..c56472ed 100644 --- a/apps/api/alert_rules.yaml +++ b/apps/api/alert_rules.yaml @@ -107,16 +107,36 @@ rules: command: "kubectl autoscale deployment {target} --memory-percent=80 --min=2 --max=5 -n {namespace}" reasoning: "[規則匹配] Pod OOMKilled 後 ReplicaSet 將自動重建,但需同步修正資源配置防止復發。" + # 2026-04-12 ogt: Host CPU 告警獨立規則 — node_exporter 告警無 pod/deployment label + # 原本放在 high_cpu 規則導致 {target}="unknown" → auto-repair 安全攔截 + # host 告警只能通知,不能 kubectl scale + - id: host_cpu_high + priority: 45 + description: Host 主機 CPU 使用率過高 (node_exporter,非 K8s workload) + match: + alertname: + - HostHighCpuLoad + - NodeCPUUsageHigh + - NodeHighCpuLoad + response: + action_title: "Host {host} CPU 過高 — 需排查高 CPU 進程" + description: "⚠️ 主機 {host} CPU 使用率超標。此為主機層告警,需 SSH 登入排查 (top / ps aux)。常見原因: Ollama 推理、DB 查詢、K3s GC。" + suggested_action: NO_ACTION + kubectl_command: "" + estimated_downtime: "N/A" + risk: low + responsibility: INFRA + reasoning: "[規則匹配] 主機 CPU 告警無法自動修復,需人工確認高 CPU 進程後決策。" + - id: high_cpu priority: 40 - description: Pod/Node CPU 使用率過高 + description: K8s Pod/Deployment CPU 使用率過高 match: # 2026-04-10 Claude Sonnet 4.6: Phase 2 飛輪修復 — 補齊 Prometheus alertname 變體 + # 2026-04-12 ogt: 移除 HostHighCpuLoad/NodeCPUUsageHigh → 已獨立為 host_cpu_high 規則 alertname: - HighCPUUsage - ContainerCpuUsageSecondsTotal - - HostHighCpuLoad - - NodeCPUUsageHigh - CPUThrottlingHigh - KubeCPUOvercommit alert_type: