diff --git a/apps/api/alert_rules.yaml b/apps/api/alert_rules.yaml index bfb8a4c8..02f97c17 100644 --- a/apps/api/alert_rules.yaml +++ b/apps/api/alert_rules.yaml @@ -108,25 +108,53 @@ rules: reasoning: "[規則匹配] Pod OOMKilled 後 ReplicaSet 將自動重建,但需同步修正資源配置防止復發。" # 2026-04-12 ogt: Host CPU 告警獨立規則 — node_exporter 告警無 pod/deployment label - # 原本放在 high_cpu 規則導致 {target}="unknown" → auto-repair 安全攔截 - # host 告警只能通知,不能 kubectl scale - - id: host_cpu_high + # 2026-04-16 ogt + Claude Sonnet 4.6: 補齊主機層所有常見 Prometheus alertname + # 原則:主機層告警 = 只能通知 + 建議 SSH 排查,絕對禁止 kubectl restart + - id: host_resource_alert priority: 45 - description: Host 主機 CPU 使用率過高 (node_exporter,非 K8s workload) + description: Host 主機資源告警 (node_exporter — CPU/記憶體/負載/磁碟增長,非 K8s workload) match: alertname: + # CPU 相關 - HostHighCpuLoad - NodeCPUUsageHigh - NodeHighCpuLoad + # 負載相關 + - HostHighLoadAverage + - NodeLoadAverageHigh + - HostLoadAverageHigh + # 記憶體相關 + - HostOutOfMemory + - HostMemoryUnderMemoryPressure + - HostMemoryUsageHigh + - NodeMemoryPressure + # 磁碟 I/O 相關 + - HostUnusualDiskReadLatency + - HostUnusualDiskWriteLatency + - HostUnusualDiskReadRate + - HostUnusualDiskWriteRate + - HostDiskWillFillIn24Hours + - HostOutOfDiskSpace + # 網路相關 + - HostUnusualNetworkThroughputIn + - HostUnusualNetworkThroughputOut + # 系統服務 + - HostSystemdServiceCrashed + - HostKernelVersionDeviations + - HostOomKillDetected + - HostEdacCorrectableErrors + - HostEdacUncorrectableErrors + - HostClockSkewDetected + - HostClockNotSynchronising response: - action_title: "Host {host} CPU 過高 — 需排查高 CPU 進程" - description: "⚠️ 主機 {host} CPU 使用率超標。此為主機層告警,需 SSH 登入排查 (top / ps aux)。常見原因: Ollama 推理、DB 查詢、K3s GC。" + action_title: "⚠️ 主機告警 — 需 SSH 人工排查" + description: "⚠️ 主機層告警(node_exporter)。此告警源自主機資源,無法透過 kubectl 自動修復。請 SSH 登入主機排查根因:top / htop / df -h / journalctl -xe。" suggested_action: NO_ACTION kubectl_command: "" estimated_downtime: "N/A" risk: low responsibility: INFRA - reasoning: "[規則匹配] 主機 CPU 告警無法自動修復,需人工確認高 CPU 進程後決策。" + reasoning: "[規則匹配] 主機層資源告警無法自動修復,需人工登入確認高負載/高記憶體/磁碟根因後決策。禁止 kubectl restart(node_exporter 不是 K8s 服務)。" - id: high_cpu priority: 40 @@ -219,6 +247,34 @@ rules: # ── 資料庫層 ───────────────────────────────────────────────── + # 2026-04-16 ogt + Claude Sonnet 4.6: PostgreSQL 監控告警 — 磁碟/資源類,絕對不能重啟 + # 根因:PostgreSQLDiskGrowthRate 落 generic_fallback → 輸出 kubectl rollout restart postgresql(錯誤!) + - id: postgresql_disk_monitoring + priority: 68 + description: PostgreSQL 磁碟/增長率/exporter 監控告警(不重啟資料庫) + match: + alertname: + - PostgreSQLDiskGrowthRate + - PostgreSQLDiskUsageHigh + - PostgreSQLDiskFull + - PostgresExporterDown + - PostgreSQLExporterDown + - PostgreSQLTableBloat + - PostgreSQLVacuumRequired + - PostgreSQLReplicationLag + - PostgreSQLTooManyConnections + response: + action_title: "⚠️ PostgreSQL 監控告警 — 需人工排查,禁止重啟" + description: "⚠️ PostgreSQL 資源/監控告警。磁碟增長過快或 exporter 異常,重啟資料庫會造成資料風險。請登入排查磁碟用量或 WAL 狀態。" + suggested_action: NO_ACTION + kubectl_command: "kubectl exec -n {namespace} deployment/postgresql -- psql -U postgres -c 'SELECT pg_database_size(current_database()), pg_size_pretty(pg_database_size(current_database()));'" + estimated_downtime: "N/A" + risk: medium + responsibility: DB + responsibility_reasoning: "PostgreSQL 磁碟告警需 DBA 評估,自動重啟資料庫有資料丟失風險,必須人工確認" + secondary_teams: [INFRA] + reasoning: "[規則匹配] PostgreSQL 磁碟增長/監控告警,絕對禁止自動重啟資料庫。需 DBA 人工確認磁碟用量、WAL 清理、VACUUM 狀態。" + - id: postgresql_down priority: 70 description: PostgreSQL 服務下線 diff --git a/apps/api/pyproject.toml b/apps/api/pyproject.toml index 2c6c4d00..533fbf26 100644 --- a/apps/api/pyproject.toml +++ b/apps/api/pyproject.toml @@ -43,6 +43,9 @@ dependencies = [ "statsmodels>=0.14.0", "drain3>=0.9.11", "sse-starlette>=1.8.0", + # 2026-04-16 ogt + Claude Sonnet 4.6: SSH MCP sensor 修復 — asyncssh 缺失導致 sensors_succeeded=0 + # 根因: ssh_provider.py 中 import asyncssh 在 try/except 外,所有 15 個 SSH tool 直接 ImportError + "asyncssh>=2.14.0", ] # [tool.uv.sources] diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index b127ba18..1d1d9949 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -1237,6 +1237,18 @@ class DecisionManager: # COMPLETED 狀態: 直接返回,避免重複建立 decision 導致 Telegram 轟炸 if existing_token.state == DecisionState.COMPLETED: return existing_token + # 2026-04-16 ogt + Claude Sonnet 4.6: 修復重複卡片根因 — ANALYZING 未早返回 + # 問題:多 pod 並發時 pod-A 在 ANALYZING,pod-B/C 發現 ANALYZING 不在返回條件 + # → 各自建新 token → 同一 incident 跑 3 次 agent_debate → 送出 3 張 TG 卡 + # 修復:ANALYZING 狀態也直接返回,避免重複處理 + if existing_token.state == DecisionState.ANALYZING: + logger.debug( + "decision_analyzing_in_progress", + incident_id=incident.incident_id, + token=existing_token.token, + reason="另一個 worker 正在分析中,跳過重複建立", + ) + return existing_token # ADR-073 Phase 3-1: TYPE-1 triage guard — 純資訊告警跳過 LLM 分析 # classify_alert_early() 已在 webhook 入口設定 notification_type