diff --git a/.claude/scheduled_tasks.lock b/.claude/scheduled_tasks.lock index eee9a99d..1d77bd2e 100644 --- a/.claude/scheduled_tasks.lock +++ b/.claude/scheduled_tasks.lock @@ -1 +1 @@ -{"sessionId":"412c1507-44d4-4702-bb80-f37e97b804a7","pid":5408,"acquiredAt":1774326092203} \ No newline at end of file +{"sessionId":"8ae62d92-9033-4838-9fc2-d8649af5eb9f","pid":40214,"procStart":"Fri Apr 24 02:17:24 2026","acquiredAt":1777016137376} \ No newline at end of file diff --git a/.claude/settings.json b/.claude/settings.json index 8080d4e0..b65706fe 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -653,7 +653,46 @@ "Read(//Users/**)", "Read(//Users/ooo/.claude/**)", "Bash(mkdir -p /Users/ogt/awoooi/.claude/agents)", - "Bash(cp /Users/ogt/.claude/agents/*.md /Users/ogt/awoooi/.claude/agents/)" + "Bash(cp /Users/ogt/.claude/agents/*.md /Users/ogt/awoooi/.claude/agents/)", + "Bash(kubectl -n awoooi-prod logs --tail=400 -l app=awoooi-api --prefix=true)", + "Bash(kubectl -n awoooi-prod logs --tail=300 awoooi-api-65c69fd649-bxbwp)", + "Bash(kubectl -n awoooi-prod logs --tail=20000 -l app=awoooi-api --prefix=false --since=24h)", + "Bash(kubectl -n awoooi-prod logs --since=24h awoooi-api-65c69fd649-bxbwp)", + "Bash(kubectl -n awoooi-prod logs --since=24h -l app=awoooi-api --prefix=false)", + "Bash(kubectl -n awoooi-prod logs --since=24h awoooi-api-65c69fd649-fmbxd)", + "Bash(kubectl -n awoooi-prod logs --since=3h awoooi-api-65c69fd649-fmbxd)", + "Bash(kubectl -n awoooi-prod logs --since=3h awoooi-api-65c69fd649-bxbwp)", + "Bash(kubectl -n awoooi-prod logs -l app=awoooi-api --tail=30 --since=30m)", + "Bash(kubectl -n awoooi-prod get pods -o wide)", + "Bash(kubectl -n awoooi-prod get pods -l app=awoooi-api -o jsonpath='{.items[0].metadata.creationTimestamp}')", + "Bash(kubectl -n awoooi-prod logs -l app=awoooi-api --tail=5 --since=5m)", + "Bash(kubectl -n awoooi-prod describe pod -l app=awoooi-api)", + "Bash(kubectl -n awoooi-prod logs -l app=awoooi-api --tail=20 --since=10m)", + "Bash(kubectl -n awoooi-prod exec deployment/awoooi-api -- python3 -c ' *)", + "Bash(PGPASSWORD=\"\" psql -h 188.188.188.188 -U aiops -d aiops -c \"\\\\d timeline_events\")", + "Bash(kubectl -n awoooi-prod get deploy awoooi-api -o yaml)", + "Bash(PGPASSWORD=\"\" psql --version)", + "Bash(kubectl -n awoooi-prod exec deploy/awoooi-api -- env)", + "Bash(kubectl -n awoooi-prod logs --tail=500 deploy/awoooi-api)", + "Bash(kubectl cp *)", + "Bash(kubectl -n awoooi-prod exec deploy/awoooi-api -- sh -c 'curl -sG \"$PROMETHEUS_URL/api/v1/query\" --data-urlencode \"query=up\" 2>&1 | head -c 400')", + "Bash(kubectl -n awoooi-prod exec deploy/awoooi-api -- sh -c 'for q in \"sum\\(rate\\(http_requests_total{status=~\\\\\"5..\\\\\"}[5m]\\)\\) / sum\\(rate\\(http_requests_total[5m]\\)\\)\" \"avg\\(rate\\(container_cpu_usage_seconds_total{namespace=\\\\\"awoooi-prod\\\\\",container=\\\\\"awoooi-api\\\\\"}[5m]\\)\\)\" \"pg_stat_activity_count{datname=\\\\\"awoooi\\\\\"}\" \"increase\\(kube_pod_container_status_restarts_total{namespace=\\\\\"awoooi-prod\\\\\"}[15m]\\)\"; do echo \"---- $q\"; curl -sG \"$PROMETHEUS_URL/api/v1/query\" --data-urlencode \"query=$q\" 2>&1 | head -c 250; echo; done')", + "Bash(kubectl -n awoooi-prod exec deploy/awoooi-api -- sh -c 'PGPASSWORD=as0V1mohktaFbGIx3R0iCatbMJ6XxFDL psql -h 192.168.0.188 -U awoooi -d awoooi_prod -c \"SELECT metric_name, count\\(*\\), max\\(trained_at\\) FROM dynamic_baseline_record GROUP BY metric_name;\" 2>&1 | head -20')", + "Bash(kubectl -n awoooi-prod exec deploy/awoooi-api -- sh -c 'PGPASSWORD=as0V1mohktaFbGIx3R0iCatbMJ6XxFDL psql -h 192.168.0.188 -U awoooi -d awoooi_prod -c \"SELECT count\\(*\\) as asset_count FROM asset_inventory; SELECT count\\(*\\) as coverage_count FROM asset_coverage_snapshot; SELECT count\\(*\\) as host_cap_count FROM host_capacity_snapshot; SELECT count\\(*\\) as compl_count FROM asset_compliance_snapshot; SELECT count\\(*\\) as rule_cat FROM alert_rule_catalog; SELECT count\\(*\\) as log_cluster FROM log_cluster_record;\" 2>&1')", + "Bash(kubectl -n awoooi-prod exec deploy/awoooi-api -- sh -c 'python3 -c \" *)", + "Bash(kubectl -n awoooi-prod exec deploy/awoooi-api -- python3 -c ' *)", + "Bash(kubectl -n awoooi-prod exec deploy/awoooi-api -- sh -c 'for q in \"http_requests_total\" \"container_cpu_usage_seconds_total\" \"container_memory_usage_bytes\" \"kube_pod_container_status_restarts_total\" \"pg_stat_activity_count\" \"node_cpu_seconds_total\" \"node_load1\"; do echo -n \"$q => \"; curl -sG \"$PROMETHEUS_URL/api/v1/query\" --data-urlencode \"query=count\\($q\\)\" 2>&1 | head -c 180; echo; done')", + "Bash(kubectl -n awoooi-prod exec deploy/awoooi-api -- sh -c 'curl -sG \"$PROMETHEUS_URL/api/v1/query\" --data-urlencode \"query=container_cpu_usage_seconds_total\" 2>&1 | python3 -c \"import json,sys; d=json.load\\(sys.stdin\\); rs=d[\\\\\"data\\\\\"][\\\\\"result\\\\\"][:3]; [print\\(r[\\\\\"metric\\\\\"]\\) for r in rs]; print\\(\\\\\"total series:\\\\\", len\\(d[\\\\\"data\\\\\"][\\\\\"result\\\\\"]\\)\\)\"')", + "Bash(kubectl -n awoooi-prod exec deploy/awoooi-api -- sh -c 'which kubectl 2>&1; kubectl version --client 2>&1 | head -3; kubectl -n awoooi-prod get deploy awoooi-api 2>&1 | head -3')", + "Bash(kubectl -n awoooi-prod logs --tail=2000 deploy/awoooi-api)", + "Bash(psql --version)", + "WebFetch(domain:core.telegram.org)", + "mcp__plugin_context7_context7__resolve-library-id", + "mcp__plugin_context7_context7__query-docs", + "WebFetch(domain:docs.claude.com)", + "Bash(git tag *)", + "Read(//usr/**)", + "Bash(psql -h 192.168.0.110 -U awoooi_user -d awoooi -c \"SELECT id, alertname, status, confidence, description, created_at FROM approval_records WHERE status='PENDING' AND DATE\\(created_at AT TIME ZONE 'Asia/Taipei'\\) = CURRENT_DATE AT TIME ZONE 'Asia/Taipei' ORDER BY created_at DESC LIMIT 10;\")" ], "deny": [ "Bash(rm -rf *)", diff --git a/apps/api/src/agents/solver_agent.py b/apps/api/src/agents/solver_agent.py index f8134a5c..29a562ed 100644 --- a/apps/api/src/agents/solver_agent.py +++ b/apps/api/src/agents/solver_agent.py @@ -210,6 +210,18 @@ class SolverAgent(BaseAgent): parsed = self._parse_response(sanitize(response_text, "solver_output")) candidates = _extract_candidates(parsed) + # 2026-04-25 ogt + Claude Sonnet 4.6: 非 K8s target 後置過濾(P0 修復) + # 根因:GiteaMemoryPressure 告警觸發 Solver → LLM 生成 "kubectl scale deployment gitea" + # Gitea 在主機 docker-compose,不在 awoooi-prod namespace → 執行必然失敗 + # 修復:用 inventory 清單對 candidates 過濾,scale/restart/delete 若 target 不在清單則丟棄 + if _k8s_inventory and candidates: + candidates = _filter_non_k8s_targets(candidates, _k8s_inventory) + logger.debug( + "solver_k8s_target_filter_applied", + remaining_candidates=len(candidates), + inventory_preview=_k8s_inventory[:100], + ) + if not candidates: return self._degraded_plan(diagnosis, 0, "no_candidates") @@ -235,12 +247,24 @@ class SolverAgent(BaseAgent): if _inventory else "\n⚠️ 無法取得叢集清單,請謹慎填寫資源名稱。\n" ) + # 2026-04-25 ogt + Claude Sonnet 4.6: 防止 Gitea 等非 K8s 服務被 kubectl scale + # 根因:Gitea 運行在主機 docker-compose,不在 K8s awoooi-prod namespace + # LLM 看到「Gitea 記憶體壓力」後自動推薦 kubectl scale deployment gitea + # 但 gitea 不在 K8s,執行必然失敗(kubectl not found error) + # 修復:加明確禁令 — 清單外資源禁止 kubectl scale/restart/delete + _non_k8s_warning = ( + "\n🚫 禁令:若 inventory 清單中無此服務(如 gitea、sentry、harbor、postgres、signoz)" + "→ 禁止使用 kubectl scale/restart/delete,必須輸出空 candidates 陣列。\n" + "這些服務運行在主機 docker-compose 環境,不在 K8s 叢集內,kubectl 無法操作它們。\n" + if _inventory + else "" + ) return f"""你是 AWOOOI SRE 系統的軍師 Agent,專職修復方案設計。 根因假設:{context.get("hypothesis", "")} 告警類別:{context.get("category", "")} 診斷信心:{context.get("confidence", 0.0):.0%} -{_inventory_section} +{_inventory_section}{_non_k8s_warning} 你的工作:為此根因提出 1-3 個修復候選方案。 每個方案必須評估: - blast_radius(0-100):影響範圍(越高 = 風險越大) @@ -306,6 +330,104 @@ blast_radius 參考: # Helpers # ───────────────────────────────────────────────────────────────────────────── +# 2026-04-25 ogt + Claude Sonnet 4.6: 高風險 kubectl 動詞集合(P0 非 K8s target 過濾用) +# 唯讀動詞(get/top/describe/logs)不需過濾 — 執行失敗不會造成破壞 +# 寫入動詞(scale/restart/delete/undo/apply/set)才需驗證 target 在 K8s inventory 內 +# 注意:kubectl rollout restart/undo 的第二個 token 是 "rollout"(非 restart/undo) +# 因此額外加 "rollout" 讓分支進入 rollout 子動詞解析 +_KUBECTL_MUTATING_VERBS: frozenset[str] = frozenset( + {"scale", "rollout", "delete", "apply", "set", "patch", "exec"} +) +# rollout 子動詞中,只有 restart/undo 是寫入操作;history/status 是唯讀 +_KUBECTL_ROLLOUT_MUTATING_SUBVERBS: frozenset[str] = frozenset({"restart", "undo"}) + + +def _filter_non_k8s_targets( + candidates: list["CandidateAction"], + inventory: str, +) -> list["CandidateAction"]: + """ + 後置過濾:丟棄 kubectl 寫入指令中 target 不在 K8s inventory 的 candidate。 + + 2026-04-25 ogt + Claude Sonnet 4.6 (P0 非 K8s target 過濾): + - 根因:GiteaMemoryPressure → LLM 生成 "kubectl scale deployment gitea --replicas=3" + Gitea 在主機 docker-compose,不在 awoooi-prod → 執行必然失敗 + - 過濾規則: + 1. 解析 action 的動詞(scale/restart/delete/undo 等) + 2. 若動詞屬於 _KUBECTL_MUTATING_VERBS,從 action 中提取 deployment/ + 3. 若 不在 inventory(且 inventory 非空),丟棄此 candidate + 記 warning log + 4. 唯讀動詞(get/top/describe/logs)直接放行,不做 target 驗證 + - 邊界:inventory 為空(fetch 失敗)時不過濾,保留原有降級語意 + + Args: + candidates: LLM 輸出的候選方案列表 + inventory: "awoooi-api, awoooi-web, postgres, ..." 格式字串(由 _fetch_k8s_inventory 提供) + + Returns: + 過濾後的 candidates 列表(可能為空,交由呼叫端降級) + """ + if not inventory: + return candidates + + # 解析 inventory 為 set,支援逗號/空格分隔 + import re as _re + inventory_names: set[str] = { + n.strip().lower() for n in _re.split(r"[,\s]+", inventory) if n.strip() + } + + # 從 action 提取 deployment/ 或 statefulset/ 中的 + _target_pattern = _re.compile( + r"(?:deployment|statefulset|deploy|sts)/([A-Za-z0-9][\w.-]{0,62})", + _re.IGNORECASE, + ) + + result: list[CandidateAction] = [] + for candidate in candidates: + action = candidate.action.strip() + + # 解析 kubectl 動詞(第二個 token:kubectl ...) + parts = action.split() + if len(parts) < 2 or parts[0].lower() != "kubectl": + result.append(candidate) + continue + + verb = parts[1].lower() + + # 唯讀動詞放行 + if verb not in _KUBECTL_MUTATING_VERBS: + result.append(candidate) + continue + + # kubectl rollout :只有 restart/undo 屬寫入操作 + # history/status/pause/resume 是唯讀,放行不做 target 驗證 + if verb == "rollout": + subverb = parts[2].lower() if len(parts) > 2 else "" + if subverb not in _KUBECTL_ROLLOUT_MUTATING_SUBVERBS: + result.append(candidate) + continue + + # 寫入動詞:提取 target name + match = _target_pattern.search(action) + if not match: + # 無法解析 target → 保守放行(避免誤殺無 deployment/ 前綴的合法指令) + result.append(candidate) + continue + + target_name = match.group(1).lower() + if target_name in inventory_names: + result.append(candidate) + else: + logger.warning( + "solver_non_k8s_target_rejected", + action=action[:120], + target=target_name, + reason="target 不在 K8s awoooi-prod inventory,可能是 docker-compose 服務(如 gitea)", + inventory_preview=inventory[:100], + ) + + return result + + async def _fetch_k8s_inventory(namespace: str = "awoooi-prod", timeout_sec: float = 5.0) -> str: """ 取得 K8s 叢集實際 Deployment/StatefulSet 清單,供 Solver prompt 注入。 diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index 8fc48416..57b89f6f 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -1624,10 +1624,22 @@ async def alertmanager_webhook( # ADR-073 Phase 2-2: 早期分診 — 在 LLM 前決定 alert_category + notification_type # 2026-04-12 ogt: 防止 HostBackupFailed 等被誤路由到 K8s executor + # 2026-04-25 ogt + Claude Sonnet 4.6: 計算告警 age 供備份告警升級判斷 + # HostBackupFailed + age > 24h → TYPE-3(P0 修復),而非 TYPE-1(純資訊) + _alert_age_hours: float = 0.0 + if alert.startsAt: + try: + from datetime import datetime, timezone + _starts_at = datetime.fromisoformat(alert.startsAt.replace("Z", "+00:00")) + _alert_age_hours = (datetime.now(timezone.utc) - _starts_at).total_seconds() / 3600 + except (ValueError, TypeError): + pass # 解析失敗視為 age=0,不影響主流程 + alert_category, notification_type = classify_alert_early( alertname=alertname, severity=alert.labels.get("severity", "warning"), labels=alert.labels, + age_hours=_alert_age_hours, ) severity_map = {"critical": "critical", "warning": "warning", "info": "info"} diff --git a/apps/api/src/services/incident_service.py b/apps/api/src/services/incident_service.py index 7f0364fb..fd4e0b6c 100644 --- a/apps/api/src/services/incident_service.py +++ b/apps/api/src/services/incident_service.py @@ -101,7 +101,12 @@ def extract_affected_services(labels: dict, target_resource: str) -> list[str]: return [] -def classify_alert_early(alertname: str, severity: str, labels: dict | None = None) -> tuple[str, str]: +def classify_alert_early( + alertname: str, + severity: str, + labels: dict | None = None, + age_hours: float = 0.0, +) -> tuple[str, str]: """ ADR-073 Phase 2-2: 早期分診,在 LLM 分析前決定 alert_category + notification_type。 防止 HostBackupFailed 等告警被誤路由到 K8s executor。 @@ -109,15 +114,26 @@ def classify_alert_early(alertname: str, severity: str, labels: dict | None = No 規則優先順序(由高到低): 1. ConfigurationDrift / KubeConfigDrift → TYPE-4D (Config Drift 卡片) 2. severity=info/none → TYPE-1 (純資訊,無按鈕) - 3. backup/heartbeat 關鍵字 → TYPE-1 + 3. backup/heartbeat 關鍵字 → TYPE-1(但 backup failure age > 24h → TYPE-3,見下) 4. Docker/Host 前綴 → infrastructure TYPE-3 5. Kube/Pod/Deploy/Node/Velero/ArgoCD 前綴 → kubernetes TYPE-3 6. Postgres/Redis 前綴 → database TYPE-3 7. 預設 → general TYPE-3 + 2026-04-25 ogt + Claude Sonnet 4.6 (P0 備份告警升級修復): + - age_hours > 24:HostBackupFailed/HostBackupStale/HostBackupMissing 升級為 TYPE-3 + 原因:備份 25h 未成功是 P0 故障,不是「純資訊」 + 此時應觸發 LLM 分析 + 自動修復建議,而非靜默發純文字通知 + C3 修正 (首席架構師 CR 2026-04-13): 從 Router 層 (webhooks.py) 移入 Service 層 原違規: 業務邏輯函數定義在 api/v1/webhooks.py + Args: + alertname: Alertmanager alert name + severity: 告警嚴重度(critical/warning/info/none) + labels: Alertmanager labels dict + age_hours: 告警持續時數(由 startsAt 計算,0.0 = 未知) + Returns: tuple[str, str]: (alert_category, notification_type) """ @@ -174,6 +190,16 @@ def classify_alert_early(alertname: str, severity: str, labels: dict | None = No "HostBackupFailed", "HostBackupStale", "HostBackupMissing", "BackupRestoreTestFailed", "BackupRestoreTestStale", } + # 2026-04-25 ogt + Claude Sonnet 4.6 (P0 備份告警升級修復): + # 備份失敗 > 24h 不是「純資訊」,是 P0 故障,必須走 TYPE-3 觸發 LLM 分析 + 自動修復 + # BackupRestoreTestFailed 屬測試驗證類,不受 age 升級影響(仍 TYPE-1) + _BACKUP_AGE_UPGRADE_NAMES = { + "HostBackupFailed", "HostBackupStale", "HostBackupMissing", + } + _BACKUP_AGE_THRESHOLD_HOURS = 24.0 + if alertname in _BACKUP_AGE_UPGRADE_NAMES and age_hours > _BACKUP_AGE_THRESHOLD_HOURS: + return "backup_failure", "TYPE-3" + # 2026-04-12 ogt: 補入 DeadMansSwitch(HEARTBEAT_ALERT_NAMES 中但之前漏掉) if ( "watchdog" in alertname_lower