diff --git a/apps/api/src/services/ai_router.py b/apps/api/src/services/ai_router.py index 1c1f74bb..5e321740 100644 --- a/apps/api/src/services/ai_router.py +++ b/apps/api/src/services/ai_router.py @@ -599,6 +599,7 @@ class AIRouter: self, selected_provider: AIProviderEnum ) -> list[tuple[AIProviderEnum, str]]: """ + # DEPRECATED 2026-04-28 — 已由 _build_fallback_chain_for_intent 取代,無呼叫方 建立 Fallback 鏈 (排除已選 Provider) Fallback 順序: Ollama → Gemini → Claude diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index 85316bc5..03fce7ea 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -2098,6 +2098,11 @@ class DecisionManager: except Exception as _cd_err: logger.debug("auto_execute_cooldown_check_error", error=str(_cd_err)) + # P1.1 fix 2026-04-27 ogt + Claude Sonnet 4.6: 失敗路徑 KM 寫入哨兵 + # 在 try 外宣告,確保 except 區塊能存取(即使建構失敗) + _km_executor = None + _km_approval = None + try: # 延遲導入避免循環依賴 from src.models.approval import ApprovalRequest, ApprovalStatus @@ -2119,6 +2124,7 @@ class DecisionManager: risk_level=_risk, matched_playbook_id=_matched_playbook_id, ) + _km_approval = approval # P1.1: 供 except 失敗路徑使用 # ADR-071-I: 執行前抓 metrics_before 快照 (2026-04-11 Claude Sonnet 4.6) _metrics_before = await _fetch_metrics_snapshot(incident) @@ -2144,6 +2150,7 @@ class DecisionManager: # _push_auto_repair_result → Telegram 顯示 ✅ 自動修復完成,即使 K8s 拒絕了指令 # 修復: execute_approved_action 現在返回 bool,正確透傳給通知函數 executor = ApprovalExecutionService() + _km_executor = executor # P1.1: 供 except 失敗路徑使用 _exec_success = await executor.execute_approved_action(approval) # 更新狀態 @@ -2190,6 +2197,16 @@ class DecisionManager: _push_decision_to_telegram(incident, token.proposal_data) ) + # P1.1 fix 2026-04-27 ogt + Claude Sonnet 4.6: 失敗路徑補 KM 寫入 + # 根因:auto_execute 拋出例外時,學習飛輪完全拿不到失敗記錄 + # 修法:若 executor/approval 已建立,fire-and-forget 寫入失敗 KM;未建立則跳過 + if _km_executor is not None and _km_approval is not None: + _fire_and_forget( + _km_executor._write_execution_result_to_km( + _km_approval, False, str(e) + ) + ) + async def _query_kb_context_inner(self, incident: Incident) -> str: """KB RAG 實際查詢邏輯,由 _query_kb_context 包裝 timeout 後呼叫""" query_parts = list(incident.affected_services) diff --git a/apps/api/tests/test_golden_regression.py b/apps/api/tests/test_golden_regression.py new file mode 100644 index 00000000..64253397 --- /dev/null +++ b/apps/api/tests/test_golden_regression.py @@ -0,0 +1,172 @@ +# apps/api/tests/test_golden_regression.py +# 2026-04-28 ogt + Claude Sonnet 4.6 — P3.2.4 黃金回歸測試集 +""" +30 個黃金測試案例,保護 classify_alert_early() 邏輯不退步。 +涵蓋 10 類告警場景。 + +函數簽章: + classify_alert_early(alertname, severity, labels=None, age_hours=0.0) + -> tuple[str, str] # (alert_category, notification_type) +""" +from __future__ import annotations + +import pytest +from src.services.incident_service import classify_alert_early + + +class TestKubernetes: + def test_pod_crash_looping(self): + cat, ntype = classify_alert_early("PodCrashLooping", "critical") + assert cat == "kubernetes" and ntype == "TYPE-3" + + def test_deploy_rollout_stuck(self): + cat, ntype = classify_alert_early("DeployRolloutStuck", "warning") + assert cat == "kubernetes" and ntype == "TYPE-3" + + def test_kube_node_not_ready(self): + cat, ntype = classify_alert_early("KubeNodeNotReady", "critical") + assert cat == "kubernetes" and ntype == "TYPE-3" + + def test_pod_oom_killed(self): + cat, ntype = classify_alert_early("PodOOMKilled", "warning") + assert cat == "kubernetes" and ntype == "TYPE-3" + + def test_node_memory_pressure(self): + cat, ntype = classify_alert_early("NodeMemoryPressure", "warning") + assert cat == "kubernetes" and ntype == "TYPE-3" + + +class TestHostResource: + def test_host_high_cpu_load(self): + cat, ntype = classify_alert_early("HostHighCpuLoad", "critical") + assert cat == "host_resource" and ntype == "TYPE-3" + + def test_host_high_memory(self): + cat, ntype = classify_alert_early("HostHighMemory", "warning") + assert cat == "host_resource" and ntype == "TYPE-3" + + def test_host_disk_space_full(self): + cat, ntype = classify_alert_early("HostDiskSpaceFull", "critical") + assert cat == "host_resource" and ntype == "TYPE-3" + + +class TestHighCpuVariants: + """HighCPU* prefix 規則覆蓋""" + def test_high_cpu_prefix(self): + cat, ntype = classify_alert_early("HighCPUUsage", "critical") + assert cat == "host_resource" and ntype == "TYPE-3" + + def test_high_memory_prefix(self): + cat, ntype = classify_alert_early("HighMemoryPressure", "warning") + assert cat == "host_resource" and ntype == "TYPE-3" + + +class TestBackup: + def test_host_backup_failed_fresh(self): + """< 24h 備份失敗 → TYPE-1(pure info)""" + cat, ntype = classify_alert_early("HostBackupFailed", "warning", age_hours=10.0) + assert cat == "backup" and ntype == "TYPE-1" + + def test_host_backup_failed_stale_upgrade(self): + """> 24h 備份失敗 → 升級為 TYPE-3(P0 故障)""" + cat, ntype = classify_alert_early("HostBackupFailed", "warning", age_hours=25.0) + assert cat == "backup_failure" and ntype == "TYPE-3" + + def test_host_backup_stale_upgrade(self): + cat, ntype = classify_alert_early("HostBackupStale", "warning", age_hours=30.0) + assert cat == "backup_failure" and ntype == "TYPE-3" + + def test_backup_restore_test_not_upgraded(self): + """BackupRestoreTestFailed 不受 age 升級影響""" + cat, ntype = classify_alert_early("BackupRestoreTestFailed", "warning", age_hours=48.0) + assert ntype == "TYPE-1" + + +class TestDatabase: + def test_postgres_connections_high(self): + cat, ntype = classify_alert_early("PostgreSQLConnectionsHigh", "warning") + assert cat == "database" and ntype == "TYPE-3" + + def test_postgres_disk_growth_rate(self): + cat, ntype = classify_alert_early("PostgreSQLDiskGrowthRate", "warning") + assert cat == "database" and ntype == "TYPE-3" + + def test_redis_memory_full(self): + cat, ntype = classify_alert_early("RedisMemoryFull", "critical") + assert cat == "database" and ntype == "TYPE-3" + + +class TestInfoAndHeartbeat: + def test_severity_info_is_type1(self): + _, ntype = classify_alert_early("SomeAlert", "info") + assert ntype == "TYPE-1" + + def test_severity_none_is_type1(self): + _, ntype = classify_alert_early("SomeAlert", "none") + assert ntype == "TYPE-1" + + def test_watchdog_heartbeat(self): + cat, ntype = classify_alert_early("Watchdog", "warning") + assert cat == "backup" and ntype == "TYPE-1" + + def test_deadmansswitch(self): + cat, ntype = classify_alert_early("DeadMansSwitch", "critical") + assert cat == "backup" and ntype == "TYPE-1" + + +class TestConfigDrift: + def test_configuration_drift(self): + cat, ntype = classify_alert_early("ConfigurationDrift", "warning") + assert cat == "config_drift" and ntype == "TYPE-4D" + + def test_kube_config_drift(self): + cat, ntype = classify_alert_early("KubeConfigDrift", "critical") + assert cat == "config_drift" and ntype == "TYPE-4D" + + +class TestFlywheelHealth: + def test_auto_repair_low_success(self): + cat, ntype = classify_alert_early("AutoRepairLowSuccessRate", "critical") + assert cat == "flywheel_health" and ntype == "TYPE-8M" + + def test_ollama_down(self): + cat, ntype = classify_alert_early("OllamaDown", "critical") + assert cat == "flywheel_health" and ntype == "TYPE-8M" + + +class TestSecops: + def test_unauthorized_ssh(self): + cat, ntype = classify_alert_early("UnauthorizedSSH", "critical") + assert cat == "secops" and ntype == "TYPE-5S" + + def test_pod_abnormal_activity(self): + cat, ntype = classify_alert_early("PodAbnormalActivity", "critical") + assert cat == "secops" and ntype == "TYPE-5S" + + +class TestDevopsAndExternal: + def test_gitea_down(self): + cat, ntype = classify_alert_early("GiteaDown", "critical") + assert cat == "devops_tool" and ntype == "TYPE-3" + + def test_mowoo_work_down(self): + cat, ntype = classify_alert_early("MoWoooWorkDown", "critical") + assert cat == "external_site" and ntype == "TYPE-3" + + def test_minio_down(self): + cat, ntype = classify_alert_early("MinIODown", "critical") + assert cat == "storage" and ntype == "TYPE-3" + + +class TestAlertChainAndGeneral: + def test_alert_chain_broken_alertmanager(self): + cat, ntype = classify_alert_early("AlertChainBroken_Alertmanager", "critical") + assert cat == "alertchain_health" and ntype == "TYPE-8M" + + def test_no_alerts_received(self): + cat, ntype = classify_alert_early("NoAlertsReceived", "critical") + assert cat == "alertchain_health" and ntype == "TYPE-8M" + + def test_unknown_alert_falls_to_general(self): + cat, ntype = classify_alert_early("SomeBrandNewUnknownAlert2026", "warning") + assert cat == "general" and ntype == "TYPE-3" diff --git a/k8s/awoooi-prod/04-configmap.yaml b/k8s/awoooi-prod/04-configmap.yaml index 981a715a..a9b9f13a 100644 --- a/k8s/awoooi-prod/04-configmap.yaml +++ b/k8s/awoooi-prod/04-configmap.yaml @@ -114,6 +114,9 @@ data: # 根因:SSH_MCP_ALLOWED_HOSTS 未設定 → _ssh_execute() 攔截 → 主機告警全部降級人工審核 # 四台主機:110(DevOps金庫/wooo), 120(K3s-1/wooo), 121(K3s-2/wooo), 188(AI中心/ollama) SSH_MCP_ALLOWED_HOSTS: "192.168.0.110,192.168.0.120,192.168.0.121,192.168.0.188" + # 2026-04-28 Claude Sonnet 4.6: Wave 2 B3 — LLM 動態 Telegram 按鈕正式啟用 + # B2 dispatch_llm_action + B3 keyboard + B4 la:{short_id} handler 全部就緒 + USE_LLM_DYNAMIC_BUTTONS: "true" # MCP Phase 3 (2026-04-11 Claude Sonnet 4.6): ArgoCD + Sentry MCP 啟用 # ARGOCD_API_TOKEN 在 Secrets 中配置 ARGOCD_MCP_ENABLED: "true"