From c50da9a2b381b3fc53b2755c30ec43979b4da503 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 31 May 2026 14:02:46 +0800 Subject: [PATCH] fix(alerts): preserve bare metal domain guard --- apps/api/src/services/decision_manager.py | 40 +++++++++++++------ docs/LOGBOOK.md | 3 ++ ...-04-15-MASTER-ai-autonomous-flywheel-v2.md | 4 +- 3 files changed, 33 insertions(+), 14 deletions(-) diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index 920a53b1..24cfb607 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -1899,6 +1899,7 @@ class DecisionManager: """ action = token.proposal_data.get("kubectl_command", "") or token.proposal_data.get("action", "") _alert_labels = incident.signals[0].labels if incident.signals else {} + _host_type_for_domain_guard = (_alert_labels.get("host_type") or "").lower() # 2026-05-02 ogt + Claude Sonnet 4.6: YAML 是權威,先覆蓋 LLM 生成的 action # 根因:LLM/Phase2 會先產出 node-exporter/kubectl 的錯域建議,導致 @@ -1948,17 +1949,32 @@ class DecisionManager: _yaml_cmd = (_yaml_r.get("kubectl_command") or "").strip() if _yaml_cmd and not _yaml_cmd.startswith("kubectl"): - # YAML 給出 SSH / docker 指令 → 覆蓋 LLM 生成的 action - action = _yaml_cmd - token.proposal_data["action"] = action - token.proposal_data["kubectl_command"] = action - await self._save_token(token) - logger.info( - "auto_execute_yaml_cmd_override", - incident_id=incident.incident_id, - alertname=_alertname_for_yaml, - yaml_cmd=action[:80], - ) + if ( + _host_type_for_domain_guard == "bare_metal" + and action.lstrip().lower().startswith("kubectl") + ): + # 2026-05-31 ogt + Codex: 保留 bare_metal × kubectl + # wrong-domain guard 的明確 blocked_reason,不用 YAML SSH + # 診斷覆蓋掉 LLM 原始錯域提案。 + logger.info( + "auto_execute_yaml_override_skipped_bare_metal_kubectl_guard", + incident_id=incident.incident_id, + alertname=_alertname_for_yaml, + yaml_cmd=_yaml_cmd[:80], + original_action=action[:80], + ) + else: + # YAML 給出 SSH / docker 指令 → 覆蓋 LLM 生成的 action + action = _yaml_cmd + token.proposal_data["action"] = action + token.proposal_data["kubectl_command"] = action + await self._save_token(token) + logger.info( + "auto_execute_yaml_cmd_override", + incident_id=incident.incident_id, + alertname=_alertname_for_yaml, + yaml_cmd=action[:80], + ) except Exception as _yaml_err: logger.debug("auto_execute_yaml_check_error", error=str(_yaml_err)) @@ -1969,7 +1985,7 @@ class DecisionManager: # 重啟 awoooi 服務根本解不了第三方 CPU 燒爆,只是拖累自己。 # 修法:偵測到 alert host_type=bare_metal 且 action 是 kubectl 類,立即降級人工, # Telegram 明示「跨 domain 動作被攔下」。auto_repair 走 SSH 診斷或人工。 - _host_type = (_alert_labels.get("host_type") or "").lower() + _host_type = _host_type_for_domain_guard _action_stripped = action.lstrip().lower() if _host_type == "bare_metal" and _action_stripped.startswith("kubectl"): logger.warning( diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 96a8f7fe..5b73fae3 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -15,6 +15,7 @@ - Postmortem 產出時同步 idempotent 寫入 `knowledge_entries(entry_type=postmortem,path_type=postmortem)` 並補 `KM_CONVERTED`。 - Heartbeat 與日報修復統計排除 observe-only/no-action,避免污染 success rate。 - `alert_rule_engine._matches()` 收緊具名 alertname 規則,避免 Host storage 類告警靠 `storage` keyword 誤配 MinIO。 +- `_auto_execute` 保留 `bare_metal × kubectl` wrong-domain guard,不讓 YAML SSH 診斷覆蓋掉 LLM 原始錯域提案,確保 `blocked_reason` 仍可被前台/Telegram 看到。 **Verification**: @@ -27,6 +28,8 @@ pytest test_alert_rule_engine_validation.py test_report_generation_service.py -q -> 67 passed pytest test_heartbeat_ollama_endpoints.py test_heartbeat_pod_state_machine.py test_gap_a4_placeholder_resolution.py -q -> 49 passed +pytest test_decision_manager_bare_metal_kubectl_guard.py test_alert_rule_engine_validation.py test_gap_a4_placeholder_resolution.py -q + -> 75 passed ``` **判讀 / 下一步**: diff --git a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md index 5d8662c5..d5d1397e 100644 --- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md +++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md @@ -2673,8 +2673,8 @@ Phase 6 完成後 **T154 Telegram approval truth + execution audit integrity(2026-05-31 台北)**: - 觸發:Telegram 上出現「此告警已處理」後仍接著顯示「已批准、執行中」,且 `INC-20260530-88D960` / `INC-20260531-88394F` 的 production 查核顯示 `approval_records.status=execution_success`,但前者實際只跑 MinIO SSH 診斷、後者只是 `OBSERVE`;`auto_repair_executions=0`,`alert_operation_log` 缺 execution start/end,Postmortem 只送 Telegram 未落 KM。這會讓 operator 誤以為修復已完成。 -- 修正:集中 `is_no_action_approval_action()`,讓 `OBSERVE` / `INVESTIGATE` / `NO_ACTION` terminal 時在 `approval_records.extra_metadata` 標記 `execution_kind=no_action`、`repair_executed=false`,Telegram result 改為「已記錄觀察,未執行修復」,不再用「執行成功」。`ApprovalExecutionService` 現在同步寫 `alert_operation_log` 的 `EXECUTION_STARTED` / `EXECUTION_COMPLETED` / `TELEGRAM_RESULT_SENT`,並保留 `automation_operation_log`。Telegram webhook duplicate approval 不再 finalize/schedule executor;long polling 只有真正 `execution_triggered` 才顯示「執行中」。`ReportGenerationService` 會把 Postmortem idempotent 寫入 `knowledge_entries(entry_type=postmortem,path_type=postmortem)` 並補 `KM_CONVERTED`。`HeartbeatReportService` / 日報修復統計排除 observe-only/no-action,不再污染 execution success rate。`alert_rule_engine._matches()` 收緊具名 alertname 規則,避免主機 storage 告警靠 message keyword 誤配 `minio_disk_high`。 -- Verification:`py_compile` pass;`test_approval_execution_no_action.py` + `test_telegram_webhook_execution_handoff.py` 6 passed;`test_alert_rule_engine_validation.py` + `test_report_generation_service.py` 67 passed;`test_heartbeat_ollama_endpoints.py` + `test_heartbeat_pod_state_machine.py` + `test_gap_a4_placeholder_resolution.py` 49 passed after aligning host SSH diagnostic assertion。 +- 修正:集中 `is_no_action_approval_action()`,讓 `OBSERVE` / `INVESTIGATE` / `NO_ACTION` terminal 時在 `approval_records.extra_metadata` 標記 `execution_kind=no_action`、`repair_executed=false`,Telegram result 改為「已記錄觀察,未執行修復」,不再用「執行成功」。`ApprovalExecutionService` 現在同步寫 `alert_operation_log` 的 `EXECUTION_STARTED` / `EXECUTION_COMPLETED` / `TELEGRAM_RESULT_SENT`,並保留 `automation_operation_log`。Telegram webhook duplicate approval 不再 finalize/schedule executor;long polling 只有真正 `execution_triggered` 才顯示「執行中」。`ReportGenerationService` 會把 Postmortem idempotent 寫入 `knowledge_entries(entry_type=postmortem,path_type=postmortem)` 並補 `KM_CONVERTED`。`HeartbeatReportService` / 日報修復統計排除 observe-only/no-action,不再污染 execution success rate。`alert_rule_engine._matches()` 收緊具名 alertname 規則,避免主機 storage 告警靠 message keyword 誤配 `minio_disk_high`;`_auto_execute` 保留 `bare_metal × kubectl` wrong-domain guard,不讓 YAML SSH 診斷覆蓋掉 LLM 原始錯域提案。 +- Verification:`py_compile` pass;`test_approval_execution_no_action.py` + `test_telegram_webhook_execution_handoff.py` 6 passed;`test_alert_rule_engine_validation.py` + `test_report_generation_service.py` 67 passed;`test_heartbeat_ollama_endpoints.py` + `test_heartbeat_pod_state_machine.py` + `test_gap_a4_placeholder_resolution.py` 49 passed;`test_decision_manager_bare_metal_kubectl_guard.py` + `test_alert_rule_engine_validation.py` + `test_gap_a4_placeholder_resolution.py` 75 passed。 - 判讀:T154 修的是「Telegram / DB / 前台統計的 truthfulness」,不是補跑舊 incident 的修復。舊資料中 status 已是 `execution_success` 的 OBSERVE 仍需靠新 metadata 才能精確分辨;部署後新 approval 會留下 immutable execution start/end 與 no-action 語意,operator 不應再把 OBSERVE 視為完成修復。 **T152 Ansible runtime readiness surfaced(2026-05-24 台北)**: