fix(alerts): preserve bare metal domain guard
This commit is contained in:
@@ -1899,6 +1899,7 @@ class DecisionManager:
|
||||
"""
|
||||
action = token.proposal_data.get("kubectl_command", "") or token.proposal_data.get("action", "")
|
||||
_alert_labels = incident.signals[0].labels if incident.signals else {}
|
||||
_host_type_for_domain_guard = (_alert_labels.get("host_type") or "").lower()
|
||||
|
||||
# 2026-05-02 ogt + Claude Sonnet 4.6: YAML 是權威,先覆蓋 LLM 生成的 action
|
||||
# 根因:LLM/Phase2 會先產出 node-exporter/kubectl 的錯域建議,導致
|
||||
@@ -1948,17 +1949,32 @@ class DecisionManager:
|
||||
|
||||
_yaml_cmd = (_yaml_r.get("kubectl_command") or "").strip()
|
||||
if _yaml_cmd and not _yaml_cmd.startswith("kubectl"):
|
||||
# YAML 給出 SSH / docker 指令 → 覆蓋 LLM 生成的 action
|
||||
action = _yaml_cmd
|
||||
token.proposal_data["action"] = action
|
||||
token.proposal_data["kubectl_command"] = action
|
||||
await self._save_token(token)
|
||||
logger.info(
|
||||
"auto_execute_yaml_cmd_override",
|
||||
incident_id=incident.incident_id,
|
||||
alertname=_alertname_for_yaml,
|
||||
yaml_cmd=action[:80],
|
||||
)
|
||||
if (
|
||||
_host_type_for_domain_guard == "bare_metal"
|
||||
and action.lstrip().lower().startswith("kubectl")
|
||||
):
|
||||
# 2026-05-31 ogt + Codex: 保留 bare_metal × kubectl
|
||||
# wrong-domain guard 的明確 blocked_reason,不用 YAML SSH
|
||||
# 診斷覆蓋掉 LLM 原始錯域提案。
|
||||
logger.info(
|
||||
"auto_execute_yaml_override_skipped_bare_metal_kubectl_guard",
|
||||
incident_id=incident.incident_id,
|
||||
alertname=_alertname_for_yaml,
|
||||
yaml_cmd=_yaml_cmd[:80],
|
||||
original_action=action[:80],
|
||||
)
|
||||
else:
|
||||
# YAML 給出 SSH / docker 指令 → 覆蓋 LLM 生成的 action
|
||||
action = _yaml_cmd
|
||||
token.proposal_data["action"] = action
|
||||
token.proposal_data["kubectl_command"] = action
|
||||
await self._save_token(token)
|
||||
logger.info(
|
||||
"auto_execute_yaml_cmd_override",
|
||||
incident_id=incident.incident_id,
|
||||
alertname=_alertname_for_yaml,
|
||||
yaml_cmd=action[:80],
|
||||
)
|
||||
except Exception as _yaml_err:
|
||||
logger.debug("auto_execute_yaml_check_error", error=str(_yaml_err))
|
||||
|
||||
@@ -1969,7 +1985,7 @@ class DecisionManager:
|
||||
# 重啟 awoooi 服務根本解不了第三方 CPU 燒爆,只是拖累自己。
|
||||
# 修法:偵測到 alert host_type=bare_metal 且 action 是 kubectl 類,立即降級人工,
|
||||
# Telegram 明示「跨 domain 動作被攔下」。auto_repair 走 SSH 診斷或人工。
|
||||
_host_type = (_alert_labels.get("host_type") or "").lower()
|
||||
_host_type = _host_type_for_domain_guard
|
||||
_action_stripped = action.lstrip().lower()
|
||||
if _host_type == "bare_metal" and _action_stripped.startswith("kubectl"):
|
||||
logger.warning(
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
- Postmortem 產出時同步 idempotent 寫入 `knowledge_entries(entry_type=postmortem,path_type=postmortem)` 並補 `KM_CONVERTED`。
|
||||
- Heartbeat 與日報修復統計排除 observe-only/no-action,避免污染 success rate。
|
||||
- `alert_rule_engine._matches()` 收緊具名 alertname 規則,避免 Host storage 類告警靠 `storage` keyword 誤配 MinIO。
|
||||
- `_auto_execute` 保留 `bare_metal × kubectl` wrong-domain guard,不讓 YAML SSH 診斷覆蓋掉 LLM 原始錯域提案,確保 `blocked_reason` 仍可被前台/Telegram 看到。
|
||||
|
||||
**Verification**:
|
||||
|
||||
@@ -27,6 +28,8 @@ pytest test_alert_rule_engine_validation.py test_report_generation_service.py -q
|
||||
-> 67 passed
|
||||
pytest test_heartbeat_ollama_endpoints.py test_heartbeat_pod_state_machine.py test_gap_a4_placeholder_resolution.py -q
|
||||
-> 49 passed
|
||||
pytest test_decision_manager_bare_metal_kubectl_guard.py test_alert_rule_engine_validation.py test_gap_a4_placeholder_resolution.py -q
|
||||
-> 75 passed
|
||||
```
|
||||
|
||||
**判讀 / 下一步**:
|
||||
|
||||
@@ -2673,8 +2673,8 @@ Phase 6 完成後
|
||||
|
||||
**T154 Telegram approval truth + execution audit integrity(2026-05-31 台北)**:
|
||||
- 觸發:Telegram 上出現「此告警已處理」後仍接著顯示「已批准、執行中」,且 `INC-20260530-88D960` / `INC-20260531-88394F` 的 production 查核顯示 `approval_records.status=execution_success`,但前者實際只跑 MinIO SSH 診斷、後者只是 `OBSERVE`;`auto_repair_executions=0`,`alert_operation_log` 缺 execution start/end,Postmortem 只送 Telegram 未落 KM。這會讓 operator 誤以為修復已完成。
|
||||
- 修正:集中 `is_no_action_approval_action()`,讓 `OBSERVE` / `INVESTIGATE` / `NO_ACTION` terminal 時在 `approval_records.extra_metadata` 標記 `execution_kind=no_action`、`repair_executed=false`,Telegram result 改為「已記錄觀察,未執行修復」,不再用「執行成功」。`ApprovalExecutionService` 現在同步寫 `alert_operation_log` 的 `EXECUTION_STARTED` / `EXECUTION_COMPLETED` / `TELEGRAM_RESULT_SENT`,並保留 `automation_operation_log`。Telegram webhook duplicate approval 不再 finalize/schedule executor;long polling 只有真正 `execution_triggered` 才顯示「執行中」。`ReportGenerationService` 會把 Postmortem idempotent 寫入 `knowledge_entries(entry_type=postmortem,path_type=postmortem)` 並補 `KM_CONVERTED`。`HeartbeatReportService` / 日報修復統計排除 observe-only/no-action,不再污染 execution success rate。`alert_rule_engine._matches()` 收緊具名 alertname 規則,避免主機 storage 告警靠 message keyword 誤配 `minio_disk_high`。
|
||||
- Verification:`py_compile` pass;`test_approval_execution_no_action.py` + `test_telegram_webhook_execution_handoff.py` 6 passed;`test_alert_rule_engine_validation.py` + `test_report_generation_service.py` 67 passed;`test_heartbeat_ollama_endpoints.py` + `test_heartbeat_pod_state_machine.py` + `test_gap_a4_placeholder_resolution.py` 49 passed after aligning host SSH diagnostic assertion。
|
||||
- 修正:集中 `is_no_action_approval_action()`,讓 `OBSERVE` / `INVESTIGATE` / `NO_ACTION` terminal 時在 `approval_records.extra_metadata` 標記 `execution_kind=no_action`、`repair_executed=false`,Telegram result 改為「已記錄觀察,未執行修復」,不再用「執行成功」。`ApprovalExecutionService` 現在同步寫 `alert_operation_log` 的 `EXECUTION_STARTED` / `EXECUTION_COMPLETED` / `TELEGRAM_RESULT_SENT`,並保留 `automation_operation_log`。Telegram webhook duplicate approval 不再 finalize/schedule executor;long polling 只有真正 `execution_triggered` 才顯示「執行中」。`ReportGenerationService` 會把 Postmortem idempotent 寫入 `knowledge_entries(entry_type=postmortem,path_type=postmortem)` 並補 `KM_CONVERTED`。`HeartbeatReportService` / 日報修復統計排除 observe-only/no-action,不再污染 execution success rate。`alert_rule_engine._matches()` 收緊具名 alertname 規則,避免主機 storage 告警靠 message keyword 誤配 `minio_disk_high`;`_auto_execute` 保留 `bare_metal × kubectl` wrong-domain guard,不讓 YAML SSH 診斷覆蓋掉 LLM 原始錯域提案。
|
||||
- Verification:`py_compile` pass;`test_approval_execution_no_action.py` + `test_telegram_webhook_execution_handoff.py` 6 passed;`test_alert_rule_engine_validation.py` + `test_report_generation_service.py` 67 passed;`test_heartbeat_ollama_endpoints.py` + `test_heartbeat_pod_state_machine.py` + `test_gap_a4_placeholder_resolution.py` 49 passed;`test_decision_manager_bare_metal_kubectl_guard.py` + `test_alert_rule_engine_validation.py` + `test_gap_a4_placeholder_resolution.py` 75 passed。
|
||||
- 判讀:T154 修的是「Telegram / DB / 前台統計的 truthfulness」,不是補跑舊 incident 的修復。舊資料中 status 已是 `execution_success` 的 OBSERVE 仍需靠新 metadata 才能精確分辨;部署後新 approval 會留下 immutable execution start/end 與 no-action 語意,operator 不應再把 OBSERVE 視為完成修復。
|
||||
|
||||
**T152 Ansible runtime readiness surfaced(2026-05-24 台北)**:
|
||||
|
||||
Reference in New Issue
Block a user