fix(aiops): persist emergency intervention traces

2026-05-01 20:34:33 +08:00
parent 8e49f2ea88
commit 7795f027d2
7 changed files with 297 additions and 9 deletions
--- a/.agents/skills/05-awoooi-sre-qa.md
+++ b/.agents/skills/05-awoooi-sre-qa.md
@@ -804,6 +804,7 @@ kubectl -n awoooi-prod logs -l app=awoooi-api --tail=50 | \
 | Drift TYPE-4D | view diff, adopt, rollback, ignore | 看 diff、採納變更、回滾、忽略 |
 | Backup / host diagnosis | restart only when rule allows, charts/logs/details, cleanup when safe | 不得提供 K8s-only repair button 當 host/backup 主動作 |
 | Post-verification degraded/failed | rollback proposal, investigate, details | 不自動 rollback，需人工或 emergency AI Agent 接手 |
+| SecOps authorize/isolate/block | record authorization, multi-sig gate | 不直接執行危險隔離；必須寫 Redis TTL、AOL、timeline |

 Regression test target: button callback names emitted by `telegram_gateway.py`
 must stay in sync with `callback_action_spec.yaml`; stale buttons are a
@@ -815,6 +816,12 @@ registered MCP providers (`kubernetes`, `ssh_host`) before `get_provider()`.
 `backup_failure` cards must expose read-only diagnostics before any write
 action: host disk, backup jobs, and Velero backup status.

+Emergency intervention is not complete until it is queryable later. Any
+auto-repair-unavailable, drift-auto-adopt-blocked, or SecOps authorization path
+must write both `alert_operation_log` and `timeline_events` using existing enum
+values (`APPROVAL_ESCALATED` / `USER_ACTION`) unless a migration has already
+landed. Telegram-only escalation is a silent learning-loop failure.
+
 All Telegram alert lifecycle operations must use `TelegramGateway.alert_chat_id`:
 initial send, analyzing placeholder, delete, editMessageText,
 editMessageReplyMarkup, CI progress, and action-result updates. Sending the
--- a/apps/api/src/services/callback_dispatcher.py
+++ b/apps/api/src/services/callback_dispatcher.py
@@ -23,6 +23,7 @@ Phase 5 Sprint 5.0-5.1 — 2026-04-14 Claude Sonnet 4.6

 from __future__ import annotations

+import json
 import time
 from dataclasses import dataclass
 from functools import lru_cache
@@ -264,7 +265,12 @@ async def dispatch_action(
    try:
        # internal provider: 特殊 URL builder（無 MCP call）
        if spec.mcp_provider == "internal":
-            result_text = _handle_internal_action(spec, resolved_params)
+            result_text = await _handle_internal_action(
+                spec,
+                resolved_params,
+                incident_id=incident_id,
+                user_id=user_id,
+            )
            duration = (time.perf_counter() - start) * 1000
            logger.info("dispatch_action_internal", action=action_name, duration_ms=round(duration, 1))
            return DispatchResult(
@@ -361,7 +367,13 @@ async def dispatch_action(
        )


-def _handle_internal_action(spec: ActionSpec, params: dict) -> str:
+async def _handle_internal_action(
+    spec: ActionSpec,
+    params: dict,
+    *,
+    incident_id: str,
+    user_id: int | None,
+) -> str:
    """
    Internal actions — 不走 MCP，直接產生 URL/文字回覆

@@ -379,12 +391,20 @@ def _handle_internal_action(spec: ActionSpec, params: dict) -> str:
        return f"{spec.emoji} <b>{spec.label}</b>\nhttps://awoooi.wooo.work/flywheel"

    if tool == "record_authorization":
-        # Sprint 5.4 會實作真實授權記錄，這裡先返回確認
-        user_id = params.get("user_id", 0)
+        recorded = await _record_authorization_audit(
+            spec=spec,
+            params=params,
+            incident_id=incident_id,
+            user_id=user_id,
+        )
+        _user_id = params.get("user_id", user_id or 0)
        source = params.get("source", "unknown")
+        action = params.get("action", "authorize")
+        suffix = "已寫入審計與時間線" if recorded else "已受理；審計寫入將由後續補償"
        return (
            f"{spec.emoji} <b>{spec.label}</b>\n"
-            f"已記錄 user={user_id} 授權 source={source}（24h 內同源告警將靜默）"
+            f"已記錄 user={_user_id} 授權 source={source} action={action}（24h 內同源告警將靜默）\n"
+            f"{suffix}"
        )

    # 未知的 internal tool
@@ -394,6 +414,104 @@ def _handle_internal_action(spec: ActionSpec, params: dict) -> str:
    )


+async def _record_authorization_audit(
+    *,
+    spec: ActionSpec,
+    params: dict,
+    incident_id: str,
+    user_id: int | None,
+) -> bool:
+    """Best-effort persistence for internal authorization actions."""
+
+    source = str(params.get("source") or "unknown")
+    requested_action = str(params.get("action") or spec.name)
+    source_ip = str(params.get("source_ip") or "")
+    actor = f"telegram:{user_id or params.get('user_id') or 0}"
+    context = {
+        "action": spec.name,
+        "label": spec.label,
+        "risk": spec.risk,
+        "category": spec.category,
+        "requested_action": requested_action,
+        "source": source,
+        "source_ip": source_ip,
+        "user_id": user_id or params.get("user_id") or 0,
+        "requires_multi_sig": spec.requires_multi_sig,
+    }
+    wrote_any = False
+
+    try:
+        from src.core.redis_client import get_redis
+
+        redis = get_redis()
+        redis_key = f"secops:authorization:{source}"
+        await redis.set(redis_key, json.dumps(context, ensure_ascii=False), ex=86400)
+        wrote_any = True
+    except Exception as exc:
+        logger.warning(
+            "record_authorization_redis_failed",
+            incident_id=incident_id,
+            source=source,
+            error=str(exc),
+        )
+
+    try:
+        from src.repositories.alert_operation_log_repository import (
+            get_alert_operation_log_repository,
+        )
+
+        event_type = "APPROVAL_ESCALATED" if spec.requires_multi_sig or spec.risk == "critical" else "USER_ACTION"
+        record = await get_alert_operation_log_repository().append(
+            event_type,
+            incident_id=incident_id,
+            actor=actor,
+            action_detail=f"telegram_authorization:{requested_action}"[:200],
+            success=True,
+            context=context,
+        )
+        wrote_any = wrote_any or bool(record)
+    except Exception as exc:
+        logger.warning(
+            "record_authorization_aol_failed",
+            incident_id=incident_id,
+            source=source,
+            error=str(exc),
+        )
+
+    try:
+        from src.services.approval_db import get_timeline_service
+
+        await get_timeline_service().add_event(
+            event_type="security",
+            status="warning" if spec.requires_multi_sig or spec.risk == "critical" else "info",
+            title="Telegram authorization recorded",
+            description=(
+                f"action={requested_action} source={source} source_ip={source_ip or 'unknown'}"
+            )[:500],
+            actor=actor,
+            actor_role="secops_authorization",
+            risk_level=spec.risk,
+            incident_id=incident_id,
+        )
+        wrote_any = True
+    except Exception as exc:
+        logger.warning(
+            "record_authorization_timeline_failed",
+            incident_id=incident_id,
+            source=source,
+            error=str(exc),
+        )
+
+    logger.info(
+        "record_authorization_audit_complete",
+        incident_id=incident_id,
+        source=source,
+        action=requested_action,
+        wrote_any=wrote_any,
+    )
+    return wrote_any
+
+
 def _format_reply(
    mcp_result: Any, reply_format: str, label: str, emoji: str
 ) -> str:
--- a/apps/api/src/services/emergency_escalation_service.py
+++ b/apps/api/src/services/emergency_escalation_service.py
@@ -119,6 +119,10 @@ async def escalate_drift_auto_adopt_blocked(
        return

    try:
+        from src.repositories.alert_operation_log_repository import (
+            get_alert_operation_log_repository,
+        )
+        from src.services.approval_db import get_timeline_service
        from src.services.telegram_gateway import get_telegram_gateway

        actionable_count = sum(
@@ -143,6 +147,43 @@ async def escalate_drift_auto_adopt_blocked(
            ),
            group_chat_id=settings.SRE_GROUP_CHAT_ID or None,
        )
+        await get_alert_operation_log_repository().append(
+            "APPROVAL_ESCALATED",
+            incident_id=report.report_id,
+            actor="drift_auto_adopt",
+            action_detail="drift_auto_adopt_blocked_emergency_channel",
+            success=True,
+            context={
+                "namespace": report.namespace,
+                "reason": reason,
+                "high_count": report.high_count,
+                "medium_count": report.medium_count,
+                "actionable_count": actionable_count,
+                "intent": intent,
+                "confidence": confidence,
+                "risk": risk,
+            },
+        )
+        try:
+            await get_timeline_service().add_event(
+                event_type="agent",
+                status="warning",
+                title="Drift emergency intervention requested",
+                description=(
+                    f"{reason} | namespace={report.namespace} "
+                    f"high={report.high_count} medium={report.medium_count} "
+                    f"intent={intent} confidence={confidence:.0%}"
+                )[:500],
+                actor="drift_auto_adopt",
+                actor_role="emergency_intervention",
+                incident_id=report.report_id,
+            )
+        except Exception as timeline_exc:
+            logger.warning(
+                "drift_emergency_timeline_failed",
+                report_id=report.report_id,
+                error=str(timeline_exc),
+            )
        logger.warning(
            "drift_auto_adopt_emergency_escalated",
            report_id=report.report_id,
--- a/apps/api/tests/test_callback_dispatcher.py
+++ b/apps/api/tests/test_callback_dispatcher.py
@@ -15,6 +15,7 @@ Phase 5 Sprint 5.0-5.1 Callback Dispatcher 單元測試

 import pytest

+from src.services import callback_dispatcher as callback_dispatcher_module
 from src.services.callback_dispatcher import (
    dispatch_action,
    get_action_spec,
@@ -269,6 +270,36 @@ class TestInternalActions:
        assert result.success is True
        assert "12345" in result.result_text

+    async def test_record_authorization_persists_audit_intent(self, monkeypatch):
+        captured = {}
+
+        async def fake_record_authorization_audit(*, spec, params, incident_id, user_id):
+            captured["spec"] = spec
+            captured["params"] = params
+            captured["incident_id"] = incident_id
+            captured["user_id"] = user_id
+            return True
+
+        monkeypatch.setattr(
+            callback_dispatcher_module,
+            "_record_authorization_audit",
+            fake_record_authorization_audit,
+        )
+
+        result = await dispatch_action(
+            action_name="secops_isolate",
+            incident_id="INC-SEC-AUTH",
+            user_id=67890,
+            labels={"instance": "192.168.0.110"},
+        )
+
+        assert result.success is True
+        assert "已寫入審計與時間線" in result.result_text
+        assert captured["spec"].name == "secops_isolate"
+        assert captured["params"]["action"] == "request_network_isolation"
+        assert captured["incident_id"] == "INC-SEC-AUTH"
+        assert captured["user_id"] == 67890
+

 # =============================================================================
 # Sprint 5.2 — MCP 呼叫失敗路徑（Provider 未註冊）
--- a/apps/api/tests/test_emergency_escalation_service.py
+++ b/apps/api/tests/test_emergency_escalation_service.py
@@ -0,0 +1,71 @@
+from types import SimpleNamespace
+
+import pytest
+
+from src.services import emergency_escalation_service as service
+
+
+@pytest.mark.asyncio
+async def test_drift_emergency_escalation_writes_aol_and_timeline(monkeypatch):
+    sent_cards = []
+    aol_calls = []
+    timeline_calls = []
+
+    async def fake_dedup(*args, **kwargs):
+        return True
+
+    class FakeGateway:
+        async def send_escalation_card(self, **kwargs):
+            sent_cards.append(kwargs)
+
+    class FakeRepo:
+        async def append(self, *args, **kwargs):
+            aol_calls.append((args, kwargs))
+            return object()
+
+    class FakeTimeline:
+        async def add_event(self, **kwargs):
+            timeline_calls.append(kwargs)
+            return {"id": "timeline-1"}
+
+    monkeypatch.setattr(service, "_dedup_first_send", fake_dedup)
+    monkeypatch.setattr(
+        "src.services.telegram_gateway.get_telegram_gateway",
+        lambda: FakeGateway(),
+    )
+    monkeypatch.setattr(
+        "src.repositories.alert_operation_log_repository.get_alert_operation_log_repository",
+        lambda: FakeRepo(),
+    )
+    monkeypatch.setattr(
+        "src.services.approval_db.get_timeline_service",
+        lambda: FakeTimeline(),
+    )
+
+    report = SimpleNamespace(
+        report_id="drift-123",
+        namespace="awoooi-prod",
+        high_count=1,
+        medium_count=2,
+        items=[
+            SimpleNamespace(is_allowlisted=False),
+            SimpleNamespace(is_allowlisted=True),
+        ],
+    )
+    interpretation = SimpleNamespace(
+        intent=SimpleNamespace(value="emergency_hotfix"),
+        confidence=0.72,
+        risk="high",
+    )
+
+    await service.escalate_drift_auto_adopt_blocked(
+        report=report,
+        reason="unsafe drift",
+        interpretation=interpretation,
+    )
+
+    assert sent_cards and sent_cards[0]["incident_id"] == "drift-123"
+    assert aol_calls and aol_calls[0][0][0] == "APPROVAL_ESCALATED"
+    assert aol_calls[0][1]["actor"] == "drift_auto_adopt"
+    assert aol_calls[0][1]["context"]["intent"] == "emergency_hotfix"
+    assert timeline_calls and timeline_calls[0]["actor_role"] == "emergency_intervention"
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -6,6 +6,20 @@

 ---

+## 2026-05-01 | Emergency intervention 留痕閉環
+
+承接 SSH/backup 自動修復閉環與 Telegram ghost-button 補洞：SecOps 隔離/封鎖按鈕已降級成授權記錄，但若只回文字、不寫 Redis/AOL/timeline，就會形成「看似有人接手，系統沒有記憶」的新斷點；drift auto-adopt 被擋時也需要同樣進 WarRoom timeline。
+
+### 完成
+- `callback_dispatcher` 的 `internal.record_authorization` 改成 async 持久化：寫 `secops:authorization:{source}` 24h TTL、寫 `alert_operation_log`，並新增 `timeline_events` security warning/info。
+- 高風險或 multi-sig SecOps 授權統一用既有 `APPROVAL_ESCALATED` AOL event，避免新增 enum 造成 migration 漏洞；一般授權用 `USER_ACTION`。
+- `EmergencyEscalationService.escalate_drift_auto_adopt_blocked()` 補 AOL + timeline，config drift 無法 auto-adopt 時不再只有 Telegram 卡片。
+- 補 regression tests，鎖住「按鈕回覆文字」之外必須落到審計與處理歷程。
+
+### 驗證
+- `python3 -m py_compile apps/api/src/services/callback_dispatcher.py apps/api/src/services/emergency_escalation_service.py` 通過。
+- `cd apps/api && DATABASE_URL=postgresql://test:test@localhost:5432/test pytest tests/test_callback_dispatcher.py tests/test_emergency_escalation_service.py tests/test_alertmanager_rule_bypass.py -q` → 45 passed。
+
 ## 2026-05-01 | SSH 自動修復閉環 + Telegram ghost-button 補洞

 承接 HostBackupFailed / SSH MCP live 驗證：`backup_failure` 已進 SSH route，但實際 188 只接受 `ollama@192.168.0.188`，而 provider 仍用預設 `wooo`；同時 `DecisionManager._ssh_execute()` 使用未註冊的 `ssh_diagnose`、錯誤的 restart tool 名稱，且 SSH 失敗或只讀診斷成功時仍可能被標成自動修復完成。
--- a/docs/adr/ADR-075-telegram-notification-standard.md
+++ b/docs/adr/ADR-075-telegram-notification-standard.md
@@ -63,9 +63,9 @@ ADR-071（2026-04-11）設計了 TYPE-1/2/3/4/4D 五種通知類型，並實作
 | TYPE-3 | 需人工審核（預設）| 依 category 動態 ≤4 個 | SRE 群組 |
 | TYPE-4 | AI 無法判斷 | [手動記錄][查面板][忽略] | SRE 群組 |
 | TYPE-4D | Config Drift | [查Diff][採納][回滾][忽略] | SRE 群組 |
-| TYPE-5S | 資安防禦（未來）| [隔離][封鎖IP][驅逐Pod][確認授權] | SRE 群組 |
+| TYPE-5S | 資安防禦 | [隔離][封鎖IP][驅逐Pod][確認授權]；危險動作先記授權/多簽 | SRE 群組 |
 | TYPE-6B | 業務/FinOps（未來）| [暫停][查SignOz][忽略] | SRE 群組 |
-| TYPE-7E | 重大事故升級（未來）| [建立戰情室][Postmortem][DR手冊][確認接手] | SRE 群組 |
+| TYPE-7E | 重大事故升級 / auto-repair unavailable | 無 ghost callback；人工/AI 接手先靠卡片與 timeline/AOL 留痕，按鈕需有 dispatcher 後才可開 | SRE 群組 |
 | TYPE-8M | 飛輪/告警鏈路健康 | [觸發診斷][查看面板][靜默] | SRE 群組 |

 ### D4：雙頻道路由規則
@@ -87,13 +87,19 @@ NOTIFICATION_TYPE_RULES = {
    "TYPE-3":  "最多 4 個 Callback Button，依 alert_category 動態選擇",
    "TYPE-4":  "固定 3 個按鈕：[手動記錄][查看面板][忽略]",
    "TYPE-4D": "固定 4 個按鈕：[查看Diff][採納][回滾][忽略]",
-    "TYPE-5S": "固定 4 個按鈕：[隔離][封鎖IP][驅逐Pod][確認授權]",
+    "TYPE-5S": "固定 4 個按鈕：[隔離][封鎖IP][驅逐Pod][確認授權]，危險動作只記授權/多簽",
    "TYPE-6B": "最多 3 個按鈕：[暫停][查看SignOz][忽略]",
-    "TYPE-7E": "固定 4 個按鈕：[建立戰情室][Postmortem草稿][DR手冊][確認接手]",
+    "TYPE-7E": "無 ghost callback；未落地 dispatcher 前不顯示 callback button",
    "TYPE-8M": "固定 3 個按鈕：[觸發診斷][飛輪面板][靜默]",
 }
 ```

+2026-05-01 補充：TYPE-7E 已用於 `auto_repair_unavailable` 與
+`drift_auto_adopt_blocked` 緊急通道。Telegram 卡片本身不是閉環；每次升級
+必須寫入 `alert_operation_log` 與 `timeline_events`，讓 WarRoom、KM 與
+learning loop 能反查。TYPE-5S 的 `record_authorization` 也必須寫 Redis TTL
+和 AOL/timeline；不得只回 Telegram toast。
+
 ---

 ## 實施計畫