fix(alerts): correct telegram execution truth

2026-05-31 13:58:21 +08:00
parent 943a6feacf
commit e2ab879636
15 changed files with 624 additions and 49 deletions
--- a/apps/api/src/api/v1/telegram.py
+++ b/apps/api/src/api/v1/telegram.py
@@ -275,6 +275,29 @@ async def telegram_webhook(
            )

            if approval:
+                status_value = approval.status.value if hasattr(approval.status, "value") else str(approval.status)
+                if (
+                    "Cannot sign" in msg
+                    or "already signed" in msg
+                    or "Concurrent modification" in msg
+                ):
+                    logger.info(
+                        "telegram_approval_ignored_already_processed",
+                        approval_id=approval_id,
+                        user_id=user_id,
+                        status=status_value,
+                        message=msg,
+                    )
+                    await _log_user_action("approve_duplicate", False, getattr(approval, "incident_id", None))
+                    return {
+                        "ok": True,
+                        "message": "Already processed",
+                        "approval_id": approval_id,
+                        "status": status_value,
+                        "execution_triggered": False,
+                        "execution_scheduled": False,
+                    }
+
                execution_scheduled = await _finalize_telegram_approval(
                    approval=approval,
                    execution_triggered=execution_triggered,
@@ -283,7 +306,7 @@ async def telegram_webhook(
                    "telegram_approval_signed",
                    approval_id=approval_id,
                    user_id=user_id,
-                    status=approval.status.value,
+                    status=status_value,
                    execution_triggered=execution_triggered,
                    execution_scheduled=execution_scheduled,
                )
@@ -291,9 +314,9 @@ async def telegram_webhook(

                return {
                    "ok": True,
-                    "message": "Approved",
+                    "message": "Approved" if execution_triggered else "Signed",
                    "approval_id": approval_id,
-                    "status": approval.status.value,
+                    "status": status_value,
                    "execution_triggered": execution_triggered,
                    "execution_scheduled": execution_scheduled,
                }
--- a/apps/api/src/services/alert_rule_engine.py
+++ b/apps/api/src/services/alert_rule_engine.py
@@ -298,6 +298,12 @@ def _matches(rule: dict, alertname: str, alert_type: str, message: str, instance
    if alertnames and alertname in alertnames:
        return True

+    # 2026-05-31 ogt + Codex: 有明確 alertname 的規則不得只靠寬鬆 message
+    # keyword 命中，否則 HostPreviousBootStorageErrorsDetected 這類主機 storage
+    # 告警會誤配到 minio_disk_high。
+    if alertnames and alertname and alertname != "custom":
+        return False
+
    # alert_type 部分匹配
    for kw in match.get("alert_type", []):
        if kw.lower() in alert_type.lower():
--- a/apps/api/src/services/approval_action_classifier.py
+++ b/apps/api/src/services/approval_action_classifier.py
@@ -0,0 +1,26 @@
+"""
+Approval action classifier
+==========================
+
+2026-05-31 ogt + Codex: Telegram 告警鏈路一致性修復。
+將 OBSERVE / INVESTIGATE / NO_ACTION 這類「純觀察、未執行修復」的
+判斷集中，避免 execution、Telegram、統計各自用不同語意。
+"""
+
+from __future__ import annotations
+
+
+def is_no_action_approval_action(action: str | None) -> bool:
+    """Return True when an approval action records observation instead of repair."""
+    text = (action or "").strip()
+    upper = text.upper()
+    if not text:
+        return True
+    return (
+        "NO_ACTION" in upper
+        or "NO-ACTION" in upper
+        or "NOACTION" in upper
+        or "(未設)" in text
+        or upper.startswith("OBSERVE")
+        or upper.startswith("INVESTIGATE")
+    )
--- a/apps/api/src/services/approval_db.py
+++ b/apps/api/src/services/approval_db.py
@@ -659,6 +659,7 @@ class ApprovalDBService:
        approval_id: UUID,
        success: bool,
        error_message: str | None = None,
+        execution_kind: str | None = None,
    ) -> None:
        """
        更新執行狀態
@@ -669,21 +670,36 @@ class ApprovalDBService:
        """
        async with get_db_context() as db:
            status = ApprovalStatus.EXECUTION_SUCCESS if success else ApprovalStatus.EXECUTION_FAILED
-            values: dict = {"status": status}
+            result = await db.execute(
+                select(ApprovalRecord).where(ApprovalRecord.id == str(approval_id))
+            )
+            record = result.scalar_one_or_none()
+            if record is None:
+                logger.warning(
+                    "approval_execution_status_update_missing",
+                    id=str(approval_id),
+                    success=success,
+                )
+                return
+
+            record.status = status
            if not success and error_message:
                # 截斷至合理長度,避免爆欄位
-                values["rejection_reason"] = str(error_message)[:2000]
-            await db.execute(
-                update(ApprovalRecord)
-                .where(ApprovalRecord.id == str(approval_id))
-                .values(**values)
-            )
+                record.rejection_reason = str(error_message)[:2000]
+            if execution_kind:
+                # 2026-05-31 ogt + Codex: OBSERVE/NO_ACTION 仍需 terminal 狀態，
+                # 但前台/報表必須能分辨「未執行修復」而非真正 execution success。
+                metadata = dict(record.extra_metadata or {})
+                metadata["execution_kind"] = execution_kind
+                metadata["repair_executed"] = execution_kind != "no_action"
+                record.extra_metadata = metadata

            logger.info(
                "approval_execution_status_updated",
                id=str(approval_id),
                success=success,
                has_error=bool(error_message),
+                execution_kind=execution_kind,
            )

    async def update_incident_id(self, approval_id: UUID, incident_id: str) -> None:
--- a/apps/api/src/services/approval_execution.py
+++ b/apps/api/src/services/approval_execution.py
@@ -36,6 +36,7 @@ from src.db.base import get_db_context
 from src.models.approval import ApprovalRequest
 from src.plugins.mcp.gateway import GatewayContext, McpGateway, McpGatewayError
 from src.plugins.mcp.interfaces import MCPToolResult
+from src.services.approval_action_classifier import is_no_action_approval_action
 from src.services.approval_db import get_approval_service, get_timeline_service
 from src.services.executor import ExecutionResult, OperationType, get_executor
 from src.services.operation_parser import parse_operation_from_action
@@ -165,6 +166,7 @@ class ApprovalExecutionService:
        # ADR-090 § 自動化動作回灌 (2026-04-19): 主流程開始即在 aol 留痕,
        # 結束時 update。不依賴 fire-and-forget,確保 33 件/7d approval 全部可觀測。
        _aol_op_id = await self._log_aol_started(approval)
+        await self._log_alert_execution_started(approval, aol_op_id=_aol_op_id)
        _aol_started_ms = time.time()

        service = get_approval_service()
@@ -228,15 +230,7 @@ class ApprovalExecutionService:
            # 2026-04-19 ogt + Claude Opus 4.7: 區分 NO_ACTION vs 真解析失敗
            # NO_ACTION 是 AI 刻意選的「純調查不破壞」,不該誤標 EXECUTION_FAILED
            # 污染 auto_execute 成功率 KPI (MASTER §7.1 #11)
-            _action_upper = (approval.action or "").upper()
-            _is_no_action = (
-                "NO_ACTION" in _action_upper
-                or "NO-ACTION" in _action_upper
-                or "NOACTION" in _action_upper
-                or "(未設)" in approval.action
-                or _action_upper.startswith("OBSERVE")
-                or _action_upper.startswith("INVESTIGATE")
-            )
+            _is_no_action = is_no_action_approval_action(approval.action)

            if _is_no_action:
                logger.info(
@@ -246,13 +240,17 @@ class ApprovalExecutionService:
                    reason="NO_ACTION - 純調查/觀察類,不執行破壞動作",
                    path="no_action",
                )
-                # 標為 SUCCESS (觀察/調查本身就是成功完成)
-                await service.update_execution_status(approval.id, success=True)
+                # 仍以 terminal success 關閉簽核，但 metadata 明確標記未執行修復。
+                await service.update_execution_status(
+                    approval.id,
+                    success=True,
+                    execution_kind="no_action",
+                )
                await timeline.add_event(
                    event_type="exec",
                    status="success",
-                    title="✅ 純觀察類動作完成 (NO_ACTION)",
-                    description=f"Action: {approval.action[:120]}",
+                    title="ℹ️ 純觀察類動作已記錄（未執行修復）",
+                    description=f"Action: {(approval.action or '')[:120]}",
                    actor="leWOOOgo",
                    actor_role="executor",
                    approval_id=str(approval.id),
@@ -269,7 +267,22 @@ class ApprovalExecutionService:
                    op_id=_aol_op_id,
                    status="success",
                    duration_ms=int((time.time() - _aol_started_ms) * 1000),
-                    output={"reason": "NO_ACTION", "action": approval.action[:200]},
+                    output={
+                        "reason": "NO_ACTION",
+                        "execution_kind": "no_action",
+                        "repair_executed": False,
+                        "action": (approval.action or "")[:200],
+                    },
+                )
+                await self._log_alert_execution_completed(
+                    approval,
+                    success=True,
+                    execution_kind="no_action",
+                    duration_ms=int((time.time() - _aol_started_ms) * 1000),
+                    output={
+                        "reason": "NO_ACTION",
+                        "repair_executed": False,
+                    },
                )
                # F2 (2026-05-07 ogt + Claude Sonnet 4.6 + Codex):
                # NO_ACTION 路徑要把 incident 推到 RESOLVED，否則 incident 永遠卡
@@ -336,6 +349,13 @@ class ApprovalExecutionService:
                duration_ms=int((time.time() - _aol_started_ms) * 1000),
                error=f"parse_fail: {approval.action[:300]}",
            )
+            await self._log_alert_execution_completed(
+                approval,
+                success=False,
+                execution_kind="parse_failed",
+                duration_ms=int((time.time() - _aol_started_ms) * 1000),
+                error_message=f"Could not parse operation type from action: {approval.action[:150]}",
+            )
            return False  # 解析失敗 → 執行未發生

        executor = get_executor()
@@ -553,6 +573,20 @@ class ApprovalExecutionService:
                    "total_attempts": total_attempts,
                },
            )
+            await self._log_alert_execution_completed(
+                approval,
+                success=True,
+                execution_kind=operation_type.value,
+                duration_ms=int((time.time() - _aol_started_ms) * 1000),
+                output={
+                    "operation_type": operation_type.value,
+                    "resource_name": resource_name,
+                    "namespace": namespace,
+                    "executor_duration_ms": result.duration_ms,
+                    "total_attempts": total_attempts,
+                    "repair_executed": True,
+                },
+            )
            return True  # K8s 執行成功

        else:
@@ -654,6 +688,22 @@ class ApprovalExecutionService:
                error=result.error,
                stderr=result.error,  # E6 stderr 回灌 — 給 retry/Playbook 負向強化用
            )
+            await self._log_alert_execution_completed(
+                approval,
+                success=False,
+                execution_kind=operation_type.value,
+                duration_ms=int((time.time() - _aol_started_ms) * 1000),
+                output={
+                    "operation_type": operation_type.value,
+                    "resource_name": resource_name,
+                    "namespace": namespace,
+                    "executor_duration_ms": result.duration_ms,
+                    "total_attempts": total_attempts,
+                    "repair_attempted": True,
+                    "repair_executed": False,
+                },
+                error_message=result.error,
+            )
            return False  # K8s 執行失敗

    async def _execute_ssh_host_action(
@@ -919,7 +969,14 @@ class ApprovalExecutionService:
            except Exception:
                pass

-            if success:
+            no_action = success and is_no_action_approval_action(approval.action)
+            if no_action:
+                text = (
+                    f"ℹ️ <b>已記錄觀察，未執行修復</b>\n"
+                    f"<code>{(approval.action or '')[:180]}</code>"
+                    f"{km_info}"
+                )
+            elif success:
                text = (
                    f"✅ <b>執行成功</b>\n"
                    f"<code>{(approval.action or '')[:180]}</code>"
@@ -948,8 +1005,34 @@ class ApprovalExecutionService:
                incident_id=approval.incident_id,
                approval_id=str(approval.id),
                success=success,
+                no_action=no_action,
                orig_msg_id=orig_msg_id,
            )
+            try:
+                from src.repositories.alert_operation_log_repository import (
+                    get_alert_operation_log_repository,
+                )
+
+                await get_alert_operation_log_repository().append(
+                    "TELEGRAM_RESULT_SENT",
+                    incident_id=approval.incident_id,
+                    approval_id=str(approval.id),
+                    actor="approval_execution",
+                    action_detail="telegram_execution_result_sent",
+                    success=success,
+                    error_message=error,
+                    context={
+                        "reply_to_message_id": orig_msg_id,
+                        "execution_kind": "no_action" if no_action else "execution",
+                        "repair_executed": not no_action and success,
+                    },
+                )
+            except Exception as _log_e:
+                logger.warning(
+                    "alert_op_telegram_result_write_failed",
+                    approval_id=str(approval.id),
+                    error=str(_log_e),
+                )
        except Exception as e:
            logger.warning(
                "push_execution_result_failed",
@@ -1592,6 +1675,85 @@ class ApprovalExecutionService:
    # 22 筆 notification_formatted。修復後每次執行都留痕。
    # =========================================================================

+    async def _log_alert_execution_started(
+        self,
+        approval: ApprovalRequest,
+        *,
+        aol_op_id: str | None,
+    ) -> None:
+        """Append immutable alert_operation_log start event for manual execution."""
+        try:
+            from src.repositories.alert_operation_log_repository import (
+                get_alert_operation_log_repository,
+            )
+
+            await get_alert_operation_log_repository().append(
+                "EXECUTION_STARTED",
+                incident_id=approval.incident_id,
+                approval_id=str(approval.id),
+                actor="approval_execution",
+                action_detail="approval_execution_started",
+                success=None,
+                context={
+                    "action": (approval.action or "")[:500],
+                    "automation_operation_id": aol_op_id,
+                    "execution_kind": (
+                        "no_action"
+                        if is_no_action_approval_action(approval.action)
+                        else "executable"
+                    ),
+                    "repair_attempted": False,
+                    "repair_executed": False,
+                },
+            )
+        except Exception as e:
+            logger.warning(
+                "alert_op_execution_started_write_failed",
+                approval_id=str(approval.id),
+                incident_id=approval.incident_id,
+                error=str(e),
+            )
+
+    async def _log_alert_execution_completed(
+        self,
+        approval: ApprovalRequest,
+        *,
+        success: bool,
+        execution_kind: str,
+        duration_ms: int,
+        output: dict | None = None,
+        error_message: str | None = None,
+    ) -> None:
+        """Append immutable alert_operation_log completion event for manual execution."""
+        try:
+            from src.repositories.alert_operation_log_repository import (
+                get_alert_operation_log_repository,
+            )
+
+            context = {
+                "action": (approval.action or "")[:500],
+                "duration_ms": duration_ms,
+                "execution_kind": execution_kind,
+                **(output or {}),
+            }
+            await get_alert_operation_log_repository().append(
+                "EXECUTION_COMPLETED",
+                incident_id=approval.incident_id,
+                approval_id=str(approval.id),
+                actor="approval_execution",
+                action_detail=f"approval_execution_{execution_kind}",
+                success=success,
+                error_message=(error_message or "")[:2000] if error_message else None,
+                context=context,
+            )
+        except Exception as e:
+            logger.warning(
+                "alert_op_execution_completed_write_failed",
+                approval_id=str(approval.id),
+                incident_id=approval.incident_id,
+                error=str(e),
+            )
+
    async def _log_aol_started(self, approval: ApprovalRequest) -> str | None:
        """
        在 automation_operation_log 寫一筆 'pending' 紀錄,回傳 op_id 供 _log_aol_completed 更新。
--- a/apps/api/src/services/heartbeat_report_service.py
+++ b/apps/api/src/services/heartbeat_report_service.py
@@ -531,7 +531,9 @@ class HeartbeatReportService:
                        SELECT
                            *,
                            (
-                                btrim(coalesce(action, '')) = ''
+                                COALESCE(extra_metadata->>'execution_kind', '') = 'no_action'
+                                OR COALESCE(extra_metadata->>'repair_executed', '') = 'false'
+                                OR btrim(coalesce(action, '')) = ''
                                OR UPPER(action) LIKE 'OBSERVE%'
                                OR UPPER(action) LIKE 'INVESTIGATE%'
                                OR UPPER(action) LIKE 'NO_ACTION%'
@@ -556,9 +558,18 @@ class HeartbeatReportService:
                            WHERE UPPER(status::text) = 'PENDING'
                              AND telegram_message_id IS NULL
                        ) AS pending_without_telegram,
-                        COUNT(*) FILTER (WHERE UPPER(status::text) = 'EXECUTION_SUCCESS') AS success,
-                        COUNT(*) FILTER (WHERE UPPER(status::text) = 'EXECUTION_FAILED') AS failed,
-                        COUNT(*) FILTER (WHERE UPPER(status::text) IN ('APPROVED','EXECUTION_SUCCESS','EXECUTION_FAILED')) AS auto_resolved
+                        COUNT(*) FILTER (
+                            WHERE UPPER(status::text) = 'EXECUTION_SUCCESS'
+                              AND NOT is_observe_only
+                        ) AS success,
+                        COUNT(*) FILTER (
+                            WHERE UPPER(status::text) = 'EXECUTION_FAILED'
+                              AND NOT is_observe_only
+                        ) AS failed,
+                        COUNT(*) FILTER (
+                            WHERE UPPER(status::text) IN ('APPROVED','EXECUTION_SUCCESS','EXECUTION_FAILED')
+                              AND NOT is_observe_only
+                        ) AS auto_resolved
                    FROM scoped
                """))
                row = r.one()
--- a/apps/api/src/services/report_generation_service.py
+++ b/apps/api/src/services/report_generation_service.py
@@ -232,11 +232,32 @@ class ReportGenerationService:
        async with get_db_context() as db:
            row = await db.execute(
                text("""
+                    WITH scoped AS (
+                        SELECT
+                            *,
+                            (
+                                COALESCE(extra_metadata->>'execution_kind', '') = 'no_action'
+                                OR COALESCE(extra_metadata->>'repair_executed', '') = 'false'
+                                OR btrim(coalesce(action, '')) = ''
+                                OR UPPER(action) LIKE 'OBSERVE%'
+                                OR UPPER(action) LIKE 'INVESTIGATE%'
+                                OR UPPER(action) LIKE 'NO_ACTION%'
+                                OR UPPER(action) LIKE '% NO_ACTION%'
+                                OR UPPER(action) LIKE '%| NO_ACTION%'
+                            ) AS is_observe_only
+                        FROM approval_records
+                        WHERE created_at >= :since
+                    )
                    SELECT
-                        COUNT(*) FILTER (WHERE UPPER(status::text) = 'EXECUTION_SUCCESS') AS success,
-                        COUNT(*) FILTER (WHERE UPPER(status::text) = 'EXECUTION_FAILED')  AS failed
-                    FROM approval_records
-                    WHERE created_at >= :since
+                        COUNT(*) FILTER (
+                            WHERE UPPER(status::text) = 'EXECUTION_SUCCESS'
+                              AND NOT is_observe_only
+                        ) AS success,
+                        COUNT(*) FILTER (
+                            WHERE UPPER(status::text) = 'EXECUTION_FAILED'
+                              AND NOT is_observe_only
+                        ) AS failed
+                    FROM scoped
                """),
                {"since": since},
            )
@@ -460,6 +481,7 @@ class ReportGenerationService:
        # 失敗時發送告警到 SRE 群組，避免靜默吞掉錯誤
        import asyncio as _asyncio
        report_text = self.format_postmortem(data)
+        await self._persist_postmortem_km(data, report_text)
        from src.services.telegram_gateway import get_telegram_gateway
        gateway = get_telegram_gateway()

@@ -510,6 +532,72 @@ class ReportGenerationService:
                error=str(_fe),
            )

+    async def _persist_postmortem_km(
+        self,
+        data: PostmortemData,
+        report_text: str,
+    ) -> None:
+        """Persist generated postmortem as an idempotent KM entry before Telegram send."""
+        try:
+            from src.db.base import get_db_context
+            from src.models.knowledge import (
+                EntrySource,
+                EntryStatus,
+                EntryType,
+                KnowledgeEntryCreate,
+            )
+            from src.repositories.alert_operation_log_repository import (
+                get_alert_operation_log_repository,
+            )
+            from src.repositories.knowledge_repository import KnowledgeDBRepository
+
+            async with get_db_context() as db:
+                repo = KnowledgeDBRepository(db)
+                entry = await repo.create(
+                    KnowledgeEntryCreate(
+                        title=f"Postmortem {data.incident_id}: {data.title}"[:255],
+                        content=report_text,
+                        entry_type=EntryType.POSTMORTEM,
+                        category="postmortem",
+                        tags=[
+                            "postmortem",
+                            "incident",
+                            "telegram",
+                            "auto_repaired" if data.auto_repaired else "human_intervention",
+                        ],
+                        source=EntrySource.AI_EXTRACTED,
+                        status=EntryStatus.REVIEW,
+                        related_incident_id=data.incident_id,
+                        path_type="postmortem",
+                        created_by="report_generation_service",
+                    )
+                )
+
+            await get_alert_operation_log_repository().append(
+                "KM_CONVERTED",
+                incident_id=data.incident_id,
+                actor="report_generation_service",
+                action_detail="postmortem_persisted",
+                success=True,
+                context={
+                    "knowledge_entry_id": entry.id,
+                    "entry_type": EntryType.POSTMORTEM.value,
+                    "path_type": "postmortem",
+                    "duration_minutes": round(data.duration_minutes, 2),
+                },
+            )
+            logger.info(
+                "postmortem_km_persisted",
+                incident_id=data.incident_id,
+                knowledge_entry_id=entry.id,
+            )
+        except Exception as e:
+            logger.warning(
+                "postmortem_km_persist_failed",
+                incident_id=data.incident_id,
+                error=str(e),
+            )
+

 # =============================================================================
 # 日度報告排程迴圈
--- a/apps/api/src/services/telegram_gateway.py
+++ b/apps/api/src/services/telegram_gateway.py
@@ -8377,9 +8377,7 @@ class TelegramGateway:
        if action == "approve":
            status_emoji = "✅"
            status_text = f"<b>已批准</b> by {_html.escape(username)}"
-            # 2026-04-14 Claude Sonnet 4.6: 原「等待執行」誤導（實際沒有 gate 會卡住路徑）
-            # 批准後一律顯示「執行中」，真實結果由 _push_execution_result_to_alert reply 補上
-            suffix = "⚡ 執行中..."
+            suffix = "⚡ 執行中..." if execution_triggered else "已簽核，等待更多簽核"
        else:
            status_emoji = "❌"
            status_text = f"<b>已拒絕</b> by {_html.escape(username)}"
@@ -8495,7 +8493,7 @@ class TelegramGateway:
                    # 2026-04-22 Claude Sonnet 4.6: 只有真正轉為 APPROVED 才發「執行中...」
                    # 非 PENDING 狀態下 sign_approval early-return → approval 是舊 record
                    # 此時不應發「執行中...」，應告知用戶告警已處理過
-                    if approval.status == ApprovalStatus.APPROVED:
+                    if approval.status == ApprovalStatus.APPROVED and execution_triggered:
                        # 2026-04-09 Claude Sonnet 4.6: 回應 Telegram — 更新訊息狀態 + answer callback
                        await self._notify_approval_result(
                            message_id=message_id,
@@ -8520,7 +8518,7 @@ class TelegramGateway:
                    # 原本 gate 用 execution_triggered，race condition 時失效（樂觀鎖失敗）
                    # 改用 approval.status == APPROVED（與 REST API 路徑 approvals.py:360 對齊）
                    # 用 Redis lock exec:{approval_id} 防重入（REST + Telegram 同時簽核）
-                    if approval.status == ApprovalStatus.APPROVED:
+                    if approval.status == ApprovalStatus.APPROVED and execution_triggered:
                        import asyncio

                        from src.core.redis_client import get_redis
--- a/apps/api/tests/test_alert_rule_engine_validation.py
+++ b/apps/api/tests/test_alert_rule_engine_validation.py
@@ -18,7 +18,7 @@ Task 2.3: validate_kubectl_command() 白名單驗證

 import pytest

-from src.services.alert_rule_engine import validate_kubectl_command
+from src.services.alert_rule_engine import match_rule, validate_kubectl_command


 # =============================================================================
@@ -76,6 +76,49 @@ class TestValidKubectlCommands:
        assert validate_kubectl_command(cmd) is False


+class TestRuleMatchingSpecificity:
+    """具名 alertname 規則不得被寬鬆 message keyword 誤命中。"""
+
+    def test_host_storage_alert_does_not_match_minio_disk_rule(self):
+        ctx = {
+            "alert_type": "host",
+            "severity": "critical",
+            "source": "prometheus",
+            "target_resource": "dirty-reboot-evidence",
+            "namespace": "awoooi-prod",
+            "message": "HostPreviousBootStorageErrorsDetected storage dirty reboot evidence",
+            "labels": {
+                "alertname": "HostPreviousBootStorageErrorsDetected",
+                "instance": "192.168.0.110:9100",
+            },
+        }
+
+        result = match_rule(ctx)
+
+        assert result is not None
+        assert result["rule_id"] != "minio_disk_high"
+        assert "/data/minio" not in result.get("kubectl_command", "")
+
+    def test_exact_minio_disk_alert_still_matches_minio_rule(self):
+        ctx = {
+            "alert_type": "storage",
+            "severity": "critical",
+            "source": "prometheus",
+            "target_resource": "minio",
+            "namespace": "awoooi-prod",
+            "message": "MinIO disk usage high",
+            "labels": {
+                "alertname": "MinioDiskUsageHigh",
+                "instance": "192.168.0.110:9000",
+            },
+        }
+
+        result = match_rule(ctx)
+
+        assert result is not None
+        assert result["rule_id"] == "minio_disk_high"
+
+
 # =============================================================================
 # 阻擋案例（應返回 False）
 # =============================================================================
--- a/apps/api/tests/test_approval_execution_no_action.py
+++ b/apps/api/tests/test_approval_execution_no_action.py
@@ -1,5 +1,4 @@
 from types import SimpleNamespace
-
 from unittest.mock import AsyncMock

 import pytest
@@ -16,14 +15,17 @@ async def test_no_action_execution_resolves_incident_once(monkeypatch):
        incident_id="INC-TEST-001",
    )
    incident_service = SimpleNamespace(resolve_incident=AsyncMock())
+    update_execution_status = AsyncMock()
+    timeline_add_event = AsyncMock()
+    alert_completed = AsyncMock(return_value=None)

    monkeypatch.setattr(
        "src.services.approval_execution.get_approval_service",
-        lambda: SimpleNamespace(update_execution_status=AsyncMock()),
+        lambda: SimpleNamespace(update_execution_status=update_execution_status),
    )
    monkeypatch.setattr(
        "src.services.approval_execution.get_timeline_service",
-        lambda: SimpleNamespace(add_event=AsyncMock()),
+        lambda: SimpleNamespace(add_event=timeline_add_event),
    )
    monkeypatch.setattr(
        "src.services.approval_execution.parse_operation_from_action",
@@ -43,12 +45,28 @@ async def test_no_action_execution_resolves_incident_once(monkeypatch):
        "src.services.approval_execution.ApprovalExecutionService._log_aol_completed",
        AsyncMock(return_value=None),
    )
+    monkeypatch.setattr(
+        "src.services.approval_execution.ApprovalExecutionService._log_alert_execution_started",
+        AsyncMock(return_value=None),
+    )
+    monkeypatch.setattr(
+        "src.services.approval_execution.ApprovalExecutionService._log_alert_execution_completed",
+        alert_completed,
+    )

    # Act
    result = await ApprovalExecutionService().execute_approved_action(approval)

    # Assert
    assert result is True
+    update_execution_status.assert_awaited_once_with(
+        approval.id,
+        success=True,
+        execution_kind="no_action",
+    )
+    assert "未執行修復" in timeline_add_event.await_args.kwargs["title"]
+    assert alert_completed.await_args.kwargs["execution_kind"] == "no_action"
+    assert alert_completed.await_args.kwargs["output"]["repair_executed"] is False
    incident_service.resolve_incident.assert_awaited_once_with("INC-TEST-001")


@@ -67,10 +85,11 @@ async def test_no_action_execution_returns_true_when_resolve_raises(monkeypatch)
    incident_service = SimpleNamespace(
        resolve_incident=AsyncMock(side_effect=RuntimeError("redis down"))
    )
+    update_execution_status = AsyncMock()

    monkeypatch.setattr(
        "src.services.approval_execution.get_approval_service",
-        lambda: SimpleNamespace(update_execution_status=AsyncMock()),
+        lambda: SimpleNamespace(update_execution_status=update_execution_status),
    )
    monkeypatch.setattr(
        "src.services.approval_execution.get_timeline_service",
@@ -94,8 +113,21 @@ async def test_no_action_execution_returns_true_when_resolve_raises(monkeypatch)
        "src.services.approval_execution.ApprovalExecutionService._log_aol_completed",
        AsyncMock(return_value=None),
    )
+    monkeypatch.setattr(
+        "src.services.approval_execution.ApprovalExecutionService._log_alert_execution_started",
+        AsyncMock(return_value=None),
+    )
+    monkeypatch.setattr(
+        "src.services.approval_execution.ApprovalExecutionService._log_alert_execution_completed",
+        AsyncMock(return_value=None),
+    )

    result = await ApprovalExecutionService().execute_approved_action(approval)

    assert result is True
+    update_execution_status.assert_awaited_once_with(
+        approval.id,
+        success=True,
+        execution_kind="no_action",
+    )
    incident_service.resolve_incident.assert_awaited_once_with("INC-TEST-002")
--- a/apps/api/tests/test_gap_a4_placeholder_resolution.py
+++ b/apps/api/tests/test_gap_a4_placeholder_resolution.py
@@ -181,7 +181,7 @@ class TestMatchRuleRejection:
    """垃圾 target 時 kubectl_command 必須被清空（降級 LLM）"""

    def test_bad_target_discards_kubectl_command(self):
-        """真實 bug：HostHighCpuLoad target=unknown → kubectl_command 應清空"""
+        """HostHighCpuLoad target=unknown → 不得組裝成壞 kubectl target。"""
        ctx = {
            "alert_type": "high_cpu",
            "severity": "warning",
@@ -192,10 +192,12 @@ class TestMatchRuleRejection:
            "labels": {"alertname": "HostHighCpuLoad", "instance": "192.168.0.110:9100"},
        }
        result = match_rule(ctx)
-        # 規則可能匹配（host_high_cpu）但 kubectl_command 必為空
+        # 規則可能匹配 host SSH 診斷；但不能把 HostHighCpuLoad 當成 K8s target。
        if result is not None:
-            assert result["kubectl_command"] == "", \
-                f"bad target 應導致 kubectl_command 清空, got: {result['kubectl_command']!r}"
+            command = result["kubectl_command"]
+            assert command == "" or command.startswith("ssh "), \
+                f"bad target 不應組裝 kubectl 指令, got: {command!r}"
+            assert "deployment/HostHighCpuLoad" not in command

    def test_good_target_preserves_kubectl_command(self):
        """真實 deployment 名稱時，kubectl_command 正常組裝"""
--- a/apps/api/tests/test_report_generation_service.py
+++ b/apps/api/tests/test_report_generation_service.py
@@ -17,7 +17,9 @@ ADR-076 Task 4: 自動報告生成
 建立: 2026-04-14 (台北時區) Claude Haiku 4.5
 """

+from contextlib import asynccontextmanager
 from datetime import datetime, timedelta, timezone
+from types import SimpleNamespace

 import pytest

@@ -274,6 +276,71 @@ class TestFormatPostmortem:
        assert "台北時間" in report


+class TestTriggerPostmortemPersistence:
+    """Postmortem 產出必須同步沉澱到 KM。"""
+
+    @pytest.mark.asyncio
+    async def test_trigger_postmortem_persists_km_before_telegram_send(self, monkeypatch):
+        now = datetime.now(_TZ_TAIPEI)
+        created = now - timedelta(minutes=16)
+        sent_messages: list[str] = []
+        created_entries: list[object] = []
+        op_logs: list[dict] = []
+
+        class FakeGateway:
+            async def send_to_group(self, text: str, parse_mode: str = "HTML") -> None:
+                sent_messages.append(text)
+
+        class FakeKnowledgeRepo:
+            def __init__(self, _db) -> None:
+                pass
+
+            async def create(self, data):
+                created_entries.append(data)
+                return SimpleNamespace(id="km-postmortem-1")
+
+        class FakeAlertOpRepo:
+            async def append(self, event_type: str, **kwargs):
+                op_logs.append({"event_type": event_type, **kwargs})
+
+        @asynccontextmanager
+        async def fake_db_context():
+            yield SimpleNamespace()
+
+        monkeypatch.setattr(
+            "src.services.telegram_gateway.get_telegram_gateway",
+            lambda: FakeGateway(),
+        )
+        monkeypatch.setattr("src.db.base.get_db_context", fake_db_context)
+        monkeypatch.setattr(
+            "src.repositories.knowledge_repository.KnowledgeDBRepository",
+            FakeKnowledgeRepo,
+        )
+        monkeypatch.setattr(
+            "src.repositories.alert_operation_log_repository.get_alert_operation_log_repository",
+            lambda: FakeAlertOpRepo(),
+        )
+
+        await ReportGenerationService().trigger_postmortem(
+            incident_id="INC-20260531-POST",
+            title="DockerContainerUnhealthy bitan-pharmacy",
+            created_at=created,
+            resolved_at=now,
+            root_cause="容器健康檢查失敗",
+            resolution_action="OBSERVE",
+            auto_repaired=False,
+        )
+
+        assert sent_messages
+        assert created_entries
+        entry = created_entries[0]
+        assert entry.entry_type.value == "postmortem"
+        assert entry.related_incident_id == "INC-20260531-POST"
+        assert entry.path_type == "postmortem"
+        assert op_logs[0]["event_type"] == "KM_CONVERTED"
+        assert op_logs[0]["action_detail"] == "postmortem_persisted"
+
+
 # =============================================================================
 # _seconds_until_next_report
 # =============================================================================
--- a/apps/api/tests/test_telegram_webhook_execution_handoff.py
+++ b/apps/api/tests/test_telegram_webhook_execution_handoff.py
@@ -15,12 +15,18 @@ class _FakeGateway:


 class _FakeApprovalService:
-    def __init__(self, approval, execution_triggered: bool) -> None:
+    def __init__(
+        self,
+        approval,
+        execution_triggered: bool,
+        sign_message: str = "Approval complete",
+    ) -> None:
        self.approval = approval
        self.execution_triggered = execution_triggered
+        self.sign_message = sign_message

    async def sign_approval(self, **_kwargs):
-        return self.approval, "Approval complete", self.execution_triggered
+        return self.approval, self.sign_message, self.execution_triggered

    async def reject_approval(self, **_kwargs):
        return self.approval, "Approval rejected"
@@ -100,6 +106,59 @@ async def test_telegram_approval_schedules_executor_after_required_signature(mon
    assert op_log_repo.rows[0]["kwargs"]["action_detail"] == "approve"


+@pytest.mark.asyncio
+async def test_telegram_approval_duplicate_does_not_schedule_executor(monkeypatch):
+    approval_id = "33333333-3333-3333-3333-333333333333"
+    approval = SimpleNamespace(
+        id=UUID(approval_id),
+        status=SimpleNamespace(value="execution_success"),
+        incident_id="INC-20260531-DUPE",
+    )
+    finalizer_calls: list[dict] = []
+    op_log_repo = _FakeAlertOperationLogRepository()
+
+    async def fake_finalize(*, approval, execution_triggered: bool) -> bool:
+        finalizer_calls.append({
+            "approval_id": str(approval.id),
+            "execution_triggered": execution_triggered,
+        })
+        return True
+
+    monkeypatch.setattr(
+        telegram_api,
+        "get_telegram_gateway",
+        lambda: _FakeGateway({
+            "success": True,
+            "action": "approve",
+            "approval_id": approval_id,
+            "user": {"id": 42, "username": "ops"},
+        }),
+    )
+    monkeypatch.setattr(
+        telegram_api,
+        "get_approval_service",
+        lambda: _FakeApprovalService(
+            approval,
+            execution_triggered=False,
+            sign_message="Cannot sign: status is execution_success",
+        ),
+    )
+    monkeypatch.setattr(telegram_api, "_finalize_telegram_approval", fake_finalize)
+    monkeypatch.setattr(
+        "src.repositories.alert_operation_log_repository.get_alert_operation_log_repository",
+        lambda: op_log_repo,
+    )
+
+    result = await telegram_api.telegram_webhook(_callback_update(f"approve:{approval_id}:ts:nonce"))
+
+    assert result["ok"] is True
+    assert result["message"] == "Already processed"
+    assert result["execution_triggered"] is False
+    assert result["execution_scheduled"] is False
+    assert finalizer_calls == []
+    assert op_log_repo.rows[0]["kwargs"]["action_detail"] == "approve_duplicate"
+
+
@pytest.mark.asyncio
 async def test_telegram_rejection_syncs_incident_state(monkeypatch):
    approval_id = "22222222-2222-2222-2222-222222222222"
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -1,3 +1,39 @@
+## 2026-05-31｜Telegram 告警執行語意與 DB 稽核完整性修復
+
+**背景**：
+
+- Production 查核 `INC-20260530-88D960` / `INC-20260531-88394F` 發現 Telegram 顯示「已批准、執行中、執行成功」，但實際分別是 MinIO SSH 診斷與 `OBSERVE`，不是建議中的修復動作。
+- `approval_records.status=execution_success` 無法區分「真的執行修復」與「純觀察/NO_ACTION terminal」；`alert_operation_log` 缺人工 approval execution 的 start/end，Postmortem 只送 Telegram 未沉澱 KM。
+- `alert_rule_engine` 允許具名規則只靠 message keyword 命中，導致主機 storage 類告警可能誤配到 `minio_disk_high`。
+
+**本次調整**：
+
+- 新增 `approval_action_classifier.is_no_action_approval_action()`，集中判斷 `OBSERVE` / `INVESTIGATE` / `NO_ACTION`。
+- NO_ACTION terminal 仍會關閉 approval，但 `extra_metadata` 標記 `execution_kind=no_action`、`repair_executed=false`；Telegram result 改為「已記錄觀察，未執行修復」。
+- `ApprovalExecutionService` 同步寫 `alert_operation_log`：`EXECUTION_STARTED`、`EXECUTION_COMPLETED`、`TELEGRAM_RESULT_SENT`。
+- Telegram webhook duplicate approval 不再 finalize / schedule executor；long polling 只有真正 `execution_triggered` 才顯示「執行中」。
+- Postmortem 產出時同步 idempotent 寫入 `knowledge_entries(entry_type=postmortem,path_type=postmortem)` 並補 `KM_CONVERTED`。
+- Heartbeat 與日報修復統計排除 observe-only/no-action，避免污染 success rate。
+- `alert_rule_engine._matches()` 收緊具名 alertname 規則，避免 Host storage 類告警靠 `storage` keyword 誤配 MinIO。
+
+**Verification**：
+
+```text
+python3 -m py_compile approval_action_classifier.py approval_execution.py approval_db.py telegram.py telegram_gateway.py alert_rule_engine.py report_generation_service.py heartbeat_report_service.py
+  -> pass
+pytest test_approval_execution_no_action.py test_telegram_webhook_execution_handoff.py -q
+  -> 6 passed
+pytest test_alert_rule_engine_validation.py test_report_generation_service.py -q
+  -> 67 passed
+pytest test_heartbeat_ollama_endpoints.py test_heartbeat_pod_state_machine.py test_gap_a4_placeholder_resolution.py -q
+  -> 49 passed
+```
+
+**判讀 / 下一步**：
+
+- 本輪修復新流量的語意與稽核完整性，不補跑舊 incident 的修復動作。
+- 舊 incident 若已是 `execution_success` 但沒有 `extra_metadata.execution_kind`，仍需透過 `automation_operation_log` / `alert_operation_log` 交叉判讀。
+
 ## 2026-05-31｜Legacy HITL PENDING 前台可見性與心跳拆分

 **背景**：
--- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md
+++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md
@@ -2671,6 +2671,12 @@ Phase 6 完成後
 - Verification：API py_compile pass；targeted ruff for new test pass；`pnpm --filter @awoooi/shared-types generate` pass；`test_approval_pending_visibility.py` 4 passed；`test_heartbeat_ollama_endpoints.py` + `test_heartbeat_pod_state_machine.py` 15 passed；`git diff --check` pass。
 - 判讀：T153 不批次 approve/reject 生產 PENDING，也不把觀察卡刪掉；它把「前台看得到 legacy HITL 事實」與「告警只針對真正人工 actionable backlog」補齊。舊 fallback kubectl / SSH action 仍需 operator 在 `/awooop/approvals` 逐筆決策；OBSERVE / NO_ACTION 類不再偽裝成 emergency manual backlog。下一段可追 LLM failure fallback 為何大量產生 `OBSERVE / medium` 卡片，但需避免破壞 agent 後續把 PENDING 更新成可執行 action 的路徑。

+**T154 Telegram approval truth + execution audit integrity（2026-05-31 台北）**：
+- 觸發：Telegram 上出現「此告警已處理」後仍接著顯示「已批准、執行中」，且 `INC-20260530-88D960` / `INC-20260531-88394F` 的 production 查核顯示 `approval_records.status=execution_success`，但前者實際只跑 MinIO SSH 診斷、後者只是 `OBSERVE`；`auto_repair_executions=0`，`alert_operation_log` 缺 execution start/end，Postmortem 只送 Telegram 未落 KM。這會讓 operator 誤以為修復已完成。
+- 修正：集中 `is_no_action_approval_action()`，讓 `OBSERVE` / `INVESTIGATE` / `NO_ACTION` terminal 時在 `approval_records.extra_metadata` 標記 `execution_kind=no_action`、`repair_executed=false`，Telegram result 改為「已記錄觀察，未執行修復」，不再用「執行成功」。`ApprovalExecutionService` 現在同步寫 `alert_operation_log` 的 `EXECUTION_STARTED` / `EXECUTION_COMPLETED` / `TELEGRAM_RESULT_SENT`，並保留 `automation_operation_log`。Telegram webhook duplicate approval 不再 finalize/schedule executor；long polling 只有真正 `execution_triggered` 才顯示「執行中」。`ReportGenerationService` 會把 Postmortem idempotent 寫入 `knowledge_entries(entry_type=postmortem,path_type=postmortem)` 並補 `KM_CONVERTED`。`HeartbeatReportService` / 日報修復統計排除 observe-only/no-action，不再污染 execution success rate。`alert_rule_engine._matches()` 收緊具名 alertname 規則，避免主機 storage 告警靠 message keyword 誤配 `minio_disk_high`。
+- Verification：`py_compile` pass；`test_approval_execution_no_action.py` + `test_telegram_webhook_execution_handoff.py` 6 passed；`test_alert_rule_engine_validation.py` + `test_report_generation_service.py` 67 passed；`test_heartbeat_ollama_endpoints.py` + `test_heartbeat_pod_state_machine.py` + `test_gap_a4_placeholder_resolution.py` 49 passed after aligning host SSH diagnostic assertion。
+- 判讀：T154 修的是「Telegram / DB / 前台統計的 truthfulness」，不是補跑舊 incident 的修復。舊資料中 status 已是 `execution_success` 的 OBSERVE 仍需靠新 metadata 才能精確分辨；部署後新 approval 會留下 immutable execution start/end 與 no-action 語意，operator 不應再把 OBSERVE 視為完成修復。
+
 **T152 Ansible runtime readiness surfaced（2026-05-24 台北）**：
 - 觸發：T151 已讓首頁看到 execution backend / Ansible attribution，但 operator 仍看不到 runtime 端缺什麼，容易把「Ansible 有候選」誤解成「Ansible 已能自動修復」。
 - 修正：API image 複製 `infra/ansible/` 作 read-only catalog；`truth-chain/quality/summary` 新增 `ansible_runtime`，回報 playbook binary、catalog、inventory、playbook_count、can_run_check_mode、blockers。首頁 execution evidence 同步顯示 runtime 狀態；目前 production 顯示 `runtime 未就緒：ansible_playbook_binary_missing`。未安裝 `ansible-core`、未啟用 check-mode / apply。