diff --git a/apps/api/src/api/v1/telegram.py b/apps/api/src/api/v1/telegram.py
index 83236fbe..0b0d3a27 100644
--- a/apps/api/src/api/v1/telegram.py
+++ b/apps/api/src/api/v1/telegram.py
@@ -275,6 +275,29 @@ async def telegram_webhook(
)
if approval:
+ status_value = approval.status.value if hasattr(approval.status, "value") else str(approval.status)
+ if (
+ "Cannot sign" in msg
+ or "already signed" in msg
+ or "Concurrent modification" in msg
+ ):
+ logger.info(
+ "telegram_approval_ignored_already_processed",
+ approval_id=approval_id,
+ user_id=user_id,
+ status=status_value,
+ message=msg,
+ )
+ await _log_user_action("approve_duplicate", False, getattr(approval, "incident_id", None))
+ return {
+ "ok": True,
+ "message": "Already processed",
+ "approval_id": approval_id,
+ "status": status_value,
+ "execution_triggered": False,
+ "execution_scheduled": False,
+ }
+
execution_scheduled = await _finalize_telegram_approval(
approval=approval,
execution_triggered=execution_triggered,
@@ -283,7 +306,7 @@ async def telegram_webhook(
"telegram_approval_signed",
approval_id=approval_id,
user_id=user_id,
- status=approval.status.value,
+ status=status_value,
execution_triggered=execution_triggered,
execution_scheduled=execution_scheduled,
)
@@ -291,9 +314,9 @@ async def telegram_webhook(
return {
"ok": True,
- "message": "Approved",
+ "message": "Approved" if execution_triggered else "Signed",
"approval_id": approval_id,
- "status": approval.status.value,
+ "status": status_value,
"execution_triggered": execution_triggered,
"execution_scheduled": execution_scheduled,
}
diff --git a/apps/api/src/services/alert_rule_engine.py b/apps/api/src/services/alert_rule_engine.py
index b96bb645..4ee6f2a0 100644
--- a/apps/api/src/services/alert_rule_engine.py
+++ b/apps/api/src/services/alert_rule_engine.py
@@ -298,6 +298,12 @@ def _matches(rule: dict, alertname: str, alert_type: str, message: str, instance
if alertnames and alertname in alertnames:
return True
+ # 2026-05-31 ogt + Codex: 有明確 alertname 的規則不得只靠寬鬆 message
+ # keyword 命中,否則 HostPreviousBootStorageErrorsDetected 這類主機 storage
+ # 告警會誤配到 minio_disk_high。
+ if alertnames and alertname and alertname != "custom":
+ return False
+
# alert_type 部分匹配
for kw in match.get("alert_type", []):
if kw.lower() in alert_type.lower():
diff --git a/apps/api/src/services/approval_action_classifier.py b/apps/api/src/services/approval_action_classifier.py
new file mode 100644
index 00000000..27b97335
--- /dev/null
+++ b/apps/api/src/services/approval_action_classifier.py
@@ -0,0 +1,26 @@
+"""
+Approval action classifier
+==========================
+
+2026-05-31 ogt + Codex: Telegram 告警鏈路一致性修復。
+將 OBSERVE / INVESTIGATE / NO_ACTION 這類「純觀察、未執行修復」的
+判斷集中,避免 execution、Telegram、統計各自用不同語意。
+"""
+
+from __future__ import annotations
+
+
+def is_no_action_approval_action(action: str | None) -> bool:
+ """Return True when an approval action records observation instead of repair."""
+ text = (action or "").strip()
+ upper = text.upper()
+ if not text:
+ return True
+ return (
+ "NO_ACTION" in upper
+ or "NO-ACTION" in upper
+ or "NOACTION" in upper
+ or "(未設)" in text
+ or upper.startswith("OBSERVE")
+ or upper.startswith("INVESTIGATE")
+ )
diff --git a/apps/api/src/services/approval_db.py b/apps/api/src/services/approval_db.py
index 608d1121..46323f62 100644
--- a/apps/api/src/services/approval_db.py
+++ b/apps/api/src/services/approval_db.py
@@ -659,6 +659,7 @@ class ApprovalDBService:
approval_id: UUID,
success: bool,
error_message: str | None = None,
+ execution_kind: str | None = None,
) -> None:
"""
更新執行狀態
@@ -669,21 +670,36 @@ class ApprovalDBService:
"""
async with get_db_context() as db:
status = ApprovalStatus.EXECUTION_SUCCESS if success else ApprovalStatus.EXECUTION_FAILED
- values: dict = {"status": status}
+ result = await db.execute(
+ select(ApprovalRecord).where(ApprovalRecord.id == str(approval_id))
+ )
+ record = result.scalar_one_or_none()
+ if record is None:
+ logger.warning(
+ "approval_execution_status_update_missing",
+ id=str(approval_id),
+ success=success,
+ )
+ return
+
+ record.status = status
if not success and error_message:
# 截斷至合理長度,避免爆欄位
- values["rejection_reason"] = str(error_message)[:2000]
- await db.execute(
- update(ApprovalRecord)
- .where(ApprovalRecord.id == str(approval_id))
- .values(**values)
- )
+ record.rejection_reason = str(error_message)[:2000]
+ if execution_kind:
+ # 2026-05-31 ogt + Codex: OBSERVE/NO_ACTION 仍需 terminal 狀態,
+ # 但前台/報表必須能分辨「未執行修復」而非真正 execution success。
+ metadata = dict(record.extra_metadata or {})
+ metadata["execution_kind"] = execution_kind
+ metadata["repair_executed"] = execution_kind != "no_action"
+ record.extra_metadata = metadata
logger.info(
"approval_execution_status_updated",
id=str(approval_id),
success=success,
has_error=bool(error_message),
+ execution_kind=execution_kind,
)
async def update_incident_id(self, approval_id: UUID, incident_id: str) -> None:
diff --git a/apps/api/src/services/approval_execution.py b/apps/api/src/services/approval_execution.py
index 9ca8c664..79a6b8ef 100644
--- a/apps/api/src/services/approval_execution.py
+++ b/apps/api/src/services/approval_execution.py
@@ -36,6 +36,7 @@ from src.db.base import get_db_context
from src.models.approval import ApprovalRequest
from src.plugins.mcp.gateway import GatewayContext, McpGateway, McpGatewayError
from src.plugins.mcp.interfaces import MCPToolResult
+from src.services.approval_action_classifier import is_no_action_approval_action
from src.services.approval_db import get_approval_service, get_timeline_service
from src.services.executor import ExecutionResult, OperationType, get_executor
from src.services.operation_parser import parse_operation_from_action
@@ -165,6 +166,7 @@ class ApprovalExecutionService:
# ADR-090 § 自動化動作回灌 (2026-04-19): 主流程開始即在 aol 留痕,
# 結束時 update。不依賴 fire-and-forget,確保 33 件/7d approval 全部可觀測。
_aol_op_id = await self._log_aol_started(approval)
+ await self._log_alert_execution_started(approval, aol_op_id=_aol_op_id)
_aol_started_ms = time.time()
service = get_approval_service()
@@ -228,15 +230,7 @@ class ApprovalExecutionService:
# 2026-04-19 ogt + Claude Opus 4.7: 區分 NO_ACTION vs 真解析失敗
# NO_ACTION 是 AI 刻意選的「純調查不破壞」,不該誤標 EXECUTION_FAILED
# 污染 auto_execute 成功率 KPI (MASTER §7.1 #11)
- _action_upper = (approval.action or "").upper()
- _is_no_action = (
- "NO_ACTION" in _action_upper
- or "NO-ACTION" in _action_upper
- or "NOACTION" in _action_upper
- or "(未設)" in approval.action
- or _action_upper.startswith("OBSERVE")
- or _action_upper.startswith("INVESTIGATE")
- )
+ _is_no_action = is_no_action_approval_action(approval.action)
if _is_no_action:
logger.info(
@@ -246,13 +240,17 @@ class ApprovalExecutionService:
reason="NO_ACTION - 純調查/觀察類,不執行破壞動作",
path="no_action",
)
- # 標為 SUCCESS (觀察/調查本身就是成功完成)
- await service.update_execution_status(approval.id, success=True)
+ # 仍以 terminal success 關閉簽核,但 metadata 明確標記未執行修復。
+ await service.update_execution_status(
+ approval.id,
+ success=True,
+ execution_kind="no_action",
+ )
await timeline.add_event(
event_type="exec",
status="success",
- title="✅ 純觀察類動作完成 (NO_ACTION)",
- description=f"Action: {approval.action[:120]}",
+ title="ℹ️ 純觀察類動作已記錄(未執行修復)",
+ description=f"Action: {(approval.action or '')[:120]}",
actor="leWOOOgo",
actor_role="executor",
approval_id=str(approval.id),
@@ -269,7 +267,22 @@ class ApprovalExecutionService:
op_id=_aol_op_id,
status="success",
duration_ms=int((time.time() - _aol_started_ms) * 1000),
- output={"reason": "NO_ACTION", "action": approval.action[:200]},
+ output={
+ "reason": "NO_ACTION",
+ "execution_kind": "no_action",
+ "repair_executed": False,
+ "action": (approval.action or "")[:200],
+ },
+ )
+ await self._log_alert_execution_completed(
+ approval,
+ success=True,
+ execution_kind="no_action",
+ duration_ms=int((time.time() - _aol_started_ms) * 1000),
+ output={
+ "reason": "NO_ACTION",
+ "repair_executed": False,
+ },
)
# F2 (2026-05-07 ogt + Claude Sonnet 4.6 + Codex):
# NO_ACTION 路徑要把 incident 推到 RESOLVED,否則 incident 永遠卡
@@ -336,6 +349,13 @@ class ApprovalExecutionService:
duration_ms=int((time.time() - _aol_started_ms) * 1000),
error=f"parse_fail: {approval.action[:300]}",
)
+ await self._log_alert_execution_completed(
+ approval,
+ success=False,
+ execution_kind="parse_failed",
+ duration_ms=int((time.time() - _aol_started_ms) * 1000),
+ error_message=f"Could not parse operation type from action: {approval.action[:150]}",
+ )
return False # 解析失敗 → 執行未發生
executor = get_executor()
@@ -553,6 +573,20 @@ class ApprovalExecutionService:
"total_attempts": total_attempts,
},
)
+ await self._log_alert_execution_completed(
+ approval,
+ success=True,
+ execution_kind=operation_type.value,
+ duration_ms=int((time.time() - _aol_started_ms) * 1000),
+ output={
+ "operation_type": operation_type.value,
+ "resource_name": resource_name,
+ "namespace": namespace,
+ "executor_duration_ms": result.duration_ms,
+ "total_attempts": total_attempts,
+ "repair_executed": True,
+ },
+ )
return True # K8s 執行成功
else:
@@ -654,6 +688,22 @@ class ApprovalExecutionService:
error=result.error,
stderr=result.error, # E6 stderr 回灌 — 給 retry/Playbook 負向強化用
)
+ await self._log_alert_execution_completed(
+ approval,
+ success=False,
+ execution_kind=operation_type.value,
+ duration_ms=int((time.time() - _aol_started_ms) * 1000),
+ output={
+ "operation_type": operation_type.value,
+ "resource_name": resource_name,
+ "namespace": namespace,
+ "executor_duration_ms": result.duration_ms,
+ "total_attempts": total_attempts,
+ "repair_attempted": True,
+ "repair_executed": False,
+ },
+ error_message=result.error,
+ )
return False # K8s 執行失敗
async def _execute_ssh_host_action(
@@ -919,7 +969,14 @@ class ApprovalExecutionService:
except Exception:
pass
- if success:
+ no_action = success and is_no_action_approval_action(approval.action)
+ if no_action:
+ text = (
+ f"ℹ️ 已記錄觀察,未執行修復\n"
+ f"{(approval.action or '')[:180]}"
+ f"{km_info}"
+ )
+ elif success:
text = (
f"✅ 執行成功\n"
f"{(approval.action or '')[:180]}"
@@ -948,8 +1005,34 @@ class ApprovalExecutionService:
incident_id=approval.incident_id,
approval_id=str(approval.id),
success=success,
+ no_action=no_action,
orig_msg_id=orig_msg_id,
)
+ try:
+ from src.repositories.alert_operation_log_repository import (
+ get_alert_operation_log_repository,
+ )
+
+ await get_alert_operation_log_repository().append(
+ "TELEGRAM_RESULT_SENT",
+ incident_id=approval.incident_id,
+ approval_id=str(approval.id),
+ actor="approval_execution",
+ action_detail="telegram_execution_result_sent",
+ success=success,
+ error_message=error,
+ context={
+ "reply_to_message_id": orig_msg_id,
+ "execution_kind": "no_action" if no_action else "execution",
+ "repair_executed": not no_action and success,
+ },
+ )
+ except Exception as _log_e:
+ logger.warning(
+ "alert_op_telegram_result_write_failed",
+ approval_id=str(approval.id),
+ error=str(_log_e),
+ )
except Exception as e:
logger.warning(
"push_execution_result_failed",
@@ -1592,6 +1675,85 @@ class ApprovalExecutionService:
# 22 筆 notification_formatted。修復後每次執行都留痕。
# =========================================================================
+ async def _log_alert_execution_started(
+ self,
+ approval: ApprovalRequest,
+ *,
+ aol_op_id: str | None,
+ ) -> None:
+ """Append immutable alert_operation_log start event for manual execution."""
+ try:
+ from src.repositories.alert_operation_log_repository import (
+ get_alert_operation_log_repository,
+ )
+
+ await get_alert_operation_log_repository().append(
+ "EXECUTION_STARTED",
+ incident_id=approval.incident_id,
+ approval_id=str(approval.id),
+ actor="approval_execution",
+ action_detail="approval_execution_started",
+ success=None,
+ context={
+ "action": (approval.action or "")[:500],
+ "automation_operation_id": aol_op_id,
+ "execution_kind": (
+ "no_action"
+ if is_no_action_approval_action(approval.action)
+ else "executable"
+ ),
+ "repair_attempted": False,
+ "repair_executed": False,
+ },
+ )
+ except Exception as e:
+ logger.warning(
+ "alert_op_execution_started_write_failed",
+ approval_id=str(approval.id),
+ incident_id=approval.incident_id,
+ error=str(e),
+ )
+
+ async def _log_alert_execution_completed(
+ self,
+ approval: ApprovalRequest,
+ *,
+ success: bool,
+ execution_kind: str,
+ duration_ms: int,
+ output: dict | None = None,
+ error_message: str | None = None,
+ ) -> None:
+ """Append immutable alert_operation_log completion event for manual execution."""
+ try:
+ from src.repositories.alert_operation_log_repository import (
+ get_alert_operation_log_repository,
+ )
+
+ context = {
+ "action": (approval.action or "")[:500],
+ "duration_ms": duration_ms,
+ "execution_kind": execution_kind,
+ **(output or {}),
+ }
+ await get_alert_operation_log_repository().append(
+ "EXECUTION_COMPLETED",
+ incident_id=approval.incident_id,
+ approval_id=str(approval.id),
+ actor="approval_execution",
+ action_detail=f"approval_execution_{execution_kind}",
+ success=success,
+ error_message=(error_message or "")[:2000] if error_message else None,
+ context=context,
+ )
+ except Exception as e:
+ logger.warning(
+ "alert_op_execution_completed_write_failed",
+ approval_id=str(approval.id),
+ incident_id=approval.incident_id,
+ error=str(e),
+ )
+
async def _log_aol_started(self, approval: ApprovalRequest) -> str | None:
"""
在 automation_operation_log 寫一筆 'pending' 紀錄,回傳 op_id 供 _log_aol_completed 更新。
diff --git a/apps/api/src/services/heartbeat_report_service.py b/apps/api/src/services/heartbeat_report_service.py
index 5768b709..b7424a95 100644
--- a/apps/api/src/services/heartbeat_report_service.py
+++ b/apps/api/src/services/heartbeat_report_service.py
@@ -531,7 +531,9 @@ class HeartbeatReportService:
SELECT
*,
(
- btrim(coalesce(action, '')) = ''
+ COALESCE(extra_metadata->>'execution_kind', '') = 'no_action'
+ OR COALESCE(extra_metadata->>'repair_executed', '') = 'false'
+ OR btrim(coalesce(action, '')) = ''
OR UPPER(action) LIKE 'OBSERVE%'
OR UPPER(action) LIKE 'INVESTIGATE%'
OR UPPER(action) LIKE 'NO_ACTION%'
@@ -556,9 +558,18 @@ class HeartbeatReportService:
WHERE UPPER(status::text) = 'PENDING'
AND telegram_message_id IS NULL
) AS pending_without_telegram,
- COUNT(*) FILTER (WHERE UPPER(status::text) = 'EXECUTION_SUCCESS') AS success,
- COUNT(*) FILTER (WHERE UPPER(status::text) = 'EXECUTION_FAILED') AS failed,
- COUNT(*) FILTER (WHERE UPPER(status::text) IN ('APPROVED','EXECUTION_SUCCESS','EXECUTION_FAILED')) AS auto_resolved
+ COUNT(*) FILTER (
+ WHERE UPPER(status::text) = 'EXECUTION_SUCCESS'
+ AND NOT is_observe_only
+ ) AS success,
+ COUNT(*) FILTER (
+ WHERE UPPER(status::text) = 'EXECUTION_FAILED'
+ AND NOT is_observe_only
+ ) AS failed,
+ COUNT(*) FILTER (
+ WHERE UPPER(status::text) IN ('APPROVED','EXECUTION_SUCCESS','EXECUTION_FAILED')
+ AND NOT is_observe_only
+ ) AS auto_resolved
FROM scoped
"""))
row = r.one()
diff --git a/apps/api/src/services/report_generation_service.py b/apps/api/src/services/report_generation_service.py
index a94ead05..2c1f296c 100644
--- a/apps/api/src/services/report_generation_service.py
+++ b/apps/api/src/services/report_generation_service.py
@@ -232,11 +232,32 @@ class ReportGenerationService:
async with get_db_context() as db:
row = await db.execute(
text("""
+ WITH scoped AS (
+ SELECT
+ *,
+ (
+ COALESCE(extra_metadata->>'execution_kind', '') = 'no_action'
+ OR COALESCE(extra_metadata->>'repair_executed', '') = 'false'
+ OR btrim(coalesce(action, '')) = ''
+ OR UPPER(action) LIKE 'OBSERVE%'
+ OR UPPER(action) LIKE 'INVESTIGATE%'
+ OR UPPER(action) LIKE 'NO_ACTION%'
+ OR UPPER(action) LIKE '% NO_ACTION%'
+ OR UPPER(action) LIKE '%| NO_ACTION%'
+ ) AS is_observe_only
+ FROM approval_records
+ WHERE created_at >= :since
+ )
SELECT
- COUNT(*) FILTER (WHERE UPPER(status::text) = 'EXECUTION_SUCCESS') AS success,
- COUNT(*) FILTER (WHERE UPPER(status::text) = 'EXECUTION_FAILED') AS failed
- FROM approval_records
- WHERE created_at >= :since
+ COUNT(*) FILTER (
+ WHERE UPPER(status::text) = 'EXECUTION_SUCCESS'
+ AND NOT is_observe_only
+ ) AS success,
+ COUNT(*) FILTER (
+ WHERE UPPER(status::text) = 'EXECUTION_FAILED'
+ AND NOT is_observe_only
+ ) AS failed
+ FROM scoped
"""),
{"since": since},
)
@@ -460,6 +481,7 @@ class ReportGenerationService:
# 失敗時發送告警到 SRE 群組,避免靜默吞掉錯誤
import asyncio as _asyncio
report_text = self.format_postmortem(data)
+ await self._persist_postmortem_km(data, report_text)
from src.services.telegram_gateway import get_telegram_gateway
gateway = get_telegram_gateway()
@@ -510,6 +532,72 @@ class ReportGenerationService:
error=str(_fe),
)
+ async def _persist_postmortem_km(
+ self,
+ data: PostmortemData,
+ report_text: str,
+ ) -> None:
+ """Persist generated postmortem as an idempotent KM entry before Telegram send."""
+ try:
+ from src.db.base import get_db_context
+ from src.models.knowledge import (
+ EntrySource,
+ EntryStatus,
+ EntryType,
+ KnowledgeEntryCreate,
+ )
+ from src.repositories.alert_operation_log_repository import (
+ get_alert_operation_log_repository,
+ )
+ from src.repositories.knowledge_repository import KnowledgeDBRepository
+
+ async with get_db_context() as db:
+ repo = KnowledgeDBRepository(db)
+ entry = await repo.create(
+ KnowledgeEntryCreate(
+ title=f"Postmortem {data.incident_id}: {data.title}"[:255],
+ content=report_text,
+ entry_type=EntryType.POSTMORTEM,
+ category="postmortem",
+ tags=[
+ "postmortem",
+ "incident",
+ "telegram",
+ "auto_repaired" if data.auto_repaired else "human_intervention",
+ ],
+ source=EntrySource.AI_EXTRACTED,
+ status=EntryStatus.REVIEW,
+ related_incident_id=data.incident_id,
+ path_type="postmortem",
+ created_by="report_generation_service",
+ )
+ )
+
+ await get_alert_operation_log_repository().append(
+ "KM_CONVERTED",
+ incident_id=data.incident_id,
+ actor="report_generation_service",
+ action_detail="postmortem_persisted",
+ success=True,
+ context={
+ "knowledge_entry_id": entry.id,
+ "entry_type": EntryType.POSTMORTEM.value,
+ "path_type": "postmortem",
+ "duration_minutes": round(data.duration_minutes, 2),
+ },
+ )
+ logger.info(
+ "postmortem_km_persisted",
+ incident_id=data.incident_id,
+ knowledge_entry_id=entry.id,
+ )
+ except Exception as e:
+ logger.warning(
+ "postmortem_km_persist_failed",
+ incident_id=data.incident_id,
+ error=str(e),
+ )
+
# =============================================================================
# 日度報告排程迴圈
diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py
index 89f0d3d9..b5516ab3 100644
--- a/apps/api/src/services/telegram_gateway.py
+++ b/apps/api/src/services/telegram_gateway.py
@@ -8377,9 +8377,7 @@ class TelegramGateway:
if action == "approve":
status_emoji = "✅"
status_text = f"已批准 by {_html.escape(username)}"
- # 2026-04-14 Claude Sonnet 4.6: 原「等待執行」誤導(實際沒有 gate 會卡住路徑)
- # 批准後一律顯示「執行中」,真實結果由 _push_execution_result_to_alert reply 補上
- suffix = "⚡ 執行中..."
+ suffix = "⚡ 執行中..." if execution_triggered else "已簽核,等待更多簽核"
else:
status_emoji = "❌"
status_text = f"已拒絕 by {_html.escape(username)}"
@@ -8495,7 +8493,7 @@ class TelegramGateway:
# 2026-04-22 Claude Sonnet 4.6: 只有真正轉為 APPROVED 才發「執行中...」
# 非 PENDING 狀態下 sign_approval early-return → approval 是舊 record
# 此時不應發「執行中...」,應告知用戶告警已處理過
- if approval.status == ApprovalStatus.APPROVED:
+ if approval.status == ApprovalStatus.APPROVED and execution_triggered:
# 2026-04-09 Claude Sonnet 4.6: 回應 Telegram — 更新訊息狀態 + answer callback
await self._notify_approval_result(
message_id=message_id,
@@ -8520,7 +8518,7 @@ class TelegramGateway:
# 原本 gate 用 execution_triggered,race condition 時失效(樂觀鎖失敗)
# 改用 approval.status == APPROVED(與 REST API 路徑 approvals.py:360 對齊)
# 用 Redis lock exec:{approval_id} 防重入(REST + Telegram 同時簽核)
- if approval.status == ApprovalStatus.APPROVED:
+ if approval.status == ApprovalStatus.APPROVED and execution_triggered:
import asyncio
from src.core.redis_client import get_redis
diff --git a/apps/api/tests/test_alert_rule_engine_validation.py b/apps/api/tests/test_alert_rule_engine_validation.py
index a05d523d..93aed3af 100644
--- a/apps/api/tests/test_alert_rule_engine_validation.py
+++ b/apps/api/tests/test_alert_rule_engine_validation.py
@@ -18,7 +18,7 @@ Task 2.3: validate_kubectl_command() 白名單驗證
import pytest
-from src.services.alert_rule_engine import validate_kubectl_command
+from src.services.alert_rule_engine import match_rule, validate_kubectl_command
# =============================================================================
@@ -76,6 +76,49 @@ class TestValidKubectlCommands:
assert validate_kubectl_command(cmd) is False
+class TestRuleMatchingSpecificity:
+ """具名 alertname 規則不得被寬鬆 message keyword 誤命中。"""
+
+ def test_host_storage_alert_does_not_match_minio_disk_rule(self):
+ ctx = {
+ "alert_type": "host",
+ "severity": "critical",
+ "source": "prometheus",
+ "target_resource": "dirty-reboot-evidence",
+ "namespace": "awoooi-prod",
+ "message": "HostPreviousBootStorageErrorsDetected storage dirty reboot evidence",
+ "labels": {
+ "alertname": "HostPreviousBootStorageErrorsDetected",
+ "instance": "192.168.0.110:9100",
+ },
+ }
+
+ result = match_rule(ctx)
+
+ assert result is not None
+ assert result["rule_id"] != "minio_disk_high"
+ assert "/data/minio" not in result.get("kubectl_command", "")
+
+ def test_exact_minio_disk_alert_still_matches_minio_rule(self):
+ ctx = {
+ "alert_type": "storage",
+ "severity": "critical",
+ "source": "prometheus",
+ "target_resource": "minio",
+ "namespace": "awoooi-prod",
+ "message": "MinIO disk usage high",
+ "labels": {
+ "alertname": "MinioDiskUsageHigh",
+ "instance": "192.168.0.110:9000",
+ },
+ }
+
+ result = match_rule(ctx)
+
+ assert result is not None
+ assert result["rule_id"] == "minio_disk_high"
+
+
# =============================================================================
# 阻擋案例(應返回 False)
# =============================================================================
diff --git a/apps/api/tests/test_approval_execution_no_action.py b/apps/api/tests/test_approval_execution_no_action.py
index 13ae42bf..1b7fcedd 100644
--- a/apps/api/tests/test_approval_execution_no_action.py
+++ b/apps/api/tests/test_approval_execution_no_action.py
@@ -1,5 +1,4 @@
from types import SimpleNamespace
-
from unittest.mock import AsyncMock
import pytest
@@ -16,14 +15,17 @@ async def test_no_action_execution_resolves_incident_once(monkeypatch):
incident_id="INC-TEST-001",
)
incident_service = SimpleNamespace(resolve_incident=AsyncMock())
+ update_execution_status = AsyncMock()
+ timeline_add_event = AsyncMock()
+ alert_completed = AsyncMock(return_value=None)
monkeypatch.setattr(
"src.services.approval_execution.get_approval_service",
- lambda: SimpleNamespace(update_execution_status=AsyncMock()),
+ lambda: SimpleNamespace(update_execution_status=update_execution_status),
)
monkeypatch.setattr(
"src.services.approval_execution.get_timeline_service",
- lambda: SimpleNamespace(add_event=AsyncMock()),
+ lambda: SimpleNamespace(add_event=timeline_add_event),
)
monkeypatch.setattr(
"src.services.approval_execution.parse_operation_from_action",
@@ -43,12 +45,28 @@ async def test_no_action_execution_resolves_incident_once(monkeypatch):
"src.services.approval_execution.ApprovalExecutionService._log_aol_completed",
AsyncMock(return_value=None),
)
+ monkeypatch.setattr(
+ "src.services.approval_execution.ApprovalExecutionService._log_alert_execution_started",
+ AsyncMock(return_value=None),
+ )
+ monkeypatch.setattr(
+ "src.services.approval_execution.ApprovalExecutionService._log_alert_execution_completed",
+ alert_completed,
+ )
# Act
result = await ApprovalExecutionService().execute_approved_action(approval)
# Assert
assert result is True
+ update_execution_status.assert_awaited_once_with(
+ approval.id,
+ success=True,
+ execution_kind="no_action",
+ )
+ assert "未執行修復" in timeline_add_event.await_args.kwargs["title"]
+ assert alert_completed.await_args.kwargs["execution_kind"] == "no_action"
+ assert alert_completed.await_args.kwargs["output"]["repair_executed"] is False
incident_service.resolve_incident.assert_awaited_once_with("INC-TEST-001")
@@ -67,10 +85,11 @@ async def test_no_action_execution_returns_true_when_resolve_raises(monkeypatch)
incident_service = SimpleNamespace(
resolve_incident=AsyncMock(side_effect=RuntimeError("redis down"))
)
+ update_execution_status = AsyncMock()
monkeypatch.setattr(
"src.services.approval_execution.get_approval_service",
- lambda: SimpleNamespace(update_execution_status=AsyncMock()),
+ lambda: SimpleNamespace(update_execution_status=update_execution_status),
)
monkeypatch.setattr(
"src.services.approval_execution.get_timeline_service",
@@ -94,8 +113,21 @@ async def test_no_action_execution_returns_true_when_resolve_raises(monkeypatch)
"src.services.approval_execution.ApprovalExecutionService._log_aol_completed",
AsyncMock(return_value=None),
)
+ monkeypatch.setattr(
+ "src.services.approval_execution.ApprovalExecutionService._log_alert_execution_started",
+ AsyncMock(return_value=None),
+ )
+ monkeypatch.setattr(
+ "src.services.approval_execution.ApprovalExecutionService._log_alert_execution_completed",
+ AsyncMock(return_value=None),
+ )
result = await ApprovalExecutionService().execute_approved_action(approval)
assert result is True
+ update_execution_status.assert_awaited_once_with(
+ approval.id,
+ success=True,
+ execution_kind="no_action",
+ )
incident_service.resolve_incident.assert_awaited_once_with("INC-TEST-002")
diff --git a/apps/api/tests/test_gap_a4_placeholder_resolution.py b/apps/api/tests/test_gap_a4_placeholder_resolution.py
index a468e35c..2f1b4ebd 100644
--- a/apps/api/tests/test_gap_a4_placeholder_resolution.py
+++ b/apps/api/tests/test_gap_a4_placeholder_resolution.py
@@ -181,7 +181,7 @@ class TestMatchRuleRejection:
"""垃圾 target 時 kubectl_command 必須被清空(降級 LLM)"""
def test_bad_target_discards_kubectl_command(self):
- """真實 bug:HostHighCpuLoad target=unknown → kubectl_command 應清空"""
+ """HostHighCpuLoad target=unknown → 不得組裝成壞 kubectl target。"""
ctx = {
"alert_type": "high_cpu",
"severity": "warning",
@@ -192,10 +192,12 @@ class TestMatchRuleRejection:
"labels": {"alertname": "HostHighCpuLoad", "instance": "192.168.0.110:9100"},
}
result = match_rule(ctx)
- # 規則可能匹配(host_high_cpu)但 kubectl_command 必為空
+ # 規則可能匹配 host SSH 診斷;但不能把 HostHighCpuLoad 當成 K8s target。
if result is not None:
- assert result["kubectl_command"] == "", \
- f"bad target 應導致 kubectl_command 清空, got: {result['kubectl_command']!r}"
+ command = result["kubectl_command"]
+ assert command == "" or command.startswith("ssh "), \
+ f"bad target 不應組裝 kubectl 指令, got: {command!r}"
+ assert "deployment/HostHighCpuLoad" not in command
def test_good_target_preserves_kubectl_command(self):
"""真實 deployment 名稱時,kubectl_command 正常組裝"""
diff --git a/apps/api/tests/test_report_generation_service.py b/apps/api/tests/test_report_generation_service.py
index b3a8845d..532a9707 100644
--- a/apps/api/tests/test_report_generation_service.py
+++ b/apps/api/tests/test_report_generation_service.py
@@ -17,7 +17,9 @@ ADR-076 Task 4: 自動報告生成
建立: 2026-04-14 (台北時區) Claude Haiku 4.5
"""
+from contextlib import asynccontextmanager
from datetime import datetime, timedelta, timezone
+from types import SimpleNamespace
import pytest
@@ -274,6 +276,71 @@ class TestFormatPostmortem:
assert "台北時間" in report
+class TestTriggerPostmortemPersistence:
+ """Postmortem 產出必須同步沉澱到 KM。"""
+
+ @pytest.mark.asyncio
+ async def test_trigger_postmortem_persists_km_before_telegram_send(self, monkeypatch):
+ now = datetime.now(_TZ_TAIPEI)
+ created = now - timedelta(minutes=16)
+ sent_messages: list[str] = []
+ created_entries: list[object] = []
+ op_logs: list[dict] = []
+
+ class FakeGateway:
+ async def send_to_group(self, text: str, parse_mode: str = "HTML") -> None:
+ sent_messages.append(text)
+
+ class FakeKnowledgeRepo:
+ def __init__(self, _db) -> None:
+ pass
+
+ async def create(self, data):
+ created_entries.append(data)
+ return SimpleNamespace(id="km-postmortem-1")
+
+ class FakeAlertOpRepo:
+ async def append(self, event_type: str, **kwargs):
+ op_logs.append({"event_type": event_type, **kwargs})
+
+ @asynccontextmanager
+ async def fake_db_context():
+ yield SimpleNamespace()
+
+ monkeypatch.setattr(
+ "src.services.telegram_gateway.get_telegram_gateway",
+ lambda: FakeGateway(),
+ )
+ monkeypatch.setattr("src.db.base.get_db_context", fake_db_context)
+ monkeypatch.setattr(
+ "src.repositories.knowledge_repository.KnowledgeDBRepository",
+ FakeKnowledgeRepo,
+ )
+ monkeypatch.setattr(
+ "src.repositories.alert_operation_log_repository.get_alert_operation_log_repository",
+ lambda: FakeAlertOpRepo(),
+ )
+
+ await ReportGenerationService().trigger_postmortem(
+ incident_id="INC-20260531-POST",
+ title="DockerContainerUnhealthy bitan-pharmacy",
+ created_at=created,
+ resolved_at=now,
+ root_cause="容器健康檢查失敗",
+ resolution_action="OBSERVE",
+ auto_repaired=False,
+ )
+
+ assert sent_messages
+ assert created_entries
+ entry = created_entries[0]
+ assert entry.entry_type.value == "postmortem"
+ assert entry.related_incident_id == "INC-20260531-POST"
+ assert entry.path_type == "postmortem"
+ assert op_logs[0]["event_type"] == "KM_CONVERTED"
+ assert op_logs[0]["action_detail"] == "postmortem_persisted"
+
+
# =============================================================================
# _seconds_until_next_report
# =============================================================================
diff --git a/apps/api/tests/test_telegram_webhook_execution_handoff.py b/apps/api/tests/test_telegram_webhook_execution_handoff.py
index 697a5c79..207e2d86 100644
--- a/apps/api/tests/test_telegram_webhook_execution_handoff.py
+++ b/apps/api/tests/test_telegram_webhook_execution_handoff.py
@@ -15,12 +15,18 @@ class _FakeGateway:
class _FakeApprovalService:
- def __init__(self, approval, execution_triggered: bool) -> None:
+ def __init__(
+ self,
+ approval,
+ execution_triggered: bool,
+ sign_message: str = "Approval complete",
+ ) -> None:
self.approval = approval
self.execution_triggered = execution_triggered
+ self.sign_message = sign_message
async def sign_approval(self, **_kwargs):
- return self.approval, "Approval complete", self.execution_triggered
+ return self.approval, self.sign_message, self.execution_triggered
async def reject_approval(self, **_kwargs):
return self.approval, "Approval rejected"
@@ -100,6 +106,59 @@ async def test_telegram_approval_schedules_executor_after_required_signature(mon
assert op_log_repo.rows[0]["kwargs"]["action_detail"] == "approve"
+@pytest.mark.asyncio
+async def test_telegram_approval_duplicate_does_not_schedule_executor(monkeypatch):
+ approval_id = "33333333-3333-3333-3333-333333333333"
+ approval = SimpleNamespace(
+ id=UUID(approval_id),
+ status=SimpleNamespace(value="execution_success"),
+ incident_id="INC-20260531-DUPE",
+ )
+ finalizer_calls: list[dict] = []
+ op_log_repo = _FakeAlertOperationLogRepository()
+
+ async def fake_finalize(*, approval, execution_triggered: bool) -> bool:
+ finalizer_calls.append({
+ "approval_id": str(approval.id),
+ "execution_triggered": execution_triggered,
+ })
+ return True
+
+ monkeypatch.setattr(
+ telegram_api,
+ "get_telegram_gateway",
+ lambda: _FakeGateway({
+ "success": True,
+ "action": "approve",
+ "approval_id": approval_id,
+ "user": {"id": 42, "username": "ops"},
+ }),
+ )
+ monkeypatch.setattr(
+ telegram_api,
+ "get_approval_service",
+ lambda: _FakeApprovalService(
+ approval,
+ execution_triggered=False,
+ sign_message="Cannot sign: status is execution_success",
+ ),
+ )
+ monkeypatch.setattr(telegram_api, "_finalize_telegram_approval", fake_finalize)
+ monkeypatch.setattr(
+ "src.repositories.alert_operation_log_repository.get_alert_operation_log_repository",
+ lambda: op_log_repo,
+ )
+
+ result = await telegram_api.telegram_webhook(_callback_update(f"approve:{approval_id}:ts:nonce"))
+
+ assert result["ok"] is True
+ assert result["message"] == "Already processed"
+ assert result["execution_triggered"] is False
+ assert result["execution_scheduled"] is False
+ assert finalizer_calls == []
+ assert op_log_repo.rows[0]["kwargs"]["action_detail"] == "approve_duplicate"
+
+
@pytest.mark.asyncio
async def test_telegram_rejection_syncs_incident_state(monkeypatch):
approval_id = "22222222-2222-2222-2222-222222222222"
diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md
index fd9ace5b..96a8f7fe 100644
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -1,3 +1,39 @@
+## 2026-05-31|Telegram 告警執行語意與 DB 稽核完整性修復
+
+**背景**:
+
+- Production 查核 `INC-20260530-88D960` / `INC-20260531-88394F` 發現 Telegram 顯示「已批准、執行中、執行成功」,但實際分別是 MinIO SSH 診斷與 `OBSERVE`,不是建議中的修復動作。
+- `approval_records.status=execution_success` 無法區分「真的執行修復」與「純觀察/NO_ACTION terminal」;`alert_operation_log` 缺人工 approval execution 的 start/end,Postmortem 只送 Telegram 未沉澱 KM。
+- `alert_rule_engine` 允許具名規則只靠 message keyword 命中,導致主機 storage 類告警可能誤配到 `minio_disk_high`。
+
+**本次調整**:
+
+- 新增 `approval_action_classifier.is_no_action_approval_action()`,集中判斷 `OBSERVE` / `INVESTIGATE` / `NO_ACTION`。
+- NO_ACTION terminal 仍會關閉 approval,但 `extra_metadata` 標記 `execution_kind=no_action`、`repair_executed=false`;Telegram result 改為「已記錄觀察,未執行修復」。
+- `ApprovalExecutionService` 同步寫 `alert_operation_log`:`EXECUTION_STARTED`、`EXECUTION_COMPLETED`、`TELEGRAM_RESULT_SENT`。
+- Telegram webhook duplicate approval 不再 finalize / schedule executor;long polling 只有真正 `execution_triggered` 才顯示「執行中」。
+- Postmortem 產出時同步 idempotent 寫入 `knowledge_entries(entry_type=postmortem,path_type=postmortem)` 並補 `KM_CONVERTED`。
+- Heartbeat 與日報修復統計排除 observe-only/no-action,避免污染 success rate。
+- `alert_rule_engine._matches()` 收緊具名 alertname 規則,避免 Host storage 類告警靠 `storage` keyword 誤配 MinIO。
+
+**Verification**:
+
+```text
+python3 -m py_compile approval_action_classifier.py approval_execution.py approval_db.py telegram.py telegram_gateway.py alert_rule_engine.py report_generation_service.py heartbeat_report_service.py
+ -> pass
+pytest test_approval_execution_no_action.py test_telegram_webhook_execution_handoff.py -q
+ -> 6 passed
+pytest test_alert_rule_engine_validation.py test_report_generation_service.py -q
+ -> 67 passed
+pytest test_heartbeat_ollama_endpoints.py test_heartbeat_pod_state_machine.py test_gap_a4_placeholder_resolution.py -q
+ -> 49 passed
+```
+
+**判讀 / 下一步**:
+
+- 本輪修復新流量的語意與稽核完整性,不補跑舊 incident 的修復動作。
+- 舊 incident 若已是 `execution_success` 但沒有 `extra_metadata.execution_kind`,仍需透過 `automation_operation_log` / `alert_operation_log` 交叉判讀。
+
## 2026-05-31|Legacy HITL PENDING 前台可見性與心跳拆分
**背景**:
diff --git a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md
index 09e9af0a..5d8662c5 100644
--- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md
+++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md
@@ -2671,6 +2671,12 @@ Phase 6 完成後
- Verification:API py_compile pass;targeted ruff for new test pass;`pnpm --filter @awoooi/shared-types generate` pass;`test_approval_pending_visibility.py` 4 passed;`test_heartbeat_ollama_endpoints.py` + `test_heartbeat_pod_state_machine.py` 15 passed;`git diff --check` pass。
- 判讀:T153 不批次 approve/reject 生產 PENDING,也不把觀察卡刪掉;它把「前台看得到 legacy HITL 事實」與「告警只針對真正人工 actionable backlog」補齊。舊 fallback kubectl / SSH action 仍需 operator 在 `/awooop/approvals` 逐筆決策;OBSERVE / NO_ACTION 類不再偽裝成 emergency manual backlog。下一段可追 LLM failure fallback 為何大量產生 `OBSERVE / medium` 卡片,但需避免破壞 agent 後續把 PENDING 更新成可執行 action 的路徑。
+**T154 Telegram approval truth + execution audit integrity(2026-05-31 台北)**:
+- 觸發:Telegram 上出現「此告警已處理」後仍接著顯示「已批准、執行中」,且 `INC-20260530-88D960` / `INC-20260531-88394F` 的 production 查核顯示 `approval_records.status=execution_success`,但前者實際只跑 MinIO SSH 診斷、後者只是 `OBSERVE`;`auto_repair_executions=0`,`alert_operation_log` 缺 execution start/end,Postmortem 只送 Telegram 未落 KM。這會讓 operator 誤以為修復已完成。
+- 修正:集中 `is_no_action_approval_action()`,讓 `OBSERVE` / `INVESTIGATE` / `NO_ACTION` terminal 時在 `approval_records.extra_metadata` 標記 `execution_kind=no_action`、`repair_executed=false`,Telegram result 改為「已記錄觀察,未執行修復」,不再用「執行成功」。`ApprovalExecutionService` 現在同步寫 `alert_operation_log` 的 `EXECUTION_STARTED` / `EXECUTION_COMPLETED` / `TELEGRAM_RESULT_SENT`,並保留 `automation_operation_log`。Telegram webhook duplicate approval 不再 finalize/schedule executor;long polling 只有真正 `execution_triggered` 才顯示「執行中」。`ReportGenerationService` 會把 Postmortem idempotent 寫入 `knowledge_entries(entry_type=postmortem,path_type=postmortem)` 並補 `KM_CONVERTED`。`HeartbeatReportService` / 日報修復統計排除 observe-only/no-action,不再污染 execution success rate。`alert_rule_engine._matches()` 收緊具名 alertname 規則,避免主機 storage 告警靠 message keyword 誤配 `minio_disk_high`。
+- Verification:`py_compile` pass;`test_approval_execution_no_action.py` + `test_telegram_webhook_execution_handoff.py` 6 passed;`test_alert_rule_engine_validation.py` + `test_report_generation_service.py` 67 passed;`test_heartbeat_ollama_endpoints.py` + `test_heartbeat_pod_state_machine.py` + `test_gap_a4_placeholder_resolution.py` 49 passed after aligning host SSH diagnostic assertion。
+- 判讀:T154 修的是「Telegram / DB / 前台統計的 truthfulness」,不是補跑舊 incident 的修復。舊資料中 status 已是 `execution_success` 的 OBSERVE 仍需靠新 metadata 才能精確分辨;部署後新 approval 會留下 immutable execution start/end 與 no-action 語意,operator 不應再把 OBSERVE 視為完成修復。
+
**T152 Ansible runtime readiness surfaced(2026-05-24 台北)**:
- 觸發:T151 已讓首頁看到 execution backend / Ansible attribution,但 operator 仍看不到 runtime 端缺什麼,容易把「Ansible 有候選」誤解成「Ansible 已能自動修復」。
- 修正:API image 複製 `infra/ansible/` 作 read-only catalog;`truth-chain/quality/summary` 新增 `ansible_runtime`,回報 playbook binary、catalog、inventory、playbook_count、can_run_check_mode、blockers。首頁 execution evidence 同步顯示 runtime 狀態;目前 production 顯示 `runtime 未就緒:ansible_playbook_binary_missing`。未安裝 `ansible-core`、未啟用 check-mode / apply。