From e2ab8796361dd041b5b950ff08561bf2f2bd2701 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 31 May 2026 13:58:21 +0800 Subject: [PATCH] fix(alerts): correct telegram execution truth --- apps/api/src/api/v1/telegram.py | 29 ++- apps/api/src/services/alert_rule_engine.py | 6 + .../services/approval_action_classifier.py | 26 +++ apps/api/src/services/approval_db.py | 30 ++- apps/api/src/services/approval_execution.py | 192 ++++++++++++++++-- .../src/services/heartbeat_report_service.py | 19 +- .../src/services/report_generation_service.py | 96 ++++++++- apps/api/src/services/telegram_gateway.py | 8 +- .../test_alert_rule_engine_validation.py | 45 +++- .../test_approval_execution_no_action.py | 40 +++- .../test_gap_a4_placeholder_resolution.py | 10 +- .../tests/test_report_generation_service.py | 67 ++++++ ...test_telegram_webhook_execution_handoff.py | 63 +++++- docs/LOGBOOK.md | 36 ++++ ...-04-15-MASTER-ai-autonomous-flywheel-v2.md | 6 + 15 files changed, 624 insertions(+), 49 deletions(-) create mode 100644 apps/api/src/services/approval_action_classifier.py diff --git a/apps/api/src/api/v1/telegram.py b/apps/api/src/api/v1/telegram.py index 83236fbe..0b0d3a27 100644 --- a/apps/api/src/api/v1/telegram.py +++ b/apps/api/src/api/v1/telegram.py @@ -275,6 +275,29 @@ async def telegram_webhook( ) if approval: + status_value = approval.status.value if hasattr(approval.status, "value") else str(approval.status) + if ( + "Cannot sign" in msg + or "already signed" in msg + or "Concurrent modification" in msg + ): + logger.info( + "telegram_approval_ignored_already_processed", + approval_id=approval_id, + user_id=user_id, + status=status_value, + message=msg, + ) + await _log_user_action("approve_duplicate", False, getattr(approval, "incident_id", None)) + return { + "ok": True, + "message": "Already processed", + "approval_id": approval_id, + "status": status_value, + "execution_triggered": False, + "execution_scheduled": False, + } + execution_scheduled = await _finalize_telegram_approval( approval=approval, execution_triggered=execution_triggered, @@ -283,7 +306,7 @@ async def telegram_webhook( "telegram_approval_signed", approval_id=approval_id, user_id=user_id, - status=approval.status.value, + status=status_value, execution_triggered=execution_triggered, execution_scheduled=execution_scheduled, ) @@ -291,9 +314,9 @@ async def telegram_webhook( return { "ok": True, - "message": "Approved", + "message": "Approved" if execution_triggered else "Signed", "approval_id": approval_id, - "status": approval.status.value, + "status": status_value, "execution_triggered": execution_triggered, "execution_scheduled": execution_scheduled, } diff --git a/apps/api/src/services/alert_rule_engine.py b/apps/api/src/services/alert_rule_engine.py index b96bb645..4ee6f2a0 100644 --- a/apps/api/src/services/alert_rule_engine.py +++ b/apps/api/src/services/alert_rule_engine.py @@ -298,6 +298,12 @@ def _matches(rule: dict, alertname: str, alert_type: str, message: str, instance if alertnames and alertname in alertnames: return True + # 2026-05-31 ogt + Codex: 有明確 alertname 的規則不得只靠寬鬆 message + # keyword 命中,否則 HostPreviousBootStorageErrorsDetected 這類主機 storage + # 告警會誤配到 minio_disk_high。 + if alertnames and alertname and alertname != "custom": + return False + # alert_type 部分匹配 for kw in match.get("alert_type", []): if kw.lower() in alert_type.lower(): diff --git a/apps/api/src/services/approval_action_classifier.py b/apps/api/src/services/approval_action_classifier.py new file mode 100644 index 00000000..27b97335 --- /dev/null +++ b/apps/api/src/services/approval_action_classifier.py @@ -0,0 +1,26 @@ +""" +Approval action classifier +========================== + +2026-05-31 ogt + Codex: Telegram 告警鏈路一致性修復。 +將 OBSERVE / INVESTIGATE / NO_ACTION 這類「純觀察、未執行修復」的 +判斷集中,避免 execution、Telegram、統計各自用不同語意。 +""" + +from __future__ import annotations + + +def is_no_action_approval_action(action: str | None) -> bool: + """Return True when an approval action records observation instead of repair.""" + text = (action or "").strip() + upper = text.upper() + if not text: + return True + return ( + "NO_ACTION" in upper + or "NO-ACTION" in upper + or "NOACTION" in upper + or "(未設)" in text + or upper.startswith("OBSERVE") + or upper.startswith("INVESTIGATE") + ) diff --git a/apps/api/src/services/approval_db.py b/apps/api/src/services/approval_db.py index 608d1121..46323f62 100644 --- a/apps/api/src/services/approval_db.py +++ b/apps/api/src/services/approval_db.py @@ -659,6 +659,7 @@ class ApprovalDBService: approval_id: UUID, success: bool, error_message: str | None = None, + execution_kind: str | None = None, ) -> None: """ 更新執行狀態 @@ -669,21 +670,36 @@ class ApprovalDBService: """ async with get_db_context() as db: status = ApprovalStatus.EXECUTION_SUCCESS if success else ApprovalStatus.EXECUTION_FAILED - values: dict = {"status": status} + result = await db.execute( + select(ApprovalRecord).where(ApprovalRecord.id == str(approval_id)) + ) + record = result.scalar_one_or_none() + if record is None: + logger.warning( + "approval_execution_status_update_missing", + id=str(approval_id), + success=success, + ) + return + + record.status = status if not success and error_message: # 截斷至合理長度,避免爆欄位 - values["rejection_reason"] = str(error_message)[:2000] - await db.execute( - update(ApprovalRecord) - .where(ApprovalRecord.id == str(approval_id)) - .values(**values) - ) + record.rejection_reason = str(error_message)[:2000] + if execution_kind: + # 2026-05-31 ogt + Codex: OBSERVE/NO_ACTION 仍需 terminal 狀態, + # 但前台/報表必須能分辨「未執行修復」而非真正 execution success。 + metadata = dict(record.extra_metadata or {}) + metadata["execution_kind"] = execution_kind + metadata["repair_executed"] = execution_kind != "no_action" + record.extra_metadata = metadata logger.info( "approval_execution_status_updated", id=str(approval_id), success=success, has_error=bool(error_message), + execution_kind=execution_kind, ) async def update_incident_id(self, approval_id: UUID, incident_id: str) -> None: diff --git a/apps/api/src/services/approval_execution.py b/apps/api/src/services/approval_execution.py index 9ca8c664..79a6b8ef 100644 --- a/apps/api/src/services/approval_execution.py +++ b/apps/api/src/services/approval_execution.py @@ -36,6 +36,7 @@ from src.db.base import get_db_context from src.models.approval import ApprovalRequest from src.plugins.mcp.gateway import GatewayContext, McpGateway, McpGatewayError from src.plugins.mcp.interfaces import MCPToolResult +from src.services.approval_action_classifier import is_no_action_approval_action from src.services.approval_db import get_approval_service, get_timeline_service from src.services.executor import ExecutionResult, OperationType, get_executor from src.services.operation_parser import parse_operation_from_action @@ -165,6 +166,7 @@ class ApprovalExecutionService: # ADR-090 § 自動化動作回灌 (2026-04-19): 主流程開始即在 aol 留痕, # 結束時 update。不依賴 fire-and-forget,確保 33 件/7d approval 全部可觀測。 _aol_op_id = await self._log_aol_started(approval) + await self._log_alert_execution_started(approval, aol_op_id=_aol_op_id) _aol_started_ms = time.time() service = get_approval_service() @@ -228,15 +230,7 @@ class ApprovalExecutionService: # 2026-04-19 ogt + Claude Opus 4.7: 區分 NO_ACTION vs 真解析失敗 # NO_ACTION 是 AI 刻意選的「純調查不破壞」,不該誤標 EXECUTION_FAILED # 污染 auto_execute 成功率 KPI (MASTER §7.1 #11) - _action_upper = (approval.action or "").upper() - _is_no_action = ( - "NO_ACTION" in _action_upper - or "NO-ACTION" in _action_upper - or "NOACTION" in _action_upper - or "(未設)" in approval.action - or _action_upper.startswith("OBSERVE") - or _action_upper.startswith("INVESTIGATE") - ) + _is_no_action = is_no_action_approval_action(approval.action) if _is_no_action: logger.info( @@ -246,13 +240,17 @@ class ApprovalExecutionService: reason="NO_ACTION - 純調查/觀察類,不執行破壞動作", path="no_action", ) - # 標為 SUCCESS (觀察/調查本身就是成功完成) - await service.update_execution_status(approval.id, success=True) + # 仍以 terminal success 關閉簽核,但 metadata 明確標記未執行修復。 + await service.update_execution_status( + approval.id, + success=True, + execution_kind="no_action", + ) await timeline.add_event( event_type="exec", status="success", - title="✅ 純觀察類動作完成 (NO_ACTION)", - description=f"Action: {approval.action[:120]}", + title="ℹ️ 純觀察類動作已記錄(未執行修復)", + description=f"Action: {(approval.action or '')[:120]}", actor="leWOOOgo", actor_role="executor", approval_id=str(approval.id), @@ -269,7 +267,22 @@ class ApprovalExecutionService: op_id=_aol_op_id, status="success", duration_ms=int((time.time() - _aol_started_ms) * 1000), - output={"reason": "NO_ACTION", "action": approval.action[:200]}, + output={ + "reason": "NO_ACTION", + "execution_kind": "no_action", + "repair_executed": False, + "action": (approval.action or "")[:200], + }, + ) + await self._log_alert_execution_completed( + approval, + success=True, + execution_kind="no_action", + duration_ms=int((time.time() - _aol_started_ms) * 1000), + output={ + "reason": "NO_ACTION", + "repair_executed": False, + }, ) # F2 (2026-05-07 ogt + Claude Sonnet 4.6 + Codex): # NO_ACTION 路徑要把 incident 推到 RESOLVED,否則 incident 永遠卡 @@ -336,6 +349,13 @@ class ApprovalExecutionService: duration_ms=int((time.time() - _aol_started_ms) * 1000), error=f"parse_fail: {approval.action[:300]}", ) + await self._log_alert_execution_completed( + approval, + success=False, + execution_kind="parse_failed", + duration_ms=int((time.time() - _aol_started_ms) * 1000), + error_message=f"Could not parse operation type from action: {approval.action[:150]}", + ) return False # 解析失敗 → 執行未發生 executor = get_executor() @@ -553,6 +573,20 @@ class ApprovalExecutionService: "total_attempts": total_attempts, }, ) + await self._log_alert_execution_completed( + approval, + success=True, + execution_kind=operation_type.value, + duration_ms=int((time.time() - _aol_started_ms) * 1000), + output={ + "operation_type": operation_type.value, + "resource_name": resource_name, + "namespace": namespace, + "executor_duration_ms": result.duration_ms, + "total_attempts": total_attempts, + "repair_executed": True, + }, + ) return True # K8s 執行成功 else: @@ -654,6 +688,22 @@ class ApprovalExecutionService: error=result.error, stderr=result.error, # E6 stderr 回灌 — 給 retry/Playbook 負向強化用 ) + await self._log_alert_execution_completed( + approval, + success=False, + execution_kind=operation_type.value, + duration_ms=int((time.time() - _aol_started_ms) * 1000), + output={ + "operation_type": operation_type.value, + "resource_name": resource_name, + "namespace": namespace, + "executor_duration_ms": result.duration_ms, + "total_attempts": total_attempts, + "repair_attempted": True, + "repair_executed": False, + }, + error_message=result.error, + ) return False # K8s 執行失敗 async def _execute_ssh_host_action( @@ -919,7 +969,14 @@ class ApprovalExecutionService: except Exception: pass - if success: + no_action = success and is_no_action_approval_action(approval.action) + if no_action: + text = ( + f"ℹ️ 已記錄觀察,未執行修復\n" + f"{(approval.action or '')[:180]}" + f"{km_info}" + ) + elif success: text = ( f"✅ 執行成功\n" f"{(approval.action or '')[:180]}" @@ -948,8 +1005,34 @@ class ApprovalExecutionService: incident_id=approval.incident_id, approval_id=str(approval.id), success=success, + no_action=no_action, orig_msg_id=orig_msg_id, ) + try: + from src.repositories.alert_operation_log_repository import ( + get_alert_operation_log_repository, + ) + + await get_alert_operation_log_repository().append( + "TELEGRAM_RESULT_SENT", + incident_id=approval.incident_id, + approval_id=str(approval.id), + actor="approval_execution", + action_detail="telegram_execution_result_sent", + success=success, + error_message=error, + context={ + "reply_to_message_id": orig_msg_id, + "execution_kind": "no_action" if no_action else "execution", + "repair_executed": not no_action and success, + }, + ) + except Exception as _log_e: + logger.warning( + "alert_op_telegram_result_write_failed", + approval_id=str(approval.id), + error=str(_log_e), + ) except Exception as e: logger.warning( "push_execution_result_failed", @@ -1592,6 +1675,85 @@ class ApprovalExecutionService: # 22 筆 notification_formatted。修復後每次執行都留痕。 # ========================================================================= + async def _log_alert_execution_started( + self, + approval: ApprovalRequest, + *, + aol_op_id: str | None, + ) -> None: + """Append immutable alert_operation_log start event for manual execution.""" + try: + from src.repositories.alert_operation_log_repository import ( + get_alert_operation_log_repository, + ) + + await get_alert_operation_log_repository().append( + "EXECUTION_STARTED", + incident_id=approval.incident_id, + approval_id=str(approval.id), + actor="approval_execution", + action_detail="approval_execution_started", + success=None, + context={ + "action": (approval.action or "")[:500], + "automation_operation_id": aol_op_id, + "execution_kind": ( + "no_action" + if is_no_action_approval_action(approval.action) + else "executable" + ), + "repair_attempted": False, + "repair_executed": False, + }, + ) + except Exception as e: + logger.warning( + "alert_op_execution_started_write_failed", + approval_id=str(approval.id), + incident_id=approval.incident_id, + error=str(e), + ) + + async def _log_alert_execution_completed( + self, + approval: ApprovalRequest, + *, + success: bool, + execution_kind: str, + duration_ms: int, + output: dict | None = None, + error_message: str | None = None, + ) -> None: + """Append immutable alert_operation_log completion event for manual execution.""" + try: + from src.repositories.alert_operation_log_repository import ( + get_alert_operation_log_repository, + ) + + context = { + "action": (approval.action or "")[:500], + "duration_ms": duration_ms, + "execution_kind": execution_kind, + **(output or {}), + } + await get_alert_operation_log_repository().append( + "EXECUTION_COMPLETED", + incident_id=approval.incident_id, + approval_id=str(approval.id), + actor="approval_execution", + action_detail=f"approval_execution_{execution_kind}", + success=success, + error_message=(error_message or "")[:2000] if error_message else None, + context=context, + ) + except Exception as e: + logger.warning( + "alert_op_execution_completed_write_failed", + approval_id=str(approval.id), + incident_id=approval.incident_id, + error=str(e), + ) + async def _log_aol_started(self, approval: ApprovalRequest) -> str | None: """ 在 automation_operation_log 寫一筆 'pending' 紀錄,回傳 op_id 供 _log_aol_completed 更新。 diff --git a/apps/api/src/services/heartbeat_report_service.py b/apps/api/src/services/heartbeat_report_service.py index 5768b709..b7424a95 100644 --- a/apps/api/src/services/heartbeat_report_service.py +++ b/apps/api/src/services/heartbeat_report_service.py @@ -531,7 +531,9 @@ class HeartbeatReportService: SELECT *, ( - btrim(coalesce(action, '')) = '' + COALESCE(extra_metadata->>'execution_kind', '') = 'no_action' + OR COALESCE(extra_metadata->>'repair_executed', '') = 'false' + OR btrim(coalesce(action, '')) = '' OR UPPER(action) LIKE 'OBSERVE%' OR UPPER(action) LIKE 'INVESTIGATE%' OR UPPER(action) LIKE 'NO_ACTION%' @@ -556,9 +558,18 @@ class HeartbeatReportService: WHERE UPPER(status::text) = 'PENDING' AND telegram_message_id IS NULL ) AS pending_without_telegram, - COUNT(*) FILTER (WHERE UPPER(status::text) = 'EXECUTION_SUCCESS') AS success, - COUNT(*) FILTER (WHERE UPPER(status::text) = 'EXECUTION_FAILED') AS failed, - COUNT(*) FILTER (WHERE UPPER(status::text) IN ('APPROVED','EXECUTION_SUCCESS','EXECUTION_FAILED')) AS auto_resolved + COUNT(*) FILTER ( + WHERE UPPER(status::text) = 'EXECUTION_SUCCESS' + AND NOT is_observe_only + ) AS success, + COUNT(*) FILTER ( + WHERE UPPER(status::text) = 'EXECUTION_FAILED' + AND NOT is_observe_only + ) AS failed, + COUNT(*) FILTER ( + WHERE UPPER(status::text) IN ('APPROVED','EXECUTION_SUCCESS','EXECUTION_FAILED') + AND NOT is_observe_only + ) AS auto_resolved FROM scoped """)) row = r.one() diff --git a/apps/api/src/services/report_generation_service.py b/apps/api/src/services/report_generation_service.py index a94ead05..2c1f296c 100644 --- a/apps/api/src/services/report_generation_service.py +++ b/apps/api/src/services/report_generation_service.py @@ -232,11 +232,32 @@ class ReportGenerationService: async with get_db_context() as db: row = await db.execute( text(""" + WITH scoped AS ( + SELECT + *, + ( + COALESCE(extra_metadata->>'execution_kind', '') = 'no_action' + OR COALESCE(extra_metadata->>'repair_executed', '') = 'false' + OR btrim(coalesce(action, '')) = '' + OR UPPER(action) LIKE 'OBSERVE%' + OR UPPER(action) LIKE 'INVESTIGATE%' + OR UPPER(action) LIKE 'NO_ACTION%' + OR UPPER(action) LIKE '% NO_ACTION%' + OR UPPER(action) LIKE '%| NO_ACTION%' + ) AS is_observe_only + FROM approval_records + WHERE created_at >= :since + ) SELECT - COUNT(*) FILTER (WHERE UPPER(status::text) = 'EXECUTION_SUCCESS') AS success, - COUNT(*) FILTER (WHERE UPPER(status::text) = 'EXECUTION_FAILED') AS failed - FROM approval_records - WHERE created_at >= :since + COUNT(*) FILTER ( + WHERE UPPER(status::text) = 'EXECUTION_SUCCESS' + AND NOT is_observe_only + ) AS success, + COUNT(*) FILTER ( + WHERE UPPER(status::text) = 'EXECUTION_FAILED' + AND NOT is_observe_only + ) AS failed + FROM scoped """), {"since": since}, ) @@ -460,6 +481,7 @@ class ReportGenerationService: # 失敗時發送告警到 SRE 群組,避免靜默吞掉錯誤 import asyncio as _asyncio report_text = self.format_postmortem(data) + await self._persist_postmortem_km(data, report_text) from src.services.telegram_gateway import get_telegram_gateway gateway = get_telegram_gateway() @@ -510,6 +532,72 @@ class ReportGenerationService: error=str(_fe), ) + async def _persist_postmortem_km( + self, + data: PostmortemData, + report_text: str, + ) -> None: + """Persist generated postmortem as an idempotent KM entry before Telegram send.""" + try: + from src.db.base import get_db_context + from src.models.knowledge import ( + EntrySource, + EntryStatus, + EntryType, + KnowledgeEntryCreate, + ) + from src.repositories.alert_operation_log_repository import ( + get_alert_operation_log_repository, + ) + from src.repositories.knowledge_repository import KnowledgeDBRepository + + async with get_db_context() as db: + repo = KnowledgeDBRepository(db) + entry = await repo.create( + KnowledgeEntryCreate( + title=f"Postmortem {data.incident_id}: {data.title}"[:255], + content=report_text, + entry_type=EntryType.POSTMORTEM, + category="postmortem", + tags=[ + "postmortem", + "incident", + "telegram", + "auto_repaired" if data.auto_repaired else "human_intervention", + ], + source=EntrySource.AI_EXTRACTED, + status=EntryStatus.REVIEW, + related_incident_id=data.incident_id, + path_type="postmortem", + created_by="report_generation_service", + ) + ) + + await get_alert_operation_log_repository().append( + "KM_CONVERTED", + incident_id=data.incident_id, + actor="report_generation_service", + action_detail="postmortem_persisted", + success=True, + context={ + "knowledge_entry_id": entry.id, + "entry_type": EntryType.POSTMORTEM.value, + "path_type": "postmortem", + "duration_minutes": round(data.duration_minutes, 2), + }, + ) + logger.info( + "postmortem_km_persisted", + incident_id=data.incident_id, + knowledge_entry_id=entry.id, + ) + except Exception as e: + logger.warning( + "postmortem_km_persist_failed", + incident_id=data.incident_id, + error=str(e), + ) + # ============================================================================= # 日度報告排程迴圈 diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index 89f0d3d9..b5516ab3 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -8377,9 +8377,7 @@ class TelegramGateway: if action == "approve": status_emoji = "✅" status_text = f"已批准 by {_html.escape(username)}" - # 2026-04-14 Claude Sonnet 4.6: 原「等待執行」誤導(實際沒有 gate 會卡住路徑) - # 批准後一律顯示「執行中」,真實結果由 _push_execution_result_to_alert reply 補上 - suffix = "⚡ 執行中..." + suffix = "⚡ 執行中..." if execution_triggered else "已簽核,等待更多簽核" else: status_emoji = "❌" status_text = f"已拒絕 by {_html.escape(username)}" @@ -8495,7 +8493,7 @@ class TelegramGateway: # 2026-04-22 Claude Sonnet 4.6: 只有真正轉為 APPROVED 才發「執行中...」 # 非 PENDING 狀態下 sign_approval early-return → approval 是舊 record # 此時不應發「執行中...」,應告知用戶告警已處理過 - if approval.status == ApprovalStatus.APPROVED: + if approval.status == ApprovalStatus.APPROVED and execution_triggered: # 2026-04-09 Claude Sonnet 4.6: 回應 Telegram — 更新訊息狀態 + answer callback await self._notify_approval_result( message_id=message_id, @@ -8520,7 +8518,7 @@ class TelegramGateway: # 原本 gate 用 execution_triggered,race condition 時失效(樂觀鎖失敗) # 改用 approval.status == APPROVED(與 REST API 路徑 approvals.py:360 對齊) # 用 Redis lock exec:{approval_id} 防重入(REST + Telegram 同時簽核) - if approval.status == ApprovalStatus.APPROVED: + if approval.status == ApprovalStatus.APPROVED and execution_triggered: import asyncio from src.core.redis_client import get_redis diff --git a/apps/api/tests/test_alert_rule_engine_validation.py b/apps/api/tests/test_alert_rule_engine_validation.py index a05d523d..93aed3af 100644 --- a/apps/api/tests/test_alert_rule_engine_validation.py +++ b/apps/api/tests/test_alert_rule_engine_validation.py @@ -18,7 +18,7 @@ Task 2.3: validate_kubectl_command() 白名單驗證 import pytest -from src.services.alert_rule_engine import validate_kubectl_command +from src.services.alert_rule_engine import match_rule, validate_kubectl_command # ============================================================================= @@ -76,6 +76,49 @@ class TestValidKubectlCommands: assert validate_kubectl_command(cmd) is False +class TestRuleMatchingSpecificity: + """具名 alertname 規則不得被寬鬆 message keyword 誤命中。""" + + def test_host_storage_alert_does_not_match_minio_disk_rule(self): + ctx = { + "alert_type": "host", + "severity": "critical", + "source": "prometheus", + "target_resource": "dirty-reboot-evidence", + "namespace": "awoooi-prod", + "message": "HostPreviousBootStorageErrorsDetected storage dirty reboot evidence", + "labels": { + "alertname": "HostPreviousBootStorageErrorsDetected", + "instance": "192.168.0.110:9100", + }, + } + + result = match_rule(ctx) + + assert result is not None + assert result["rule_id"] != "minio_disk_high" + assert "/data/minio" not in result.get("kubectl_command", "") + + def test_exact_minio_disk_alert_still_matches_minio_rule(self): + ctx = { + "alert_type": "storage", + "severity": "critical", + "source": "prometheus", + "target_resource": "minio", + "namespace": "awoooi-prod", + "message": "MinIO disk usage high", + "labels": { + "alertname": "MinioDiskUsageHigh", + "instance": "192.168.0.110:9000", + }, + } + + result = match_rule(ctx) + + assert result is not None + assert result["rule_id"] == "minio_disk_high" + + # ============================================================================= # 阻擋案例(應返回 False) # ============================================================================= diff --git a/apps/api/tests/test_approval_execution_no_action.py b/apps/api/tests/test_approval_execution_no_action.py index 13ae42bf..1b7fcedd 100644 --- a/apps/api/tests/test_approval_execution_no_action.py +++ b/apps/api/tests/test_approval_execution_no_action.py @@ -1,5 +1,4 @@ from types import SimpleNamespace - from unittest.mock import AsyncMock import pytest @@ -16,14 +15,17 @@ async def test_no_action_execution_resolves_incident_once(monkeypatch): incident_id="INC-TEST-001", ) incident_service = SimpleNamespace(resolve_incident=AsyncMock()) + update_execution_status = AsyncMock() + timeline_add_event = AsyncMock() + alert_completed = AsyncMock(return_value=None) monkeypatch.setattr( "src.services.approval_execution.get_approval_service", - lambda: SimpleNamespace(update_execution_status=AsyncMock()), + lambda: SimpleNamespace(update_execution_status=update_execution_status), ) monkeypatch.setattr( "src.services.approval_execution.get_timeline_service", - lambda: SimpleNamespace(add_event=AsyncMock()), + lambda: SimpleNamespace(add_event=timeline_add_event), ) monkeypatch.setattr( "src.services.approval_execution.parse_operation_from_action", @@ -43,12 +45,28 @@ async def test_no_action_execution_resolves_incident_once(monkeypatch): "src.services.approval_execution.ApprovalExecutionService._log_aol_completed", AsyncMock(return_value=None), ) + monkeypatch.setattr( + "src.services.approval_execution.ApprovalExecutionService._log_alert_execution_started", + AsyncMock(return_value=None), + ) + monkeypatch.setattr( + "src.services.approval_execution.ApprovalExecutionService._log_alert_execution_completed", + alert_completed, + ) # Act result = await ApprovalExecutionService().execute_approved_action(approval) # Assert assert result is True + update_execution_status.assert_awaited_once_with( + approval.id, + success=True, + execution_kind="no_action", + ) + assert "未執行修復" in timeline_add_event.await_args.kwargs["title"] + assert alert_completed.await_args.kwargs["execution_kind"] == "no_action" + assert alert_completed.await_args.kwargs["output"]["repair_executed"] is False incident_service.resolve_incident.assert_awaited_once_with("INC-TEST-001") @@ -67,10 +85,11 @@ async def test_no_action_execution_returns_true_when_resolve_raises(monkeypatch) incident_service = SimpleNamespace( resolve_incident=AsyncMock(side_effect=RuntimeError("redis down")) ) + update_execution_status = AsyncMock() monkeypatch.setattr( "src.services.approval_execution.get_approval_service", - lambda: SimpleNamespace(update_execution_status=AsyncMock()), + lambda: SimpleNamespace(update_execution_status=update_execution_status), ) monkeypatch.setattr( "src.services.approval_execution.get_timeline_service", @@ -94,8 +113,21 @@ async def test_no_action_execution_returns_true_when_resolve_raises(monkeypatch) "src.services.approval_execution.ApprovalExecutionService._log_aol_completed", AsyncMock(return_value=None), ) + monkeypatch.setattr( + "src.services.approval_execution.ApprovalExecutionService._log_alert_execution_started", + AsyncMock(return_value=None), + ) + monkeypatch.setattr( + "src.services.approval_execution.ApprovalExecutionService._log_alert_execution_completed", + AsyncMock(return_value=None), + ) result = await ApprovalExecutionService().execute_approved_action(approval) assert result is True + update_execution_status.assert_awaited_once_with( + approval.id, + success=True, + execution_kind="no_action", + ) incident_service.resolve_incident.assert_awaited_once_with("INC-TEST-002") diff --git a/apps/api/tests/test_gap_a4_placeholder_resolution.py b/apps/api/tests/test_gap_a4_placeholder_resolution.py index a468e35c..2f1b4ebd 100644 --- a/apps/api/tests/test_gap_a4_placeholder_resolution.py +++ b/apps/api/tests/test_gap_a4_placeholder_resolution.py @@ -181,7 +181,7 @@ class TestMatchRuleRejection: """垃圾 target 時 kubectl_command 必須被清空(降級 LLM)""" def test_bad_target_discards_kubectl_command(self): - """真實 bug:HostHighCpuLoad target=unknown → kubectl_command 應清空""" + """HostHighCpuLoad target=unknown → 不得組裝成壞 kubectl target。""" ctx = { "alert_type": "high_cpu", "severity": "warning", @@ -192,10 +192,12 @@ class TestMatchRuleRejection: "labels": {"alertname": "HostHighCpuLoad", "instance": "192.168.0.110:9100"}, } result = match_rule(ctx) - # 規則可能匹配(host_high_cpu)但 kubectl_command 必為空 + # 規則可能匹配 host SSH 診斷;但不能把 HostHighCpuLoad 當成 K8s target。 if result is not None: - assert result["kubectl_command"] == "", \ - f"bad target 應導致 kubectl_command 清空, got: {result['kubectl_command']!r}" + command = result["kubectl_command"] + assert command == "" or command.startswith("ssh "), \ + f"bad target 不應組裝 kubectl 指令, got: {command!r}" + assert "deployment/HostHighCpuLoad" not in command def test_good_target_preserves_kubectl_command(self): """真實 deployment 名稱時,kubectl_command 正常組裝""" diff --git a/apps/api/tests/test_report_generation_service.py b/apps/api/tests/test_report_generation_service.py index b3a8845d..532a9707 100644 --- a/apps/api/tests/test_report_generation_service.py +++ b/apps/api/tests/test_report_generation_service.py @@ -17,7 +17,9 @@ ADR-076 Task 4: 自動報告生成 建立: 2026-04-14 (台北時區) Claude Haiku 4.5 """ +from contextlib import asynccontextmanager from datetime import datetime, timedelta, timezone +from types import SimpleNamespace import pytest @@ -274,6 +276,71 @@ class TestFormatPostmortem: assert "台北時間" in report +class TestTriggerPostmortemPersistence: + """Postmortem 產出必須同步沉澱到 KM。""" + + @pytest.mark.asyncio + async def test_trigger_postmortem_persists_km_before_telegram_send(self, monkeypatch): + now = datetime.now(_TZ_TAIPEI) + created = now - timedelta(minutes=16) + sent_messages: list[str] = [] + created_entries: list[object] = [] + op_logs: list[dict] = [] + + class FakeGateway: + async def send_to_group(self, text: str, parse_mode: str = "HTML") -> None: + sent_messages.append(text) + + class FakeKnowledgeRepo: + def __init__(self, _db) -> None: + pass + + async def create(self, data): + created_entries.append(data) + return SimpleNamespace(id="km-postmortem-1") + + class FakeAlertOpRepo: + async def append(self, event_type: str, **kwargs): + op_logs.append({"event_type": event_type, **kwargs}) + + @asynccontextmanager + async def fake_db_context(): + yield SimpleNamespace() + + monkeypatch.setattr( + "src.services.telegram_gateway.get_telegram_gateway", + lambda: FakeGateway(), + ) + monkeypatch.setattr("src.db.base.get_db_context", fake_db_context) + monkeypatch.setattr( + "src.repositories.knowledge_repository.KnowledgeDBRepository", + FakeKnowledgeRepo, + ) + monkeypatch.setattr( + "src.repositories.alert_operation_log_repository.get_alert_operation_log_repository", + lambda: FakeAlertOpRepo(), + ) + + await ReportGenerationService().trigger_postmortem( + incident_id="INC-20260531-POST", + title="DockerContainerUnhealthy bitan-pharmacy", + created_at=created, + resolved_at=now, + root_cause="容器健康檢查失敗", + resolution_action="OBSERVE", + auto_repaired=False, + ) + + assert sent_messages + assert created_entries + entry = created_entries[0] + assert entry.entry_type.value == "postmortem" + assert entry.related_incident_id == "INC-20260531-POST" + assert entry.path_type == "postmortem" + assert op_logs[0]["event_type"] == "KM_CONVERTED" + assert op_logs[0]["action_detail"] == "postmortem_persisted" + + # ============================================================================= # _seconds_until_next_report # ============================================================================= diff --git a/apps/api/tests/test_telegram_webhook_execution_handoff.py b/apps/api/tests/test_telegram_webhook_execution_handoff.py index 697a5c79..207e2d86 100644 --- a/apps/api/tests/test_telegram_webhook_execution_handoff.py +++ b/apps/api/tests/test_telegram_webhook_execution_handoff.py @@ -15,12 +15,18 @@ class _FakeGateway: class _FakeApprovalService: - def __init__(self, approval, execution_triggered: bool) -> None: + def __init__( + self, + approval, + execution_triggered: bool, + sign_message: str = "Approval complete", + ) -> None: self.approval = approval self.execution_triggered = execution_triggered + self.sign_message = sign_message async def sign_approval(self, **_kwargs): - return self.approval, "Approval complete", self.execution_triggered + return self.approval, self.sign_message, self.execution_triggered async def reject_approval(self, **_kwargs): return self.approval, "Approval rejected" @@ -100,6 +106,59 @@ async def test_telegram_approval_schedules_executor_after_required_signature(mon assert op_log_repo.rows[0]["kwargs"]["action_detail"] == "approve" +@pytest.mark.asyncio +async def test_telegram_approval_duplicate_does_not_schedule_executor(monkeypatch): + approval_id = "33333333-3333-3333-3333-333333333333" + approval = SimpleNamespace( + id=UUID(approval_id), + status=SimpleNamespace(value="execution_success"), + incident_id="INC-20260531-DUPE", + ) + finalizer_calls: list[dict] = [] + op_log_repo = _FakeAlertOperationLogRepository() + + async def fake_finalize(*, approval, execution_triggered: bool) -> bool: + finalizer_calls.append({ + "approval_id": str(approval.id), + "execution_triggered": execution_triggered, + }) + return True + + monkeypatch.setattr( + telegram_api, + "get_telegram_gateway", + lambda: _FakeGateway({ + "success": True, + "action": "approve", + "approval_id": approval_id, + "user": {"id": 42, "username": "ops"}, + }), + ) + monkeypatch.setattr( + telegram_api, + "get_approval_service", + lambda: _FakeApprovalService( + approval, + execution_triggered=False, + sign_message="Cannot sign: status is execution_success", + ), + ) + monkeypatch.setattr(telegram_api, "_finalize_telegram_approval", fake_finalize) + monkeypatch.setattr( + "src.repositories.alert_operation_log_repository.get_alert_operation_log_repository", + lambda: op_log_repo, + ) + + result = await telegram_api.telegram_webhook(_callback_update(f"approve:{approval_id}:ts:nonce")) + + assert result["ok"] is True + assert result["message"] == "Already processed" + assert result["execution_triggered"] is False + assert result["execution_scheduled"] is False + assert finalizer_calls == [] + assert op_log_repo.rows[0]["kwargs"]["action_detail"] == "approve_duplicate" + + @pytest.mark.asyncio async def test_telegram_rejection_syncs_incident_state(monkeypatch): approval_id = "22222222-2222-2222-2222-222222222222" diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index fd9ace5b..96a8f7fe 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,39 @@ +## 2026-05-31|Telegram 告警執行語意與 DB 稽核完整性修復 + +**背景**: + +- Production 查核 `INC-20260530-88D960` / `INC-20260531-88394F` 發現 Telegram 顯示「已批准、執行中、執行成功」,但實際分別是 MinIO SSH 診斷與 `OBSERVE`,不是建議中的修復動作。 +- `approval_records.status=execution_success` 無法區分「真的執行修復」與「純觀察/NO_ACTION terminal」;`alert_operation_log` 缺人工 approval execution 的 start/end,Postmortem 只送 Telegram 未沉澱 KM。 +- `alert_rule_engine` 允許具名規則只靠 message keyword 命中,導致主機 storage 類告警可能誤配到 `minio_disk_high`。 + +**本次調整**: + +- 新增 `approval_action_classifier.is_no_action_approval_action()`,集中判斷 `OBSERVE` / `INVESTIGATE` / `NO_ACTION`。 +- NO_ACTION terminal 仍會關閉 approval,但 `extra_metadata` 標記 `execution_kind=no_action`、`repair_executed=false`;Telegram result 改為「已記錄觀察,未執行修復」。 +- `ApprovalExecutionService` 同步寫 `alert_operation_log`:`EXECUTION_STARTED`、`EXECUTION_COMPLETED`、`TELEGRAM_RESULT_SENT`。 +- Telegram webhook duplicate approval 不再 finalize / schedule executor;long polling 只有真正 `execution_triggered` 才顯示「執行中」。 +- Postmortem 產出時同步 idempotent 寫入 `knowledge_entries(entry_type=postmortem,path_type=postmortem)` 並補 `KM_CONVERTED`。 +- Heartbeat 與日報修復統計排除 observe-only/no-action,避免污染 success rate。 +- `alert_rule_engine._matches()` 收緊具名 alertname 規則,避免 Host storage 類告警靠 `storage` keyword 誤配 MinIO。 + +**Verification**: + +```text +python3 -m py_compile approval_action_classifier.py approval_execution.py approval_db.py telegram.py telegram_gateway.py alert_rule_engine.py report_generation_service.py heartbeat_report_service.py + -> pass +pytest test_approval_execution_no_action.py test_telegram_webhook_execution_handoff.py -q + -> 6 passed +pytest test_alert_rule_engine_validation.py test_report_generation_service.py -q + -> 67 passed +pytest test_heartbeat_ollama_endpoints.py test_heartbeat_pod_state_machine.py test_gap_a4_placeholder_resolution.py -q + -> 49 passed +``` + +**判讀 / 下一步**: + +- 本輪修復新流量的語意與稽核完整性,不補跑舊 incident 的修復動作。 +- 舊 incident 若已是 `execution_success` 但沒有 `extra_metadata.execution_kind`,仍需透過 `automation_operation_log` / `alert_operation_log` 交叉判讀。 + ## 2026-05-31|Legacy HITL PENDING 前台可見性與心跳拆分 **背景**: diff --git a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md index 09e9af0a..5d8662c5 100644 --- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md +++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md @@ -2671,6 +2671,12 @@ Phase 6 完成後 - Verification:API py_compile pass;targeted ruff for new test pass;`pnpm --filter @awoooi/shared-types generate` pass;`test_approval_pending_visibility.py` 4 passed;`test_heartbeat_ollama_endpoints.py` + `test_heartbeat_pod_state_machine.py` 15 passed;`git diff --check` pass。 - 判讀:T153 不批次 approve/reject 生產 PENDING,也不把觀察卡刪掉;它把「前台看得到 legacy HITL 事實」與「告警只針對真正人工 actionable backlog」補齊。舊 fallback kubectl / SSH action 仍需 operator 在 `/awooop/approvals` 逐筆決策;OBSERVE / NO_ACTION 類不再偽裝成 emergency manual backlog。下一段可追 LLM failure fallback 為何大量產生 `OBSERVE / medium` 卡片,但需避免破壞 agent 後續把 PENDING 更新成可執行 action 的路徑。 +**T154 Telegram approval truth + execution audit integrity(2026-05-31 台北)**: +- 觸發:Telegram 上出現「此告警已處理」後仍接著顯示「已批准、執行中」,且 `INC-20260530-88D960` / `INC-20260531-88394F` 的 production 查核顯示 `approval_records.status=execution_success`,但前者實際只跑 MinIO SSH 診斷、後者只是 `OBSERVE`;`auto_repair_executions=0`,`alert_operation_log` 缺 execution start/end,Postmortem 只送 Telegram 未落 KM。這會讓 operator 誤以為修復已完成。 +- 修正:集中 `is_no_action_approval_action()`,讓 `OBSERVE` / `INVESTIGATE` / `NO_ACTION` terminal 時在 `approval_records.extra_metadata` 標記 `execution_kind=no_action`、`repair_executed=false`,Telegram result 改為「已記錄觀察,未執行修復」,不再用「執行成功」。`ApprovalExecutionService` 現在同步寫 `alert_operation_log` 的 `EXECUTION_STARTED` / `EXECUTION_COMPLETED` / `TELEGRAM_RESULT_SENT`,並保留 `automation_operation_log`。Telegram webhook duplicate approval 不再 finalize/schedule executor;long polling 只有真正 `execution_triggered` 才顯示「執行中」。`ReportGenerationService` 會把 Postmortem idempotent 寫入 `knowledge_entries(entry_type=postmortem,path_type=postmortem)` 並補 `KM_CONVERTED`。`HeartbeatReportService` / 日報修復統計排除 observe-only/no-action,不再污染 execution success rate。`alert_rule_engine._matches()` 收緊具名 alertname 規則,避免主機 storage 告警靠 message keyword 誤配 `minio_disk_high`。 +- Verification:`py_compile` pass;`test_approval_execution_no_action.py` + `test_telegram_webhook_execution_handoff.py` 6 passed;`test_alert_rule_engine_validation.py` + `test_report_generation_service.py` 67 passed;`test_heartbeat_ollama_endpoints.py` + `test_heartbeat_pod_state_machine.py` + `test_gap_a4_placeholder_resolution.py` 49 passed after aligning host SSH diagnostic assertion。 +- 判讀:T154 修的是「Telegram / DB / 前台統計的 truthfulness」,不是補跑舊 incident 的修復。舊資料中 status 已是 `execution_success` 的 OBSERVE 仍需靠新 metadata 才能精確分辨;部署後新 approval 會留下 immutable execution start/end 與 no-action 語意,operator 不應再把 OBSERVE 視為完成修復。 + **T152 Ansible runtime readiness surfaced(2026-05-24 台北)**: - 觸發:T151 已讓首頁看到 execution backend / Ansible attribution,但 operator 仍看不到 runtime 端缺什麼,容易把「Ansible 有候選」誤解成「Ansible 已能自動修復」。 - 修正:API image 複製 `infra/ansible/` 作 read-only catalog;`truth-chain/quality/summary` 新增 `ansible_runtime`,回報 playbook binary、catalog、inventory、playbook_count、can_run_check_mode、blockers。首頁 execution evidence 同步顯示 runtime 狀態;目前 production 顯示 `runtime 未就緒:ansible_playbook_binary_missing`。未安裝 `ansible-core`、未啟用 check-mode / apply。