fix(alerts): correct telegram execution truth
Some checks failed
CD Pipeline / tests (push) Failing after 52s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Successful in 11s

This commit is contained in:
Your Name
2026-05-31 13:58:21 +08:00
parent 943a6feacf
commit e2ab879636
15 changed files with 624 additions and 49 deletions

View File

@@ -275,6 +275,29 @@ async def telegram_webhook(
)
if approval:
status_value = approval.status.value if hasattr(approval.status, "value") else str(approval.status)
if (
"Cannot sign" in msg
or "already signed" in msg
or "Concurrent modification" in msg
):
logger.info(
"telegram_approval_ignored_already_processed",
approval_id=approval_id,
user_id=user_id,
status=status_value,
message=msg,
)
await _log_user_action("approve_duplicate", False, getattr(approval, "incident_id", None))
return {
"ok": True,
"message": "Already processed",
"approval_id": approval_id,
"status": status_value,
"execution_triggered": False,
"execution_scheduled": False,
}
execution_scheduled = await _finalize_telegram_approval(
approval=approval,
execution_triggered=execution_triggered,
@@ -283,7 +306,7 @@ async def telegram_webhook(
"telegram_approval_signed",
approval_id=approval_id,
user_id=user_id,
status=approval.status.value,
status=status_value,
execution_triggered=execution_triggered,
execution_scheduled=execution_scheduled,
)
@@ -291,9 +314,9 @@ async def telegram_webhook(
return {
"ok": True,
"message": "Approved",
"message": "Approved" if execution_triggered else "Signed",
"approval_id": approval_id,
"status": approval.status.value,
"status": status_value,
"execution_triggered": execution_triggered,
"execution_scheduled": execution_scheduled,
}

View File

@@ -298,6 +298,12 @@ def _matches(rule: dict, alertname: str, alert_type: str, message: str, instance
if alertnames and alertname in alertnames:
return True
# 2026-05-31 ogt + Codex: 有明確 alertname 的規則不得只靠寬鬆 message
# keyword 命中,否則 HostPreviousBootStorageErrorsDetected 這類主機 storage
# 告警會誤配到 minio_disk_high。
if alertnames and alertname and alertname != "custom":
return False
# alert_type 部分匹配
for kw in match.get("alert_type", []):
if kw.lower() in alert_type.lower():

View File

@@ -0,0 +1,26 @@
"""
Approval action classifier
==========================
2026-05-31 ogt + Codex: Telegram 告警鏈路一致性修復。
將 OBSERVE / INVESTIGATE / NO_ACTION 這類「純觀察、未執行修復」的
判斷集中,避免 execution、Telegram、統計各自用不同語意。
"""
from __future__ import annotations
def is_no_action_approval_action(action: str | None) -> bool:
"""Return True when an approval action records observation instead of repair."""
text = (action or "").strip()
upper = text.upper()
if not text:
return True
return (
"NO_ACTION" in upper
or "NO-ACTION" in upper
or "NOACTION" in upper
or "(未設)" in text
or upper.startswith("OBSERVE")
or upper.startswith("INVESTIGATE")
)

View File

@@ -659,6 +659,7 @@ class ApprovalDBService:
approval_id: UUID,
success: bool,
error_message: str | None = None,
execution_kind: str | None = None,
) -> None:
"""
更新執行狀態
@@ -669,21 +670,36 @@ class ApprovalDBService:
"""
async with get_db_context() as db:
status = ApprovalStatus.EXECUTION_SUCCESS if success else ApprovalStatus.EXECUTION_FAILED
values: dict = {"status": status}
result = await db.execute(
select(ApprovalRecord).where(ApprovalRecord.id == str(approval_id))
)
record = result.scalar_one_or_none()
if record is None:
logger.warning(
"approval_execution_status_update_missing",
id=str(approval_id),
success=success,
)
return
record.status = status
if not success and error_message:
# 截斷至合理長度,避免爆欄位
values["rejection_reason"] = str(error_message)[:2000]
await db.execute(
update(ApprovalRecord)
.where(ApprovalRecord.id == str(approval_id))
.values(**values)
)
record.rejection_reason = str(error_message)[:2000]
if execution_kind:
# 2026-05-31 ogt + Codex: OBSERVE/NO_ACTION 仍需 terminal 狀態,
# 但前台/報表必須能分辨「未執行修復」而非真正 execution success。
metadata = dict(record.extra_metadata or {})
metadata["execution_kind"] = execution_kind
metadata["repair_executed"] = execution_kind != "no_action"
record.extra_metadata = metadata
logger.info(
"approval_execution_status_updated",
id=str(approval_id),
success=success,
has_error=bool(error_message),
execution_kind=execution_kind,
)
async def update_incident_id(self, approval_id: UUID, incident_id: str) -> None:

View File

@@ -36,6 +36,7 @@ from src.db.base import get_db_context
from src.models.approval import ApprovalRequest
from src.plugins.mcp.gateway import GatewayContext, McpGateway, McpGatewayError
from src.plugins.mcp.interfaces import MCPToolResult
from src.services.approval_action_classifier import is_no_action_approval_action
from src.services.approval_db import get_approval_service, get_timeline_service
from src.services.executor import ExecutionResult, OperationType, get_executor
from src.services.operation_parser import parse_operation_from_action
@@ -165,6 +166,7 @@ class ApprovalExecutionService:
# ADR-090 § 自動化動作回灌 (2026-04-19): 主流程開始即在 aol 留痕,
# 結束時 update。不依賴 fire-and-forget,確保 33 件/7d approval 全部可觀測。
_aol_op_id = await self._log_aol_started(approval)
await self._log_alert_execution_started(approval, aol_op_id=_aol_op_id)
_aol_started_ms = time.time()
service = get_approval_service()
@@ -228,15 +230,7 @@ class ApprovalExecutionService:
# 2026-04-19 ogt + Claude Opus 4.7: 區分 NO_ACTION vs 真解析失敗
# NO_ACTION 是 AI 刻意選的「純調查不破壞」,不該誤標 EXECUTION_FAILED
# 污染 auto_execute 成功率 KPI (MASTER §7.1 #11)
_action_upper = (approval.action or "").upper()
_is_no_action = (
"NO_ACTION" in _action_upper
or "NO-ACTION" in _action_upper
or "NOACTION" in _action_upper
or "(未設)" in approval.action
or _action_upper.startswith("OBSERVE")
or _action_upper.startswith("INVESTIGATE")
)
_is_no_action = is_no_action_approval_action(approval.action)
if _is_no_action:
logger.info(
@@ -246,13 +240,17 @@ class ApprovalExecutionService:
reason="NO_ACTION - 純調查/觀察類,不執行破壞動作",
path="no_action",
)
# 標為 SUCCESS (觀察/調查本身就是成功完成)
await service.update_execution_status(approval.id, success=True)
# 仍以 terminal success 關閉簽核,但 metadata 明確標記未執行修復。
await service.update_execution_status(
approval.id,
success=True,
execution_kind="no_action",
)
await timeline.add_event(
event_type="exec",
status="success",
title=" 純觀察類動作完成 (NO_ACTION)",
description=f"Action: {approval.action[:120]}",
title=" 純觀察類動作已記錄(未執行修復)",
description=f"Action: {(approval.action or '')[:120]}",
actor="leWOOOgo",
actor_role="executor",
approval_id=str(approval.id),
@@ -269,7 +267,22 @@ class ApprovalExecutionService:
op_id=_aol_op_id,
status="success",
duration_ms=int((time.time() - _aol_started_ms) * 1000),
output={"reason": "NO_ACTION", "action": approval.action[:200]},
output={
"reason": "NO_ACTION",
"execution_kind": "no_action",
"repair_executed": False,
"action": (approval.action or "")[:200],
},
)
await self._log_alert_execution_completed(
approval,
success=True,
execution_kind="no_action",
duration_ms=int((time.time() - _aol_started_ms) * 1000),
output={
"reason": "NO_ACTION",
"repair_executed": False,
},
)
# F2 (2026-05-07 ogt + Claude Sonnet 4.6 + Codex):
# NO_ACTION 路徑要把 incident 推到 RESOLVED否則 incident 永遠卡
@@ -336,6 +349,13 @@ class ApprovalExecutionService:
duration_ms=int((time.time() - _aol_started_ms) * 1000),
error=f"parse_fail: {approval.action[:300]}",
)
await self._log_alert_execution_completed(
approval,
success=False,
execution_kind="parse_failed",
duration_ms=int((time.time() - _aol_started_ms) * 1000),
error_message=f"Could not parse operation type from action: {approval.action[:150]}",
)
return False # 解析失敗 → 執行未發生
executor = get_executor()
@@ -553,6 +573,20 @@ class ApprovalExecutionService:
"total_attempts": total_attempts,
},
)
await self._log_alert_execution_completed(
approval,
success=True,
execution_kind=operation_type.value,
duration_ms=int((time.time() - _aol_started_ms) * 1000),
output={
"operation_type": operation_type.value,
"resource_name": resource_name,
"namespace": namespace,
"executor_duration_ms": result.duration_ms,
"total_attempts": total_attempts,
"repair_executed": True,
},
)
return True # K8s 執行成功
else:
@@ -654,6 +688,22 @@ class ApprovalExecutionService:
error=result.error,
stderr=result.error, # E6 stderr 回灌 — 給 retry/Playbook 負向強化用
)
await self._log_alert_execution_completed(
approval,
success=False,
execution_kind=operation_type.value,
duration_ms=int((time.time() - _aol_started_ms) * 1000),
output={
"operation_type": operation_type.value,
"resource_name": resource_name,
"namespace": namespace,
"executor_duration_ms": result.duration_ms,
"total_attempts": total_attempts,
"repair_attempted": True,
"repair_executed": False,
},
error_message=result.error,
)
return False # K8s 執行失敗
async def _execute_ssh_host_action(
@@ -919,7 +969,14 @@ class ApprovalExecutionService:
except Exception:
pass
if success:
no_action = success and is_no_action_approval_action(approval.action)
if no_action:
text = (
f" <b>已記錄觀察,未執行修復</b>\n"
f"<code>{(approval.action or '')[:180]}</code>"
f"{km_info}"
)
elif success:
text = (
f"✅ <b>執行成功</b>\n"
f"<code>{(approval.action or '')[:180]}</code>"
@@ -948,8 +1005,34 @@ class ApprovalExecutionService:
incident_id=approval.incident_id,
approval_id=str(approval.id),
success=success,
no_action=no_action,
orig_msg_id=orig_msg_id,
)
try:
from src.repositories.alert_operation_log_repository import (
get_alert_operation_log_repository,
)
await get_alert_operation_log_repository().append(
"TELEGRAM_RESULT_SENT",
incident_id=approval.incident_id,
approval_id=str(approval.id),
actor="approval_execution",
action_detail="telegram_execution_result_sent",
success=success,
error_message=error,
context={
"reply_to_message_id": orig_msg_id,
"execution_kind": "no_action" if no_action else "execution",
"repair_executed": not no_action and success,
},
)
except Exception as _log_e:
logger.warning(
"alert_op_telegram_result_write_failed",
approval_id=str(approval.id),
error=str(_log_e),
)
except Exception as e:
logger.warning(
"push_execution_result_failed",
@@ -1592,6 +1675,85 @@ class ApprovalExecutionService:
# 22 筆 notification_formatted。修復後每次執行都留痕。
# =========================================================================
async def _log_alert_execution_started(
self,
approval: ApprovalRequest,
*,
aol_op_id: str | None,
) -> None:
"""Append immutable alert_operation_log start event for manual execution."""
try:
from src.repositories.alert_operation_log_repository import (
get_alert_operation_log_repository,
)
await get_alert_operation_log_repository().append(
"EXECUTION_STARTED",
incident_id=approval.incident_id,
approval_id=str(approval.id),
actor="approval_execution",
action_detail="approval_execution_started",
success=None,
context={
"action": (approval.action or "")[:500],
"automation_operation_id": aol_op_id,
"execution_kind": (
"no_action"
if is_no_action_approval_action(approval.action)
else "executable"
),
"repair_attempted": False,
"repair_executed": False,
},
)
except Exception as e:
logger.warning(
"alert_op_execution_started_write_failed",
approval_id=str(approval.id),
incident_id=approval.incident_id,
error=str(e),
)
async def _log_alert_execution_completed(
self,
approval: ApprovalRequest,
*,
success: bool,
execution_kind: str,
duration_ms: int,
output: dict | None = None,
error_message: str | None = None,
) -> None:
"""Append immutable alert_operation_log completion event for manual execution."""
try:
from src.repositories.alert_operation_log_repository import (
get_alert_operation_log_repository,
)
context = {
"action": (approval.action or "")[:500],
"duration_ms": duration_ms,
"execution_kind": execution_kind,
**(output or {}),
}
await get_alert_operation_log_repository().append(
"EXECUTION_COMPLETED",
incident_id=approval.incident_id,
approval_id=str(approval.id),
actor="approval_execution",
action_detail=f"approval_execution_{execution_kind}",
success=success,
error_message=(error_message or "")[:2000] if error_message else None,
context=context,
)
except Exception as e:
logger.warning(
"alert_op_execution_completed_write_failed",
approval_id=str(approval.id),
incident_id=approval.incident_id,
error=str(e),
)
async def _log_aol_started(self, approval: ApprovalRequest) -> str | None:
"""
在 automation_operation_log 寫一筆 'pending' 紀錄,回傳 op_id 供 _log_aol_completed 更新。

View File

@@ -531,7 +531,9 @@ class HeartbeatReportService:
SELECT
*,
(
btrim(coalesce(action, '')) = ''
COALESCE(extra_metadata->>'execution_kind', '') = 'no_action'
OR COALESCE(extra_metadata->>'repair_executed', '') = 'false'
OR btrim(coalesce(action, '')) = ''
OR UPPER(action) LIKE 'OBSERVE%'
OR UPPER(action) LIKE 'INVESTIGATE%'
OR UPPER(action) LIKE 'NO_ACTION%'
@@ -556,9 +558,18 @@ class HeartbeatReportService:
WHERE UPPER(status::text) = 'PENDING'
AND telegram_message_id IS NULL
) AS pending_without_telegram,
COUNT(*) FILTER (WHERE UPPER(status::text) = 'EXECUTION_SUCCESS') AS success,
COUNT(*) FILTER (WHERE UPPER(status::text) = 'EXECUTION_FAILED') AS failed,
COUNT(*) FILTER (WHERE UPPER(status::text) IN ('APPROVED','EXECUTION_SUCCESS','EXECUTION_FAILED')) AS auto_resolved
COUNT(*) FILTER (
WHERE UPPER(status::text) = 'EXECUTION_SUCCESS'
AND NOT is_observe_only
) AS success,
COUNT(*) FILTER (
WHERE UPPER(status::text) = 'EXECUTION_FAILED'
AND NOT is_observe_only
) AS failed,
COUNT(*) FILTER (
WHERE UPPER(status::text) IN ('APPROVED','EXECUTION_SUCCESS','EXECUTION_FAILED')
AND NOT is_observe_only
) AS auto_resolved
FROM scoped
"""))
row = r.one()

View File

@@ -232,11 +232,32 @@ class ReportGenerationService:
async with get_db_context() as db:
row = await db.execute(
text("""
WITH scoped AS (
SELECT
*,
(
COALESCE(extra_metadata->>'execution_kind', '') = 'no_action'
OR COALESCE(extra_metadata->>'repair_executed', '') = 'false'
OR btrim(coalesce(action, '')) = ''
OR UPPER(action) LIKE 'OBSERVE%'
OR UPPER(action) LIKE 'INVESTIGATE%'
OR UPPER(action) LIKE 'NO_ACTION%'
OR UPPER(action) LIKE '% NO_ACTION%'
OR UPPER(action) LIKE '%| NO_ACTION%'
) AS is_observe_only
FROM approval_records
WHERE created_at >= :since
)
SELECT
COUNT(*) FILTER (WHERE UPPER(status::text) = 'EXECUTION_SUCCESS') AS success,
COUNT(*) FILTER (WHERE UPPER(status::text) = 'EXECUTION_FAILED') AS failed
FROM approval_records
WHERE created_at >= :since
COUNT(*) FILTER (
WHERE UPPER(status::text) = 'EXECUTION_SUCCESS'
AND NOT is_observe_only
) AS success,
COUNT(*) FILTER (
WHERE UPPER(status::text) = 'EXECUTION_FAILED'
AND NOT is_observe_only
) AS failed
FROM scoped
"""),
{"since": since},
)
@@ -460,6 +481,7 @@ class ReportGenerationService:
# 失敗時發送告警到 SRE 群組,避免靜默吞掉錯誤
import asyncio as _asyncio
report_text = self.format_postmortem(data)
await self._persist_postmortem_km(data, report_text)
from src.services.telegram_gateway import get_telegram_gateway
gateway = get_telegram_gateway()
@@ -510,6 +532,72 @@ class ReportGenerationService:
error=str(_fe),
)
async def _persist_postmortem_km(
self,
data: PostmortemData,
report_text: str,
) -> None:
"""Persist generated postmortem as an idempotent KM entry before Telegram send."""
try:
from src.db.base import get_db_context
from src.models.knowledge import (
EntrySource,
EntryStatus,
EntryType,
KnowledgeEntryCreate,
)
from src.repositories.alert_operation_log_repository import (
get_alert_operation_log_repository,
)
from src.repositories.knowledge_repository import KnowledgeDBRepository
async with get_db_context() as db:
repo = KnowledgeDBRepository(db)
entry = await repo.create(
KnowledgeEntryCreate(
title=f"Postmortem {data.incident_id}: {data.title}"[:255],
content=report_text,
entry_type=EntryType.POSTMORTEM,
category="postmortem",
tags=[
"postmortem",
"incident",
"telegram",
"auto_repaired" if data.auto_repaired else "human_intervention",
],
source=EntrySource.AI_EXTRACTED,
status=EntryStatus.REVIEW,
related_incident_id=data.incident_id,
path_type="postmortem",
created_by="report_generation_service",
)
)
await get_alert_operation_log_repository().append(
"KM_CONVERTED",
incident_id=data.incident_id,
actor="report_generation_service",
action_detail="postmortem_persisted",
success=True,
context={
"knowledge_entry_id": entry.id,
"entry_type": EntryType.POSTMORTEM.value,
"path_type": "postmortem",
"duration_minutes": round(data.duration_minutes, 2),
},
)
logger.info(
"postmortem_km_persisted",
incident_id=data.incident_id,
knowledge_entry_id=entry.id,
)
except Exception as e:
logger.warning(
"postmortem_km_persist_failed",
incident_id=data.incident_id,
error=str(e),
)
# =============================================================================
# 日度報告排程迴圈

View File

@@ -8377,9 +8377,7 @@ class TelegramGateway:
if action == "approve":
status_emoji = ""
status_text = f"<b>已批准</b> by {_html.escape(username)}"
# 2026-04-14 Claude Sonnet 4.6: 原「等待執行」誤導(實際沒有 gate 會卡住路徑)
# 批准後一律顯示「執行中」,真實結果由 _push_execution_result_to_alert reply 補上
suffix = "⚡ 執行中..."
suffix = "⚡ 執行中..." if execution_triggered else "已簽核,等待更多簽核"
else:
status_emoji = ""
status_text = f"<b>已拒絕</b> by {_html.escape(username)}"
@@ -8495,7 +8493,7 @@ class TelegramGateway:
# 2026-04-22 Claude Sonnet 4.6: 只有真正轉為 APPROVED 才發「執行中...」
# 非 PENDING 狀態下 sign_approval early-return → approval 是舊 record
# 此時不應發「執行中...」,應告知用戶告警已處理過
if approval.status == ApprovalStatus.APPROVED:
if approval.status == ApprovalStatus.APPROVED and execution_triggered:
# 2026-04-09 Claude Sonnet 4.6: 回應 Telegram — 更新訊息狀態 + answer callback
await self._notify_approval_result(
message_id=message_id,
@@ -8520,7 +8518,7 @@ class TelegramGateway:
# 原本 gate 用 execution_triggeredrace condition 時失效(樂觀鎖失敗)
# 改用 approval.status == APPROVED與 REST API 路徑 approvals.py:360 對齊)
# 用 Redis lock exec:{approval_id} 防重入REST + Telegram 同時簽核)
if approval.status == ApprovalStatus.APPROVED:
if approval.status == ApprovalStatus.APPROVED and execution_triggered:
import asyncio
from src.core.redis_client import get_redis

View File

@@ -18,7 +18,7 @@ Task 2.3: validate_kubectl_command() 白名單驗證
import pytest
from src.services.alert_rule_engine import validate_kubectl_command
from src.services.alert_rule_engine import match_rule, validate_kubectl_command
# =============================================================================
@@ -76,6 +76,49 @@ class TestValidKubectlCommands:
assert validate_kubectl_command(cmd) is False
class TestRuleMatchingSpecificity:
"""具名 alertname 規則不得被寬鬆 message keyword 誤命中。"""
def test_host_storage_alert_does_not_match_minio_disk_rule(self):
ctx = {
"alert_type": "host",
"severity": "critical",
"source": "prometheus",
"target_resource": "dirty-reboot-evidence",
"namespace": "awoooi-prod",
"message": "HostPreviousBootStorageErrorsDetected storage dirty reboot evidence",
"labels": {
"alertname": "HostPreviousBootStorageErrorsDetected",
"instance": "192.168.0.110:9100",
},
}
result = match_rule(ctx)
assert result is not None
assert result["rule_id"] != "minio_disk_high"
assert "/data/minio" not in result.get("kubectl_command", "")
def test_exact_minio_disk_alert_still_matches_minio_rule(self):
ctx = {
"alert_type": "storage",
"severity": "critical",
"source": "prometheus",
"target_resource": "minio",
"namespace": "awoooi-prod",
"message": "MinIO disk usage high",
"labels": {
"alertname": "MinioDiskUsageHigh",
"instance": "192.168.0.110:9000",
},
}
result = match_rule(ctx)
assert result is not None
assert result["rule_id"] == "minio_disk_high"
# =============================================================================
# 阻擋案例(應返回 False
# =============================================================================

View File

@@ -1,5 +1,4 @@
from types import SimpleNamespace
from unittest.mock import AsyncMock
import pytest
@@ -16,14 +15,17 @@ async def test_no_action_execution_resolves_incident_once(monkeypatch):
incident_id="INC-TEST-001",
)
incident_service = SimpleNamespace(resolve_incident=AsyncMock())
update_execution_status = AsyncMock()
timeline_add_event = AsyncMock()
alert_completed = AsyncMock(return_value=None)
monkeypatch.setattr(
"src.services.approval_execution.get_approval_service",
lambda: SimpleNamespace(update_execution_status=AsyncMock()),
lambda: SimpleNamespace(update_execution_status=update_execution_status),
)
monkeypatch.setattr(
"src.services.approval_execution.get_timeline_service",
lambda: SimpleNamespace(add_event=AsyncMock()),
lambda: SimpleNamespace(add_event=timeline_add_event),
)
monkeypatch.setattr(
"src.services.approval_execution.parse_operation_from_action",
@@ -43,12 +45,28 @@ async def test_no_action_execution_resolves_incident_once(monkeypatch):
"src.services.approval_execution.ApprovalExecutionService._log_aol_completed",
AsyncMock(return_value=None),
)
monkeypatch.setattr(
"src.services.approval_execution.ApprovalExecutionService._log_alert_execution_started",
AsyncMock(return_value=None),
)
monkeypatch.setattr(
"src.services.approval_execution.ApprovalExecutionService._log_alert_execution_completed",
alert_completed,
)
# Act
result = await ApprovalExecutionService().execute_approved_action(approval)
# Assert
assert result is True
update_execution_status.assert_awaited_once_with(
approval.id,
success=True,
execution_kind="no_action",
)
assert "未執行修復" in timeline_add_event.await_args.kwargs["title"]
assert alert_completed.await_args.kwargs["execution_kind"] == "no_action"
assert alert_completed.await_args.kwargs["output"]["repair_executed"] is False
incident_service.resolve_incident.assert_awaited_once_with("INC-TEST-001")
@@ -67,10 +85,11 @@ async def test_no_action_execution_returns_true_when_resolve_raises(monkeypatch)
incident_service = SimpleNamespace(
resolve_incident=AsyncMock(side_effect=RuntimeError("redis down"))
)
update_execution_status = AsyncMock()
monkeypatch.setattr(
"src.services.approval_execution.get_approval_service",
lambda: SimpleNamespace(update_execution_status=AsyncMock()),
lambda: SimpleNamespace(update_execution_status=update_execution_status),
)
monkeypatch.setattr(
"src.services.approval_execution.get_timeline_service",
@@ -94,8 +113,21 @@ async def test_no_action_execution_returns_true_when_resolve_raises(monkeypatch)
"src.services.approval_execution.ApprovalExecutionService._log_aol_completed",
AsyncMock(return_value=None),
)
monkeypatch.setattr(
"src.services.approval_execution.ApprovalExecutionService._log_alert_execution_started",
AsyncMock(return_value=None),
)
monkeypatch.setattr(
"src.services.approval_execution.ApprovalExecutionService._log_alert_execution_completed",
AsyncMock(return_value=None),
)
result = await ApprovalExecutionService().execute_approved_action(approval)
assert result is True
update_execution_status.assert_awaited_once_with(
approval.id,
success=True,
execution_kind="no_action",
)
incident_service.resolve_incident.assert_awaited_once_with("INC-TEST-002")

View File

@@ -181,7 +181,7 @@ class TestMatchRuleRejection:
"""垃圾 target 時 kubectl_command 必須被清空(降級 LLM"""
def test_bad_target_discards_kubectl_command(self):
"""真實 bugHostHighCpuLoad target=unknown → kubectl_command 應清空"""
"""HostHighCpuLoad target=unknown → 不得組裝成壞 kubectl target。"""
ctx = {
"alert_type": "high_cpu",
"severity": "warning",
@@ -192,10 +192,12 @@ class TestMatchRuleRejection:
"labels": {"alertname": "HostHighCpuLoad", "instance": "192.168.0.110:9100"},
}
result = match_rule(ctx)
# 規則可能匹配host_high_cpu但 kubectl_command 必為空
# 規則可能匹配 host SSH 診斷;但不能把 HostHighCpuLoad 當成 K8s target。
if result is not None:
assert result["kubectl_command"] == "", \
f"bad target 應導致 kubectl_command 清空, got: {result['kubectl_command']!r}"
command = result["kubectl_command"]
assert command == "" or command.startswith("ssh "), \
f"bad target 不應組裝 kubectl 指令, got: {command!r}"
assert "deployment/HostHighCpuLoad" not in command
def test_good_target_preserves_kubectl_command(self):
"""真實 deployment 名稱時kubectl_command 正常組裝"""

View File

@@ -17,7 +17,9 @@ ADR-076 Task 4: 自動報告生成
建立: 2026-04-14 (台北時區) Claude Haiku 4.5
"""
from contextlib import asynccontextmanager
from datetime import datetime, timedelta, timezone
from types import SimpleNamespace
import pytest
@@ -274,6 +276,71 @@ class TestFormatPostmortem:
assert "台北時間" in report
class TestTriggerPostmortemPersistence:
"""Postmortem 產出必須同步沉澱到 KM。"""
@pytest.mark.asyncio
async def test_trigger_postmortem_persists_km_before_telegram_send(self, monkeypatch):
now = datetime.now(_TZ_TAIPEI)
created = now - timedelta(minutes=16)
sent_messages: list[str] = []
created_entries: list[object] = []
op_logs: list[dict] = []
class FakeGateway:
async def send_to_group(self, text: str, parse_mode: str = "HTML") -> None:
sent_messages.append(text)
class FakeKnowledgeRepo:
def __init__(self, _db) -> None:
pass
async def create(self, data):
created_entries.append(data)
return SimpleNamespace(id="km-postmortem-1")
class FakeAlertOpRepo:
async def append(self, event_type: str, **kwargs):
op_logs.append({"event_type": event_type, **kwargs})
@asynccontextmanager
async def fake_db_context():
yield SimpleNamespace()
monkeypatch.setattr(
"src.services.telegram_gateway.get_telegram_gateway",
lambda: FakeGateway(),
)
monkeypatch.setattr("src.db.base.get_db_context", fake_db_context)
monkeypatch.setattr(
"src.repositories.knowledge_repository.KnowledgeDBRepository",
FakeKnowledgeRepo,
)
monkeypatch.setattr(
"src.repositories.alert_operation_log_repository.get_alert_operation_log_repository",
lambda: FakeAlertOpRepo(),
)
await ReportGenerationService().trigger_postmortem(
incident_id="INC-20260531-POST",
title="DockerContainerUnhealthy bitan-pharmacy",
created_at=created,
resolved_at=now,
root_cause="容器健康檢查失敗",
resolution_action="OBSERVE",
auto_repaired=False,
)
assert sent_messages
assert created_entries
entry = created_entries[0]
assert entry.entry_type.value == "postmortem"
assert entry.related_incident_id == "INC-20260531-POST"
assert entry.path_type == "postmortem"
assert op_logs[0]["event_type"] == "KM_CONVERTED"
assert op_logs[0]["action_detail"] == "postmortem_persisted"
# =============================================================================
# _seconds_until_next_report
# =============================================================================

View File

@@ -15,12 +15,18 @@ class _FakeGateway:
class _FakeApprovalService:
def __init__(self, approval, execution_triggered: bool) -> None:
def __init__(
self,
approval,
execution_triggered: bool,
sign_message: str = "Approval complete",
) -> None:
self.approval = approval
self.execution_triggered = execution_triggered
self.sign_message = sign_message
async def sign_approval(self, **_kwargs):
return self.approval, "Approval complete", self.execution_triggered
return self.approval, self.sign_message, self.execution_triggered
async def reject_approval(self, **_kwargs):
return self.approval, "Approval rejected"
@@ -100,6 +106,59 @@ async def test_telegram_approval_schedules_executor_after_required_signature(mon
assert op_log_repo.rows[0]["kwargs"]["action_detail"] == "approve"
@pytest.mark.asyncio
async def test_telegram_approval_duplicate_does_not_schedule_executor(monkeypatch):
approval_id = "33333333-3333-3333-3333-333333333333"
approval = SimpleNamespace(
id=UUID(approval_id),
status=SimpleNamespace(value="execution_success"),
incident_id="INC-20260531-DUPE",
)
finalizer_calls: list[dict] = []
op_log_repo = _FakeAlertOperationLogRepository()
async def fake_finalize(*, approval, execution_triggered: bool) -> bool:
finalizer_calls.append({
"approval_id": str(approval.id),
"execution_triggered": execution_triggered,
})
return True
monkeypatch.setattr(
telegram_api,
"get_telegram_gateway",
lambda: _FakeGateway({
"success": True,
"action": "approve",
"approval_id": approval_id,
"user": {"id": 42, "username": "ops"},
}),
)
monkeypatch.setattr(
telegram_api,
"get_approval_service",
lambda: _FakeApprovalService(
approval,
execution_triggered=False,
sign_message="Cannot sign: status is execution_success",
),
)
monkeypatch.setattr(telegram_api, "_finalize_telegram_approval", fake_finalize)
monkeypatch.setattr(
"src.repositories.alert_operation_log_repository.get_alert_operation_log_repository",
lambda: op_log_repo,
)
result = await telegram_api.telegram_webhook(_callback_update(f"approve:{approval_id}:ts:nonce"))
assert result["ok"] is True
assert result["message"] == "Already processed"
assert result["execution_triggered"] is False
assert result["execution_scheduled"] is False
assert finalizer_calls == []
assert op_log_repo.rows[0]["kwargs"]["action_detail"] == "approve_duplicate"
@pytest.mark.asyncio
async def test_telegram_rejection_syncs_incident_state(monkeypatch):
approval_id = "22222222-2222-2222-2222-222222222222"

View File

@@ -1,3 +1,39 @@
## 2026-05-31Telegram 告警執行語意與 DB 稽核完整性修復
**背景**
- Production 查核 `INC-20260530-88D960` / `INC-20260531-88394F` 發現 Telegram 顯示「已批准、執行中、執行成功」,但實際分別是 MinIO SSH 診斷與 `OBSERVE`,不是建議中的修復動作。
- `approval_records.status=execution_success` 無法區分「真的執行修復」與「純觀察/NO_ACTION terminal」`alert_operation_log` 缺人工 approval execution 的 start/endPostmortem 只送 Telegram 未沉澱 KM。
- `alert_rule_engine` 允許具名規則只靠 message keyword 命中,導致主機 storage 類告警可能誤配到 `minio_disk_high`
**本次調整**
- 新增 `approval_action_classifier.is_no_action_approval_action()`,集中判斷 `OBSERVE` / `INVESTIGATE` / `NO_ACTION`
- NO_ACTION terminal 仍會關閉 approval`extra_metadata` 標記 `execution_kind=no_action``repair_executed=false`Telegram result 改為「已記錄觀察,未執行修復」。
- `ApprovalExecutionService` 同步寫 `alert_operation_log``EXECUTION_STARTED``EXECUTION_COMPLETED``TELEGRAM_RESULT_SENT`
- Telegram webhook duplicate approval 不再 finalize / schedule executorlong polling 只有真正 `execution_triggered` 才顯示「執行中」。
- Postmortem 產出時同步 idempotent 寫入 `knowledge_entries(entry_type=postmortem,path_type=postmortem)` 並補 `KM_CONVERTED`
- Heartbeat 與日報修復統計排除 observe-only/no-action避免污染 success rate。
- `alert_rule_engine._matches()` 收緊具名 alertname 規則,避免 Host storage 類告警靠 `storage` keyword 誤配 MinIO。
**Verification**
```text
python3 -m py_compile approval_action_classifier.py approval_execution.py approval_db.py telegram.py telegram_gateway.py alert_rule_engine.py report_generation_service.py heartbeat_report_service.py
-> pass
pytest test_approval_execution_no_action.py test_telegram_webhook_execution_handoff.py -q
-> 6 passed
pytest test_alert_rule_engine_validation.py test_report_generation_service.py -q
-> 67 passed
pytest test_heartbeat_ollama_endpoints.py test_heartbeat_pod_state_machine.py test_gap_a4_placeholder_resolution.py -q
-> 49 passed
```
**判讀 / 下一步**
- 本輪修復新流量的語意與稽核完整性,不補跑舊 incident 的修復動作。
- 舊 incident 若已是 `execution_success` 但沒有 `extra_metadata.execution_kind`,仍需透過 `automation_operation_log` / `alert_operation_log` 交叉判讀。
## 2026-05-31Legacy HITL PENDING 前台可見性與心跳拆分
**背景**

View File

@@ -2671,6 +2671,12 @@ Phase 6 完成後
- VerificationAPI py_compile passtargeted ruff for new test pass`pnpm --filter @awoooi/shared-types generate` pass`test_approval_pending_visibility.py` 4 passed`test_heartbeat_ollama_endpoints.py` + `test_heartbeat_pod_state_machine.py` 15 passed`git diff --check` pass。
- 判讀T153 不批次 approve/reject 生產 PENDING也不把觀察卡刪掉它把「前台看得到 legacy HITL 事實」與「告警只針對真正人工 actionable backlog」補齊。舊 fallback kubectl / SSH action 仍需 operator 在 `/awooop/approvals` 逐筆決策OBSERVE / NO_ACTION 類不再偽裝成 emergency manual backlog。下一段可追 LLM failure fallback 為何大量產生 `OBSERVE / medium` 卡片,但需避免破壞 agent 後續把 PENDING 更新成可執行 action 的路徑。
**T154 Telegram approval truth + execution audit integrity2026-05-31 台北)**
- 觸發Telegram 上出現「此告警已處理」後仍接著顯示「已批准、執行中」,且 `INC-20260530-88D960` / `INC-20260531-88394F` 的 production 查核顯示 `approval_records.status=execution_success`,但前者實際只跑 MinIO SSH 診斷、後者只是 `OBSERVE``auto_repair_executions=0``alert_operation_log` 缺 execution start/endPostmortem 只送 Telegram 未落 KM。這會讓 operator 誤以為修復已完成。
- 修正:集中 `is_no_action_approval_action()`,讓 `OBSERVE` / `INVESTIGATE` / `NO_ACTION` terminal 時在 `approval_records.extra_metadata` 標記 `execution_kind=no_action``repair_executed=false`Telegram result 改為「已記錄觀察,未執行修復」,不再用「執行成功」。`ApprovalExecutionService` 現在同步寫 `alert_operation_log``EXECUTION_STARTED` / `EXECUTION_COMPLETED` / `TELEGRAM_RESULT_SENT`,並保留 `automation_operation_log`。Telegram webhook duplicate approval 不再 finalize/schedule executorlong polling 只有真正 `execution_triggered` 才顯示「執行中」。`ReportGenerationService` 會把 Postmortem idempotent 寫入 `knowledge_entries(entry_type=postmortem,path_type=postmortem)` 並補 `KM_CONVERTED``HeartbeatReportService` / 日報修復統計排除 observe-only/no-action不再污染 execution success rate。`alert_rule_engine._matches()` 收緊具名 alertname 規則,避免主機 storage 告警靠 message keyword 誤配 `minio_disk_high`
- Verification`py_compile` pass`test_approval_execution_no_action.py` + `test_telegram_webhook_execution_handoff.py` 6 passed`test_alert_rule_engine_validation.py` + `test_report_generation_service.py` 67 passed`test_heartbeat_ollama_endpoints.py` + `test_heartbeat_pod_state_machine.py` + `test_gap_a4_placeholder_resolution.py` 49 passed after aligning host SSH diagnostic assertion。
- 判讀T154 修的是「Telegram / DB / 前台統計的 truthfulness」不是補跑舊 incident 的修復。舊資料中 status 已是 `execution_success` 的 OBSERVE 仍需靠新 metadata 才能精確分辨;部署後新 approval 會留下 immutable execution start/end 與 no-action 語意operator 不應再把 OBSERVE 視為完成修復。
**T152 Ansible runtime readiness surfaced2026-05-24 台北)**
- 觸發T151 已讓首頁看到 execution backend / Ansible attribution但 operator 仍看不到 runtime 端缺什麼容易把「Ansible 有候選」誤解成「Ansible 已能自動修復」。
- 修正API image 複製 `infra/ansible/` 作 read-only catalog`truth-chain/quality/summary` 新增 `ansible_runtime`,回報 playbook binary、catalog、inventory、playbook_count、can_run_check_mode、blockers。首頁 execution evidence 同步顯示 runtime 狀態;目前 production 顯示 `runtime 未就緒ansible_playbook_binary_missing`。未安裝 `ansible-core`、未啟用 check-mode / apply。