fix(alerts): correct telegram execution truth
This commit is contained in:
@@ -275,6 +275,29 @@ async def telegram_webhook(
|
||||
)
|
||||
|
||||
if approval:
|
||||
status_value = approval.status.value if hasattr(approval.status, "value") else str(approval.status)
|
||||
if (
|
||||
"Cannot sign" in msg
|
||||
or "already signed" in msg
|
||||
or "Concurrent modification" in msg
|
||||
):
|
||||
logger.info(
|
||||
"telegram_approval_ignored_already_processed",
|
||||
approval_id=approval_id,
|
||||
user_id=user_id,
|
||||
status=status_value,
|
||||
message=msg,
|
||||
)
|
||||
await _log_user_action("approve_duplicate", False, getattr(approval, "incident_id", None))
|
||||
return {
|
||||
"ok": True,
|
||||
"message": "Already processed",
|
||||
"approval_id": approval_id,
|
||||
"status": status_value,
|
||||
"execution_triggered": False,
|
||||
"execution_scheduled": False,
|
||||
}
|
||||
|
||||
execution_scheduled = await _finalize_telegram_approval(
|
||||
approval=approval,
|
||||
execution_triggered=execution_triggered,
|
||||
@@ -283,7 +306,7 @@ async def telegram_webhook(
|
||||
"telegram_approval_signed",
|
||||
approval_id=approval_id,
|
||||
user_id=user_id,
|
||||
status=approval.status.value,
|
||||
status=status_value,
|
||||
execution_triggered=execution_triggered,
|
||||
execution_scheduled=execution_scheduled,
|
||||
)
|
||||
@@ -291,9 +314,9 @@ async def telegram_webhook(
|
||||
|
||||
return {
|
||||
"ok": True,
|
||||
"message": "Approved",
|
||||
"message": "Approved" if execution_triggered else "Signed",
|
||||
"approval_id": approval_id,
|
||||
"status": approval.status.value,
|
||||
"status": status_value,
|
||||
"execution_triggered": execution_triggered,
|
||||
"execution_scheduled": execution_scheduled,
|
||||
}
|
||||
|
||||
@@ -298,6 +298,12 @@ def _matches(rule: dict, alertname: str, alert_type: str, message: str, instance
|
||||
if alertnames and alertname in alertnames:
|
||||
return True
|
||||
|
||||
# 2026-05-31 ogt + Codex: 有明確 alertname 的規則不得只靠寬鬆 message
|
||||
# keyword 命中,否則 HostPreviousBootStorageErrorsDetected 這類主機 storage
|
||||
# 告警會誤配到 minio_disk_high。
|
||||
if alertnames and alertname and alertname != "custom":
|
||||
return False
|
||||
|
||||
# alert_type 部分匹配
|
||||
for kw in match.get("alert_type", []):
|
||||
if kw.lower() in alert_type.lower():
|
||||
|
||||
26
apps/api/src/services/approval_action_classifier.py
Normal file
26
apps/api/src/services/approval_action_classifier.py
Normal file
@@ -0,0 +1,26 @@
|
||||
"""
|
||||
Approval action classifier
|
||||
==========================
|
||||
|
||||
2026-05-31 ogt + Codex: Telegram 告警鏈路一致性修復。
|
||||
將 OBSERVE / INVESTIGATE / NO_ACTION 這類「純觀察、未執行修復」的
|
||||
判斷集中,避免 execution、Telegram、統計各自用不同語意。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
def is_no_action_approval_action(action: str | None) -> bool:
|
||||
"""Return True when an approval action records observation instead of repair."""
|
||||
text = (action or "").strip()
|
||||
upper = text.upper()
|
||||
if not text:
|
||||
return True
|
||||
return (
|
||||
"NO_ACTION" in upper
|
||||
or "NO-ACTION" in upper
|
||||
or "NOACTION" in upper
|
||||
or "(未設)" in text
|
||||
or upper.startswith("OBSERVE")
|
||||
or upper.startswith("INVESTIGATE")
|
||||
)
|
||||
@@ -659,6 +659,7 @@ class ApprovalDBService:
|
||||
approval_id: UUID,
|
||||
success: bool,
|
||||
error_message: str | None = None,
|
||||
execution_kind: str | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
更新執行狀態
|
||||
@@ -669,21 +670,36 @@ class ApprovalDBService:
|
||||
"""
|
||||
async with get_db_context() as db:
|
||||
status = ApprovalStatus.EXECUTION_SUCCESS if success else ApprovalStatus.EXECUTION_FAILED
|
||||
values: dict = {"status": status}
|
||||
result = await db.execute(
|
||||
select(ApprovalRecord).where(ApprovalRecord.id == str(approval_id))
|
||||
)
|
||||
record = result.scalar_one_or_none()
|
||||
if record is None:
|
||||
logger.warning(
|
||||
"approval_execution_status_update_missing",
|
||||
id=str(approval_id),
|
||||
success=success,
|
||||
)
|
||||
return
|
||||
|
||||
record.status = status
|
||||
if not success and error_message:
|
||||
# 截斷至合理長度,避免爆欄位
|
||||
values["rejection_reason"] = str(error_message)[:2000]
|
||||
await db.execute(
|
||||
update(ApprovalRecord)
|
||||
.where(ApprovalRecord.id == str(approval_id))
|
||||
.values(**values)
|
||||
)
|
||||
record.rejection_reason = str(error_message)[:2000]
|
||||
if execution_kind:
|
||||
# 2026-05-31 ogt + Codex: OBSERVE/NO_ACTION 仍需 terminal 狀態,
|
||||
# 但前台/報表必須能分辨「未執行修復」而非真正 execution success。
|
||||
metadata = dict(record.extra_metadata or {})
|
||||
metadata["execution_kind"] = execution_kind
|
||||
metadata["repair_executed"] = execution_kind != "no_action"
|
||||
record.extra_metadata = metadata
|
||||
|
||||
logger.info(
|
||||
"approval_execution_status_updated",
|
||||
id=str(approval_id),
|
||||
success=success,
|
||||
has_error=bool(error_message),
|
||||
execution_kind=execution_kind,
|
||||
)
|
||||
|
||||
async def update_incident_id(self, approval_id: UUID, incident_id: str) -> None:
|
||||
|
||||
@@ -36,6 +36,7 @@ from src.db.base import get_db_context
|
||||
from src.models.approval import ApprovalRequest
|
||||
from src.plugins.mcp.gateway import GatewayContext, McpGateway, McpGatewayError
|
||||
from src.plugins.mcp.interfaces import MCPToolResult
|
||||
from src.services.approval_action_classifier import is_no_action_approval_action
|
||||
from src.services.approval_db import get_approval_service, get_timeline_service
|
||||
from src.services.executor import ExecutionResult, OperationType, get_executor
|
||||
from src.services.operation_parser import parse_operation_from_action
|
||||
@@ -165,6 +166,7 @@ class ApprovalExecutionService:
|
||||
# ADR-090 § 自動化動作回灌 (2026-04-19): 主流程開始即在 aol 留痕,
|
||||
# 結束時 update。不依賴 fire-and-forget,確保 33 件/7d approval 全部可觀測。
|
||||
_aol_op_id = await self._log_aol_started(approval)
|
||||
await self._log_alert_execution_started(approval, aol_op_id=_aol_op_id)
|
||||
_aol_started_ms = time.time()
|
||||
|
||||
service = get_approval_service()
|
||||
@@ -228,15 +230,7 @@ class ApprovalExecutionService:
|
||||
# 2026-04-19 ogt + Claude Opus 4.7: 區分 NO_ACTION vs 真解析失敗
|
||||
# NO_ACTION 是 AI 刻意選的「純調查不破壞」,不該誤標 EXECUTION_FAILED
|
||||
# 污染 auto_execute 成功率 KPI (MASTER §7.1 #11)
|
||||
_action_upper = (approval.action or "").upper()
|
||||
_is_no_action = (
|
||||
"NO_ACTION" in _action_upper
|
||||
or "NO-ACTION" in _action_upper
|
||||
or "NOACTION" in _action_upper
|
||||
or "(未設)" in approval.action
|
||||
or _action_upper.startswith("OBSERVE")
|
||||
or _action_upper.startswith("INVESTIGATE")
|
||||
)
|
||||
_is_no_action = is_no_action_approval_action(approval.action)
|
||||
|
||||
if _is_no_action:
|
||||
logger.info(
|
||||
@@ -246,13 +240,17 @@ class ApprovalExecutionService:
|
||||
reason="NO_ACTION - 純調查/觀察類,不執行破壞動作",
|
||||
path="no_action",
|
||||
)
|
||||
# 標為 SUCCESS (觀察/調查本身就是成功完成)
|
||||
await service.update_execution_status(approval.id, success=True)
|
||||
# 仍以 terminal success 關閉簽核,但 metadata 明確標記未執行修復。
|
||||
await service.update_execution_status(
|
||||
approval.id,
|
||||
success=True,
|
||||
execution_kind="no_action",
|
||||
)
|
||||
await timeline.add_event(
|
||||
event_type="exec",
|
||||
status="success",
|
||||
title="✅ 純觀察類動作完成 (NO_ACTION)",
|
||||
description=f"Action: {approval.action[:120]}",
|
||||
title="ℹ️ 純觀察類動作已記錄(未執行修復)",
|
||||
description=f"Action: {(approval.action or '')[:120]}",
|
||||
actor="leWOOOgo",
|
||||
actor_role="executor",
|
||||
approval_id=str(approval.id),
|
||||
@@ -269,7 +267,22 @@ class ApprovalExecutionService:
|
||||
op_id=_aol_op_id,
|
||||
status="success",
|
||||
duration_ms=int((time.time() - _aol_started_ms) * 1000),
|
||||
output={"reason": "NO_ACTION", "action": approval.action[:200]},
|
||||
output={
|
||||
"reason": "NO_ACTION",
|
||||
"execution_kind": "no_action",
|
||||
"repair_executed": False,
|
||||
"action": (approval.action or "")[:200],
|
||||
},
|
||||
)
|
||||
await self._log_alert_execution_completed(
|
||||
approval,
|
||||
success=True,
|
||||
execution_kind="no_action",
|
||||
duration_ms=int((time.time() - _aol_started_ms) * 1000),
|
||||
output={
|
||||
"reason": "NO_ACTION",
|
||||
"repair_executed": False,
|
||||
},
|
||||
)
|
||||
# F2 (2026-05-07 ogt + Claude Sonnet 4.6 + Codex):
|
||||
# NO_ACTION 路徑要把 incident 推到 RESOLVED,否則 incident 永遠卡
|
||||
@@ -336,6 +349,13 @@ class ApprovalExecutionService:
|
||||
duration_ms=int((time.time() - _aol_started_ms) * 1000),
|
||||
error=f"parse_fail: {approval.action[:300]}",
|
||||
)
|
||||
await self._log_alert_execution_completed(
|
||||
approval,
|
||||
success=False,
|
||||
execution_kind="parse_failed",
|
||||
duration_ms=int((time.time() - _aol_started_ms) * 1000),
|
||||
error_message=f"Could not parse operation type from action: {approval.action[:150]}",
|
||||
)
|
||||
return False # 解析失敗 → 執行未發生
|
||||
|
||||
executor = get_executor()
|
||||
@@ -553,6 +573,20 @@ class ApprovalExecutionService:
|
||||
"total_attempts": total_attempts,
|
||||
},
|
||||
)
|
||||
await self._log_alert_execution_completed(
|
||||
approval,
|
||||
success=True,
|
||||
execution_kind=operation_type.value,
|
||||
duration_ms=int((time.time() - _aol_started_ms) * 1000),
|
||||
output={
|
||||
"operation_type": operation_type.value,
|
||||
"resource_name": resource_name,
|
||||
"namespace": namespace,
|
||||
"executor_duration_ms": result.duration_ms,
|
||||
"total_attempts": total_attempts,
|
||||
"repair_executed": True,
|
||||
},
|
||||
)
|
||||
return True # K8s 執行成功
|
||||
|
||||
else:
|
||||
@@ -654,6 +688,22 @@ class ApprovalExecutionService:
|
||||
error=result.error,
|
||||
stderr=result.error, # E6 stderr 回灌 — 給 retry/Playbook 負向強化用
|
||||
)
|
||||
await self._log_alert_execution_completed(
|
||||
approval,
|
||||
success=False,
|
||||
execution_kind=operation_type.value,
|
||||
duration_ms=int((time.time() - _aol_started_ms) * 1000),
|
||||
output={
|
||||
"operation_type": operation_type.value,
|
||||
"resource_name": resource_name,
|
||||
"namespace": namespace,
|
||||
"executor_duration_ms": result.duration_ms,
|
||||
"total_attempts": total_attempts,
|
||||
"repair_attempted": True,
|
||||
"repair_executed": False,
|
||||
},
|
||||
error_message=result.error,
|
||||
)
|
||||
return False # K8s 執行失敗
|
||||
|
||||
async def _execute_ssh_host_action(
|
||||
@@ -919,7 +969,14 @@ class ApprovalExecutionService:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if success:
|
||||
no_action = success and is_no_action_approval_action(approval.action)
|
||||
if no_action:
|
||||
text = (
|
||||
f"ℹ️ <b>已記錄觀察,未執行修復</b>\n"
|
||||
f"<code>{(approval.action or '')[:180]}</code>"
|
||||
f"{km_info}"
|
||||
)
|
||||
elif success:
|
||||
text = (
|
||||
f"✅ <b>執行成功</b>\n"
|
||||
f"<code>{(approval.action or '')[:180]}</code>"
|
||||
@@ -948,8 +1005,34 @@ class ApprovalExecutionService:
|
||||
incident_id=approval.incident_id,
|
||||
approval_id=str(approval.id),
|
||||
success=success,
|
||||
no_action=no_action,
|
||||
orig_msg_id=orig_msg_id,
|
||||
)
|
||||
try:
|
||||
from src.repositories.alert_operation_log_repository import (
|
||||
get_alert_operation_log_repository,
|
||||
)
|
||||
|
||||
await get_alert_operation_log_repository().append(
|
||||
"TELEGRAM_RESULT_SENT",
|
||||
incident_id=approval.incident_id,
|
||||
approval_id=str(approval.id),
|
||||
actor="approval_execution",
|
||||
action_detail="telegram_execution_result_sent",
|
||||
success=success,
|
||||
error_message=error,
|
||||
context={
|
||||
"reply_to_message_id": orig_msg_id,
|
||||
"execution_kind": "no_action" if no_action else "execution",
|
||||
"repair_executed": not no_action and success,
|
||||
},
|
||||
)
|
||||
except Exception as _log_e:
|
||||
logger.warning(
|
||||
"alert_op_telegram_result_write_failed",
|
||||
approval_id=str(approval.id),
|
||||
error=str(_log_e),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"push_execution_result_failed",
|
||||
@@ -1592,6 +1675,85 @@ class ApprovalExecutionService:
|
||||
# 22 筆 notification_formatted。修復後每次執行都留痕。
|
||||
# =========================================================================
|
||||
|
||||
async def _log_alert_execution_started(
|
||||
self,
|
||||
approval: ApprovalRequest,
|
||||
*,
|
||||
aol_op_id: str | None,
|
||||
) -> None:
|
||||
"""Append immutable alert_operation_log start event for manual execution."""
|
||||
try:
|
||||
from src.repositories.alert_operation_log_repository import (
|
||||
get_alert_operation_log_repository,
|
||||
)
|
||||
|
||||
await get_alert_operation_log_repository().append(
|
||||
"EXECUTION_STARTED",
|
||||
incident_id=approval.incident_id,
|
||||
approval_id=str(approval.id),
|
||||
actor="approval_execution",
|
||||
action_detail="approval_execution_started",
|
||||
success=None,
|
||||
context={
|
||||
"action": (approval.action or "")[:500],
|
||||
"automation_operation_id": aol_op_id,
|
||||
"execution_kind": (
|
||||
"no_action"
|
||||
if is_no_action_approval_action(approval.action)
|
||||
else "executable"
|
||||
),
|
||||
"repair_attempted": False,
|
||||
"repair_executed": False,
|
||||
},
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"alert_op_execution_started_write_failed",
|
||||
approval_id=str(approval.id),
|
||||
incident_id=approval.incident_id,
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
async def _log_alert_execution_completed(
|
||||
self,
|
||||
approval: ApprovalRequest,
|
||||
*,
|
||||
success: bool,
|
||||
execution_kind: str,
|
||||
duration_ms: int,
|
||||
output: dict | None = None,
|
||||
error_message: str | None = None,
|
||||
) -> None:
|
||||
"""Append immutable alert_operation_log completion event for manual execution."""
|
||||
try:
|
||||
from src.repositories.alert_operation_log_repository import (
|
||||
get_alert_operation_log_repository,
|
||||
)
|
||||
|
||||
context = {
|
||||
"action": (approval.action or "")[:500],
|
||||
"duration_ms": duration_ms,
|
||||
"execution_kind": execution_kind,
|
||||
**(output or {}),
|
||||
}
|
||||
await get_alert_operation_log_repository().append(
|
||||
"EXECUTION_COMPLETED",
|
||||
incident_id=approval.incident_id,
|
||||
approval_id=str(approval.id),
|
||||
actor="approval_execution",
|
||||
action_detail=f"approval_execution_{execution_kind}",
|
||||
success=success,
|
||||
error_message=(error_message or "")[:2000] if error_message else None,
|
||||
context=context,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"alert_op_execution_completed_write_failed",
|
||||
approval_id=str(approval.id),
|
||||
incident_id=approval.incident_id,
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
async def _log_aol_started(self, approval: ApprovalRequest) -> str | None:
|
||||
"""
|
||||
在 automation_operation_log 寫一筆 'pending' 紀錄,回傳 op_id 供 _log_aol_completed 更新。
|
||||
|
||||
@@ -531,7 +531,9 @@ class HeartbeatReportService:
|
||||
SELECT
|
||||
*,
|
||||
(
|
||||
btrim(coalesce(action, '')) = ''
|
||||
COALESCE(extra_metadata->>'execution_kind', '') = 'no_action'
|
||||
OR COALESCE(extra_metadata->>'repair_executed', '') = 'false'
|
||||
OR btrim(coalesce(action, '')) = ''
|
||||
OR UPPER(action) LIKE 'OBSERVE%'
|
||||
OR UPPER(action) LIKE 'INVESTIGATE%'
|
||||
OR UPPER(action) LIKE 'NO_ACTION%'
|
||||
@@ -556,9 +558,18 @@ class HeartbeatReportService:
|
||||
WHERE UPPER(status::text) = 'PENDING'
|
||||
AND telegram_message_id IS NULL
|
||||
) AS pending_without_telegram,
|
||||
COUNT(*) FILTER (WHERE UPPER(status::text) = 'EXECUTION_SUCCESS') AS success,
|
||||
COUNT(*) FILTER (WHERE UPPER(status::text) = 'EXECUTION_FAILED') AS failed,
|
||||
COUNT(*) FILTER (WHERE UPPER(status::text) IN ('APPROVED','EXECUTION_SUCCESS','EXECUTION_FAILED')) AS auto_resolved
|
||||
COUNT(*) FILTER (
|
||||
WHERE UPPER(status::text) = 'EXECUTION_SUCCESS'
|
||||
AND NOT is_observe_only
|
||||
) AS success,
|
||||
COUNT(*) FILTER (
|
||||
WHERE UPPER(status::text) = 'EXECUTION_FAILED'
|
||||
AND NOT is_observe_only
|
||||
) AS failed,
|
||||
COUNT(*) FILTER (
|
||||
WHERE UPPER(status::text) IN ('APPROVED','EXECUTION_SUCCESS','EXECUTION_FAILED')
|
||||
AND NOT is_observe_only
|
||||
) AS auto_resolved
|
||||
FROM scoped
|
||||
"""))
|
||||
row = r.one()
|
||||
|
||||
@@ -232,11 +232,32 @@ class ReportGenerationService:
|
||||
async with get_db_context() as db:
|
||||
row = await db.execute(
|
||||
text("""
|
||||
WITH scoped AS (
|
||||
SELECT
|
||||
*,
|
||||
(
|
||||
COALESCE(extra_metadata->>'execution_kind', '') = 'no_action'
|
||||
OR COALESCE(extra_metadata->>'repair_executed', '') = 'false'
|
||||
OR btrim(coalesce(action, '')) = ''
|
||||
OR UPPER(action) LIKE 'OBSERVE%'
|
||||
OR UPPER(action) LIKE 'INVESTIGATE%'
|
||||
OR UPPER(action) LIKE 'NO_ACTION%'
|
||||
OR UPPER(action) LIKE '% NO_ACTION%'
|
||||
OR UPPER(action) LIKE '%| NO_ACTION%'
|
||||
) AS is_observe_only
|
||||
FROM approval_records
|
||||
WHERE created_at >= :since
|
||||
)
|
||||
SELECT
|
||||
COUNT(*) FILTER (WHERE UPPER(status::text) = 'EXECUTION_SUCCESS') AS success,
|
||||
COUNT(*) FILTER (WHERE UPPER(status::text) = 'EXECUTION_FAILED') AS failed
|
||||
FROM approval_records
|
||||
WHERE created_at >= :since
|
||||
COUNT(*) FILTER (
|
||||
WHERE UPPER(status::text) = 'EXECUTION_SUCCESS'
|
||||
AND NOT is_observe_only
|
||||
) AS success,
|
||||
COUNT(*) FILTER (
|
||||
WHERE UPPER(status::text) = 'EXECUTION_FAILED'
|
||||
AND NOT is_observe_only
|
||||
) AS failed
|
||||
FROM scoped
|
||||
"""),
|
||||
{"since": since},
|
||||
)
|
||||
@@ -460,6 +481,7 @@ class ReportGenerationService:
|
||||
# 失敗時發送告警到 SRE 群組,避免靜默吞掉錯誤
|
||||
import asyncio as _asyncio
|
||||
report_text = self.format_postmortem(data)
|
||||
await self._persist_postmortem_km(data, report_text)
|
||||
from src.services.telegram_gateway import get_telegram_gateway
|
||||
gateway = get_telegram_gateway()
|
||||
|
||||
@@ -510,6 +532,72 @@ class ReportGenerationService:
|
||||
error=str(_fe),
|
||||
)
|
||||
|
||||
async def _persist_postmortem_km(
|
||||
self,
|
||||
data: PostmortemData,
|
||||
report_text: str,
|
||||
) -> None:
|
||||
"""Persist generated postmortem as an idempotent KM entry before Telegram send."""
|
||||
try:
|
||||
from src.db.base import get_db_context
|
||||
from src.models.knowledge import (
|
||||
EntrySource,
|
||||
EntryStatus,
|
||||
EntryType,
|
||||
KnowledgeEntryCreate,
|
||||
)
|
||||
from src.repositories.alert_operation_log_repository import (
|
||||
get_alert_operation_log_repository,
|
||||
)
|
||||
from src.repositories.knowledge_repository import KnowledgeDBRepository
|
||||
|
||||
async with get_db_context() as db:
|
||||
repo = KnowledgeDBRepository(db)
|
||||
entry = await repo.create(
|
||||
KnowledgeEntryCreate(
|
||||
title=f"Postmortem {data.incident_id}: {data.title}"[:255],
|
||||
content=report_text,
|
||||
entry_type=EntryType.POSTMORTEM,
|
||||
category="postmortem",
|
||||
tags=[
|
||||
"postmortem",
|
||||
"incident",
|
||||
"telegram",
|
||||
"auto_repaired" if data.auto_repaired else "human_intervention",
|
||||
],
|
||||
source=EntrySource.AI_EXTRACTED,
|
||||
status=EntryStatus.REVIEW,
|
||||
related_incident_id=data.incident_id,
|
||||
path_type="postmortem",
|
||||
created_by="report_generation_service",
|
||||
)
|
||||
)
|
||||
|
||||
await get_alert_operation_log_repository().append(
|
||||
"KM_CONVERTED",
|
||||
incident_id=data.incident_id,
|
||||
actor="report_generation_service",
|
||||
action_detail="postmortem_persisted",
|
||||
success=True,
|
||||
context={
|
||||
"knowledge_entry_id": entry.id,
|
||||
"entry_type": EntryType.POSTMORTEM.value,
|
||||
"path_type": "postmortem",
|
||||
"duration_minutes": round(data.duration_minutes, 2),
|
||||
},
|
||||
)
|
||||
logger.info(
|
||||
"postmortem_km_persisted",
|
||||
incident_id=data.incident_id,
|
||||
knowledge_entry_id=entry.id,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"postmortem_km_persist_failed",
|
||||
incident_id=data.incident_id,
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 日度報告排程迴圈
|
||||
|
||||
@@ -8377,9 +8377,7 @@ class TelegramGateway:
|
||||
if action == "approve":
|
||||
status_emoji = "✅"
|
||||
status_text = f"<b>已批准</b> by {_html.escape(username)}"
|
||||
# 2026-04-14 Claude Sonnet 4.6: 原「等待執行」誤導(實際沒有 gate 會卡住路徑)
|
||||
# 批准後一律顯示「執行中」,真實結果由 _push_execution_result_to_alert reply 補上
|
||||
suffix = "⚡ 執行中..."
|
||||
suffix = "⚡ 執行中..." if execution_triggered else "已簽核,等待更多簽核"
|
||||
else:
|
||||
status_emoji = "❌"
|
||||
status_text = f"<b>已拒絕</b> by {_html.escape(username)}"
|
||||
@@ -8495,7 +8493,7 @@ class TelegramGateway:
|
||||
# 2026-04-22 Claude Sonnet 4.6: 只有真正轉為 APPROVED 才發「執行中...」
|
||||
# 非 PENDING 狀態下 sign_approval early-return → approval 是舊 record
|
||||
# 此時不應發「執行中...」,應告知用戶告警已處理過
|
||||
if approval.status == ApprovalStatus.APPROVED:
|
||||
if approval.status == ApprovalStatus.APPROVED and execution_triggered:
|
||||
# 2026-04-09 Claude Sonnet 4.6: 回應 Telegram — 更新訊息狀態 + answer callback
|
||||
await self._notify_approval_result(
|
||||
message_id=message_id,
|
||||
@@ -8520,7 +8518,7 @@ class TelegramGateway:
|
||||
# 原本 gate 用 execution_triggered,race condition 時失效(樂觀鎖失敗)
|
||||
# 改用 approval.status == APPROVED(與 REST API 路徑 approvals.py:360 對齊)
|
||||
# 用 Redis lock exec:{approval_id} 防重入(REST + Telegram 同時簽核)
|
||||
if approval.status == ApprovalStatus.APPROVED:
|
||||
if approval.status == ApprovalStatus.APPROVED and execution_triggered:
|
||||
import asyncio
|
||||
|
||||
from src.core.redis_client import get_redis
|
||||
|
||||
@@ -18,7 +18,7 @@ Task 2.3: validate_kubectl_command() 白名單驗證
|
||||
|
||||
import pytest
|
||||
|
||||
from src.services.alert_rule_engine import validate_kubectl_command
|
||||
from src.services.alert_rule_engine import match_rule, validate_kubectl_command
|
||||
|
||||
|
||||
# =============================================================================
|
||||
@@ -76,6 +76,49 @@ class TestValidKubectlCommands:
|
||||
assert validate_kubectl_command(cmd) is False
|
||||
|
||||
|
||||
class TestRuleMatchingSpecificity:
|
||||
"""具名 alertname 規則不得被寬鬆 message keyword 誤命中。"""
|
||||
|
||||
def test_host_storage_alert_does_not_match_minio_disk_rule(self):
|
||||
ctx = {
|
||||
"alert_type": "host",
|
||||
"severity": "critical",
|
||||
"source": "prometheus",
|
||||
"target_resource": "dirty-reboot-evidence",
|
||||
"namespace": "awoooi-prod",
|
||||
"message": "HostPreviousBootStorageErrorsDetected storage dirty reboot evidence",
|
||||
"labels": {
|
||||
"alertname": "HostPreviousBootStorageErrorsDetected",
|
||||
"instance": "192.168.0.110:9100",
|
||||
},
|
||||
}
|
||||
|
||||
result = match_rule(ctx)
|
||||
|
||||
assert result is not None
|
||||
assert result["rule_id"] != "minio_disk_high"
|
||||
assert "/data/minio" not in result.get("kubectl_command", "")
|
||||
|
||||
def test_exact_minio_disk_alert_still_matches_minio_rule(self):
|
||||
ctx = {
|
||||
"alert_type": "storage",
|
||||
"severity": "critical",
|
||||
"source": "prometheus",
|
||||
"target_resource": "minio",
|
||||
"namespace": "awoooi-prod",
|
||||
"message": "MinIO disk usage high",
|
||||
"labels": {
|
||||
"alertname": "MinioDiskUsageHigh",
|
||||
"instance": "192.168.0.110:9000",
|
||||
},
|
||||
}
|
||||
|
||||
result = match_rule(ctx)
|
||||
|
||||
assert result is not None
|
||||
assert result["rule_id"] == "minio_disk_high"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 阻擋案例(應返回 False)
|
||||
# =============================================================================
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
from types import SimpleNamespace
|
||||
|
||||
from unittest.mock import AsyncMock
|
||||
|
||||
import pytest
|
||||
@@ -16,14 +15,17 @@ async def test_no_action_execution_resolves_incident_once(monkeypatch):
|
||||
incident_id="INC-TEST-001",
|
||||
)
|
||||
incident_service = SimpleNamespace(resolve_incident=AsyncMock())
|
||||
update_execution_status = AsyncMock()
|
||||
timeline_add_event = AsyncMock()
|
||||
alert_completed = AsyncMock(return_value=None)
|
||||
|
||||
monkeypatch.setattr(
|
||||
"src.services.approval_execution.get_approval_service",
|
||||
lambda: SimpleNamespace(update_execution_status=AsyncMock()),
|
||||
lambda: SimpleNamespace(update_execution_status=update_execution_status),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"src.services.approval_execution.get_timeline_service",
|
||||
lambda: SimpleNamespace(add_event=AsyncMock()),
|
||||
lambda: SimpleNamespace(add_event=timeline_add_event),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"src.services.approval_execution.parse_operation_from_action",
|
||||
@@ -43,12 +45,28 @@ async def test_no_action_execution_resolves_incident_once(monkeypatch):
|
||||
"src.services.approval_execution.ApprovalExecutionService._log_aol_completed",
|
||||
AsyncMock(return_value=None),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"src.services.approval_execution.ApprovalExecutionService._log_alert_execution_started",
|
||||
AsyncMock(return_value=None),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"src.services.approval_execution.ApprovalExecutionService._log_alert_execution_completed",
|
||||
alert_completed,
|
||||
)
|
||||
|
||||
# Act
|
||||
result = await ApprovalExecutionService().execute_approved_action(approval)
|
||||
|
||||
# Assert
|
||||
assert result is True
|
||||
update_execution_status.assert_awaited_once_with(
|
||||
approval.id,
|
||||
success=True,
|
||||
execution_kind="no_action",
|
||||
)
|
||||
assert "未執行修復" in timeline_add_event.await_args.kwargs["title"]
|
||||
assert alert_completed.await_args.kwargs["execution_kind"] == "no_action"
|
||||
assert alert_completed.await_args.kwargs["output"]["repair_executed"] is False
|
||||
incident_service.resolve_incident.assert_awaited_once_with("INC-TEST-001")
|
||||
|
||||
|
||||
@@ -67,10 +85,11 @@ async def test_no_action_execution_returns_true_when_resolve_raises(monkeypatch)
|
||||
incident_service = SimpleNamespace(
|
||||
resolve_incident=AsyncMock(side_effect=RuntimeError("redis down"))
|
||||
)
|
||||
update_execution_status = AsyncMock()
|
||||
|
||||
monkeypatch.setattr(
|
||||
"src.services.approval_execution.get_approval_service",
|
||||
lambda: SimpleNamespace(update_execution_status=AsyncMock()),
|
||||
lambda: SimpleNamespace(update_execution_status=update_execution_status),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"src.services.approval_execution.get_timeline_service",
|
||||
@@ -94,8 +113,21 @@ async def test_no_action_execution_returns_true_when_resolve_raises(monkeypatch)
|
||||
"src.services.approval_execution.ApprovalExecutionService._log_aol_completed",
|
||||
AsyncMock(return_value=None),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"src.services.approval_execution.ApprovalExecutionService._log_alert_execution_started",
|
||||
AsyncMock(return_value=None),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"src.services.approval_execution.ApprovalExecutionService._log_alert_execution_completed",
|
||||
AsyncMock(return_value=None),
|
||||
)
|
||||
|
||||
result = await ApprovalExecutionService().execute_approved_action(approval)
|
||||
|
||||
assert result is True
|
||||
update_execution_status.assert_awaited_once_with(
|
||||
approval.id,
|
||||
success=True,
|
||||
execution_kind="no_action",
|
||||
)
|
||||
incident_service.resolve_incident.assert_awaited_once_with("INC-TEST-002")
|
||||
|
||||
@@ -181,7 +181,7 @@ class TestMatchRuleRejection:
|
||||
"""垃圾 target 時 kubectl_command 必須被清空(降級 LLM)"""
|
||||
|
||||
def test_bad_target_discards_kubectl_command(self):
|
||||
"""真實 bug:HostHighCpuLoad target=unknown → kubectl_command 應清空"""
|
||||
"""HostHighCpuLoad target=unknown → 不得組裝成壞 kubectl target。"""
|
||||
ctx = {
|
||||
"alert_type": "high_cpu",
|
||||
"severity": "warning",
|
||||
@@ -192,10 +192,12 @@ class TestMatchRuleRejection:
|
||||
"labels": {"alertname": "HostHighCpuLoad", "instance": "192.168.0.110:9100"},
|
||||
}
|
||||
result = match_rule(ctx)
|
||||
# 規則可能匹配(host_high_cpu)但 kubectl_command 必為空
|
||||
# 規則可能匹配 host SSH 診斷;但不能把 HostHighCpuLoad 當成 K8s target。
|
||||
if result is not None:
|
||||
assert result["kubectl_command"] == "", \
|
||||
f"bad target 應導致 kubectl_command 清空, got: {result['kubectl_command']!r}"
|
||||
command = result["kubectl_command"]
|
||||
assert command == "" or command.startswith("ssh "), \
|
||||
f"bad target 不應組裝 kubectl 指令, got: {command!r}"
|
||||
assert "deployment/HostHighCpuLoad" not in command
|
||||
|
||||
def test_good_target_preserves_kubectl_command(self):
|
||||
"""真實 deployment 名稱時,kubectl_command 正常組裝"""
|
||||
|
||||
@@ -17,7 +17,9 @@ ADR-076 Task 4: 自動報告生成
|
||||
建立: 2026-04-14 (台北時區) Claude Haiku 4.5
|
||||
"""
|
||||
|
||||
from contextlib import asynccontextmanager
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from types import SimpleNamespace
|
||||
|
||||
import pytest
|
||||
|
||||
@@ -274,6 +276,71 @@ class TestFormatPostmortem:
|
||||
assert "台北時間" in report
|
||||
|
||||
|
||||
class TestTriggerPostmortemPersistence:
|
||||
"""Postmortem 產出必須同步沉澱到 KM。"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_trigger_postmortem_persists_km_before_telegram_send(self, monkeypatch):
|
||||
now = datetime.now(_TZ_TAIPEI)
|
||||
created = now - timedelta(minutes=16)
|
||||
sent_messages: list[str] = []
|
||||
created_entries: list[object] = []
|
||||
op_logs: list[dict] = []
|
||||
|
||||
class FakeGateway:
|
||||
async def send_to_group(self, text: str, parse_mode: str = "HTML") -> None:
|
||||
sent_messages.append(text)
|
||||
|
||||
class FakeKnowledgeRepo:
|
||||
def __init__(self, _db) -> None:
|
||||
pass
|
||||
|
||||
async def create(self, data):
|
||||
created_entries.append(data)
|
||||
return SimpleNamespace(id="km-postmortem-1")
|
||||
|
||||
class FakeAlertOpRepo:
|
||||
async def append(self, event_type: str, **kwargs):
|
||||
op_logs.append({"event_type": event_type, **kwargs})
|
||||
|
||||
@asynccontextmanager
|
||||
async def fake_db_context():
|
||||
yield SimpleNamespace()
|
||||
|
||||
monkeypatch.setattr(
|
||||
"src.services.telegram_gateway.get_telegram_gateway",
|
||||
lambda: FakeGateway(),
|
||||
)
|
||||
monkeypatch.setattr("src.db.base.get_db_context", fake_db_context)
|
||||
monkeypatch.setattr(
|
||||
"src.repositories.knowledge_repository.KnowledgeDBRepository",
|
||||
FakeKnowledgeRepo,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"src.repositories.alert_operation_log_repository.get_alert_operation_log_repository",
|
||||
lambda: FakeAlertOpRepo(),
|
||||
)
|
||||
|
||||
await ReportGenerationService().trigger_postmortem(
|
||||
incident_id="INC-20260531-POST",
|
||||
title="DockerContainerUnhealthy bitan-pharmacy",
|
||||
created_at=created,
|
||||
resolved_at=now,
|
||||
root_cause="容器健康檢查失敗",
|
||||
resolution_action="OBSERVE",
|
||||
auto_repaired=False,
|
||||
)
|
||||
|
||||
assert sent_messages
|
||||
assert created_entries
|
||||
entry = created_entries[0]
|
||||
assert entry.entry_type.value == "postmortem"
|
||||
assert entry.related_incident_id == "INC-20260531-POST"
|
||||
assert entry.path_type == "postmortem"
|
||||
assert op_logs[0]["event_type"] == "KM_CONVERTED"
|
||||
assert op_logs[0]["action_detail"] == "postmortem_persisted"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# _seconds_until_next_report
|
||||
# =============================================================================
|
||||
|
||||
@@ -15,12 +15,18 @@ class _FakeGateway:
|
||||
|
||||
|
||||
class _FakeApprovalService:
|
||||
def __init__(self, approval, execution_triggered: bool) -> None:
|
||||
def __init__(
|
||||
self,
|
||||
approval,
|
||||
execution_triggered: bool,
|
||||
sign_message: str = "Approval complete",
|
||||
) -> None:
|
||||
self.approval = approval
|
||||
self.execution_triggered = execution_triggered
|
||||
self.sign_message = sign_message
|
||||
|
||||
async def sign_approval(self, **_kwargs):
|
||||
return self.approval, "Approval complete", self.execution_triggered
|
||||
return self.approval, self.sign_message, self.execution_triggered
|
||||
|
||||
async def reject_approval(self, **_kwargs):
|
||||
return self.approval, "Approval rejected"
|
||||
@@ -100,6 +106,59 @@ async def test_telegram_approval_schedules_executor_after_required_signature(mon
|
||||
assert op_log_repo.rows[0]["kwargs"]["action_detail"] == "approve"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_telegram_approval_duplicate_does_not_schedule_executor(monkeypatch):
|
||||
approval_id = "33333333-3333-3333-3333-333333333333"
|
||||
approval = SimpleNamespace(
|
||||
id=UUID(approval_id),
|
||||
status=SimpleNamespace(value="execution_success"),
|
||||
incident_id="INC-20260531-DUPE",
|
||||
)
|
||||
finalizer_calls: list[dict] = []
|
||||
op_log_repo = _FakeAlertOperationLogRepository()
|
||||
|
||||
async def fake_finalize(*, approval, execution_triggered: bool) -> bool:
|
||||
finalizer_calls.append({
|
||||
"approval_id": str(approval.id),
|
||||
"execution_triggered": execution_triggered,
|
||||
})
|
||||
return True
|
||||
|
||||
monkeypatch.setattr(
|
||||
telegram_api,
|
||||
"get_telegram_gateway",
|
||||
lambda: _FakeGateway({
|
||||
"success": True,
|
||||
"action": "approve",
|
||||
"approval_id": approval_id,
|
||||
"user": {"id": 42, "username": "ops"},
|
||||
}),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
telegram_api,
|
||||
"get_approval_service",
|
||||
lambda: _FakeApprovalService(
|
||||
approval,
|
||||
execution_triggered=False,
|
||||
sign_message="Cannot sign: status is execution_success",
|
||||
),
|
||||
)
|
||||
monkeypatch.setattr(telegram_api, "_finalize_telegram_approval", fake_finalize)
|
||||
monkeypatch.setattr(
|
||||
"src.repositories.alert_operation_log_repository.get_alert_operation_log_repository",
|
||||
lambda: op_log_repo,
|
||||
)
|
||||
|
||||
result = await telegram_api.telegram_webhook(_callback_update(f"approve:{approval_id}:ts:nonce"))
|
||||
|
||||
assert result["ok"] is True
|
||||
assert result["message"] == "Already processed"
|
||||
assert result["execution_triggered"] is False
|
||||
assert result["execution_scheduled"] is False
|
||||
assert finalizer_calls == []
|
||||
assert op_log_repo.rows[0]["kwargs"]["action_detail"] == "approve_duplicate"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_telegram_rejection_syncs_incident_state(monkeypatch):
|
||||
approval_id = "22222222-2222-2222-2222-222222222222"
|
||||
|
||||
@@ -1,3 +1,39 @@
|
||||
## 2026-05-31|Telegram 告警執行語意與 DB 稽核完整性修復
|
||||
|
||||
**背景**:
|
||||
|
||||
- Production 查核 `INC-20260530-88D960` / `INC-20260531-88394F` 發現 Telegram 顯示「已批准、執行中、執行成功」,但實際分別是 MinIO SSH 診斷與 `OBSERVE`,不是建議中的修復動作。
|
||||
- `approval_records.status=execution_success` 無法區分「真的執行修復」與「純觀察/NO_ACTION terminal」;`alert_operation_log` 缺人工 approval execution 的 start/end,Postmortem 只送 Telegram 未沉澱 KM。
|
||||
- `alert_rule_engine` 允許具名規則只靠 message keyword 命中,導致主機 storage 類告警可能誤配到 `minio_disk_high`。
|
||||
|
||||
**本次調整**:
|
||||
|
||||
- 新增 `approval_action_classifier.is_no_action_approval_action()`,集中判斷 `OBSERVE` / `INVESTIGATE` / `NO_ACTION`。
|
||||
- NO_ACTION terminal 仍會關閉 approval,但 `extra_metadata` 標記 `execution_kind=no_action`、`repair_executed=false`;Telegram result 改為「已記錄觀察,未執行修復」。
|
||||
- `ApprovalExecutionService` 同步寫 `alert_operation_log`:`EXECUTION_STARTED`、`EXECUTION_COMPLETED`、`TELEGRAM_RESULT_SENT`。
|
||||
- Telegram webhook duplicate approval 不再 finalize / schedule executor;long polling 只有真正 `execution_triggered` 才顯示「執行中」。
|
||||
- Postmortem 產出時同步 idempotent 寫入 `knowledge_entries(entry_type=postmortem,path_type=postmortem)` 並補 `KM_CONVERTED`。
|
||||
- Heartbeat 與日報修復統計排除 observe-only/no-action,避免污染 success rate。
|
||||
- `alert_rule_engine._matches()` 收緊具名 alertname 規則,避免 Host storage 類告警靠 `storage` keyword 誤配 MinIO。
|
||||
|
||||
**Verification**:
|
||||
|
||||
```text
|
||||
python3 -m py_compile approval_action_classifier.py approval_execution.py approval_db.py telegram.py telegram_gateway.py alert_rule_engine.py report_generation_service.py heartbeat_report_service.py
|
||||
-> pass
|
||||
pytest test_approval_execution_no_action.py test_telegram_webhook_execution_handoff.py -q
|
||||
-> 6 passed
|
||||
pytest test_alert_rule_engine_validation.py test_report_generation_service.py -q
|
||||
-> 67 passed
|
||||
pytest test_heartbeat_ollama_endpoints.py test_heartbeat_pod_state_machine.py test_gap_a4_placeholder_resolution.py -q
|
||||
-> 49 passed
|
||||
```
|
||||
|
||||
**判讀 / 下一步**:
|
||||
|
||||
- 本輪修復新流量的語意與稽核完整性,不補跑舊 incident 的修復動作。
|
||||
- 舊 incident 若已是 `execution_success` 但沒有 `extra_metadata.execution_kind`,仍需透過 `automation_operation_log` / `alert_operation_log` 交叉判讀。
|
||||
|
||||
## 2026-05-31|Legacy HITL PENDING 前台可見性與心跳拆分
|
||||
|
||||
**背景**:
|
||||
|
||||
@@ -2671,6 +2671,12 @@ Phase 6 完成後
|
||||
- Verification:API py_compile pass;targeted ruff for new test pass;`pnpm --filter @awoooi/shared-types generate` pass;`test_approval_pending_visibility.py` 4 passed;`test_heartbeat_ollama_endpoints.py` + `test_heartbeat_pod_state_machine.py` 15 passed;`git diff --check` pass。
|
||||
- 判讀:T153 不批次 approve/reject 生產 PENDING,也不把觀察卡刪掉;它把「前台看得到 legacy HITL 事實」與「告警只針對真正人工 actionable backlog」補齊。舊 fallback kubectl / SSH action 仍需 operator 在 `/awooop/approvals` 逐筆決策;OBSERVE / NO_ACTION 類不再偽裝成 emergency manual backlog。下一段可追 LLM failure fallback 為何大量產生 `OBSERVE / medium` 卡片,但需避免破壞 agent 後續把 PENDING 更新成可執行 action 的路徑。
|
||||
|
||||
**T154 Telegram approval truth + execution audit integrity(2026-05-31 台北)**:
|
||||
- 觸發:Telegram 上出現「此告警已處理」後仍接著顯示「已批准、執行中」,且 `INC-20260530-88D960` / `INC-20260531-88394F` 的 production 查核顯示 `approval_records.status=execution_success`,但前者實際只跑 MinIO SSH 診斷、後者只是 `OBSERVE`;`auto_repair_executions=0`,`alert_operation_log` 缺 execution start/end,Postmortem 只送 Telegram 未落 KM。這會讓 operator 誤以為修復已完成。
|
||||
- 修正:集中 `is_no_action_approval_action()`,讓 `OBSERVE` / `INVESTIGATE` / `NO_ACTION` terminal 時在 `approval_records.extra_metadata` 標記 `execution_kind=no_action`、`repair_executed=false`,Telegram result 改為「已記錄觀察,未執行修復」,不再用「執行成功」。`ApprovalExecutionService` 現在同步寫 `alert_operation_log` 的 `EXECUTION_STARTED` / `EXECUTION_COMPLETED` / `TELEGRAM_RESULT_SENT`,並保留 `automation_operation_log`。Telegram webhook duplicate approval 不再 finalize/schedule executor;long polling 只有真正 `execution_triggered` 才顯示「執行中」。`ReportGenerationService` 會把 Postmortem idempotent 寫入 `knowledge_entries(entry_type=postmortem,path_type=postmortem)` 並補 `KM_CONVERTED`。`HeartbeatReportService` / 日報修復統計排除 observe-only/no-action,不再污染 execution success rate。`alert_rule_engine._matches()` 收緊具名 alertname 規則,避免主機 storage 告警靠 message keyword 誤配 `minio_disk_high`。
|
||||
- Verification:`py_compile` pass;`test_approval_execution_no_action.py` + `test_telegram_webhook_execution_handoff.py` 6 passed;`test_alert_rule_engine_validation.py` + `test_report_generation_service.py` 67 passed;`test_heartbeat_ollama_endpoints.py` + `test_heartbeat_pod_state_machine.py` + `test_gap_a4_placeholder_resolution.py` 49 passed after aligning host SSH diagnostic assertion。
|
||||
- 判讀:T154 修的是「Telegram / DB / 前台統計的 truthfulness」,不是補跑舊 incident 的修復。舊資料中 status 已是 `execution_success` 的 OBSERVE 仍需靠新 metadata 才能精確分辨;部署後新 approval 會留下 immutable execution start/end 與 no-action 語意,operator 不應再把 OBSERVE 視為完成修復。
|
||||
|
||||
**T152 Ansible runtime readiness surfaced(2026-05-24 台北)**:
|
||||
- 觸發:T151 已讓首頁看到 execution backend / Ansible attribution,但 operator 仍看不到 runtime 端缺什麼,容易把「Ansible 有候選」誤解成「Ansible 已能自動修復」。
|
||||
- 修正:API image 複製 `infra/ansible/` 作 read-only catalog;`truth-chain/quality/summary` 新增 `ansible_runtime`,回報 playbook binary、catalog、inventory、playbook_count、can_run_check_mode、blockers。首頁 execution evidence 同步顯示 runtime 狀態;目前 production 顯示 `runtime 未就緒:ansible_playbook_binary_missing`。未安裝 `ansible-core`、未啟用 check-mode / apply。
|
||||
|
||||
Reference in New Issue
Block a user