feat(auto-repair): 所有操作強制寫入 DB — auto_repair_executions 表
Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 1m32s

統帥指令: 所有自動修復操作(成功/失敗)必須持久化

變更:
- migrations/phase10_auto_repair_executions.sql: 新增表 + 4 個索引
- db/models.py: 新增 AutoRepairExecution SQLAlchemy model
- repositories/audit_log_repository.py: 新增 AutoRepairExecutionRepository (create/list_by_incident/get_stats)
- auto_repair_service.py: execute_auto_repair 成功/失敗分支都寫入 DB
  - 新增 similarity_score 參數傳遞
  - AutoRepairDecision 新增 similarity_score 欄位
- webhooks.py: 傳入 similarity_score 到 execute_auto_repair

已執行 migration: awoooi_prod@192.168.0.188:5432 

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-08 11:16:37 +08:00
parent 68a2fff746
commit eee6f06215
5 changed files with 218 additions and 3 deletions

View File

@@ -0,0 +1,38 @@
-- Phase 10: Auto Repair Executions 操作記錄表
-- 建立時間: 2026-04-08 (台北時區)
-- 建立者: Claude Code — 統帥指令「所有操作都必須被記錄,寫入資料庫」
--
-- 設計說明:
-- 自動修復每次執行(成功或失敗)都寫入此表
-- 不依賴 approval_id自動修復不需要人工批准
-- 支援查詢: 按 incident / playbook / 時間範圍 / 成功率
CREATE TABLE IF NOT EXISTS auto_repair_executions (
-- 主鍵
id VARCHAR(36) PRIMARY KEY DEFAULT gen_random_uuid()::text,
-- 關聯
incident_id VARCHAR(30) NOT NULL,
playbook_id VARCHAR(36) NOT NULL,
playbook_name VARCHAR(200) NOT NULL,
-- 執行結果
success BOOLEAN NOT NULL DEFAULT FALSE,
executed_steps JSONB NOT NULL DEFAULT '[]', -- list of step result strings
error_message TEXT,
-- 執行上下文
triggered_by VARCHAR(50) NOT NULL DEFAULT 'auto_repair', -- auto_repair / cold_start_trust
similarity_score NUMERIC(5,4), -- 匹配相似度
risk_level VARCHAR(20), -- LOW / MEDIUM / HIGH
execution_time_ms INTEGER,
-- 時間戳 (台北時區)
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- 索引
CREATE INDEX IF NOT EXISTS ix_are_incident_id ON auto_repair_executions (incident_id);
CREATE INDEX IF NOT EXISTS ix_are_playbook_id ON auto_repair_executions (playbook_id);
CREATE INDEX IF NOT EXISTS ix_are_created_at ON auto_repair_executions (created_at DESC);
CREATE INDEX IF NOT EXISTS ix_are_success ON auto_repair_executions (success);

View File

@@ -209,6 +209,7 @@ async def _try_auto_repair_background(
incident=incident,
playbook=decision.playbook,
is_cold_start=decision.is_cold_start,
similarity_score=decision.similarity_score,
)
logger.info(

View File

@@ -352,6 +352,50 @@ class AuditLog(Base):
)
# =============================================================================
# AutoRepairExecution - Phase 10 操作記錄
# 2026-04-08 Claude Code: 統帥指令「所有操作都必須被記錄,寫入資料庫」
# =============================================================================
class AutoRepairExecution(Base):
"""
自動修復執行記錄
每次 evaluate_auto_repair 觸發並執行 (成功或失敗) 都寫入此表。
不依賴 approval_id自動修復不需人工批准
"""
__tablename__ = "auto_repair_executions"
id: Mapped[str] = mapped_column(String(36), primary_key=True, default=generate_uuid)
# 關聯
incident_id: Mapped[str] = mapped_column(String(30), nullable=False, index=True)
playbook_id: Mapped[str] = mapped_column(String(36), nullable=False, index=True)
playbook_name: Mapped[str] = mapped_column(String(200), nullable=False)
# 執行結果
success: Mapped[bool] = mapped_column(default=False, nullable=False)
executed_steps: Mapped[list] = mapped_column(JSON, default=list, nullable=False)
error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
# 執行上下文
triggered_by: Mapped[str] = mapped_column(
String(50), default="auto_repair", nullable=False,
comment="auto_repair / cold_start_trust",
)
similarity_score: Mapped[float | None] = mapped_column(nullable=True)
risk_level: Mapped[str | None] = mapped_column(String(20), nullable=True)
execution_time_ms: Mapped[int | None] = mapped_column(Integer, nullable=True)
# 時間戳 (台北時區)
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=taipei_now)
__table_args__ = (
Index("ix_are_created_at", "created_at"),
Index("ix_are_success", "success"),
)
# =============================================================================
# IncidentRecord - Phase 6.2 Episodic Memory (PostgreSQL)
# =============================================================================

View File

@@ -15,7 +15,7 @@ import structlog
from sqlalchemy import func, select
from src.db.base import get_db_context
from src.db.models import AuditLog
from src.db.models import AuditLog, AutoRepairExecution
logger = structlog.get_logger(__name__)
@@ -157,6 +157,97 @@ class AuditLogRepository:
}
# =============================================================================
# AutoRepairExecutionRepository
# 2026-04-08 Claude Code: 統帥指令「所有操作都必須被記錄,寫入資料庫」
# =============================================================================
class AutoRepairExecutionRepository:
"""自動修復執行記錄 Repository"""
async def create(
self,
incident_id: str,
playbook_id: str,
playbook_name: str,
success: bool,
executed_steps: list[str],
error_message: str | None = None,
triggered_by: str = "auto_repair",
similarity_score: float | None = None,
risk_level: str | None = None,
execution_time_ms: int | None = None,
) -> AutoRepairExecution:
"""寫入一筆自動修復執行記錄"""
async with get_db_context() as db:
record = AutoRepairExecution(
incident_id=incident_id,
playbook_id=playbook_id,
playbook_name=playbook_name,
success=success,
executed_steps=executed_steps,
error_message=error_message,
triggered_by=triggered_by,
similarity_score=similarity_score,
risk_level=risk_level,
execution_time_ms=execution_time_ms,
)
db.add(record)
await db.flush()
await db.refresh(record)
return record
async def list_by_incident(self, incident_id: str) -> list[AutoRepairExecution]:
"""查詢某 incident 的所有修復記錄"""
async with get_db_context() as db:
result = await db.execute(
select(AutoRepairExecution)
.where(AutoRepairExecution.incident_id == incident_id)
.order_by(AutoRepairExecution.created_at.desc())
)
return list(result.scalars().all())
async def get_stats(self, since_hours: int = 24) -> dict[str, Any]:
"""統計最近 N 小時的自動修復執行情況"""
from datetime import timedelta
from src.utils.timezone import now_taipei
since = now_taipei() - timedelta(hours=since_hours)
async with get_db_context() as db:
total_r = await db.execute(
select(func.count(AutoRepairExecution.id))
.where(AutoRepairExecution.created_at >= since)
)
total = total_r.scalar() or 0
success_r = await db.execute(
select(func.count(AutoRepairExecution.id))
.where(AutoRepairExecution.created_at >= since)
.where(AutoRepairExecution.success.is_(True))
)
success_count = success_r.scalar() or 0
return {
"total": total,
"success_count": success_count,
"failure_count": total - success_count,
"success_rate": round(success_count / total * 100, 1) if total > 0 else 0,
"since_hours": since_hours,
}
_auto_repair_execution_repo: AutoRepairExecutionRepository | None = None
def get_auto_repair_execution_repository() -> AutoRepairExecutionRepository:
"""取得 AutoRepairExecutionRepository 實例 (Singleton)"""
global _auto_repair_execution_repo
if _auto_repair_execution_repo is None:
_auto_repair_execution_repo = AutoRepairExecutionRepository()
return _auto_repair_execution_repo
# =============================================================================
# Singleton
# =============================================================================

View File

@@ -63,6 +63,8 @@ class AutoRepairDecision:
blocked_by: str | None = None # 阻擋原因 (如 HIGH_RISK, P1_SEVERITY)
# 2026-04-07 Claude Code: Sprint 4 B2 — 追蹤首次信任
is_cold_start: bool = False
# 2026-04-08 Claude Code: 傳入 execute_auto_repair 供 DB 記錄
similarity_score: float | None = None
@dataclass
@@ -280,9 +282,10 @@ class AutoRepairService:
return AutoRepairDecision(
can_auto_repair=True,
playbook=best_match.playbook,
reason=f"匹配高品質 Playbook: {best_match.playbook.name} (成功率 {best_match.playbook.success_rate:.0%})",
reason=f"匹配 Playbook: {best_match.playbook.name} (相似度 {best_match.similarity_score:.0%})",
risk_level=max_risk,
is_cold_start=_is_cold_start,
similarity_score=best_match.similarity_score,
)
async def execute_auto_repair(
@@ -290,13 +293,14 @@ class AutoRepairService:
incident: Incident,
playbook: Playbook,
is_cold_start: bool = False,
similarity_score: float | None = None,
) -> AutoRepairResult:
"""
執行自動修復
流程:
1. 依序執行 Playbook 中的 repair_steps
2. 記錄執行結果
2. 記錄執行結果到 DB (auto_repair_executions)
3. 更新 Playbook 統計
4. 記錄處置類型 (Sprint 4 B1/B2)
"""
@@ -351,6 +355,24 @@ class AutoRepairService:
execution_time_ms=execution_time,
)
# 2026-04-08 Claude Code: 統帥指令「所有操作都必須被記錄,寫入資料庫」
try:
from src.repositories.audit_log_repository import get_auto_repair_execution_repository
max_risk = self._get_max_risk_level(playbook)
await get_auto_repair_execution_repository().create(
incident_id=incident.incident_id,
playbook_id=playbook.playbook_id,
playbook_name=playbook.name,
success=True,
executed_steps=executed_steps,
triggered_by="cold_start_trust" if is_cold_start else "auto_repair",
similarity_score=similarity_score,
risk_level=max_risk.value if max_risk else None,
execution_time_ms=execution_time,
)
except Exception as _db_e:
logger.error("auto_repair_db_write_failed", error=str(_db_e))
# 2026-04-07 Claude Code: Sprint 4 B1/B2 — 記錄處置類型
# P0-1 Fix: 統一使用 AnomalyCounter.hash_signature()
try:
@@ -407,6 +429,25 @@ class AutoRepairService:
execution_time_ms=execution_time,
)
# 2026-04-08 Claude Code: 失敗也必須寫入 DB
try:
from src.repositories.audit_log_repository import get_auto_repair_execution_repository
max_risk = self._get_max_risk_level(playbook)
await get_auto_repair_execution_repository().create(
incident_id=incident.incident_id,
playbook_id=playbook.playbook_id,
playbook_name=playbook.name,
success=False,
executed_steps=executed_steps,
error_message=str(e),
triggered_by="cold_start_trust" if is_cold_start else "auto_repair",
similarity_score=similarity_score,
risk_level=max_risk.value if max_risk else None,
execution_time_ms=execution_time,
)
except Exception as _db_e:
logger.error("auto_repair_db_write_failed", error=str(_db_e))
# 2026-04-04 Claude Code: Phase 25 P1 — 失敗修復後 fire-and-forget 生成 ANTI_PATTERN
# 2026-04-05 Claude Code: I1 修正 — 補齊 _pending_tasks GC 防護(對稱化)
try: