feat(auto-repair): 所有操作強制寫入 DB — auto_repair_executions 表
Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 1m32s
Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 1m32s
統帥指令: 所有自動修復操作(成功/失敗)必須持久化 變更: - migrations/phase10_auto_repair_executions.sql: 新增表 + 4 個索引 - db/models.py: 新增 AutoRepairExecution SQLAlchemy model - repositories/audit_log_repository.py: 新增 AutoRepairExecutionRepository (create/list_by_incident/get_stats) - auto_repair_service.py: execute_auto_repair 成功/失敗分支都寫入 DB - 新增 similarity_score 參數傳遞 - AutoRepairDecision 新增 similarity_score 欄位 - webhooks.py: 傳入 similarity_score 到 execute_auto_repair 已執行 migration: awoooi_prod@192.168.0.188:5432 ✅ Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
38
apps/api/migrations/phase10_auto_repair_executions.sql
Normal file
38
apps/api/migrations/phase10_auto_repair_executions.sql
Normal file
@@ -0,0 +1,38 @@
|
||||
-- Phase 10: Auto Repair Executions 操作記錄表
|
||||
-- 建立時間: 2026-04-08 (台北時區)
|
||||
-- 建立者: Claude Code — 統帥指令「所有操作都必須被記錄,寫入資料庫」
|
||||
--
|
||||
-- 設計說明:
|
||||
-- 自動修復每次執行(成功或失敗)都寫入此表
|
||||
-- 不依賴 approval_id(自動修復不需要人工批准)
|
||||
-- 支援查詢: 按 incident / playbook / 時間範圍 / 成功率
|
||||
|
||||
CREATE TABLE IF NOT EXISTS auto_repair_executions (
|
||||
-- 主鍵
|
||||
id VARCHAR(36) PRIMARY KEY DEFAULT gen_random_uuid()::text,
|
||||
|
||||
-- 關聯
|
||||
incident_id VARCHAR(30) NOT NULL,
|
||||
playbook_id VARCHAR(36) NOT NULL,
|
||||
playbook_name VARCHAR(200) NOT NULL,
|
||||
|
||||
-- 執行結果
|
||||
success BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
executed_steps JSONB NOT NULL DEFAULT '[]', -- list of step result strings
|
||||
error_message TEXT,
|
||||
|
||||
-- 執行上下文
|
||||
triggered_by VARCHAR(50) NOT NULL DEFAULT 'auto_repair', -- auto_repair / cold_start_trust
|
||||
similarity_score NUMERIC(5,4), -- 匹配相似度
|
||||
risk_level VARCHAR(20), -- LOW / MEDIUM / HIGH
|
||||
execution_time_ms INTEGER,
|
||||
|
||||
-- 時間戳 (台北時區)
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- 索引
|
||||
CREATE INDEX IF NOT EXISTS ix_are_incident_id ON auto_repair_executions (incident_id);
|
||||
CREATE INDEX IF NOT EXISTS ix_are_playbook_id ON auto_repair_executions (playbook_id);
|
||||
CREATE INDEX IF NOT EXISTS ix_are_created_at ON auto_repair_executions (created_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS ix_are_success ON auto_repair_executions (success);
|
||||
@@ -209,6 +209,7 @@ async def _try_auto_repair_background(
|
||||
incident=incident,
|
||||
playbook=decision.playbook,
|
||||
is_cold_start=decision.is_cold_start,
|
||||
similarity_score=decision.similarity_score,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
|
||||
@@ -352,6 +352,50 @@ class AuditLog(Base):
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# AutoRepairExecution - Phase 10 操作記錄
|
||||
# 2026-04-08 Claude Code: 統帥指令「所有操作都必須被記錄,寫入資料庫」
|
||||
# =============================================================================
|
||||
|
||||
class AutoRepairExecution(Base):
|
||||
"""
|
||||
自動修復執行記錄
|
||||
|
||||
每次 evaluate_auto_repair 觸發並執行 (成功或失敗) 都寫入此表。
|
||||
不依賴 approval_id(自動修復不需人工批准)。
|
||||
"""
|
||||
__tablename__ = "auto_repair_executions"
|
||||
|
||||
id: Mapped[str] = mapped_column(String(36), primary_key=True, default=generate_uuid)
|
||||
|
||||
# 關聯
|
||||
incident_id: Mapped[str] = mapped_column(String(30), nullable=False, index=True)
|
||||
playbook_id: Mapped[str] = mapped_column(String(36), nullable=False, index=True)
|
||||
playbook_name: Mapped[str] = mapped_column(String(200), nullable=False)
|
||||
|
||||
# 執行結果
|
||||
success: Mapped[bool] = mapped_column(default=False, nullable=False)
|
||||
executed_steps: Mapped[list] = mapped_column(JSON, default=list, nullable=False)
|
||||
error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
|
||||
# 執行上下文
|
||||
triggered_by: Mapped[str] = mapped_column(
|
||||
String(50), default="auto_repair", nullable=False,
|
||||
comment="auto_repair / cold_start_trust",
|
||||
)
|
||||
similarity_score: Mapped[float | None] = mapped_column(nullable=True)
|
||||
risk_level: Mapped[str | None] = mapped_column(String(20), nullable=True)
|
||||
execution_time_ms: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
|
||||
# 時間戳 (台北時區)
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=taipei_now)
|
||||
|
||||
__table_args__ = (
|
||||
Index("ix_are_created_at", "created_at"),
|
||||
Index("ix_are_success", "success"),
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# IncidentRecord - Phase 6.2 Episodic Memory (PostgreSQL)
|
||||
# =============================================================================
|
||||
|
||||
@@ -15,7 +15,7 @@ import structlog
|
||||
from sqlalchemy import func, select
|
||||
|
||||
from src.db.base import get_db_context
|
||||
from src.db.models import AuditLog
|
||||
from src.db.models import AuditLog, AutoRepairExecution
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
@@ -157,6 +157,97 @@ class AuditLogRepository:
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# AutoRepairExecutionRepository
|
||||
# 2026-04-08 Claude Code: 統帥指令「所有操作都必須被記錄,寫入資料庫」
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class AutoRepairExecutionRepository:
|
||||
"""自動修復執行記錄 Repository"""
|
||||
|
||||
async def create(
|
||||
self,
|
||||
incident_id: str,
|
||||
playbook_id: str,
|
||||
playbook_name: str,
|
||||
success: bool,
|
||||
executed_steps: list[str],
|
||||
error_message: str | None = None,
|
||||
triggered_by: str = "auto_repair",
|
||||
similarity_score: float | None = None,
|
||||
risk_level: str | None = None,
|
||||
execution_time_ms: int | None = None,
|
||||
) -> AutoRepairExecution:
|
||||
"""寫入一筆自動修復執行記錄"""
|
||||
async with get_db_context() as db:
|
||||
record = AutoRepairExecution(
|
||||
incident_id=incident_id,
|
||||
playbook_id=playbook_id,
|
||||
playbook_name=playbook_name,
|
||||
success=success,
|
||||
executed_steps=executed_steps,
|
||||
error_message=error_message,
|
||||
triggered_by=triggered_by,
|
||||
similarity_score=similarity_score,
|
||||
risk_level=risk_level,
|
||||
execution_time_ms=execution_time_ms,
|
||||
)
|
||||
db.add(record)
|
||||
await db.flush()
|
||||
await db.refresh(record)
|
||||
return record
|
||||
|
||||
async def list_by_incident(self, incident_id: str) -> list[AutoRepairExecution]:
|
||||
"""查詢某 incident 的所有修復記錄"""
|
||||
async with get_db_context() as db:
|
||||
result = await db.execute(
|
||||
select(AutoRepairExecution)
|
||||
.where(AutoRepairExecution.incident_id == incident_id)
|
||||
.order_by(AutoRepairExecution.created_at.desc())
|
||||
)
|
||||
return list(result.scalars().all())
|
||||
|
||||
async def get_stats(self, since_hours: int = 24) -> dict[str, Any]:
|
||||
"""統計最近 N 小時的自動修復執行情況"""
|
||||
from datetime import timedelta
|
||||
from src.utils.timezone import now_taipei
|
||||
|
||||
since = now_taipei() - timedelta(hours=since_hours)
|
||||
async with get_db_context() as db:
|
||||
total_r = await db.execute(
|
||||
select(func.count(AutoRepairExecution.id))
|
||||
.where(AutoRepairExecution.created_at >= since)
|
||||
)
|
||||
total = total_r.scalar() or 0
|
||||
|
||||
success_r = await db.execute(
|
||||
select(func.count(AutoRepairExecution.id))
|
||||
.where(AutoRepairExecution.created_at >= since)
|
||||
.where(AutoRepairExecution.success.is_(True))
|
||||
)
|
||||
success_count = success_r.scalar() or 0
|
||||
|
||||
return {
|
||||
"total": total,
|
||||
"success_count": success_count,
|
||||
"failure_count": total - success_count,
|
||||
"success_rate": round(success_count / total * 100, 1) if total > 0 else 0,
|
||||
"since_hours": since_hours,
|
||||
}
|
||||
|
||||
|
||||
_auto_repair_execution_repo: AutoRepairExecutionRepository | None = None
|
||||
|
||||
|
||||
def get_auto_repair_execution_repository() -> AutoRepairExecutionRepository:
|
||||
"""取得 AutoRepairExecutionRepository 實例 (Singleton)"""
|
||||
global _auto_repair_execution_repo
|
||||
if _auto_repair_execution_repo is None:
|
||||
_auto_repair_execution_repo = AutoRepairExecutionRepository()
|
||||
return _auto_repair_execution_repo
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Singleton
|
||||
# =============================================================================
|
||||
|
||||
@@ -63,6 +63,8 @@ class AutoRepairDecision:
|
||||
blocked_by: str | None = None # 阻擋原因 (如 HIGH_RISK, P1_SEVERITY)
|
||||
# 2026-04-07 Claude Code: Sprint 4 B2 — 追蹤首次信任
|
||||
is_cold_start: bool = False
|
||||
# 2026-04-08 Claude Code: 傳入 execute_auto_repair 供 DB 記錄
|
||||
similarity_score: float | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -280,9 +282,10 @@ class AutoRepairService:
|
||||
return AutoRepairDecision(
|
||||
can_auto_repair=True,
|
||||
playbook=best_match.playbook,
|
||||
reason=f"匹配高品質 Playbook: {best_match.playbook.name} (成功率 {best_match.playbook.success_rate:.0%})",
|
||||
reason=f"匹配 Playbook: {best_match.playbook.name} (相似度 {best_match.similarity_score:.0%})",
|
||||
risk_level=max_risk,
|
||||
is_cold_start=_is_cold_start,
|
||||
similarity_score=best_match.similarity_score,
|
||||
)
|
||||
|
||||
async def execute_auto_repair(
|
||||
@@ -290,13 +293,14 @@ class AutoRepairService:
|
||||
incident: Incident,
|
||||
playbook: Playbook,
|
||||
is_cold_start: bool = False,
|
||||
similarity_score: float | None = None,
|
||||
) -> AutoRepairResult:
|
||||
"""
|
||||
執行自動修復
|
||||
|
||||
流程:
|
||||
1. 依序執行 Playbook 中的 repair_steps
|
||||
2. 記錄執行結果
|
||||
2. 記錄執行結果到 DB (auto_repair_executions)
|
||||
3. 更新 Playbook 統計
|
||||
4. 記錄處置類型 (Sprint 4 B1/B2)
|
||||
"""
|
||||
@@ -351,6 +355,24 @@ class AutoRepairService:
|
||||
execution_time_ms=execution_time,
|
||||
)
|
||||
|
||||
# 2026-04-08 Claude Code: 統帥指令「所有操作都必須被記錄,寫入資料庫」
|
||||
try:
|
||||
from src.repositories.audit_log_repository import get_auto_repair_execution_repository
|
||||
max_risk = self._get_max_risk_level(playbook)
|
||||
await get_auto_repair_execution_repository().create(
|
||||
incident_id=incident.incident_id,
|
||||
playbook_id=playbook.playbook_id,
|
||||
playbook_name=playbook.name,
|
||||
success=True,
|
||||
executed_steps=executed_steps,
|
||||
triggered_by="cold_start_trust" if is_cold_start else "auto_repair",
|
||||
similarity_score=similarity_score,
|
||||
risk_level=max_risk.value if max_risk else None,
|
||||
execution_time_ms=execution_time,
|
||||
)
|
||||
except Exception as _db_e:
|
||||
logger.error("auto_repair_db_write_failed", error=str(_db_e))
|
||||
|
||||
# 2026-04-07 Claude Code: Sprint 4 B1/B2 — 記錄處置類型
|
||||
# P0-1 Fix: 統一使用 AnomalyCounter.hash_signature()
|
||||
try:
|
||||
@@ -407,6 +429,25 @@ class AutoRepairService:
|
||||
execution_time_ms=execution_time,
|
||||
)
|
||||
|
||||
# 2026-04-08 Claude Code: 失敗也必須寫入 DB
|
||||
try:
|
||||
from src.repositories.audit_log_repository import get_auto_repair_execution_repository
|
||||
max_risk = self._get_max_risk_level(playbook)
|
||||
await get_auto_repair_execution_repository().create(
|
||||
incident_id=incident.incident_id,
|
||||
playbook_id=playbook.playbook_id,
|
||||
playbook_name=playbook.name,
|
||||
success=False,
|
||||
executed_steps=executed_steps,
|
||||
error_message=str(e),
|
||||
triggered_by="cold_start_trust" if is_cold_start else "auto_repair",
|
||||
similarity_score=similarity_score,
|
||||
risk_level=max_risk.value if max_risk else None,
|
||||
execution_time_ms=execution_time,
|
||||
)
|
||||
except Exception as _db_e:
|
||||
logger.error("auto_repair_db_write_failed", error=str(_db_e))
|
||||
|
||||
# 2026-04-04 Claude Code: Phase 25 P1 — 失敗修復後 fire-and-forget 生成 ANTI_PATTERN
|
||||
# 2026-04-05 Claude Code: I1 修正 — 補齊 _pending_tasks GC 防護(對稱化)
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user