diff --git a/apps/api/migrations/phase10_auto_repair_executions.sql b/apps/api/migrations/phase10_auto_repair_executions.sql new file mode 100644 index 00000000..80443ee0 --- /dev/null +++ b/apps/api/migrations/phase10_auto_repair_executions.sql @@ -0,0 +1,38 @@ +-- Phase 10: Auto Repair Executions 操作記錄表 +-- 建立時間: 2026-04-08 (台北時區) +-- 建立者: Claude Code — 統帥指令「所有操作都必須被記錄,寫入資料庫」 +-- +-- 設計說明: +-- 自動修復每次執行(成功或失敗)都寫入此表 +-- 不依賴 approval_id(自動修復不需要人工批准) +-- 支援查詢: 按 incident / playbook / 時間範圍 / 成功率 + +CREATE TABLE IF NOT EXISTS auto_repair_executions ( + -- 主鍵 + id VARCHAR(36) PRIMARY KEY DEFAULT gen_random_uuid()::text, + + -- 關聯 + incident_id VARCHAR(30) NOT NULL, + playbook_id VARCHAR(36) NOT NULL, + playbook_name VARCHAR(200) NOT NULL, + + -- 執行結果 + success BOOLEAN NOT NULL DEFAULT FALSE, + executed_steps JSONB NOT NULL DEFAULT '[]', -- list of step result strings + error_message TEXT, + + -- 執行上下文 + triggered_by VARCHAR(50) NOT NULL DEFAULT 'auto_repair', -- auto_repair / cold_start_trust + similarity_score NUMERIC(5,4), -- 匹配相似度 + risk_level VARCHAR(20), -- LOW / MEDIUM / HIGH + execution_time_ms INTEGER, + + -- 時間戳 (台北時區) + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +-- 索引 +CREATE INDEX IF NOT EXISTS ix_are_incident_id ON auto_repair_executions (incident_id); +CREATE INDEX IF NOT EXISTS ix_are_playbook_id ON auto_repair_executions (playbook_id); +CREATE INDEX IF NOT EXISTS ix_are_created_at ON auto_repair_executions (created_at DESC); +CREATE INDEX IF NOT EXISTS ix_are_success ON auto_repair_executions (success); diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index 420a3f69..16c71efb 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -209,6 +209,7 @@ async def _try_auto_repair_background( incident=incident, playbook=decision.playbook, is_cold_start=decision.is_cold_start, + similarity_score=decision.similarity_score, ) logger.info( diff --git a/apps/api/src/db/models.py b/apps/api/src/db/models.py index 22a0ea9f..31ade469 100644 --- a/apps/api/src/db/models.py +++ b/apps/api/src/db/models.py @@ -352,6 +352,50 @@ class AuditLog(Base): ) +# ============================================================================= +# AutoRepairExecution - Phase 10 操作記錄 +# 2026-04-08 Claude Code: 統帥指令「所有操作都必須被記錄,寫入資料庫」 +# ============================================================================= + +class AutoRepairExecution(Base): + """ + 自動修復執行記錄 + + 每次 evaluate_auto_repair 觸發並執行 (成功或失敗) 都寫入此表。 + 不依賴 approval_id(自動修復不需人工批准)。 + """ + __tablename__ = "auto_repair_executions" + + id: Mapped[str] = mapped_column(String(36), primary_key=True, default=generate_uuid) + + # 關聯 + incident_id: Mapped[str] = mapped_column(String(30), nullable=False, index=True) + playbook_id: Mapped[str] = mapped_column(String(36), nullable=False, index=True) + playbook_name: Mapped[str] = mapped_column(String(200), nullable=False) + + # 執行結果 + success: Mapped[bool] = mapped_column(default=False, nullable=False) + executed_steps: Mapped[list] = mapped_column(JSON, default=list, nullable=False) + error_message: Mapped[str | None] = mapped_column(Text, nullable=True) + + # 執行上下文 + triggered_by: Mapped[str] = mapped_column( + String(50), default="auto_repair", nullable=False, + comment="auto_repair / cold_start_trust", + ) + similarity_score: Mapped[float | None] = mapped_column(nullable=True) + risk_level: Mapped[str | None] = mapped_column(String(20), nullable=True) + execution_time_ms: Mapped[int | None] = mapped_column(Integer, nullable=True) + + # 時間戳 (台北時區) + created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=taipei_now) + + __table_args__ = ( + Index("ix_are_created_at", "created_at"), + Index("ix_are_success", "success"), + ) + + # ============================================================================= # IncidentRecord - Phase 6.2 Episodic Memory (PostgreSQL) # ============================================================================= diff --git a/apps/api/src/repositories/audit_log_repository.py b/apps/api/src/repositories/audit_log_repository.py index ed44ce75..251a285c 100644 --- a/apps/api/src/repositories/audit_log_repository.py +++ b/apps/api/src/repositories/audit_log_repository.py @@ -15,7 +15,7 @@ import structlog from sqlalchemy import func, select from src.db.base import get_db_context -from src.db.models import AuditLog +from src.db.models import AuditLog, AutoRepairExecution logger = structlog.get_logger(__name__) @@ -157,6 +157,97 @@ class AuditLogRepository: } +# ============================================================================= +# AutoRepairExecutionRepository +# 2026-04-08 Claude Code: 統帥指令「所有操作都必須被記錄,寫入資料庫」 +# ============================================================================= + + +class AutoRepairExecutionRepository: + """自動修復執行記錄 Repository""" + + async def create( + self, + incident_id: str, + playbook_id: str, + playbook_name: str, + success: bool, + executed_steps: list[str], + error_message: str | None = None, + triggered_by: str = "auto_repair", + similarity_score: float | None = None, + risk_level: str | None = None, + execution_time_ms: int | None = None, + ) -> AutoRepairExecution: + """寫入一筆自動修復執行記錄""" + async with get_db_context() as db: + record = AutoRepairExecution( + incident_id=incident_id, + playbook_id=playbook_id, + playbook_name=playbook_name, + success=success, + executed_steps=executed_steps, + error_message=error_message, + triggered_by=triggered_by, + similarity_score=similarity_score, + risk_level=risk_level, + execution_time_ms=execution_time_ms, + ) + db.add(record) + await db.flush() + await db.refresh(record) + return record + + async def list_by_incident(self, incident_id: str) -> list[AutoRepairExecution]: + """查詢某 incident 的所有修復記錄""" + async with get_db_context() as db: + result = await db.execute( + select(AutoRepairExecution) + .where(AutoRepairExecution.incident_id == incident_id) + .order_by(AutoRepairExecution.created_at.desc()) + ) + return list(result.scalars().all()) + + async def get_stats(self, since_hours: int = 24) -> dict[str, Any]: + """統計最近 N 小時的自動修復執行情況""" + from datetime import timedelta + from src.utils.timezone import now_taipei + + since = now_taipei() - timedelta(hours=since_hours) + async with get_db_context() as db: + total_r = await db.execute( + select(func.count(AutoRepairExecution.id)) + .where(AutoRepairExecution.created_at >= since) + ) + total = total_r.scalar() or 0 + + success_r = await db.execute( + select(func.count(AutoRepairExecution.id)) + .where(AutoRepairExecution.created_at >= since) + .where(AutoRepairExecution.success.is_(True)) + ) + success_count = success_r.scalar() or 0 + + return { + "total": total, + "success_count": success_count, + "failure_count": total - success_count, + "success_rate": round(success_count / total * 100, 1) if total > 0 else 0, + "since_hours": since_hours, + } + + +_auto_repair_execution_repo: AutoRepairExecutionRepository | None = None + + +def get_auto_repair_execution_repository() -> AutoRepairExecutionRepository: + """取得 AutoRepairExecutionRepository 實例 (Singleton)""" + global _auto_repair_execution_repo + if _auto_repair_execution_repo is None: + _auto_repair_execution_repo = AutoRepairExecutionRepository() + return _auto_repair_execution_repo + + # ============================================================================= # Singleton # ============================================================================= diff --git a/apps/api/src/services/auto_repair_service.py b/apps/api/src/services/auto_repair_service.py index 771e2bfb..d48bdc7e 100644 --- a/apps/api/src/services/auto_repair_service.py +++ b/apps/api/src/services/auto_repair_service.py @@ -63,6 +63,8 @@ class AutoRepairDecision: blocked_by: str | None = None # 阻擋原因 (如 HIGH_RISK, P1_SEVERITY) # 2026-04-07 Claude Code: Sprint 4 B2 — 追蹤首次信任 is_cold_start: bool = False + # 2026-04-08 Claude Code: 傳入 execute_auto_repair 供 DB 記錄 + similarity_score: float | None = None @dataclass @@ -280,9 +282,10 @@ class AutoRepairService: return AutoRepairDecision( can_auto_repair=True, playbook=best_match.playbook, - reason=f"匹配高品質 Playbook: {best_match.playbook.name} (成功率 {best_match.playbook.success_rate:.0%})", + reason=f"匹配 Playbook: {best_match.playbook.name} (相似度 {best_match.similarity_score:.0%})", risk_level=max_risk, is_cold_start=_is_cold_start, + similarity_score=best_match.similarity_score, ) async def execute_auto_repair( @@ -290,13 +293,14 @@ class AutoRepairService: incident: Incident, playbook: Playbook, is_cold_start: bool = False, + similarity_score: float | None = None, ) -> AutoRepairResult: """ 執行自動修復 流程: 1. 依序執行 Playbook 中的 repair_steps - 2. 記錄執行結果 + 2. 記錄執行結果到 DB (auto_repair_executions) 3. 更新 Playbook 統計 4. 記錄處置類型 (Sprint 4 B1/B2) """ @@ -351,6 +355,24 @@ class AutoRepairService: execution_time_ms=execution_time, ) + # 2026-04-08 Claude Code: 統帥指令「所有操作都必須被記錄,寫入資料庫」 + try: + from src.repositories.audit_log_repository import get_auto_repair_execution_repository + max_risk = self._get_max_risk_level(playbook) + await get_auto_repair_execution_repository().create( + incident_id=incident.incident_id, + playbook_id=playbook.playbook_id, + playbook_name=playbook.name, + success=True, + executed_steps=executed_steps, + triggered_by="cold_start_trust" if is_cold_start else "auto_repair", + similarity_score=similarity_score, + risk_level=max_risk.value if max_risk else None, + execution_time_ms=execution_time, + ) + except Exception as _db_e: + logger.error("auto_repair_db_write_failed", error=str(_db_e)) + # 2026-04-07 Claude Code: Sprint 4 B1/B2 — 記錄處置類型 # P0-1 Fix: 統一使用 AnomalyCounter.hash_signature() try: @@ -407,6 +429,25 @@ class AutoRepairService: execution_time_ms=execution_time, ) + # 2026-04-08 Claude Code: 失敗也必須寫入 DB + try: + from src.repositories.audit_log_repository import get_auto_repair_execution_repository + max_risk = self._get_max_risk_level(playbook) + await get_auto_repair_execution_repository().create( + incident_id=incident.incident_id, + playbook_id=playbook.playbook_id, + playbook_name=playbook.name, + success=False, + executed_steps=executed_steps, + error_message=str(e), + triggered_by="cold_start_trust" if is_cold_start else "auto_repair", + similarity_score=similarity_score, + risk_level=max_risk.value if max_risk else None, + execution_time_ms=execution_time, + ) + except Exception as _db_e: + logger.error("auto_repair_db_write_failed", error=str(_db_e)) + # 2026-04-04 Claude Code: Phase 25 P1 — 失敗修復後 fire-and-forget 生成 ANTI_PATTERN # 2026-04-05 Claude Code: I1 修正 — 補齊 _pending_tasks GC 防護(對稱化) try: