awoooi/apps/api/src/services/evidence_snapshot.py

"""
AWOOOI AIOps Phase 1 — 不可變事件證據快照
==========================================
EvidenceSnapshot：PreDecisionInvestigator 的輸出契約。

設計原則：
1. 不可變（Immutable）— 建立後只讀；執行後補填 post_execution_state
2. 版本化（Versioned）— schema_version 確保 fine-tune pipeline 可過濾
3. 安全（Sanitized）— 所有感官文字必須過 SanitizationService
4. 降級友好（Graceful Degradation）— 部分感官失敗不阻塞決策

資料流：
  PreDecisionInvestigator
    → EvidenceSnapshot（Pydantic model）
    → save() 寫入 incident_evidence 表
    → 傳給 decision_manager._dual_engine_analyze()

  PostExecutionVerifier
    → update_post_execution() 補填 post_execution_state

ADR-081: PreDecisionInvestigator + EvidenceSnapshot
2026-04-15 ogt + Claude Sonnet 4.6 (亞太): Phase 1 初始建立
"""

from __future__ import annotations

import uuid
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any

import structlog
from sqlalchemy import update

from src.db.base import get_db_context
from src.db.models import IncidentEvidence
from src.utils.timezone import now_taipei

logger = structlog.get_logger(__name__)

# EvidenceSnapshot schema 版本
SCHEMA_VERSION = "v1"

# Evidence summary 最大長度（防止超出 LLM token budget）
MAX_SUMMARY_CHARS = 32_000  # ≈ 8K tokens（UTF-8 中文 1 字 ≈ 4 chars）


@dataclass
class EvidenceSnapshot:
    """
    AI 決策前的不可變情報快照。

    8D 感官維度：
      D1 k8s_state        — kubectl describe pod + events
      D2 recent_logs      — container stderr tail-50（已 sanitize）
      D3 metrics_snapshot — Prometheus 5min vs 1h baseline
      D4 recent_deployments — ArgoCD/Gitea 過去 1h 部署 diff
      D5 business_metrics — 訂單量 / 登入成功率 / P0 SLI
      D6 historical_context — 過去 30 天同 alertname 處置歷史
      D7 peer_health      — 同 Deployment 其他 replica 健康度
      D8 dependency_topology — Istio/Service Mesh 上下游 latency

    品質指標：
      mcp_health          — 各工具呼叫成敗 {tool_name: bool}
      sensors_attempted / sensors_succeeded — 感官覆蓋率

    Usage:
        snapshot = EvidenceSnapshot(incident_id="INC-001")
        snapshot.k8s_state = {"phase": "CrashLoopBackOff", ...}
        snapshot_id = await snapshot.save()
    """

    incident_id: str

    # Identifiers
    snapshot_id: str = field(default_factory=lambda: str(uuid.uuid4()))
    schema_version: str = SCHEMA_VERSION
    collected_at: datetime = field(default_factory=now_taipei)

    # 告警基礎資訊（sensors=0 時的最小情報，2026-04-16 ogt + Claude Sonnet 4.6）
    alert_info: dict[str, Any] | None = None

    # 8D 感官數據
    k8s_state: dict[str, Any] | None = None          # D1
    recent_logs: str | None = None                   # D2 (sanitized)
    metrics_snapshot: dict[str, Any] | None = None   # D3
    recent_deployments: list[dict] | None = None     # D4
    business_metrics: dict[str, Any] | None = None   # D5
    historical_context: str | None = None            # D6
    peer_health: dict[str, Any] | None = None        # D7
    dependency_topology: dict[str, Any] | None = None  # D8
    # Phase 4 ADR-084: 動態異常感官（DynamicBaseline + LogAnomaly + TrendPredictor）
    # 2026-04-15 ogt + Claude Sonnet 4.6（亞太）: Phase 4 8D 升級
    anomaly_context: dict[str, Any] | None = None    # Phase 4 動態異常上下文
    # 2026-04-27 P3.1-T2-PathA by Claude — DiagAggregator 信號分類層（in-memory only，不持久化）
    # {"signal_count": int, "signals": [{"source", "signal_type", "severity", "message", ...}]}
    extra_diagnosis: dict | None = None

    # 感官品質
    mcp_health: dict[str, bool] = field(default_factory=dict)
    collection_duration_ms: int | None = None
    sensors_attempted: int = 0
    sensors_succeeded: int = 0

    # LLM 輸入摘要（由 Investigator 組裝）
    evidence_summary: str | None = None

    # 執行前後 State
    pre_execution_state: dict[str, Any] | None = None
    post_execution_state: dict[str, Any] | None = None
    verification_result: str | None = None

    # W2 PR-V1: SelfHealingValidator 自愈品質評估 (2026-04-28 ogt + Claude Sonnet 4.6)
    # ENABLE_SELF_HEALING_VALIDATOR=false 時永 None
    self_healing_score: float | None = None
    self_healing_detail: dict[str, Any] | None = None

    # Phase 3 填充（目前永 null）
    matched_playbook_id: str | None = None

    # ─────────────────────────────────────────────────────────────
    # Derived helpers
    # ─────────────────────────────────────────────────────────────

    @property
    def sensor_coverage_ratio(self) -> float:
        """感官覆蓋率（0.0 ~ 1.0）"""
        if self.sensors_attempted == 0:
            return 0.0
        return self.sensors_succeeded / self.sensors_attempted

    @property
    def has_k8s_context(self) -> bool:
        return self.k8s_state is not None

    @property
    def has_log_context(self) -> bool:
        return self.recent_logs is not None and len(self.recent_logs) > 0

    def build_summary(self) -> str:
        """
        組裝 LLM-ready 情報摘要（< MAX_SUMMARY_CHARS）。

        格式採用 <raw_evidence> 區塊隔離，防止 Prompt Injection。
        """
        parts: list[str] = []

        # 告警基礎資訊永遠放在最前（sensors=0 時也要讓 AI 知道是什麼告警）
        if self.alert_info:
            parts.append(f"[告警資訊] {self.alert_info}")

        if self.k8s_state:
            parts.append(f"[K8s狀態] {self.k8s_state}")
        if self.recent_logs:
            parts.append(f"[近期日誌]\n{self.recent_logs[:2000]}")
        if self.metrics_snapshot:
            parts.append(f"[指標快照] {self.metrics_snapshot}")
        if self.recent_deployments:
            dep_str = "; ".join(
                d.get("summary", str(d)) for d in self.recent_deployments[:3]
            )
            parts.append(f"[近期部署] {dep_str}")
        if self.business_metrics:
            parts.append(f"[業務指標] {self.business_metrics}")
        if self.historical_context:
            parts.append(f"[歷史脈絡] {self.historical_context[:500]}")
        if self.peer_health:
            parts.append(f"[同級副本健康度] {self.peer_health}")
        if self.dependency_topology:
            parts.append(f"[依賴拓撲] {self.dependency_topology}")
        if self.anomaly_context:
            parts.append(f"[動態異常偵測]\n{self.anomaly_context}")
        # 2026-04-27 P3.1-T2-PathA by Claude — DiagAggregator 信號分類層（結構化 dict）
        if self.extra_diagnosis and self.extra_diagnosis.get("signals"):
            signals_str = ", ".join(
                s.get("signal_type", "?") for s in self.extra_diagnosis["signals"][:5]
            )
            parts.append(f"[Signal Classification] {signals_str}")

        # 感官品質報告
        failed_tools = [t for t, ok in self.mcp_health.items() if not ok]
        if failed_tools:
            parts.append(f"[感官警告] 以下工具呼叫失敗，情報可能不完整: {failed_tools}")

        raw = "\n\n".join(parts)
        summary = f"<raw_evidence>\n{raw}\n</raw_evidence>"

        # Token budget 保護
        if len(summary) > MAX_SUMMARY_CHARS:
            summary = summary[:MAX_SUMMARY_CHARS] + "\n[...已截斷，超出 token budget]</raw_evidence>"

        return summary

    # ─────────────────────────────────────────────────────────────
    # Persistence
    # ─────────────────────────────────────────────────────────────

    async def save(self) -> str:
        """
        將快照持久化到 incident_evidence 表。

        Returns:
            str: snapshot_id（UUID）
        """
        if self.evidence_summary is None:
            self.evidence_summary = self.build_summary()

        try:
            async with get_db_context() as db:
                record = IncidentEvidence(
                    id=self.snapshot_id,
                    incident_id=self.incident_id,
                    matched_playbook_id=self.matched_playbook_id,
                    schema_version=self.schema_version,
                    k8s_state=self.k8s_state,
                    recent_logs=self.recent_logs,
                    metrics_snapshot=self.metrics_snapshot,
                    recent_deployments=self.recent_deployments,
                    business_metrics=self.business_metrics,
                    historical_context=self.historical_context,
                    peer_health=self.peer_health,
                    dependency_topology=self.dependency_topology,
                    anomaly_context=self.anomaly_context,
                    mcp_health=self.mcp_health,
                    collection_duration_ms=self.collection_duration_ms,
                    sensors_attempted=self.sensors_attempted,
                    sensors_succeeded=self.sensors_succeeded,
                    evidence_summary=self.evidence_summary,
                    pre_execution_state=self.pre_execution_state,
                    post_execution_state=self.post_execution_state,
                    verification_result=self.verification_result,
                    collected_at=self.collected_at,
                )
                db.add(record)
                await db.flush()

            logger.info(
                "evidence_snapshot_saved",
                snapshot_id=self.snapshot_id,
                incident_id=self.incident_id,
                sensors_succeeded=self.sensors_succeeded,
                collection_ms=self.collection_duration_ms,
            )
            return self.snapshot_id

        except Exception:
            logger.exception(
                "evidence_snapshot_save_error",
                snapshot_id=self.snapshot_id,
                incident_id=self.incident_id,
            )
            raise

    async def update_post_execution(
        self,
        post_state: dict[str, Any],
        verification_result: str,
    ) -> None:
        """
        PostExecutionVerifier 執行後補填 post_execution_state。

        Args:
            post_state: 執行後環境狀態
            verification_result: "success" / "degraded" / "failed" / "timeout"
        """
        self.post_execution_state = post_state
        self.verification_result = verification_result

        try:
            async with get_db_context() as db:
                stmt_result = await db.execute(
                    update(IncidentEvidence)
                    .where(IncidentEvidence.id == self.snapshot_id)
                    .values(
                        post_execution_state=post_state,
                        verification_result=verification_result,
                    )
                )

            # Gate 1 fix: 零行更新代表 snapshot 從未持久化（save() 失敗）→ 學習數據將靜默丟失
            if stmt_result.rowcount < 1:
                logger.warning(
                    "evidence_snapshot_post_update_no_rows",
                    snapshot_id=self.snapshot_id,
                    verification_result=verification_result,
                )
            else:
                logger.info(
                    "evidence_snapshot_post_execution_updated",
                    snapshot_id=self.snapshot_id,
                    verification_result=verification_result,
                )
        except Exception:
            logger.exception(
                "evidence_snapshot_post_update_error",
                snapshot_id=self.snapshot_id,
            )
            raise

    async def update_self_healing(
        self,
        score: float,
        detail: dict[str, Any],
    ) -> None:
        """
        W2 PR-V1: SelfHealingValidator 評估結果補填。

        在 PostExecutionVerifier.verify() 完成 update_post_execution() 之後呼叫。
        僅在 ENABLE_SELF_HEALING_VALIDATOR=True 且 snapshot 已持久化時有效。

        Args:
            score:  自愈品質分數（0.0-1.0）
            detail: SelfHealingValidator.assess_self_healing() 返回的明細 dict
        2026-04-28 ogt + Claude Sonnet 4.6: W2 PR-V1 初始建立
        """
        self.self_healing_score = score
        self.self_healing_detail = detail

        try:
            async with get_db_context() as db:
                stmt_result = await db.execute(
                    update(IncidentEvidence)
                    .where(IncidentEvidence.id == self.snapshot_id)
                    .values(
                        self_healing_score=score,
                        self_healing_detail=detail,
                    )
                )

            if stmt_result.rowcount < 1:
                logger.warning(
                    "evidence_snapshot_self_healing_update_no_rows",
                    snapshot_id=self.snapshot_id,
                    score=score,
                )
            else:
                logger.info(
                    "evidence_snapshot_self_healing_updated",
                    snapshot_id=self.snapshot_id,
                    score=score,
                )
        except Exception:
            logger.exception(
                "evidence_snapshot_self_healing_update_error",
                snapshot_id=self.snapshot_id,
            )
            raise


async def get_latest_snapshot(incident_id: str) -> EvidenceSnapshot | None:
    """
    查詢某 Incident 最新的 EvidenceSnapshot（由 snapshot_id 識別）。

    主要供測試和 Phase 3 learning pipeline 使用。
    """
    from sqlalchemy import desc, select

    try:
        async with get_db_context() as db:
            result = await db.execute(
                select(IncidentEvidence)
                .where(IncidentEvidence.incident_id == incident_id)
                .order_by(desc(IncidentEvidence.collected_at))
                .limit(1)
            )
            row = result.scalar_one_or_none()

        if row is None:
            return None

        snap = EvidenceSnapshot(
            incident_id=row.incident_id,
            snapshot_id=row.id,
            schema_version=row.schema_version,
            collected_at=row.collected_at,
            k8s_state=row.k8s_state,
            recent_logs=row.recent_logs,
            metrics_snapshot=row.metrics_snapshot,
            recent_deployments=row.recent_deployments,
            business_metrics=row.business_metrics,
            historical_context=row.historical_context,
            peer_health=row.peer_health,
            dependency_topology=row.dependency_topology,
            anomaly_context=row.anomaly_context,
            mcp_health=row.mcp_health or {},
            collection_duration_ms=row.collection_duration_ms,
            sensors_attempted=row.sensors_attempted or 0,
            sensors_succeeded=row.sensors_succeeded or 0,
            evidence_summary=row.evidence_summary,
            pre_execution_state=row.pre_execution_state,
            post_execution_state=row.post_execution_state,
            verification_result=row.verification_result,
            matched_playbook_id=row.matched_playbook_id,
        )
        return snap

    except Exception:
        logger.exception("evidence_snapshot_get_error", incident_id=incident_id)
        return None