awoooi/apps/api/src/services/finetune_exporter.py

"""
AWOOOI AIOps Phase 3 — Fine-tune JSONL 匯出器
=============================================
職責：每週將（EvidenceSnapshot × AgentSession × AutoRepairExecution）
      組合成訓練對（instruction, input, output），匯出為 JSONL 檔案供 LLM 微調。

為什麼需要 fine-tune 管線？
  EWMA Playbook trust 只調整「選哪個 Playbook」，
  但 LLM 本身的推理模式（症狀識別、根因分析、行動描述格式）無法從 EWMA 學習。
  Fine-tune 資料管線讓 AI 從真實成功案例中學習「如何推理」，
  不只學習「信任哪個 Playbook」。

匯出策略：
  - 查詢 incident_evidence 中 verification_result = 'success' 且有 evidence_summary 的記錄
  - 聯結同 incident_id 的 AgentSession（coordinator turn）取得推理決策
  - 聯結 auto_repair_executions 取得實際執行動作
  - 組合成 JSONL 格式（Alpaca instruction-input-output 格式）
  - 輸出到 FINETUNE_EXPORT_PATH（預設 /tmp/finetune/）；MinIO 支援待設定

JSONL 格式（每行 1 個 JSON 物件）：
  {
    "instruction": "根據 AIOps 情報摘要，分析告警根因並提出修復建議",
    "input": "<evidence_summary>",
    "output": "<coordinator 推理決策 + 執行動作>",
    "metadata": {
      "incident_id": "...",
      "alertname": "...",
      "verification_result": "success",
      "collected_at": "...",
      "schema_version": "v1"
    }
  }

設計原則：
  1. 只匯出 verification_result = 'success' 的記錄（負向案例不入訓練集，避免強化錯誤模式）
  2. 每次匯出加時間戳前綴（不覆蓋舊檔）
  3. 每批最多 500 筆（大規模訓練集需分批）
  4. 失敗只記錄 error，不影響主路徑

ADR-083 Phase 3: Fine-tune 管線（L7×D4）
2026-04-15 ogt + Claude Sonnet 4.6（亞太）: Phase 3 初始建立
"""

from __future__ import annotations

import asyncio
import json
import os
from datetime import timedelta
from pathlib import Path

import structlog
from sqlalchemy import and_, select, text as sql_text

from src.db.base import get_db_context
from src.db.models import AgentSession, AutoRepairExecution, IncidentEvidence
from src.utils.timezone import now_taipei

logger = structlog.get_logger(__name__)

# ─────────────────────────────────────────────────────────────────────────────
# 常數
# ─────────────────────────────────────────────────────────────────────────────

FINETUNE_EXPORT_PATH = os.getenv("FINETUNE_EXPORT_PATH", "/tmp/finetune")
BATCH_LIMIT = 500
EXPORT_LOOKBACK_DAYS = 7       # 只匯出過去 N 天的資料
WEEKLY_INTERVAL_SEC = 7 * 86_400

INSTRUCTION = (
    "根據以下 AIOps 情報摘要（EvidenceSnapshot），"
    "分析告警根因並提出具體的修復建議，說明修復動作的理由。"
)


# ─────────────────────────────────────────────────────────────────────────────
# Fine-tune Exporter
# ─────────────────────────────────────────────────────────────────────────────

class FineTuneExporter:
    """
    Fine-tune JSONL 匯出器（每週執行）

    Usage:
        exporter = FineTuneExporter()
        path, count = await exporter.export()
    """

    async def export(self) -> tuple[str | None, int]:
        """
        匯出訓練資料。

        Returns:
            (output_file_path, row_count)
            若無資料或功能關閉，返回 (None, 0)
        """
        from src.core.feature_flags import aiops_flags
        if not aiops_flags.AIOPS_P3_FINETUNE_EXPORT:
            logger.debug("finetune_exporter_skipped_feature_flag")
            return None, 0

        try:
            return await self._run_export()
        except Exception as e:
            logger.error("finetune_exporter_error", error=str(e))
            return None, 0

    async def _run_export(self) -> tuple[str | None, int]:
        cutoff = now_taipei() - timedelta(days=EXPORT_LOOKBACK_DAYS)
        async with get_db_context() as db:
            # 1. 取得成功驗證的 EvidenceSnapshot（有 evidence_summary + verification_result='success'）
            stmt = select(IncidentEvidence).where(
                and_(
                    IncidentEvidence.verification_result == "success",
                    IncidentEvidence.evidence_summary.isnot(None),
                    IncidentEvidence.collected_at >= cutoff,
                )
            ).limit(BATCH_LIMIT)

            result = await db.execute(stmt)
            evidences = result.scalars().all()

            if not evidences:
                logger.info("finetune_exporter_no_data", lookback_days=EXPORT_LOOKBACK_DAYS)
                return None, 0

            # 2. 為每筆 evidence 取對應的 coordinator AgentSession + AutoRepairExecution
            rows: list[dict] = []
            for ev in evidences:
                row = await self._build_row(db, ev)
                if row:
                    rows.append(row)

        if not rows:
            return None, 0

        # 3. 寫出 JSONL
        output_path = await self._write_jsonl(rows)
        logger.info(
            "finetune_export_done",
            row_count=len(rows),
            path=output_path,
        )

        # 2026-04-18 ADR-090-D: 寫入 finetune_exports 表(MASTER §7.1 #3 KPI 資料源)
        try:
            import hashlib, os
            _size = os.path.getsize(output_path) if output_path and os.path.exists(output_path) else None
            _checksum = None
            if output_path and os.path.exists(output_path):
                with open(output_path, 'rb') as _f:
                    _checksum = hashlib.sha256(_f.read()).hexdigest()
            _ids = [str(ev.id) for ev in evidences]
            async with get_db_context() as _db:
                await _db.execute(
                    sql_text("""
                        INSERT INTO finetune_exports (
                            export_type, source_table, source_ids,
                            file_path, record_count, size_bytes, checksum_sha256,
                            metadata
                        ) VALUES (
                            'evidence_snapshot', 'incident_evidence', :ids,
                            :fp, :rc, :sz, :cs, CAST(:md AS jsonb)
                        )
                    """),
                    {
                        "ids": _ids,
                        "fp": output_path,
                        "rc": len(rows),
                        "sz": _size,
                        "cs": _checksum,
                        "md": json.dumps({"lookback_days": EXPORT_LOOKBACK_DAYS}),
                    },
                )
        except Exception as _db_e:
            logger.warning("finetune_exports_db_write_failed", error=str(_db_e))

        return output_path, len(rows)

    async def _build_row(self, db, ev: IncidentEvidence) -> dict | None:
        """組合單筆訓練對。"""
        # 取 coordinator Agent turn（若有）
        agent_stmt = select(AgentSession).where(
            and_(
                AgentSession.incident_id == ev.incident_id,
                AgentSession.agent_role == "coordinator",
            )
        ).order_by(AgentSession.created_at.desc()).limit(1)
        agent_result = await db.execute(agent_stmt)
        coordinator = agent_result.scalar_one_or_none()

        # 取最新執行記錄
        exec_stmt = select(AutoRepairExecution).where(
            AutoRepairExecution.incident_id == ev.incident_id,
        ).order_by(AutoRepairExecution.created_at.desc()).limit(1)
        exec_result = await db.execute(exec_stmt)
        execution = exec_result.scalar_one_or_none()

        # 組合 output 文字
        output_parts: list[str] = []
        if coordinator and coordinator.output_json:
            coord_out = coordinator.output_json
            if isinstance(coord_out, dict):
                # 取 reasoning 或 decision 字段
                reasoning = (
                    coord_out.get("reasoning")
                    or coord_out.get("decision")
                    or str(coord_out)[:500]
                )
                output_parts.append(f"[AI 決策]\n{reasoning}")

        if execution:
            action_desc = execution.playbook_name or "未知"
            if execution.executed_steps:
                steps = execution.executed_steps
                if isinstance(steps, list) and steps:
                    first = steps[0]
                    if isinstance(first, dict):
                        action_desc = first.get("action") or first.get("step") or action_desc
            output_parts.append(f"[執行動作]\n{action_desc}")
            output_parts.append(
                f"[執行結果] {'成功' if execution.success else '失敗'}"
            )

        if not output_parts:
            return None  # 無 output 資料，跳過

        # 取 alertname（優先從 ev 關聯 incident 的 signal labels）
        alertname = ev.incident_id  # fallback to incident_id

        return {
            "instruction": INSTRUCTION,
            "input": (ev.evidence_summary or "").strip(),
            "output": "\n\n".join(output_parts),
            "metadata": {
                "incident_id": ev.incident_id,
                "alertname": alertname,
                "verification_result": ev.verification_result,
                "collected_at": ev.collected_at.isoformat() if ev.collected_at else None,
                "schema_version": ev.schema_version,
                "matched_playbook_id": ev.matched_playbook_id,
            },
        }

    async def _write_jsonl(self, rows: list[dict]) -> str:
        """寫出 JSONL 到 FINETUNE_EXPORT_PATH。"""
        export_dir = Path(FINETUNE_EXPORT_PATH)
        export_dir.mkdir(parents=True, exist_ok=True)

        ts = now_taipei().strftime("%Y%m%d-%H%M%S")
        filename = f"finetune-{ts}.jsonl"
        output_path = export_dir / filename

        with open(output_path, "w", encoding="utf-8") as f:
            for row in rows:
                f.write(json.dumps(row, ensure_ascii=False) + "\n")

        return str(output_path)


# ─────────────────────────────────────────────────────────────────────────────
# Loop（掛載到 main.py）
# ─────────────────────────────────────────────────────────────────────────────

async def run_finetune_export_loop() -> None:
    """
    無限迴圈：每 7 天執行一次 fine-tune 資料匯出。
    在 main.py startup 以 asyncio.create_task 掛載。
    """
    exporter = FineTuneExporter()
    while True:
        try:
            path, count = await exporter.export()
            if count > 0:
                logger.info("finetune_export_loop_tick", rows=count, path=path)
        except Exception as e:
            logger.error("finetune_export_loop_error", error=str(e))

        await asyncio.sleep(WEEKLY_INTERVAL_SEC)