Files
awoooi/apps/api/src/services/finetune_exporter.py
Your Name ff30c61c4c
All checks were successful
Code Review / ai-code-review (push) Successful in 21s
CD Pipeline / tests (push) Successful in 1m20s
CD Pipeline / build-and-deploy (push) Successful in 4m15s
CD Pipeline / post-deploy-checks (push) Successful in 1m58s
fix(rls): 收斂 API DB access context
2026-05-12 19:55:13 +08:00

280 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
AWOOOI AIOps Phase 3 — Fine-tune JSONL 匯出器
=============================================
職責每週將EvidenceSnapshot × AgentSession × AutoRepairExecution
組合成訓練對instruction, input, output匯出為 JSONL 檔案供 LLM 微調。
為什麼需要 fine-tune 管線?
EWMA Playbook trust 只調整「選哪個 Playbook」
但 LLM 本身的推理模式(症狀識別、根因分析、行動描述格式)無法從 EWMA 學習。
Fine-tune 資料管線讓 AI 從真實成功案例中學習「如何推理」,
不只學習「信任哪個 Playbook」。
匯出策略:
- 查詢 incident_evidence 中 verification_result = 'success' 且有 evidence_summary 的記錄
- 聯結同 incident_id 的 AgentSessioncoordinator turn取得推理決策
- 聯結 auto_repair_executions 取得實際執行動作
- 組合成 JSONL 格式Alpaca instruction-input-output 格式)
- 輸出到 FINETUNE_EXPORT_PATH預設 /tmp/finetune/MinIO 支援待設定
JSONL 格式(每行 1 個 JSON 物件):
{
"instruction": "根據 AIOps 情報摘要,分析告警根因並提出修復建議",
"input": "<evidence_summary>",
"output": "<coordinator 推理決策 + 執行動作>",
"metadata": {
"incident_id": "...",
"alertname": "...",
"verification_result": "success",
"collected_at": "...",
"schema_version": "v1"
}
}
設計原則:
1. 只匯出 verification_result = 'success' 的記錄(負向案例不入訓練集,避免強化錯誤模式)
2. 每次匯出加時間戳前綴(不覆蓋舊檔)
3. 每批最多 500 筆(大規模訓練集需分批)
4. 失敗只記錄 error不影響主路徑
ADR-083 Phase 3: Fine-tune 管線L7×D4
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 初始建立
"""
from __future__ import annotations
import asyncio
import json
import os
from datetime import timedelta
from pathlib import Path
import structlog
from sqlalchemy import and_, select, text as sql_text
from src.db.base import get_db_context
from src.db.models import AgentSession, AutoRepairExecution, IncidentEvidence
from src.utils.timezone import now_taipei
logger = structlog.get_logger(__name__)
# ─────────────────────────────────────────────────────────────────────────────
# 常數
# ─────────────────────────────────────────────────────────────────────────────
FINETUNE_EXPORT_PATH = os.getenv("FINETUNE_EXPORT_PATH", "/tmp/finetune")
BATCH_LIMIT = 500
EXPORT_LOOKBACK_DAYS = 7 # 只匯出過去 N 天的資料
WEEKLY_INTERVAL_SEC = 7 * 86_400
INSTRUCTION = (
"根據以下 AIOps 情報摘要EvidenceSnapshot"
"分析告警根因並提出具體的修復建議,說明修復動作的理由。"
)
# ─────────────────────────────────────────────────────────────────────────────
# Fine-tune Exporter
# ─────────────────────────────────────────────────────────────────────────────
class FineTuneExporter:
"""
Fine-tune JSONL 匯出器(每週執行)
Usage:
exporter = FineTuneExporter()
path, count = await exporter.export()
"""
async def export(self) -> tuple[str | None, int]:
"""
匯出訓練資料。
Returns:
(output_file_path, row_count)
若無資料或功能關閉,返回 (None, 0)
"""
from src.core.feature_flags import aiops_flags
if not aiops_flags.AIOPS_P3_FINETUNE_EXPORT:
logger.debug("finetune_exporter_skipped_feature_flag")
return None, 0
try:
return await self._run_export()
except Exception as e:
logger.error("finetune_exporter_error", error=str(e))
return None, 0
async def _run_export(self) -> tuple[str | None, int]:
cutoff = now_taipei() - timedelta(days=EXPORT_LOOKBACK_DAYS)
async with get_db_context() as db:
# 1. 取得成功驗證的 EvidenceSnapshot有 evidence_summary + verification_result='success'
stmt = select(IncidentEvidence).where(
and_(
IncidentEvidence.verification_result == "success",
IncidentEvidence.evidence_summary.isnot(None),
IncidentEvidence.collected_at >= cutoff,
)
).limit(BATCH_LIMIT)
result = await db.execute(stmt)
evidences = result.scalars().all()
if not evidences:
logger.info("finetune_exporter_no_data", lookback_days=EXPORT_LOOKBACK_DAYS)
return None, 0
# 2. 為每筆 evidence 取對應的 coordinator AgentSession + AutoRepairExecution
rows: list[dict] = []
for ev in evidences:
row = await self._build_row(db, ev)
if row:
rows.append(row)
if not rows:
return None, 0
# 3. 寫出 JSONL
output_path = await self._write_jsonl(rows)
logger.info(
"finetune_export_done",
row_count=len(rows),
path=output_path,
)
# 2026-04-18 ADR-090-D: 寫入 finetune_exports 表(MASTER §7.1 #3 KPI 資料源)
try:
import hashlib, os
_size = os.path.getsize(output_path) if output_path and os.path.exists(output_path) else None
_checksum = None
if output_path and os.path.exists(output_path):
with open(output_path, 'rb') as _f:
_checksum = hashlib.sha256(_f.read()).hexdigest()
_ids = [str(ev.id) for ev in evidences]
async with get_db_context() as _db:
await _db.execute(
sql_text("""
INSERT INTO finetune_exports (
export_type, source_table, source_ids,
file_path, record_count, size_bytes, checksum_sha256,
metadata
) VALUES (
'evidence_snapshot', 'incident_evidence', :ids,
:fp, :rc, :sz, :cs, CAST(:md AS jsonb)
)
"""),
{
"ids": _ids,
"fp": output_path,
"rc": len(rows),
"sz": _size,
"cs": _checksum,
"md": json.dumps({"lookback_days": EXPORT_LOOKBACK_DAYS}),
},
)
except Exception as _db_e:
logger.warning("finetune_exports_db_write_failed", error=str(_db_e))
return output_path, len(rows)
async def _build_row(self, db, ev: IncidentEvidence) -> dict | None:
"""組合單筆訓練對。"""
# 取 coordinator Agent turn若有
agent_stmt = select(AgentSession).where(
and_(
AgentSession.incident_id == ev.incident_id,
AgentSession.agent_role == "coordinator",
)
).order_by(AgentSession.created_at.desc()).limit(1)
agent_result = await db.execute(agent_stmt)
coordinator = agent_result.scalar_one_or_none()
# 取最新執行記錄
exec_stmt = select(AutoRepairExecution).where(
AutoRepairExecution.incident_id == ev.incident_id,
).order_by(AutoRepairExecution.created_at.desc()).limit(1)
exec_result = await db.execute(exec_stmt)
execution = exec_result.scalar_one_or_none()
# 組合 output 文字
output_parts: list[str] = []
if coordinator and coordinator.output_json:
coord_out = coordinator.output_json
if isinstance(coord_out, dict):
# 取 reasoning 或 decision 字段
reasoning = (
coord_out.get("reasoning")
or coord_out.get("decision")
or str(coord_out)[:500]
)
output_parts.append(f"[AI 決策]\n{reasoning}")
if execution:
action_desc = execution.playbook_name or "未知"
if execution.executed_steps:
steps = execution.executed_steps
if isinstance(steps, list) and steps:
first = steps[0]
if isinstance(first, dict):
action_desc = first.get("action") or first.get("step") or action_desc
output_parts.append(f"[執行動作]\n{action_desc}")
output_parts.append(
f"[執行結果] {'成功' if execution.success else '失敗'}"
)
if not output_parts:
return None # 無 output 資料,跳過
# 取 alertname優先從 ev 關聯 incident 的 signal labels
alertname = ev.incident_id # fallback to incident_id
return {
"instruction": INSTRUCTION,
"input": (ev.evidence_summary or "").strip(),
"output": "\n\n".join(output_parts),
"metadata": {
"incident_id": ev.incident_id,
"alertname": alertname,
"verification_result": ev.verification_result,
"collected_at": ev.collected_at.isoformat() if ev.collected_at else None,
"schema_version": ev.schema_version,
"matched_playbook_id": ev.matched_playbook_id,
},
}
async def _write_jsonl(self, rows: list[dict]) -> str:
"""寫出 JSONL 到 FINETUNE_EXPORT_PATH。"""
export_dir = Path(FINETUNE_EXPORT_PATH)
export_dir.mkdir(parents=True, exist_ok=True)
ts = now_taipei().strftime("%Y%m%d-%H%M%S")
filename = f"finetune-{ts}.jsonl"
output_path = export_dir / filename
with open(output_path, "w", encoding="utf-8") as f:
for row in rows:
f.write(json.dumps(row, ensure_ascii=False) + "\n")
return str(output_path)
# ─────────────────────────────────────────────────────────────────────────────
# Loop掛載到 main.py
# ─────────────────────────────────────────────────────────────────────────────
async def run_finetune_export_loop() -> None:
"""
無限迴圈:每 7 天執行一次 fine-tune 資料匯出。
在 main.py startup 以 asyncio.create_task 掛載。
"""
exporter = FineTuneExporter()
while True:
try:
path, count = await exporter.export()
if count > 0:
logger.info("finetune_export_loop_tick", rows=count, path=path)
except Exception as e:
logger.error("finetune_export_loop_error", error=str(e))
await asyncio.sleep(WEEKLY_INTERVAL_SEC)