280 lines
12 KiB
Python
280 lines
12 KiB
Python
"""
|
||
AWOOOI AIOps Phase 3 — Fine-tune JSONL 匯出器
|
||
=============================================
|
||
職責:每週將(EvidenceSnapshot × AgentSession × AutoRepairExecution)
|
||
組合成訓練對(instruction, input, output),匯出為 JSONL 檔案供 LLM 微調。
|
||
|
||
為什麼需要 fine-tune 管線?
|
||
EWMA Playbook trust 只調整「選哪個 Playbook」,
|
||
但 LLM 本身的推理模式(症狀識別、根因分析、行動描述格式)無法從 EWMA 學習。
|
||
Fine-tune 資料管線讓 AI 從真實成功案例中學習「如何推理」,
|
||
不只學習「信任哪個 Playbook」。
|
||
|
||
匯出策略:
|
||
- 查詢 incident_evidence 中 verification_result = 'success' 且有 evidence_summary 的記錄
|
||
- 聯結同 incident_id 的 AgentSession(coordinator turn)取得推理決策
|
||
- 聯結 auto_repair_executions 取得實際執行動作
|
||
- 組合成 JSONL 格式(Alpaca instruction-input-output 格式)
|
||
- 輸出到 FINETUNE_EXPORT_PATH(預設 /tmp/finetune/);MinIO 支援待設定
|
||
|
||
JSONL 格式(每行 1 個 JSON 物件):
|
||
{
|
||
"instruction": "根據 AIOps 情報摘要,分析告警根因並提出修復建議",
|
||
"input": "<evidence_summary>",
|
||
"output": "<coordinator 推理決策 + 執行動作>",
|
||
"metadata": {
|
||
"incident_id": "...",
|
||
"alertname": "...",
|
||
"verification_result": "success",
|
||
"collected_at": "...",
|
||
"schema_version": "v1"
|
||
}
|
||
}
|
||
|
||
設計原則:
|
||
1. 只匯出 verification_result = 'success' 的記錄(負向案例不入訓練集,避免強化錯誤模式)
|
||
2. 每次匯出加時間戳前綴(不覆蓋舊檔)
|
||
3. 每批最多 500 筆(大規模訓練集需分批)
|
||
4. 失敗只記錄 error,不影響主路徑
|
||
|
||
ADR-083 Phase 3: Fine-tune 管線(L7×D4)
|
||
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 初始建立
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import json
|
||
import os
|
||
from datetime import timedelta
|
||
from pathlib import Path
|
||
|
||
import structlog
|
||
from sqlalchemy import and_, select, text as sql_text
|
||
|
||
from src.db.base import get_db_context
|
||
from src.db.models import AgentSession, AutoRepairExecution, IncidentEvidence
|
||
from src.utils.timezone import now_taipei
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# 常數
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
FINETUNE_EXPORT_PATH = os.getenv("FINETUNE_EXPORT_PATH", "/tmp/finetune")
|
||
BATCH_LIMIT = 500
|
||
EXPORT_LOOKBACK_DAYS = 7 # 只匯出過去 N 天的資料
|
||
WEEKLY_INTERVAL_SEC = 7 * 86_400
|
||
|
||
INSTRUCTION = (
|
||
"根據以下 AIOps 情報摘要(EvidenceSnapshot),"
|
||
"分析告警根因並提出具體的修復建議,說明修復動作的理由。"
|
||
)
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Fine-tune Exporter
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
class FineTuneExporter:
|
||
"""
|
||
Fine-tune JSONL 匯出器(每週執行)
|
||
|
||
Usage:
|
||
exporter = FineTuneExporter()
|
||
path, count = await exporter.export()
|
||
"""
|
||
|
||
async def export(self) -> tuple[str | None, int]:
|
||
"""
|
||
匯出訓練資料。
|
||
|
||
Returns:
|
||
(output_file_path, row_count)
|
||
若無資料或功能關閉,返回 (None, 0)
|
||
"""
|
||
from src.core.feature_flags import aiops_flags
|
||
if not aiops_flags.AIOPS_P3_FINETUNE_EXPORT:
|
||
logger.debug("finetune_exporter_skipped_feature_flag")
|
||
return None, 0
|
||
|
||
try:
|
||
return await self._run_export()
|
||
except Exception as e:
|
||
logger.error("finetune_exporter_error", error=str(e))
|
||
return None, 0
|
||
|
||
async def _run_export(self) -> tuple[str | None, int]:
|
||
cutoff = now_taipei() - timedelta(days=EXPORT_LOOKBACK_DAYS)
|
||
async with get_db_context() as db:
|
||
# 1. 取得成功驗證的 EvidenceSnapshot(有 evidence_summary + verification_result='success')
|
||
stmt = select(IncidentEvidence).where(
|
||
and_(
|
||
IncidentEvidence.verification_result == "success",
|
||
IncidentEvidence.evidence_summary.isnot(None),
|
||
IncidentEvidence.collected_at >= cutoff,
|
||
)
|
||
).limit(BATCH_LIMIT)
|
||
|
||
result = await db.execute(stmt)
|
||
evidences = result.scalars().all()
|
||
|
||
if not evidences:
|
||
logger.info("finetune_exporter_no_data", lookback_days=EXPORT_LOOKBACK_DAYS)
|
||
return None, 0
|
||
|
||
# 2. 為每筆 evidence 取對應的 coordinator AgentSession + AutoRepairExecution
|
||
rows: list[dict] = []
|
||
for ev in evidences:
|
||
row = await self._build_row(db, ev)
|
||
if row:
|
||
rows.append(row)
|
||
|
||
if not rows:
|
||
return None, 0
|
||
|
||
# 3. 寫出 JSONL
|
||
output_path = await self._write_jsonl(rows)
|
||
logger.info(
|
||
"finetune_export_done",
|
||
row_count=len(rows),
|
||
path=output_path,
|
||
)
|
||
|
||
# 2026-04-18 ADR-090-D: 寫入 finetune_exports 表(MASTER §7.1 #3 KPI 資料源)
|
||
try:
|
||
import hashlib, os
|
||
_size = os.path.getsize(output_path) if output_path and os.path.exists(output_path) else None
|
||
_checksum = None
|
||
if output_path and os.path.exists(output_path):
|
||
with open(output_path, 'rb') as _f:
|
||
_checksum = hashlib.sha256(_f.read()).hexdigest()
|
||
_ids = [str(ev.id) for ev in evidences]
|
||
async with get_db_context() as _db:
|
||
await _db.execute(
|
||
sql_text("""
|
||
INSERT INTO finetune_exports (
|
||
export_type, source_table, source_ids,
|
||
file_path, record_count, size_bytes, checksum_sha256,
|
||
metadata
|
||
) VALUES (
|
||
'evidence_snapshot', 'incident_evidence', :ids,
|
||
:fp, :rc, :sz, :cs, CAST(:md AS jsonb)
|
||
)
|
||
"""),
|
||
{
|
||
"ids": _ids,
|
||
"fp": output_path,
|
||
"rc": len(rows),
|
||
"sz": _size,
|
||
"cs": _checksum,
|
||
"md": json.dumps({"lookback_days": EXPORT_LOOKBACK_DAYS}),
|
||
},
|
||
)
|
||
except Exception as _db_e:
|
||
logger.warning("finetune_exports_db_write_failed", error=str(_db_e))
|
||
|
||
return output_path, len(rows)
|
||
|
||
async def _build_row(self, db, ev: IncidentEvidence) -> dict | None:
|
||
"""組合單筆訓練對。"""
|
||
# 取 coordinator Agent turn(若有)
|
||
agent_stmt = select(AgentSession).where(
|
||
and_(
|
||
AgentSession.incident_id == ev.incident_id,
|
||
AgentSession.agent_role == "coordinator",
|
||
)
|
||
).order_by(AgentSession.created_at.desc()).limit(1)
|
||
agent_result = await db.execute(agent_stmt)
|
||
coordinator = agent_result.scalar_one_or_none()
|
||
|
||
# 取最新執行記錄
|
||
exec_stmt = select(AutoRepairExecution).where(
|
||
AutoRepairExecution.incident_id == ev.incident_id,
|
||
).order_by(AutoRepairExecution.created_at.desc()).limit(1)
|
||
exec_result = await db.execute(exec_stmt)
|
||
execution = exec_result.scalar_one_or_none()
|
||
|
||
# 組合 output 文字
|
||
output_parts: list[str] = []
|
||
if coordinator and coordinator.output_json:
|
||
coord_out = coordinator.output_json
|
||
if isinstance(coord_out, dict):
|
||
# 取 reasoning 或 decision 字段
|
||
reasoning = (
|
||
coord_out.get("reasoning")
|
||
or coord_out.get("decision")
|
||
or str(coord_out)[:500]
|
||
)
|
||
output_parts.append(f"[AI 決策]\n{reasoning}")
|
||
|
||
if execution:
|
||
action_desc = execution.playbook_name or "未知"
|
||
if execution.executed_steps:
|
||
steps = execution.executed_steps
|
||
if isinstance(steps, list) and steps:
|
||
first = steps[0]
|
||
if isinstance(first, dict):
|
||
action_desc = first.get("action") or first.get("step") or action_desc
|
||
output_parts.append(f"[執行動作]\n{action_desc}")
|
||
output_parts.append(
|
||
f"[執行結果] {'成功' if execution.success else '失敗'}"
|
||
)
|
||
|
||
if not output_parts:
|
||
return None # 無 output 資料,跳過
|
||
|
||
# 取 alertname(優先從 ev 關聯 incident 的 signal labels)
|
||
alertname = ev.incident_id # fallback to incident_id
|
||
|
||
return {
|
||
"instruction": INSTRUCTION,
|
||
"input": (ev.evidence_summary or "").strip(),
|
||
"output": "\n\n".join(output_parts),
|
||
"metadata": {
|
||
"incident_id": ev.incident_id,
|
||
"alertname": alertname,
|
||
"verification_result": ev.verification_result,
|
||
"collected_at": ev.collected_at.isoformat() if ev.collected_at else None,
|
||
"schema_version": ev.schema_version,
|
||
"matched_playbook_id": ev.matched_playbook_id,
|
||
},
|
||
}
|
||
|
||
async def _write_jsonl(self, rows: list[dict]) -> str:
|
||
"""寫出 JSONL 到 FINETUNE_EXPORT_PATH。"""
|
||
export_dir = Path(FINETUNE_EXPORT_PATH)
|
||
export_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
ts = now_taipei().strftime("%Y%m%d-%H%M%S")
|
||
filename = f"finetune-{ts}.jsonl"
|
||
output_path = export_dir / filename
|
||
|
||
with open(output_path, "w", encoding="utf-8") as f:
|
||
for row in rows:
|
||
f.write(json.dumps(row, ensure_ascii=False) + "\n")
|
||
|
||
return str(output_path)
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Loop(掛載到 main.py)
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
async def run_finetune_export_loop() -> None:
|
||
"""
|
||
無限迴圈:每 7 天執行一次 fine-tune 資料匯出。
|
||
在 main.py startup 以 asyncio.create_task 掛載。
|
||
"""
|
||
exporter = FineTuneExporter()
|
||
while True:
|
||
try:
|
||
path, count = await exporter.export()
|
||
if count > 0:
|
||
logger.info("finetune_export_loop_tick", rows=count, path=path)
|
||
except Exception as e:
|
||
logger.error("finetune_export_loop_error", error=str(e))
|
||
|
||
await asyncio.sleep(WEEKLY_INTERVAL_SEC)
|