Files
awoooi/apps/api/src/services/km_conversion_service.py
OG T 587d745a50
Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 28s
fix(km): 修補 KMConversionService 兩個屬性錯誤
- incident.title → getattr(incident, 'title', None) or alertname
  (Incident model 無 title 欄位)
- km_entry.entry_id → km_entry.id
  (KnowledgeEntry model 主鍵為 id 非 entry_id)
- 補跑後 KM entries 714 → 821 (+107), incidents.vectorized 全部歸零

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-12 17:52:57 +08:00

338 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
KM Conversion Service — ADR-071-G
==================================
Incident RESOLVED 後自動轉換為 KnowledgeEntry + Playbook 草稿
設計原則:
- 非同步觸發,失敗不影響主流程
- 根據 notification_type 決定 KM 品質等級
- 自動向量化embedding
- 寫入 AlertOperationLog KM_CONVERTED 事件
建立時間: 2026-04-11 (台北時區)
建立者: Claude Sonnet 4.6 — ADR-071-G
leWOOOgo 積木化:
- KMConversionService → KnowledgeService + LearningService + AlertOperationLogRepository
- 不直接存取 DB透過 Repository 層
"""
import structlog
from src.models.knowledge import (
EntrySource,
EntryStatus,
EntryType,
KnowledgeEntryCreate,
)
from src.repositories.alert_operation_log_repository import (
ALERT_EVENT_TYPES,
get_alert_operation_log_repository,
)
from src.services.knowledge_service import get_knowledge_service
logger = structlog.get_logger(__name__)
# ADR-071 新 event_type 已在 migrations/adr071_notification_lifecycle.sql 的 DB enum 中定義
# 不在此做 runtime ALERT_EVENT_TYPES.update():避免模組副作用污染全局狀態 + 測試隔離失敗
# 若 repository 層有 ALERT_EVENT_TYPES 白名單 validation
# 請在 alert_operation_log_repository.py 的 ALERT_EVENT_TYPES 靜態集合中加入以下值:
# "KM_CONVERTED", "NOTIFICATION_CLASSIFIED", "MANUAL_FIX_RECORDED",
# "PLAYBOOK_DRAFT_CREATED", "STATE_GUARD_BLOCKED"
# 通知類型 → KM 品質等級對應
_TYPE_TO_STATUS = {
"TYPE-2": EntryStatus.APPROVED, # 自動修復成功,最高品質
"TYPE-3": EntryStatus.REVIEW, # 人工審核後執行
"TYPE-4": EntryStatus.DRAFT, # AI 無法判斷,草稿
"TYPE-4D": EntryStatus.DRAFT, # Config Drift草稿
"TYPE-1": None, # 純資訊,不轉 KM
}
_TYPE_TO_SOURCE = {
"TYPE-2": EntrySource.AI_EXTRACTED,
"TYPE-3": EntrySource.AI_EXTRACTED,
"TYPE-4": EntrySource.HUMAN,
"TYPE-4D": EntrySource.AI_EXTRACTED,
}
class KMConversionService:
"""
Incident → KM 自動轉換服務
觸發時機:
1. Incident 狀態變為 RESOLVED主要路徑
2. 使用者點擊 [手動修復後記錄] 後ADR-071-H
3. 每日 03:00 cron 補轉換vectorized=False + RESOLVED
"""
def __init__(self) -> None:
self._knowledge_svc = get_knowledge_service()
self._op_log_repo = get_alert_operation_log_repository()
async def convert(self, incident) -> dict | None:
"""
將 Incident 轉換為 KnowledgeEntry
Args:
incident: Incident ORM 物件(呼叫前確保 signals 已 eager load
Returns:
dict with km_entry_id and quality_level, or None if skipped
"""
try:
return await self._convert_inner(incident)
except Exception as e:
logger.error(
"km_conversion_error",
incident_id=getattr(incident, "incident_id", "unknown"),
error=str(e),
)
raise
async def _convert_inner(self, incident) -> dict | None:
notification_type = getattr(incident, "notification_type", None) or "TYPE-3"
# TYPE-1 不轉 KM
target_status = _TYPE_TO_STATUS.get(notification_type)
if target_status is None:
logger.debug(
"km_conversion_skipped",
incident_id=incident.incident_id,
reason="TYPE-1 純資訊,不轉 KM",
)
return None
entry_source = _TYPE_TO_SOURCE.get(notification_type, EntrySource.AI_EXTRACTED)
alert_category = getattr(incident, "alert_category", None) or "unknown"
# 提取 label 資訊
labels = incident.signals[0].labels if incident.signals else {}
alertname = labels.get("alertname", "unknown")
severity = labels.get("severity", "unknown")
affected_services = ", ".join(incident.affected_services or ["unknown"])
# 計算修復耗時
resolution_time = ""
if incident.resolved_at and incident.created_at:
try:
delta = incident.resolved_at - incident.created_at
resolution_time = f"{int(delta.total_seconds())}s"
except Exception:
pass
# 建立 KM 內容(標準格式)
content = self._build_content(
incident=incident,
alertname=alertname,
affected_services=affected_services,
severity=severity,
resolution_time=resolution_time,
)
_inc_title = getattr(incident, "title", None) or alertname
title = f"{alertname} @ {affected_services[:40]}{_inc_title[:60]}"
tags = [alertname, affected_services, severity, notification_type]
if alert_category != "unknown":
tags.append(alert_category)
km_entry = await self._knowledge_svc.create_entry(
KnowledgeEntryCreate(
title=title[:255],
content=content,
entry_type=EntryType.INCIDENT_CASE,
category=alert_category,
tags=[t for t in tags if t],
source=entry_source,
status=target_status,
related_incident_id=incident.incident_id,
)
)
# 寫入操作日誌
try:
await self._op_log_repo.append(
event_type="KM_CONVERTED",
incident_id=incident.incident_id,
actor="km_conversion_service",
action_detail=f"KM entry created: {km_entry.id}",
success=True,
context={
"km_entry_id": km_entry.id,
"quality_level": target_status.value,
"notification_type": notification_type,
},
)
except Exception as _e:
logger.warning("km_op_log_failed", incident_id=incident.incident_id, error=str(_e))
# BUG-004 修復 2026-04-11: KM entry 建立後knowledge_service 背景觸發 embedding
# 但 incidents.vectorized 沒有被設為 True → 飛輪閉環ADR-068學習效果歸零。
# 等 embedding 背景任務啟動後(短延遲)更新 incidents.vectorized = True。
# 注意embedding 為背景 asyncio task此處標記 vectorized=True 代表「已觸發向量化」
# 真正完成以 knowledge_embedding_saved log 為準,但 vectorized flag 用於篩選補轉換
try:
from src.db.base import get_db_context
from src.db.models import IncidentRecord
from sqlalchemy import update as _sa_update
async with get_db_context() as _db:
await _db.execute(
_sa_update(IncidentRecord)
.where(IncidentRecord.incident_id == incident.incident_id)
.values(vectorized=True)
)
await _db.commit()
logger.info(
"km_incident_vectorized_flagged",
incident_id=incident.incident_id,
km_entry_id=km_entry.id,
)
except Exception as _ve:
logger.warning(
"km_vectorized_flag_failed",
incident_id=incident.incident_id,
error=str(_ve),
)
# C2 修復 2026-04-11: DB 更新後,同步更新 Redis Working Memory 中的 vectorized 欄位
# 審計查 Redis Incident 物件,若不同步則審計仍顯示 vectorized=False
# Key 格式: incident:{incident_id}(與 incident_service.save_to_working_memory 一致)
try:
import json as _json
from src.core.redis_client import get_redis
_redis = get_redis()
_redis_key = f"incident:{incident.incident_id}"
_raw = await _redis.get(_redis_key)
if _raw:
_data = _json.loads(_raw)
if not _data.get("vectorized"):
_data["vectorized"] = True
_ttl = await _redis.ttl(_redis_key)
_ex = _ttl if _ttl and _ttl > 0 else 604800
await _redis.set(_redis_key, _json.dumps(_data), ex=_ex)
logger.info(
"km_incident_vectorized_redis_synced",
incident_id=incident.incident_id,
)
except Exception as _re:
logger.debug(
"km_vectorized_redis_sync_failed",
incident_id=incident.incident_id,
error=str(_re),
)
logger.info(
"km_converted",
incident_id=incident.incident_id,
km_entry_id=km_entry.id,
quality_level=target_status.value,
notification_type=notification_type,
)
return {
"km_entry_id": km_entry.id,
"quality_level": target_status.value,
}
def _build_content(
self,
incident,
alertname: str,
affected_services: str,
severity: str,
resolution_time: str,
) -> str:
"""
建立 KM 條目內容(標準格式)
"""
created_at_str = str(incident.created_at) if incident.created_at else "未知"
resolved_at_str = str(incident.resolved_at) if incident.resolved_at else "未知"
context_summary = ""
if incident.context_bundle:
context_summary = str(incident.context_bundle.get("summary", ""))
# 決策鏈資訊
decision_chain = getattr(incident, "decision_chain", None)
root_cause = ""
action_type = ""
action_command = ""
if decision_chain and isinstance(decision_chain, dict):
root_cause = decision_chain.get("root_cause", "")
action_type = decision_chain.get("action_type", "")
action_command = decision_chain.get("action", "")
# 指標快照(若有)— ADR-071-J: 使用精簡 delta 格式 (2026-04-11 Claude Sonnet 4.6)
metrics_section = ""
if incident.metrics_before or incident.metrics_after:
mb = incident.metrics_before or {}
ma = incident.metrics_after or {}
delta_parts = []
for key, label in (("cpu_pct", "CPU"), ("mem_pct", "Mem"), ("disk_pct", "Disk")):
if key in mb and key in ma:
delta_parts.append(f"{label} {mb[key]:.0f}%→{ma[key]:.0f}%")
elif key in ma:
delta_parts.append(f"{label} 後={ma[key]:.0f}%")
if "restart_count" in mb and "restart_count" in ma:
delta_parts.append(f"重啟 {mb['restart_count']}{ma['restart_count']}")
delta_str = " | ".join(delta_parts) if delta_parts else "(無量化數據)"
# K8s 狀態(若有)
k8s_state = getattr(incident, "k8s_state_after", None)
k8s_line = f"\n- K8s 狀態: {k8s_state}" if k8s_state else ""
metrics_section = (
f"\n## 效果驗證\n"
f"- 指標變化: {delta_str}\n"
f"- 恢復耗時: {resolution_time}\n"
+ k8s_line
+ "\n"
)
# 驗證結果(若有)
verify_section = ""
if incident.verification_result:
verify_section = f"- 驗證方式: {incident.verification_result}\n"
manual_section = ""
if incident.manual_fix_steps:
manual_section = (
f"\n## 手動修復步驟\n"
f"- 執行者: {incident.manual_fix_by or '未知'}\n"
f"```\n{incident.manual_fix_steps}\n```\n"
)
return (
f"## 症狀\n"
f"- 告警: {alertname}\n"
f"- 服務: {affected_services}\n"
f"- 嚴重度: {severity}\n"
f"- 觸發時間: {created_at_str}\n"
f"- 解決時間: {resolved_at_str}\n"
+ (f"- 即時情境: {context_summary}\n" if context_summary else "")
+ f"\n## 根因分析\n{root_cause or getattr(incident, 'title', None) or '未知'}\n"
+ (
f"\n## 執行動作\n"
f"- 類型: {action_type}\n"
f"- 指令: {action_command}\n"
if action_type or action_command else ""
)
+ metrics_section
+ verify_section
+ manual_section
)
# Singleton (模組層級)
_km_conversion_service: KMConversionService | None = None
def get_km_conversion_service() -> KMConversionService:
global _km_conversion_service
if _km_conversion_service is None:
_km_conversion_service = KMConversionService()
return _km_conversion_service