fix(auto-repair): 首席架構師 Review — 4 Critical/Important 修復
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 7m2s
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 7m2s
Critical #1: KM write task 移出 try/except - _trigger_learning 的 KM 寫入原在 try 內,learning 失敗時不寫 KM - 移至 except 後確保成功/失敗都寫入 - 移除冗餘 import asyncio(已在頂層 import) - Minor: approval.incident_id or None 防空字串 Important #2: migration 加 PRIMARY KEY - playbook_id 從 UNIQUE 升為 PRIMARY KEY - prod DB 已執行 ALTER TABLE ADD PRIMARY KEY Important #3: s.sequence→s.step_number, s.description→s.command - embed_playbook() 使用不存在的欄位名,RAG 向量索引靜默失敗 - RepairStep 正確欄位: step_number, command Important #1: PlaybookService._get_rag_service 不再 Service 層快取 - 改為每次呼叫工廠 get_playbook_rag_service() - 避免舊實例繞過工廠的 is_closed 重建邏輯 冷啟動修復 (首席架構師建議B+C): - _trigger_playbook_extraction 執行成功後自動設定 execution_success=True, effectiveness_score=4, status=RESOLVED - skip 路徑 logger.debug → logger.info 提升可觀測性 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -6,7 +6,8 @@
|
||||
|
||||
CREATE TABLE IF NOT EXISTS playbooks (
|
||||
-- 識別
|
||||
playbook_id VARCHAR(32) UNIQUE NOT NULL,
|
||||
-- 2026-04-04 ogt: 首席架構師 Review — 加 PRIMARY KEY,移除多餘 UNIQUE
|
||||
playbook_id VARCHAR(32) PRIMARY KEY,
|
||||
|
||||
-- 元資料
|
||||
name VARCHAR(256) NOT NULL,
|
||||
|
||||
@@ -239,13 +239,6 @@ class ApprovalExecutionService:
|
||||
result=result,
|
||||
)
|
||||
|
||||
# 2026-04-04 ogt: 執行結果沉澱到 KM
|
||||
# 統帥鐵律: 所有異常與自動修復紀錄必須回寫 KM
|
||||
import asyncio
|
||||
asyncio.create_task(
|
||||
self._write_execution_result_to_km(approval, success, error_message)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
# 學習失敗不影響主流程
|
||||
logger.warning(
|
||||
@@ -254,6 +247,12 @@ class ApprovalExecutionService:
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
# 2026-04-04 ogt: 執行結果沉澱到 KM — 移出 try/except 確保 learning 失敗也寫入
|
||||
# 統帥鐵律: 所有異常與自動修復紀錄必須回寫 KM
|
||||
asyncio.create_task(
|
||||
self._write_execution_result_to_km(approval, success, error_message)
|
||||
)
|
||||
|
||||
async def _write_execution_result_to_km(
|
||||
self,
|
||||
approval: "ApprovalRequest",
|
||||
@@ -288,7 +287,7 @@ class ApprovalExecutionService:
|
||||
category="execution_result",
|
||||
tags=["execution", "auto_repair" if success else "execution_failed"],
|
||||
source=EntrySource.AI_EXTRACTED,
|
||||
related_incident_id=approval.incident_id,
|
||||
related_incident_id=approval.incident_id or None,
|
||||
created_by="approval_execution",
|
||||
)
|
||||
await get_knowledge_service().create_entry(entry_data)
|
||||
@@ -395,8 +394,8 @@ class ApprovalExecutionService:
|
||||
# 暫時從 description 或 action 解析
|
||||
incident_id = self._extract_incident_id_from_approval(approval)
|
||||
if not incident_id:
|
||||
logger.debug(
|
||||
"playbook_extraction_skip",
|
||||
logger.info(
|
||||
"playbook_extraction_skipped",
|
||||
approval_id=str(approval.id),
|
||||
reason="No incident_id found",
|
||||
)
|
||||
@@ -409,43 +408,46 @@ class ApprovalExecutionService:
|
||||
incident = await incident_service.get_incident(incident_id)
|
||||
|
||||
if not incident:
|
||||
logger.debug(
|
||||
"playbook_extraction_skip",
|
||||
logger.info(
|
||||
"playbook_extraction_skipped",
|
||||
approval_id=str(approval.id),
|
||||
incident_id=incident_id,
|
||||
reason="Incident not found",
|
||||
)
|
||||
return
|
||||
|
||||
# 3. 檢查 Incident 狀態
|
||||
from src.models.incident import IncidentStatus
|
||||
# 3. 執行成功後自動設定 outcome (冷啟動關鍵)
|
||||
# 2026-04-04 ogt: 首席架構師 Review — 補上 execution_success + effectiveness_score
|
||||
# 確保 Playbook 萃取前置條件能成立,不再依賴人工填分
|
||||
from src.models.incident import IncidentOutcome, IncidentStatus
|
||||
from src.utils.timezone import now_taipei
|
||||
|
||||
if incident.outcome is None:
|
||||
incident.outcome = IncidentOutcome()
|
||||
if not incident.outcome.execution_success:
|
||||
incident.outcome.execution_success = True
|
||||
if incident.outcome.effectiveness_score is None or incident.outcome.effectiveness_score < 4:
|
||||
incident.outcome.effectiveness_score = 4 # 系統判斷:K8s 執行成功 = 有效
|
||||
if incident.status not in [IncidentStatus.RESOLVED, IncidentStatus.CLOSED]:
|
||||
logger.debug(
|
||||
"playbook_extraction_skip",
|
||||
approval_id=str(approval.id),
|
||||
incident_id=incident_id,
|
||||
incident_status=incident.status.value,
|
||||
reason="Incident not resolved",
|
||||
)
|
||||
return
|
||||
incident.status = IncidentStatus.RESOLVED
|
||||
incident.resolved_at = now_taipei()
|
||||
|
||||
# 4. 檢查 effectiveness_score
|
||||
effectiveness = incident.outcome.effectiveness_score if incident.outcome else 0
|
||||
if effectiveness < 4:
|
||||
logger.debug(
|
||||
"playbook_extraction_skip",
|
||||
approval_id=str(approval.id),
|
||||
incident_id=incident_id,
|
||||
effectiveness=effectiveness,
|
||||
reason="Low effectiveness score",
|
||||
)
|
||||
return
|
||||
# 回存 Incident(fire-and-forget 路徑,失敗不影響主流程)
|
||||
await incident_service.save_to_working_memory(incident)
|
||||
|
||||
# 5. 觸發萃取
|
||||
logger.info(
|
||||
"playbook_extraction_incident_updated",
|
||||
approval_id=str(approval.id),
|
||||
incident_id=incident_id,
|
||||
effectiveness_score=incident.outcome.effectiveness_score,
|
||||
status=incident.status.value,
|
||||
)
|
||||
|
||||
# 4. 觸發萃取(effectiveness 已保證 >= 4)
|
||||
from src.services.playbook_service import get_playbook_service
|
||||
|
||||
playbook_service = get_playbook_service()
|
||||
effectiveness = incident.outcome.effectiveness_score or 4
|
||||
playbook = await playbook_service.extract_from_incident(
|
||||
incident=incident,
|
||||
auto_approve=effectiveness >= 5, # 滿分自動核准
|
||||
|
||||
@@ -239,9 +239,10 @@ class PlaybookRAGService:
|
||||
text_parts.append(f"描述: {playbook.description}")
|
||||
|
||||
# 修復步驟
|
||||
# 2026-04-04 ogt: 修正欄位名稱 s.sequence→s.step_number, s.description→s.command
|
||||
if playbook.repair_steps:
|
||||
steps_text = "; ".join(
|
||||
f"{s.sequence}. {s.description}"
|
||||
f"{s.step_number}. {s.command}"
|
||||
for s in playbook.repair_steps[:5] # 最多 5 步
|
||||
)
|
||||
text_parts.append(f"步驟: {steps_text}")
|
||||
|
||||
@@ -87,14 +87,16 @@ class PlaybookService:
|
||||
|
||||
def __init__(self, repository: IPlaybookRepository | None = None):
|
||||
self._repository = repository or get_playbook_repository()
|
||||
# 2026-03-27 ogt: RAG Service 改為 lazy initialization (async factory)
|
||||
self._rag_service = None
|
||||
|
||||
async def _get_rag_service(self):
|
||||
"""Lazy initialization for RAG service (2026-03-27 async factory)"""
|
||||
if self._rag_service is None:
|
||||
self._rag_service = await get_playbook_rag_service()
|
||||
return self._rag_service
|
||||
"""
|
||||
取得 RAG Service — 每次走工廠,不在 Service 層快取
|
||||
|
||||
2026-04-04 ogt: 首席架構師 Review — 移除 Service 層快取
|
||||
原因: PlaybookService 快取舊實例會繞過工廠的 is_closed 重建邏輯
|
||||
由 get_playbook_rag_service() 工廠統一管理生命週期
|
||||
"""
|
||||
return await get_playbook_rag_service()
|
||||
|
||||
# === Core Operations ===
|
||||
|
||||
|
||||
Reference in New Issue
Block a user