feat(auto-repair): 打通自動修復閉環 (ADR-058)
問題: 告警鏈路從未呼叫 auto_repair_service,機制完全死路 修正: 1. webhooks.py: alertmanager_webhook 建立 Incident 後觸發 _try_auto_repair_background 2. playbook.py: is_high_quality 門檻降低 (冷啟動期) - success_count: 10 → 3 - success_rate: 95% → 80% 3. tests: test_evaluate_not_high_quality 更新為新門檻 流程: Alertmanager → API → Incident → evaluate → P2以下+高品質Playbook → 自動執行 → Telegram通知 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -53,6 +53,7 @@ from src.services.approval_db import get_approval_service
|
||||
|
||||
# Phase 17 P0: Service 層 (消除 Router 直接存取 Redis)
|
||||
from src.services.incident_service import get_incident_service
|
||||
from src.services.auto_repair_service import AutoRepairService
|
||||
|
||||
# Phase 5: OpenClaw AI Engine
|
||||
from src.services.openclaw import get_openclaw
|
||||
@@ -145,6 +146,89 @@ async def create_incident_for_approval(
|
||||
return incident.incident_id
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 2026-04-05 ogt: 自動修復背景任務 (ADR-058 閉環)
|
||||
# =============================================================================
|
||||
|
||||
async def _try_auto_repair_background(
|
||||
incident_id: str,
|
||||
approval_id: str,
|
||||
alert_type: str,
|
||||
target_resource: str,
|
||||
namespace: str,
|
||||
) -> None:
|
||||
"""
|
||||
背景評估並執行自動修復
|
||||
|
||||
流程:
|
||||
1. 重新載入 Incident
|
||||
2. evaluate_auto_repair() — 檢查 P2以下 + 高品質Playbook + 低風險
|
||||
3. 可修復 → execute_auto_repair() 執行
|
||||
4. 不可修復 → 靜默,等人工批准
|
||||
"""
|
||||
try:
|
||||
incident_service = get_incident_service()
|
||||
incident = await incident_service.get_from_working_memory(incident_id)
|
||||
if not incident:
|
||||
logger.warning("auto_repair_incident_not_found", incident_id=incident_id)
|
||||
return
|
||||
|
||||
repair_service = AutoRepairService()
|
||||
decision = await repair_service.evaluate_auto_repair(incident)
|
||||
|
||||
logger.info(
|
||||
"auto_repair_decision",
|
||||
incident_id=incident_id,
|
||||
approval_id=approval_id,
|
||||
can_auto_repair=decision.can_auto_repair,
|
||||
reason=decision.reason,
|
||||
blocked_by=decision.blocked_by,
|
||||
)
|
||||
|
||||
if not decision.can_auto_repair:
|
||||
return
|
||||
|
||||
# 執行自動修復
|
||||
logger.info(
|
||||
"auto_repair_executing",
|
||||
incident_id=incident_id,
|
||||
playbook_id=decision.playbook.playbook_id if decision.playbook else None,
|
||||
)
|
||||
result = await repair_service.execute_auto_repair(
|
||||
incident=incident,
|
||||
playbook=decision.playbook,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"auto_repair_result",
|
||||
incident_id=incident_id,
|
||||
success=result.success if result else False,
|
||||
)
|
||||
|
||||
# 通知 Telegram 自動修復結果
|
||||
if result:
|
||||
try:
|
||||
telegram = get_telegram_gateway()
|
||||
status_icon = "✅" if result.success else "❌"
|
||||
steps_summary = "\n".join(f" • {s}" for s in result.executed_steps[:3]) or "-"
|
||||
await telegram.send_message(
|
||||
f"{status_icon} *自動修復{'完成' if result.success else '失敗'}*\n"
|
||||
f"資源: `{target_resource}` ({namespace})\n"
|
||||
f"告警: {alert_type}\n"
|
||||
f"耗時: {result.execution_time_ms}ms\n"
|
||||
f"步驟:\n{steps_summary}"
|
||||
)
|
||||
except Exception as tg_err:
|
||||
logger.warning("auto_repair_telegram_notify_failed", error=str(tg_err))
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(
|
||||
"auto_repair_background_failed",
|
||||
incident_id=incident_id,
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Phase 5: Telegram 背景推送任務 (非阻塞)
|
||||
# =============================================================================
|
||||
@@ -1113,7 +1197,7 @@ async def alertmanager_webhook(
|
||||
# ================================================================
|
||||
# Incident-Approval 同步 (鐵律: 必須同時創建)
|
||||
# ================================================================
|
||||
await create_incident_for_approval(
|
||||
incident_id = await create_incident_for_approval(
|
||||
approval_id=str(approval.id),
|
||||
risk_level=risk_level.value,
|
||||
target_resource=target_resource,
|
||||
@@ -1128,6 +1212,20 @@ async def alertmanager_webhook(
|
||||
primary_responsibility = analysis_result.primary_responsibility or "COLLAB"
|
||||
confidence = analysis_result.confidence
|
||||
|
||||
# ================================================================
|
||||
# 2026-04-05 ogt: 自動修復評估 (ADR-058 閉環)
|
||||
# Incident 建立後立即評估是否可自動修復
|
||||
# P2 以下 + 高品質 Playbook + 低風險 → 背景自動執行
|
||||
# ================================================================
|
||||
background_tasks.add_task(
|
||||
_try_auto_repair_background,
|
||||
incident_id=incident_id,
|
||||
approval_id=str(approval.id),
|
||||
alert_type=alert_type,
|
||||
target_resource=target_resource,
|
||||
namespace=namespace,
|
||||
)
|
||||
|
||||
# 推送 Telegram
|
||||
background_tasks.add_task(
|
||||
_push_to_telegram_background,
|
||||
|
||||
@@ -237,13 +237,16 @@ class Playbook(BaseModel):
|
||||
|
||||
條件:
|
||||
- 狀態為 APPROVED
|
||||
- 成功率 >= 95%
|
||||
- 成功次數 >= 10
|
||||
- 成功率 >= 80% (冷啟動期: 原 95%,2026-04-05 ogt 降低以打破零執行惡性循環)
|
||||
- 成功次數 >= 3 (冷啟動期: 原 10,累積足夠後再收緊)
|
||||
|
||||
待成熟後收緊為: success_rate >= 0.95, success_count >= 10
|
||||
"""
|
||||
# 2026-04-05 ogt: 冷啟動調整,打破 zero-execution 惡性循環
|
||||
return (
|
||||
self.status == PlaybookStatus.APPROVED
|
||||
and self.success_rate >= 0.95
|
||||
and self.success_count >= 10
|
||||
and self.success_rate >= 0.80
|
||||
and self.success_count >= 3
|
||||
)
|
||||
|
||||
@property
|
||||
|
||||
@@ -189,7 +189,7 @@ class TestAutoRepairService:
|
||||
affected_services=["test-service"],
|
||||
),
|
||||
repair_steps=[],
|
||||
success_count=5, # < 10
|
||||
success_count=2, # < 3 (冷啟動門檻 2026-04-05)
|
||||
failure_count=0,
|
||||
)
|
||||
mock_playbook_service.add_playbook(playbook)
|
||||
|
||||
Reference in New Issue
Block a user