feat(auto-repair): 打通自動修復閉環 (ADR-058)
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Type Sync Check / check-type-sync (push) Failing after 53s

問題: 告警鏈路從未呼叫 auto_repair_service,機制完全死路
修正:
1. webhooks.py: alertmanager_webhook 建立 Incident 後觸發 _try_auto_repair_background
2. playbook.py: is_high_quality 門檻降低 (冷啟動期)
   - success_count: 10 → 3
   - success_rate: 95% → 80%
3. tests: test_evaluate_not_high_quality 更新為新門檻

流程: Alertmanager → API → Incident → evaluate → P2以下+高品質Playbook → 自動執行 → Telegram通知

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-05 22:08:08 +08:00
parent 9629367bc2
commit 5499169996
3 changed files with 107 additions and 6 deletions

View File

@@ -53,6 +53,7 @@ from src.services.approval_db import get_approval_service
# Phase 17 P0: Service 層 (消除 Router 直接存取 Redis)
from src.services.incident_service import get_incident_service
from src.services.auto_repair_service import AutoRepairService
# Phase 5: OpenClaw AI Engine
from src.services.openclaw import get_openclaw
@@ -145,6 +146,89 @@ async def create_incident_for_approval(
return incident.incident_id
# =============================================================================
# 2026-04-05 ogt: 自動修復背景任務 (ADR-058 閉環)
# =============================================================================
async def _try_auto_repair_background(
incident_id: str,
approval_id: str,
alert_type: str,
target_resource: str,
namespace: str,
) -> None:
"""
背景評估並執行自動修復
流程:
1. 重新載入 Incident
2. evaluate_auto_repair() — 檢查 P2以下 + 高品質Playbook + 低風險
3. 可修復 → execute_auto_repair() 執行
4. 不可修復 → 靜默,等人工批准
"""
try:
incident_service = get_incident_service()
incident = await incident_service.get_from_working_memory(incident_id)
if not incident:
logger.warning("auto_repair_incident_not_found", incident_id=incident_id)
return
repair_service = AutoRepairService()
decision = await repair_service.evaluate_auto_repair(incident)
logger.info(
"auto_repair_decision",
incident_id=incident_id,
approval_id=approval_id,
can_auto_repair=decision.can_auto_repair,
reason=decision.reason,
blocked_by=decision.blocked_by,
)
if not decision.can_auto_repair:
return
# 執行自動修復
logger.info(
"auto_repair_executing",
incident_id=incident_id,
playbook_id=decision.playbook.playbook_id if decision.playbook else None,
)
result = await repair_service.execute_auto_repair(
incident=incident,
playbook=decision.playbook,
)
logger.info(
"auto_repair_result",
incident_id=incident_id,
success=result.success if result else False,
)
# 通知 Telegram 自動修復結果
if result:
try:
telegram = get_telegram_gateway()
status_icon = "" if result.success else ""
steps_summary = "\n".join(f"{s}" for s in result.executed_steps[:3]) or "-"
await telegram.send_message(
f"{status_icon} *自動修復{'完成' if result.success else '失敗'}*\n"
f"資源: `{target_resource}` ({namespace})\n"
f"告警: {alert_type}\n"
f"耗時: {result.execution_time_ms}ms\n"
f"步驟:\n{steps_summary}"
)
except Exception as tg_err:
logger.warning("auto_repair_telegram_notify_failed", error=str(tg_err))
except Exception as e:
logger.exception(
"auto_repair_background_failed",
incident_id=incident_id,
error=str(e),
)
# =============================================================================
# Phase 5: Telegram 背景推送任務 (非阻塞)
# =============================================================================
@@ -1113,7 +1197,7 @@ async def alertmanager_webhook(
# ================================================================
# Incident-Approval 同步 (鐵律: 必須同時創建)
# ================================================================
await create_incident_for_approval(
incident_id = await create_incident_for_approval(
approval_id=str(approval.id),
risk_level=risk_level.value,
target_resource=target_resource,
@@ -1128,6 +1212,20 @@ async def alertmanager_webhook(
primary_responsibility = analysis_result.primary_responsibility or "COLLAB"
confidence = analysis_result.confidence
# ================================================================
# 2026-04-05 ogt: 自動修復評估 (ADR-058 閉環)
# Incident 建立後立即評估是否可自動修復
# P2 以下 + 高品質 Playbook + 低風險 → 背景自動執行
# ================================================================
background_tasks.add_task(
_try_auto_repair_background,
incident_id=incident_id,
approval_id=str(approval.id),
alert_type=alert_type,
target_resource=target_resource,
namespace=namespace,
)
# 推送 Telegram
background_tasks.add_task(
_push_to_telegram_background,

View File

@@ -237,13 +237,16 @@ class Playbook(BaseModel):
條件:
- 狀態為 APPROVED
- 成功率 >= 95%
- 成功次數 >= 10
- 成功率 >= 80% (冷啟動期: 原 95%2026-04-05 ogt 降低以打破零執行惡性循環)
- 成功次數 >= 3 (冷啟動期: 原 10累積足夠後再收緊)
待成熟後收緊為: success_rate >= 0.95, success_count >= 10
"""
# 2026-04-05 ogt: 冷啟動調整,打破 zero-execution 惡性循環
return (
self.status == PlaybookStatus.APPROVED
and self.success_rate >= 0.95
and self.success_count >= 10
and self.success_rate >= 0.80
and self.success_count >= 3
)
@property

View File

@@ -189,7 +189,7 @@ class TestAutoRepairService:
affected_services=["test-service"],
),
repair_steps=[],
success_count=5, # < 10
success_count=2, # < 3 (冷啟動門檻 2026-04-05)
failure_count=0,
)
mock_playbook_service.add_playbook(playbook)