From 5499169996d0945c39224f77ad37745c76286fbb Mon Sep 17 00:00:00 2001 From: OG T Date: Sun, 5 Apr 2026 22:08:08 +0800 Subject: [PATCH] =?UTF-8?q?feat(auto-repair):=20=E6=89=93=E9=80=9A?= =?UTF-8?q?=E8=87=AA=E5=8B=95=E4=BF=AE=E5=BE=A9=E9=96=89=E7=92=B0=20(ADR-0?= =?UTF-8?q?58)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 問題: 告警鏈路從未呼叫 auto_repair_service,機制完全死路 修正: 1. webhooks.py: alertmanager_webhook 建立 Incident 後觸發 _try_auto_repair_background 2. playbook.py: is_high_quality 門檻降低 (冷啟動期) - success_count: 10 → 3 - success_rate: 95% → 80% 3. tests: test_evaluate_not_high_quality 更新為新門檻 流程: Alertmanager → API → Incident → evaluate → P2以下+高品質Playbook → 自動執行 → Telegram通知 Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/api/v1/webhooks.py | 100 ++++++++++++++++++++- apps/api/src/models/playbook.py | 11 ++- apps/api/tests/test_auto_repair_service.py | 2 +- 3 files changed, 107 insertions(+), 6 deletions(-) diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index 70b53ed5..bf7b8a18 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -53,6 +53,7 @@ from src.services.approval_db import get_approval_service # Phase 17 P0: Service 層 (消除 Router 直接存取 Redis) from src.services.incident_service import get_incident_service +from src.services.auto_repair_service import AutoRepairService # Phase 5: OpenClaw AI Engine from src.services.openclaw import get_openclaw @@ -145,6 +146,89 @@ async def create_incident_for_approval( return incident.incident_id +# ============================================================================= +# 2026-04-05 ogt: 自動修復背景任務 (ADR-058 閉環) +# ============================================================================= + +async def _try_auto_repair_background( + incident_id: str, + approval_id: str, + alert_type: str, + target_resource: str, + namespace: str, +) -> None: + """ + 背景評估並執行自動修復 + + 流程: + 1. 重新載入 Incident + 2. evaluate_auto_repair() — 檢查 P2以下 + 高品質Playbook + 低風險 + 3. 可修復 → execute_auto_repair() 執行 + 4. 不可修復 → 靜默,等人工批准 + """ + try: + incident_service = get_incident_service() + incident = await incident_service.get_from_working_memory(incident_id) + if not incident: + logger.warning("auto_repair_incident_not_found", incident_id=incident_id) + return + + repair_service = AutoRepairService() + decision = await repair_service.evaluate_auto_repair(incident) + + logger.info( + "auto_repair_decision", + incident_id=incident_id, + approval_id=approval_id, + can_auto_repair=decision.can_auto_repair, + reason=decision.reason, + blocked_by=decision.blocked_by, + ) + + if not decision.can_auto_repair: + return + + # 執行自動修復 + logger.info( + "auto_repair_executing", + incident_id=incident_id, + playbook_id=decision.playbook.playbook_id if decision.playbook else None, + ) + result = await repair_service.execute_auto_repair( + incident=incident, + playbook=decision.playbook, + ) + + logger.info( + "auto_repair_result", + incident_id=incident_id, + success=result.success if result else False, + ) + + # 通知 Telegram 自動修復結果 + if result: + try: + telegram = get_telegram_gateway() + status_icon = "✅" if result.success else "❌" + steps_summary = "\n".join(f" • {s}" for s in result.executed_steps[:3]) or "-" + await telegram.send_message( + f"{status_icon} *自動修復{'完成' if result.success else '失敗'}*\n" + f"資源: `{target_resource}` ({namespace})\n" + f"告警: {alert_type}\n" + f"耗時: {result.execution_time_ms}ms\n" + f"步驟:\n{steps_summary}" + ) + except Exception as tg_err: + logger.warning("auto_repair_telegram_notify_failed", error=str(tg_err)) + + except Exception as e: + logger.exception( + "auto_repair_background_failed", + incident_id=incident_id, + error=str(e), + ) + + # ============================================================================= # Phase 5: Telegram 背景推送任務 (非阻塞) # ============================================================================= @@ -1113,7 +1197,7 @@ async def alertmanager_webhook( # ================================================================ # Incident-Approval 同步 (鐵律: 必須同時創建) # ================================================================ - await create_incident_for_approval( + incident_id = await create_incident_for_approval( approval_id=str(approval.id), risk_level=risk_level.value, target_resource=target_resource, @@ -1128,6 +1212,20 @@ async def alertmanager_webhook( primary_responsibility = analysis_result.primary_responsibility or "COLLAB" confidence = analysis_result.confidence + # ================================================================ + # 2026-04-05 ogt: 自動修復評估 (ADR-058 閉環) + # Incident 建立後立即評估是否可自動修復 + # P2 以下 + 高品質 Playbook + 低風險 → 背景自動執行 + # ================================================================ + background_tasks.add_task( + _try_auto_repair_background, + incident_id=incident_id, + approval_id=str(approval.id), + alert_type=alert_type, + target_resource=target_resource, + namespace=namespace, + ) + # 推送 Telegram background_tasks.add_task( _push_to_telegram_background, diff --git a/apps/api/src/models/playbook.py b/apps/api/src/models/playbook.py index ede4d257..b7f98508 100644 --- a/apps/api/src/models/playbook.py +++ b/apps/api/src/models/playbook.py @@ -237,13 +237,16 @@ class Playbook(BaseModel): 條件: - 狀態為 APPROVED - - 成功率 >= 95% - - 成功次數 >= 10 + - 成功率 >= 80% (冷啟動期: 原 95%,2026-04-05 ogt 降低以打破零執行惡性循環) + - 成功次數 >= 3 (冷啟動期: 原 10,累積足夠後再收緊) + + 待成熟後收緊為: success_rate >= 0.95, success_count >= 10 """ + # 2026-04-05 ogt: 冷啟動調整,打破 zero-execution 惡性循環 return ( self.status == PlaybookStatus.APPROVED - and self.success_rate >= 0.95 - and self.success_count >= 10 + and self.success_rate >= 0.80 + and self.success_count >= 3 ) @property diff --git a/apps/api/tests/test_auto_repair_service.py b/apps/api/tests/test_auto_repair_service.py index 1dc18fe7..0f6de065 100644 --- a/apps/api/tests/test_auto_repair_service.py +++ b/apps/api/tests/test_auto_repair_service.py @@ -189,7 +189,7 @@ class TestAutoRepairService: affected_services=["test-service"], ), repair_steps=[], - success_count=5, # < 10 + success_count=2, # < 3 (冷啟動門檻 2026-04-05) failure_count=0, ) mock_playbook_service.add_playbook(playbook)