diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index 20a846e4..3f4d9a4e 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -263,6 +263,7 @@ async def _try_auto_repair_background( playbook=decision.playbook, is_cold_start=decision.is_cold_start, similarity_score=decision.similarity_score, + run_post_verification=False, ) logger.info( diff --git a/apps/api/src/services/auto_repair_service.py b/apps/api/src/services/auto_repair_service.py index 11a08f24..a03e7b3d 100644 --- a/apps/api/src/services/auto_repair_service.py +++ b/apps/api/src/services/auto_repair_service.py @@ -108,6 +108,7 @@ class IAutoRepairService(Protocol): self, incident: Incident, playbook: Playbook, + run_post_verification: bool = True, ) -> AutoRepairResult: """ 執行自動修復 @@ -378,6 +379,7 @@ class AutoRepairService: playbook: Playbook, is_cold_start: bool = False, similarity_score: float | None = None, + run_post_verification: bool = True, ) -> AutoRepairResult: """ 執行自動修復 @@ -579,10 +581,17 @@ class AutoRepairService: error=str(_inner_e), ) - _vl_task = _asyncio.create_task(_verify_and_learn()) - if hasattr(self, "_pending_tasks"): - self._pending_tasks.add(_vl_task) - _vl_task.add_done_callback(self._pending_tasks.discard) + if run_post_verification: + _vl_task = _asyncio.create_task(_verify_and_learn()) + if hasattr(self, "_pending_tasks"): + self._pending_tasks.add(_vl_task) + _vl_task.add_done_callback(self._pending_tasks.discard) + else: + logger.info( + "auto_repair_service_post_verify_delegated", + incident_id=incident.incident_id, + playbook_id=playbook.playbook_id, + ) except Exception as _vl_e: logger.warning("auto_repair_verifier_setup_failed", error=str(_vl_e)) diff --git a/apps/api/src/services/post_execution_verifier.py b/apps/api/src/services/post_execution_verifier.py index 0ca0f4a9..e52c66cb 100644 --- a/apps/api/src/services/post_execution_verifier.py +++ b/apps/api/src/services/post_execution_verifier.py @@ -126,11 +126,25 @@ class PostExecutionVerifier: logger.warning("verifier_timeout", incident_id=incident_id) if snapshot: await _update_snapshot(snapshot, {}, "timeout") + else: + await _persist_fallback_snapshot( + incident=incident, + post_state={}, + result="timeout", + action_taken=action_taken, + ) return "timeout" except Exception: logger.exception("verifier_collect_error", incident_id=incident_id) if snapshot: await _update_snapshot(snapshot, {}, "failed") + else: + await _persist_fallback_snapshot( + incident=incident, + post_state={}, + result="failed", + action_taken=action_taken, + ) return "failed" # 3. 對比前後狀態 @@ -140,6 +154,13 @@ class PostExecutionVerifier: # 4. 更新 EvidenceSnapshot if snapshot: await _update_snapshot(snapshot, post_state, result) + else: + await _persist_fallback_snapshot( + incident=incident, + post_state=post_state, + result=result, + action_taken=action_taken, + ) logger.info( "verifier_done", @@ -492,6 +513,60 @@ async def _update_snapshot( logger.exception("verifier_snapshot_update_failed", snapshot_id=snapshot.snapshot_id) +async def _persist_fallback_snapshot( + *, + incident: "Incident", + post_state: dict[str, Any], + result: str, + action_taken: str, +) -> None: + """ + Persist verifier outcome even when the pre-decision snapshot is unavailable. + + Live T14 evidence showed auto_repair rows with verifier decisions in logs but + NULL incident_evidence.verification_result because verify(snapshot=None) had + no durable target. This fallback makes the verification gate auditable without + pretending a pre-execution baseline existed. + """ + incident_id = _get_incident_id(incident) + try: + snapshot = EvidenceSnapshot(incident_id=incident_id) + snapshot.post_execution_state = post_state + snapshot.verification_result = result + snapshot.matched_playbook_id = _extract_playbook_id(action_taken) + snapshot.sensors_attempted = 1 + snapshot.sensors_succeeded = 1 if post_state else 0 + snapshot.mcp_health = {"post_execution_verifier": bool(post_state)} + snapshot.evidence_summary = ( + "[PostExecutionVerifier] fallback verification snapshot; " + f"action={action_taken[:160]}; result={result}; " + "pre_execution_state=missing" + ) + await snapshot.save() + logger.info( + "verifier_fallback_snapshot_saved", + incident_id=incident_id, + snapshot_id=snapshot.snapshot_id, + result=result, + action=action_taken, + ) + except Exception: + logger.warning( + "verifier_fallback_snapshot_save_failed", + incident_id=incident_id, + result=result, + exc_info=True, + ) + + +def _extract_playbook_id(action_taken: str) -> str | None: + for prefix in ("auto_repair_playbook:", "auto_repair:"): + if action_taken.startswith(prefix): + playbook_id = action_taken.removeprefix(prefix).split(":", 1)[0].strip() + return playbook_id or None + return None + + # ───────────────────────────────────────────────────────────────────────────── # Singleton # ───────────────────────────────────────────────────────────────────────────── diff --git a/apps/api/tests/test_learning_chain_e2e.py b/apps/api/tests/test_learning_chain_e2e.py index 977bde76..e81e4b2d 100644 --- a/apps/api/tests/test_learning_chain_e2e.py +++ b/apps/api/tests/test_learning_chain_e2e.py @@ -233,6 +233,45 @@ async def test_auto_repair_success_triggers_verify_and_learn(monkeypatch): assert call["matched_playbook_id"] == playbook.playbook_id +@pytest.mark.asyncio +async def test_auto_repair_can_delegate_post_verification(monkeypatch): + """ + webhook 路徑會自行 await PostExecutionVerifier;service 層需可跳過內部 + fire-and-forget 驗證,避免同一個修復產生兩組驗證與 Telegram 升級。 + """ + stub_verifier = StubVerifier(result="success") + stub_learning = StubLearningService() + + import src.services.post_execution_verifier as _pev_mod + monkeypatch.setattr(_pev_mod, "_verifier", stub_verifier) + + import src.services.learning_service as _ls_mod + monkeypatch.setattr(_ls_mod, "_learning_service", stub_learning) + + playbook = _make_playbook() + pb_service = StubPlaybookService() + pb_service.add_playbook(playbook) + + service = AutoRepairService( + playbook_service=pb_service, + cooldown_checker=_no_cooldown, + ) + + incident = _make_incident() + result = await service.execute_auto_repair( + incident, + playbook, + run_post_verification=False, + ) + + assert result.success is True + + await asyncio.sleep(0.05) + + assert stub_verifier.calls == [] + assert stub_learning.verification_calls == [] + + @pytest.mark.asyncio async def test_auto_repair_failure_does_not_call_verifier(monkeypatch): """ diff --git a/apps/api/tests/test_post_execution_verifier.py b/apps/api/tests/test_post_execution_verifier.py index bada10d8..b0ff82ac 100644 --- a/apps/api/tests/test_post_execution_verifier.py +++ b/apps/api/tests/test_post_execution_verifier.py @@ -208,6 +208,37 @@ class TestVerify: assert result == "failed" + @pytest.mark.asyncio + async def test_snapshot_missing_persists_fallback_verification(self): + """snapshot=None 也必須把 verification_result 寫成可稽核 fallback snapshot。""" + verifier = PostExecutionVerifier() + incident = _stub_incident() + persist = AsyncMock() + + with patch.object( + verifier, + "_collect_post_state", + new=AsyncMock(return_value={"status": "Running"}), + ): + with patch( + "src.services.post_execution_verifier._persist_fallback_snapshot", + new=persist, + ): + result = await verifier.verify( + incident=incident, + snapshot=None, + action_taken="auto_repair_playbook:PB-TEST", + warmup_sec=0.0, + ) + + assert result == "success" + persist.assert_awaited_once() + call_kwargs = persist.await_args.kwargs + assert call_kwargs["incident"] is incident + assert call_kwargs["post_state"] == {"status": "Running"} + assert call_kwargs["result"] == "success" + assert call_kwargs["action_taken"] == "auto_repair_playbook:PB-TEST" + @pytest.mark.asyncio async def test_collect_timeout_returns_timeout(self): """MCP 蒐集超時 → "timeout",不 raise"""