Merge remote-tracking branch 'gitea/main' into codex/security-supply-chain-contracts-20260512

This commit is contained in:
Your Name
2026-05-13 18:48:36 +08:00
5 changed files with 159 additions and 4 deletions

View File

@@ -263,6 +263,7 @@ async def _try_auto_repair_background(
playbook=decision.playbook,
is_cold_start=decision.is_cold_start,
similarity_score=decision.similarity_score,
run_post_verification=False,
)
logger.info(

View File

@@ -108,6 +108,7 @@ class IAutoRepairService(Protocol):
self,
incident: Incident,
playbook: Playbook,
run_post_verification: bool = True,
) -> AutoRepairResult:
"""
執行自動修復
@@ -378,6 +379,7 @@ class AutoRepairService:
playbook: Playbook,
is_cold_start: bool = False,
similarity_score: float | None = None,
run_post_verification: bool = True,
) -> AutoRepairResult:
"""
執行自動修復
@@ -579,10 +581,17 @@ class AutoRepairService:
error=str(_inner_e),
)
_vl_task = _asyncio.create_task(_verify_and_learn())
if hasattr(self, "_pending_tasks"):
self._pending_tasks.add(_vl_task)
_vl_task.add_done_callback(self._pending_tasks.discard)
if run_post_verification:
_vl_task = _asyncio.create_task(_verify_and_learn())
if hasattr(self, "_pending_tasks"):
self._pending_tasks.add(_vl_task)
_vl_task.add_done_callback(self._pending_tasks.discard)
else:
logger.info(
"auto_repair_service_post_verify_delegated",
incident_id=incident.incident_id,
playbook_id=playbook.playbook_id,
)
except Exception as _vl_e:
logger.warning("auto_repair_verifier_setup_failed", error=str(_vl_e))

View File

@@ -126,11 +126,25 @@ class PostExecutionVerifier:
logger.warning("verifier_timeout", incident_id=incident_id)
if snapshot:
await _update_snapshot(snapshot, {}, "timeout")
else:
await _persist_fallback_snapshot(
incident=incident,
post_state={},
result="timeout",
action_taken=action_taken,
)
return "timeout"
except Exception:
logger.exception("verifier_collect_error", incident_id=incident_id)
if snapshot:
await _update_snapshot(snapshot, {}, "failed")
else:
await _persist_fallback_snapshot(
incident=incident,
post_state={},
result="failed",
action_taken=action_taken,
)
return "failed"
# 3. 對比前後狀態
@@ -140,6 +154,13 @@ class PostExecutionVerifier:
# 4. 更新 EvidenceSnapshot
if snapshot:
await _update_snapshot(snapshot, post_state, result)
else:
await _persist_fallback_snapshot(
incident=incident,
post_state=post_state,
result=result,
action_taken=action_taken,
)
logger.info(
"verifier_done",
@@ -492,6 +513,60 @@ async def _update_snapshot(
logger.exception("verifier_snapshot_update_failed", snapshot_id=snapshot.snapshot_id)
async def _persist_fallback_snapshot(
*,
incident: "Incident",
post_state: dict[str, Any],
result: str,
action_taken: str,
) -> None:
"""
Persist verifier outcome even when the pre-decision snapshot is unavailable.
Live T14 evidence showed auto_repair rows with verifier decisions in logs but
NULL incident_evidence.verification_result because verify(snapshot=None) had
no durable target. This fallback makes the verification gate auditable without
pretending a pre-execution baseline existed.
"""
incident_id = _get_incident_id(incident)
try:
snapshot = EvidenceSnapshot(incident_id=incident_id)
snapshot.post_execution_state = post_state
snapshot.verification_result = result
snapshot.matched_playbook_id = _extract_playbook_id(action_taken)
snapshot.sensors_attempted = 1
snapshot.sensors_succeeded = 1 if post_state else 0
snapshot.mcp_health = {"post_execution_verifier": bool(post_state)}
snapshot.evidence_summary = (
"[PostExecutionVerifier] fallback verification snapshot; "
f"action={action_taken[:160]}; result={result}; "
"pre_execution_state=missing"
)
await snapshot.save()
logger.info(
"verifier_fallback_snapshot_saved",
incident_id=incident_id,
snapshot_id=snapshot.snapshot_id,
result=result,
action=action_taken,
)
except Exception:
logger.warning(
"verifier_fallback_snapshot_save_failed",
incident_id=incident_id,
result=result,
exc_info=True,
)
def _extract_playbook_id(action_taken: str) -> str | None:
for prefix in ("auto_repair_playbook:", "auto_repair:"):
if action_taken.startswith(prefix):
playbook_id = action_taken.removeprefix(prefix).split(":", 1)[0].strip()
return playbook_id or None
return None
# ─────────────────────────────────────────────────────────────────────────────
# Singleton
# ─────────────────────────────────────────────────────────────────────────────

View File

@@ -233,6 +233,45 @@ async def test_auto_repair_success_triggers_verify_and_learn(monkeypatch):
assert call["matched_playbook_id"] == playbook.playbook_id
@pytest.mark.asyncio
async def test_auto_repair_can_delegate_post_verification(monkeypatch):
"""
webhook 路徑會自行 await PostExecutionVerifierservice 層需可跳過內部
fire-and-forget 驗證,避免同一個修復產生兩組驗證與 Telegram 升級。
"""
stub_verifier = StubVerifier(result="success")
stub_learning = StubLearningService()
import src.services.post_execution_verifier as _pev_mod
monkeypatch.setattr(_pev_mod, "_verifier", stub_verifier)
import src.services.learning_service as _ls_mod
monkeypatch.setattr(_ls_mod, "_learning_service", stub_learning)
playbook = _make_playbook()
pb_service = StubPlaybookService()
pb_service.add_playbook(playbook)
service = AutoRepairService(
playbook_service=pb_service,
cooldown_checker=_no_cooldown,
)
incident = _make_incident()
result = await service.execute_auto_repair(
incident,
playbook,
run_post_verification=False,
)
assert result.success is True
await asyncio.sleep(0.05)
assert stub_verifier.calls == []
assert stub_learning.verification_calls == []
@pytest.mark.asyncio
async def test_auto_repair_failure_does_not_call_verifier(monkeypatch):
"""

View File

@@ -208,6 +208,37 @@ class TestVerify:
assert result == "failed"
@pytest.mark.asyncio
async def test_snapshot_missing_persists_fallback_verification(self):
"""snapshot=None 也必須把 verification_result 寫成可稽核 fallback snapshot。"""
verifier = PostExecutionVerifier()
incident = _stub_incident()
persist = AsyncMock()
with patch.object(
verifier,
"_collect_post_state",
new=AsyncMock(return_value={"status": "Running"}),
):
with patch(
"src.services.post_execution_verifier._persist_fallback_snapshot",
new=persist,
):
result = await verifier.verify(
incident=incident,
snapshot=None,
action_taken="auto_repair_playbook:PB-TEST",
warmup_sec=0.0,
)
assert result == "success"
persist.assert_awaited_once()
call_kwargs = persist.await_args.kwargs
assert call_kwargs["incident"] is incident
assert call_kwargs["post_state"] == {"status": "Running"}
assert call_kwargs["result"] == "success"
assert call_kwargs["action_taken"] == "auto_repair_playbook:PB-TEST"
@pytest.mark.asyncio
async def test_collect_timeout_returns_timeout(self):
"""MCP 蒐集超時 → "timeout",不 raise"""