Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 1m38s
W2 (onboarder 4 週飛輪 80→90 路徑第二週) + critic PR review 5 個 critical/major 全部修完,default flag=false 安全無爆炸風險。 ## W2 三件 PR ### PR-R2 — AOL → catalog confidence EWMA 回灌(修飛輪斷鏈 C2) - 新檔 `apps/api/src/jobs/aol_to_catalog_writeback_job.py` - 邏輯:每小時掃 AOL 計算 EWMA confidence (alpha=0.3) 回灌 alert_rule_catalog - 失敗閾值 N=5 連續低成功率 → review_status='draft' - Hermes _fetch_noisy_rules SQL 加 OR review_status='draft' - ENABLE_AOL_WRITEBACK_JOB=false (default) - 8 個測試(mock path 修正:lazy import → patch src.db.base.get_db_context) ### PR-V1 — self_healing_validator 串接 (修飛輪斷鏈 C6) - 新檔 `apps/api/src/services/self_healing_validator.py`(純函數 assess_self_healing) - post_execution_verifier.py step 5 串接(feature flag gate) - evidence_snapshot.py 加 self_healing_score / self_healing_detail 欄位 - db/models.py + base.py ALTER IF NOT EXISTS - score < 0.5 → 觸發 rollback 提案 Telegram alert(不自動執行) - ENABLE_SELF_HEALING_VALIDATOR=false (default) - 7 個測試 ### PR-L1 — KM ↔ Playbook 雙向回路 (修飛輪斷鏈 C3+C4) - learning_service.py 三條新邏輯: 1. _write_playbook_evolution_km:promote/demote 寫 KM 演化條目 2. _check_and_mark_playbook_review:N=5 累積觸發 review_required 3. _demote_alert_rule_catalog_confidence:DEPRECATED → confidence×=0.5 - PlaybookRecord 加 review_required 欄位(schema migration via base.py) - ENABLE_KM_PLAYBOOK_FEEDBACK_LOOP=false (default) - KM_PLAYBOOK_REVIEW_THRESHOLD=5 可調 - 6 個測試 ## KMWriter Critic 5 個 Critical/Major 修復(之前 critic PR review 發現) 之前 push commitc5753e1c已修,本 commit 補回 stash 中的對應檔案: - C1 km_writer.py:194 backfill 自打臉(已修:同步 await + DLQ) - C2 km_writer.py:391 KM_WRITE_AWAIT=false 路徑收緊 - M1 decision_manager.py:2178/2203 移除 _fire_and_forget - M2 incident_service.py:1099 自製 path 加 retry+DLQ - M3 km_writer.py:166 冪等聲明對齊(UPSERT + partial unique index) ## 驗證 - 1635 unit tests 全綠(+27 from 1608) - 與fb0c72db(推翻 A2 Ollama primary) 共存無衝突 - 所有新 Job/Service default flag=false(不爆炸) ## 期望影響 飛輪斷鏈 C2 + C3 + C4 + C6 全修 飛輪自主化評分:65 → 85 預估(W2 完成後) 啟用順序(待 prodfb0c72db驗證 OLLAMA primary 跑得起來後): 1. ENABLE_AOL_WRITEBACK_JOB=true 2. ENABLE_KM_PLAYBOOK_FEEDBACK_LOOP=true 3. ENABLE_SELF_HEALING_VALIDATOR=true Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
353 lines
14 KiB
Python
353 lines
14 KiB
Python
"""
|
||
SelfHealingValidator 整合測試
|
||
================================
|
||
W2 PR-V1: 飛輪斷鏈 C6 修復驗收測試
|
||
|
||
測試項目:
|
||
1. test_validator_called_after_verification
|
||
— ENABLE=True 時,verify() 完成後 assess_self_healing 被呼叫
|
||
|
||
2. test_low_score_triggers_rollback_proposal
|
||
— score < 0.5 時,Telegram rollback 提案被發送
|
||
|
||
3. test_high_score_no_action
|
||
— score >= 0.5 時,Telegram 不觸發
|
||
|
||
4. test_validator_failure_does_not_block_main_flow
|
||
— assess_self_healing 拋例外,verify() 仍返回正確結果
|
||
|
||
5. test_feature_flag_disabled_skips
|
||
— ENABLE=False 時,assess_self_healing 不被呼叫
|
||
|
||
2026-04-28 ogt + Claude Sonnet 4.6: W2 PR-V1 初始建立
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import pytest
|
||
from unittest.mock import AsyncMock, MagicMock, patch
|
||
|
||
from src.services.post_execution_verifier import PostExecutionVerifier
|
||
from src.services.evidence_snapshot import EvidenceSnapshot
|
||
from src.services.self_healing_validator import assess_self_healing
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Stubs
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
def _stub_incident(
|
||
alertname: str = "KubePodCrashLooping",
|
||
namespace: str = "awoooi-prod",
|
||
pod: str = "api-xyz",
|
||
) -> object:
|
||
class _Signal:
|
||
labels = {
|
||
"alertname": alertname,
|
||
"namespace": namespace,
|
||
"pod": pod,
|
||
}
|
||
|
||
class _Incident:
|
||
incident_id = "INC-TEST"
|
||
signals = [_Signal()]
|
||
|
||
return _Incident()
|
||
|
||
|
||
def _stub_snapshot(incident_id: str = "INC-TEST") -> EvidenceSnapshot:
|
||
snap = EvidenceSnapshot(incident_id=incident_id)
|
||
snap.pre_execution_state = {"status": "CrashLoopBackOff"}
|
||
return snap
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# assess_self_healing 單元測試(無 IO)
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
class TestAssessSelfHealing:
|
||
"""assess_self_healing() 純函數測試"""
|
||
|
||
def test_success_result_gives_high_score(self):
|
||
result = assess_self_healing(
|
||
pre_state={"status": "CrashLoopBackOff"},
|
||
post_state={"status": "Running", "containers": "1/1"},
|
||
verification_result="success",
|
||
action_taken="restart_service:api",
|
||
)
|
||
assert result["score"] >= 0.5
|
||
assert result["root_cause_cleared"] is True
|
||
|
||
def test_failed_result_gives_zero_score(self):
|
||
result = assess_self_healing(
|
||
pre_state={"status": "Running"},
|
||
post_state={"status": "CrashLoopBackOff"},
|
||
verification_result="failed",
|
||
action_taken="patch_config",
|
||
)
|
||
assert result["score"] == 0.0
|
||
assert result["root_cause_cleared"] is False
|
||
|
||
def test_degraded_result_gives_low_score(self):
|
||
result = assess_self_healing(
|
||
pre_state=None,
|
||
post_state={"status": "Pending"},
|
||
verification_result="degraded",
|
||
action_taken="scale_up",
|
||
)
|
||
assert result["score"] < 0.5
|
||
|
||
def test_regression_reduces_score(self):
|
||
"""執行後出現新 CrashLoopBackOff → regression penalty 扣分"""
|
||
result = assess_self_healing(
|
||
pre_state={"status": "Running"},
|
||
post_state={"status": "Running", "reason": "CrashLoopBackOff"},
|
||
verification_result="success",
|
||
action_taken="restart_service",
|
||
)
|
||
# regression 要扣分
|
||
assert "crashloopbackoff" in result["regressions"]
|
||
# 即使 verification_result=success,regression 導致扣分
|
||
assert result["score"] < 1.0
|
||
|
||
def test_no_regression_full_score_on_success(self):
|
||
"""乾淨的 success:無 regression、root cause 解除 → score=1.0"""
|
||
result = assess_self_healing(
|
||
pre_state={"status": "CrashLoopBackOff"},
|
||
post_state={"status": "Running", "containers": "1/1"},
|
||
verification_result="success",
|
||
action_taken="restart_service:api",
|
||
)
|
||
assert result["score"] == 1.0
|
||
assert result["regressions"] == []
|
||
|
||
def test_timeout_gives_low_base_score(self):
|
||
result = assess_self_healing(
|
||
pre_state=None,
|
||
post_state={},
|
||
verification_result="timeout",
|
||
action_taken="restart_service",
|
||
)
|
||
assert result["score"] == 0.2
|
||
|
||
def test_detail_is_human_readable(self):
|
||
result = assess_self_healing(
|
||
pre_state=None,
|
||
post_state={"status": "Running"},
|
||
verification_result="success",
|
||
action_taken="restart",
|
||
)
|
||
assert "base=" in result["detail"]
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# 整合測試:verify() → _run_self_healing_validator
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
class TestVerifyIntegration:
|
||
"""PostExecutionVerifier.verify() 串接 SelfHealingValidator 整合測試"""
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_validator_called_after_verification(self):
|
||
"""ENABLE=True → verify() 完成後 assess_self_healing 被呼叫"""
|
||
verifier = PostExecutionVerifier()
|
||
incident = _stub_incident()
|
||
|
||
with (
|
||
patch.object(
|
||
verifier,
|
||
"_collect_post_state",
|
||
new=AsyncMock(return_value={"status": "Running"}),
|
||
),
|
||
patch("src.services.post_execution_verifier._update_snapshot", new=AsyncMock()),
|
||
patch(
|
||
"src.services.post_execution_verifier._run_self_healing_validator",
|
||
new=AsyncMock(),
|
||
) as mock_validator,
|
||
):
|
||
await verifier.verify(
|
||
incident=incident,
|
||
snapshot=None,
|
||
action_taken="restart_service:api",
|
||
warmup_sec=0.0,
|
||
)
|
||
|
||
mock_validator.assert_called_once()
|
||
call_kwargs = mock_validator.call_args.kwargs
|
||
assert call_kwargs["incident_id"] == "INC-TEST"
|
||
assert call_kwargs["verification_result"] == "success"
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_low_score_triggers_rollback_proposal(self):
|
||
"""score < 0.5 → Telegram rollback 提案被發送"""
|
||
with (
|
||
patch(
|
||
"src.services.self_healing_validator.assess_self_healing",
|
||
return_value={
|
||
"score": 0.2,
|
||
"root_cause_cleared": False,
|
||
"regressions": ["crashloopbackoff"],
|
||
"detail": "base=0.40; regression_penalty=0.15",
|
||
"verification_result": "degraded",
|
||
"action_taken": "restart_service",
|
||
},
|
||
),
|
||
patch(
|
||
"src.services.post_execution_verifier._send_rollback_proposal_alert",
|
||
new=AsyncMock(),
|
||
) as mock_send,
|
||
patch(
|
||
"src.core.config.get_settings",
|
||
return_value=MagicMock(ENABLE_SELF_HEALING_VALIDATOR=True),
|
||
),
|
||
):
|
||
from src.services.post_execution_verifier import _run_self_healing_validator
|
||
await _run_self_healing_validator(
|
||
incident_id="INC-LOW",
|
||
snapshot=None,
|
||
pre_state={"status": "Running"},
|
||
post_state={"status": "CrashLoopBackOff"},
|
||
verification_result="degraded",
|
||
action_taken="restart_service",
|
||
)
|
||
|
||
mock_send.assert_called_once()
|
||
call_kwargs = mock_send.call_args.kwargs
|
||
assert call_kwargs["score"] == 0.2
|
||
assert call_kwargs["incident_id"] == "INC-LOW"
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_high_score_no_action(self):
|
||
"""score >= 0.5 → Telegram rollback 提案不發送"""
|
||
with (
|
||
patch(
|
||
"src.services.self_healing_validator.assess_self_healing",
|
||
return_value={
|
||
"score": 1.0,
|
||
"root_cause_cleared": True,
|
||
"regressions": [],
|
||
"detail": "base=1.00",
|
||
"verification_result": "success",
|
||
"action_taken": "restart_service",
|
||
},
|
||
),
|
||
patch(
|
||
"src.services.post_execution_verifier._send_rollback_proposal_alert",
|
||
new=AsyncMock(),
|
||
) as mock_send,
|
||
patch(
|
||
"src.core.config.get_settings",
|
||
return_value=MagicMock(ENABLE_SELF_HEALING_VALIDATOR=True),
|
||
),
|
||
):
|
||
from src.services.post_execution_verifier import _run_self_healing_validator
|
||
await _run_self_healing_validator(
|
||
incident_id="INC-HIGH",
|
||
snapshot=None,
|
||
pre_state={"status": "CrashLoopBackOff"},
|
||
post_state={"status": "Running"},
|
||
verification_result="success",
|
||
action_taken="restart_service",
|
||
)
|
||
|
||
mock_send.assert_not_called()
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_validator_failure_does_not_block_main_flow(self):
|
||
"""assess_self_healing 拋例外,verify() 仍返回正確結果"""
|
||
verifier = PostExecutionVerifier()
|
||
incident = _stub_incident()
|
||
|
||
with (
|
||
patch.object(
|
||
verifier,
|
||
"_collect_post_state",
|
||
new=AsyncMock(return_value={"status": "Running"}),
|
||
),
|
||
patch("src.services.post_execution_verifier._update_snapshot", new=AsyncMock()),
|
||
# _run_self_healing_validator 本身 raise → 應被吞掉
|
||
patch(
|
||
"src.services.post_execution_verifier._run_self_healing_validator",
|
||
new=AsyncMock(side_effect=RuntimeError("validator exploded")),
|
||
),
|
||
):
|
||
# verify() 不應 raise,仍返回 "success"
|
||
result = await verifier.verify(
|
||
incident=incident,
|
||
snapshot=None,
|
||
action_taken="restart_service:api",
|
||
warmup_sec=0.0,
|
||
)
|
||
|
||
# verify() 的主流程結果不受影響
|
||
# 注意:_run_self_healing_validator 由 verify() await 直接呼叫,
|
||
# 其例外由 verify() 的 try/except(approve_execution 層級)或自身包住
|
||
# 此測試確認即使 validator 炸掉,result 仍是正確的驗證結果
|
||
assert result == "success"
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_feature_flag_disabled_skips(self):
|
||
"""ENABLE_SELF_HEALING_VALIDATOR=False → assess_self_healing 不被呼叫"""
|
||
import src.services.self_healing_validator as _shv
|
||
with (
|
||
patch.object(_shv, "assess_self_healing") as mock_assess,
|
||
patch(
|
||
"src.core.config.get_settings",
|
||
return_value=MagicMock(ENABLE_SELF_HEALING_VALIDATOR=False),
|
||
),
|
||
):
|
||
from src.services.post_execution_verifier import _run_self_healing_validator
|
||
await _run_self_healing_validator(
|
||
incident_id="INC-FLAG",
|
||
snapshot=None,
|
||
pre_state=None,
|
||
post_state={"status": "Running"},
|
||
verification_result="success",
|
||
action_taken="restart_service",
|
||
)
|
||
|
||
mock_assess.assert_not_called()
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_snapshot_self_healing_score_updated(self):
|
||
"""score 補填 EvidenceSnapshot.self_healing_score"""
|
||
snap = _stub_snapshot()
|
||
snap.update_self_healing = AsyncMock()
|
||
|
||
with (
|
||
patch(
|
||
"src.services.self_healing_validator.assess_self_healing",
|
||
return_value={
|
||
"score": 0.85,
|
||
"root_cause_cleared": True,
|
||
"regressions": [],
|
||
"detail": "base=1.00",
|
||
"verification_result": "success",
|
||
"action_taken": "restart_service",
|
||
},
|
||
),
|
||
patch(
|
||
"src.services.post_execution_verifier._send_rollback_proposal_alert",
|
||
new=AsyncMock(),
|
||
),
|
||
patch(
|
||
"src.core.config.get_settings",
|
||
return_value=MagicMock(ENABLE_SELF_HEALING_VALIDATOR=True),
|
||
),
|
||
):
|
||
from src.services.post_execution_verifier import _run_self_healing_validator
|
||
await _run_self_healing_validator(
|
||
incident_id="INC-SNAP",
|
||
snapshot=snap,
|
||
pre_state={"status": "CrashLoopBackOff"},
|
||
post_state={"status": "Running"},
|
||
verification_result="success",
|
||
action_taken="restart_service",
|
||
)
|
||
|
||
snap.update_self_healing.assert_called_once()
|
||
call_kwargs = snap.update_self_healing.call_args.kwargs
|
||
assert call_kwargs["score"] == 0.85
|
||
assert call_kwargs["detail"]["root_cause_cleared"] is True
|
||
assert call_kwargs["detail"]["regressions"] == []
|