feat(api): 首次信任機制 — 打破自動修復冷啟動雞生蛋問題

問題: Playbook 需要 success_count >= 3 才算 is_high_quality,
但沒有自動修復就不會有成功紀錄 → 永遠達不到門檻。

方案 C: 首次信任 (Cold Start Trust)
- APPROVED 狀態 + 全步驟 risk=LOW + 執行次數 < 3 → 自動放行
- Redis counter 限制每日最多 5 次首次信任自動修復
- 累積 3 次成功後自動回歸正常 is_high_quality 門檻

安全邊界:
- 只有 LOW risk 步驟才能首次信任 (重啟容器等)
- HIGH/CRITICAL 仍需人工審核
- P0/P1 嚴重度仍需人工審核
- 每日上限防止失控

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-07 11:21:00 +08:00
parent 2fe8062fb8
commit 53b2daeaca

View File

@@ -32,6 +32,7 @@ from src.models.incident import Incident, Severity
from src.models.playbook import (
ActionType,
Playbook,
PlaybookStatus,
RiskLevel,
SymptomPattern,
)
@@ -135,6 +136,12 @@ class AutoRepairService:
MAX_AUTO_REPAIR_SEVERITY = Severity.P2 # 最高允許自動修復的嚴重度
MIN_SIMILARITY_SCORE = 0.7 # 最低相似度門檻
# 2026-04-07 Claude Code: 首次信任機制 — 打破冷啟動雞生蛋問題
# 條件: APPROVED + 全部步驟 risk=LOW + 執行次數 < 3
# 每日最多 5 次首次信任自動修復,防止失控
COLD_START_TRUST_MAX_EXECUTIONS = 3 # 累積幾次後回歸正常門檻
COLD_START_TRUST_DAILY_LIMIT = 5 # 每日首次信任上限
def __init__(
self,
playbook_service: IPlaybookService | None = None,
@@ -255,17 +262,47 @@ class AutoRepairService:
blocked_by="LOW_SIMILARITY",
)
# 高品質檢查
# 高品質檢查 + 首次信任機制
# 2026-04-07 Claude Code: 方案 C — 打破冷啟動雞生蛋問題
max_risk = self._get_max_risk_level(best_match.playbook)
if not best_match.playbook.is_high_quality:
return AutoRepairDecision(
can_auto_repair=False,
playbook=best_match.playbook,
reason=f"Playbook 尚未達到高品質標準 (成功率: {best_match.playbook.success_rate:.0%}, 執行次數: {best_match.playbook.total_executions})",
blocked_by="NOT_HIGH_QUALITY",
# 首次信任: APPROVED + 全步驟 LOW risk + 執行次數 < N
cold_start_eligible = (
best_match.playbook.status == PlaybookStatus.APPROVED
and max_risk == RiskLevel.LOW
and best_match.playbook.total_executions < self.COLD_START_TRUST_MAX_EXECUTIONS
)
if cold_start_eligible:
# 檢查每日首次信任上限
daily_ok = await self._check_cold_start_daily_limit()
if daily_ok:
logger.info(
"auto_repair_cold_start_trust",
incident_id=incident.incident_id,
playbook_id=best_match.playbook.playbook_id,
playbook_name=best_match.playbook.name,
total_executions=best_match.playbook.total_executions,
max_risk=max_risk.value,
)
# 跳過 is_high_quality 門檻,直接進入風險檢查
else:
return AutoRepairDecision(
can_auto_repair=False,
playbook=best_match.playbook,
reason=f"首次信任每日上限已達 {self.COLD_START_TRUST_DAILY_LIMIT}",
blocked_by="COLD_START_DAILY_LIMIT",
)
else:
return AutoRepairDecision(
can_auto_repair=False,
playbook=best_match.playbook,
reason=f"Playbook 尚未達到高品質標準 (成功率: {best_match.playbook.success_rate:.0%}, 執行次數: {best_match.playbook.total_executions})",
blocked_by="NOT_HIGH_QUALITY",
)
# 5. 檢查動作風險等級
max_risk = self._get_max_risk_level(best_match.playbook)
if self._risk_exceeds_threshold(max_risk):
return AutoRepairDecision(
@@ -468,6 +505,42 @@ class AutoRepairService:
high_risks = {RiskLevel.HIGH, RiskLevel.CRITICAL}
return risk in high_risks
async def _check_cold_start_daily_limit(self) -> bool:
"""
檢查今日首次信任自動修復次數是否在限額內。
使用 Redis counterkey 含日期,自動過期。
2026-04-07 Claude Code: 方案 C — 冷啟動每日上限防護
"""
try:
from src.core.redis_client import get_redis
redis = await get_redis()
if redis is None:
# Redis 不可用 → 保守拒絕
return False
from src.utils.timezone import now_taipei
today_key = f"cold_start_trust:{now_taipei().strftime('%Y-%m-%d')}"
count = await redis.incr(today_key)
# 首次建立 key 時設定過期 (25 小時,確保跨日清理)
if count == 1:
await redis.expire(today_key, 90000)
if count > self.COLD_START_TRUST_DAILY_LIMIT:
logger.warning(
"cold_start_daily_limit_reached",
today_key=today_key,
count=count,
limit=self.COLD_START_TRUST_DAILY_LIMIT,
)
return False
return True
except Exception as e:
logger.warning("cold_start_daily_limit_check_failed", error=str(e))
# 安全降級:檢查失敗 → 保守拒絕
return False
async def _execute_step(self, incident: Incident, step) -> str:
"""
執行單一修復步驟