P1 測試修復: - test_smart_router.py: 更新至當前 API (IntentResult + DIAGNOSE/CONFIG 規範化) - test_auto_repair_service.py: 注入 _no_cooldown fixture 隔離 Redis 依賴 - test_global_repair_cooldown.py: 加 @pytest.mark.integration 標記 P2 架構改進: - AutoRepairService: 新增 cooldown_checker DI 參數 (Callable | None) - global_repair_cooldown: get_redis() 移入 try-except 防止未捕獲 RuntimeError P3 配置: - pyproject.toml: 登記 integration pytest marker Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
179 lines
5.3 KiB
Python
179 lines
5.3 KiB
Python
"""
|
||
全域修復熔斷機制
|
||
================
|
||
ADR-039:防止跨資源循環修復
|
||
|
||
設計原則:
|
||
- Redis TTL 滑動窗口(15 分鐘)
|
||
- 失敗降級:Redis 故障時保守跳過自動修復,強制人工確認
|
||
|
||
遵循 leWOOOgo 積木化鐵律:
|
||
- 此模組屬於 Service 層
|
||
- 依賴 core/redis_client
|
||
- 無副作用的純邏輯判斷
|
||
|
||
2026-03-29 ogt - ADR-039 實作
|
||
"""
|
||
|
||
import structlog
|
||
|
||
from src.core.redis_client import get_redis
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
GLOBAL_COOLDOWN_KEY = "global:auto_repair:count"
|
||
GLOBAL_COOLDOWN_TTL = 900 # 15 分鐘窗口
|
||
GLOBAL_COOLDOWN_THRESHOLD = 5 # 超過 5 次強制凍結
|
||
|
||
# ADR-039: 有狀態服務黑名單(永遠禁止自動重啟)
|
||
STATEFUL_SERVICE_BLACKLIST = frozenset(
|
||
{
|
||
# PostgreSQL
|
||
"postgres",
|
||
"postgresql",
|
||
"awoooi-postgres",
|
||
# Redis
|
||
"redis",
|
||
"awoooi-redis",
|
||
"redis-stack",
|
||
# ClickHouse (SignOz)
|
||
"clickhouse",
|
||
"signoz-clickhouse",
|
||
# 其他有狀態服務
|
||
"elasticsearch",
|
||
"etcd",
|
||
"minio",
|
||
"awoooi-minio",
|
||
"kafka",
|
||
"zookeeper",
|
||
}
|
||
)
|
||
|
||
|
||
async def check_global_repair_cooldown(
|
||
incident_id: str,
|
||
affected_services: list[str] | None = None,
|
||
) -> tuple[bool, str]:
|
||
"""
|
||
檢查是否允許自動修復
|
||
|
||
Args:
|
||
incident_id: 事件 ID(用於日誌追蹤)
|
||
affected_services: 受影響的服務列表
|
||
|
||
Returns:
|
||
(can_repair: bool, reason: str)
|
||
"""
|
||
affected_services = affected_services or []
|
||
|
||
# === 硬禁令:有狀態服務黑名單 (純邏輯,無需 Redis) ===
|
||
for service in affected_services:
|
||
service_lower = service.lower()
|
||
for blacklisted in STATEFUL_SERVICE_BLACKLIST:
|
||
if blacklisted in service_lower:
|
||
reason = f"服務 {service} 為有狀態服務,禁止自動重啟,請統帥手動介入"
|
||
logger.warning(
|
||
"stateful_service_blocked",
|
||
service=service,
|
||
incident_id=incident_id,
|
||
blacklist_match=blacklisted,
|
||
)
|
||
return False, reason
|
||
|
||
# === 全域冷卻期:Redis 計數 ===
|
||
# 2026-04-01 ogt: 將 get_redis() 移入 try-except,防止 Redis 未初始化時拋出未捕獲例外
|
||
try:
|
||
redis = get_redis()
|
||
count_raw = await redis.get(GLOBAL_COOLDOWN_KEY)
|
||
current_count = int(count_raw) if count_raw else 0
|
||
|
||
if current_count >= GLOBAL_COOLDOWN_THRESHOLD:
|
||
reason = (
|
||
f"系統在過去 15 分鐘內已自動修復 {current_count} 次,"
|
||
f"超出安全閾值 {GLOBAL_COOLDOWN_THRESHOLD},"
|
||
"強制轉為人工審核模式"
|
||
)
|
||
logger.warning(
|
||
"global_repair_cooldown_active",
|
||
current_count=current_count,
|
||
threshold=GLOBAL_COOLDOWN_THRESHOLD,
|
||
incident_id=incident_id,
|
||
)
|
||
return False, reason
|
||
|
||
return True, "允許自動修復"
|
||
|
||
except Exception as e:
|
||
# Redis 故障 → 保守策略:禁止自動修復
|
||
logger.error(
|
||
"global_repair_cooldown_redis_error",
|
||
error=str(e),
|
||
fallback="blocking_auto_repair_for_safety",
|
||
incident_id=incident_id,
|
||
)
|
||
return False, f"Redis 連線異常,保守禁止自動修復(原因:{e})"
|
||
|
||
|
||
async def record_global_repair_action() -> None:
|
||
"""
|
||
記錄一次全域修復動作
|
||
|
||
使用 INCR + EXPIRE 實現滑動窗口計數
|
||
注意:INCR 是原子操作,多個 Worker 並發安全
|
||
"""
|
||
try:
|
||
redis = get_redis()
|
||
count = await redis.incr(GLOBAL_COOLDOWN_KEY)
|
||
|
||
# 只在第一次設定 TTL(避免頻繁重設導致窗口延長)
|
||
if count == 1:
|
||
await redis.expire(GLOBAL_COOLDOWN_KEY, GLOBAL_COOLDOWN_TTL)
|
||
|
||
logger.info(
|
||
"global_repair_action_recorded",
|
||
count=count,
|
||
threshold=GLOBAL_COOLDOWN_THRESHOLD,
|
||
ttl_seconds=GLOBAL_COOLDOWN_TTL,
|
||
)
|
||
|
||
except Exception as e:
|
||
# Redis 故障:靜默失敗(不影響主流程)
|
||
logger.warning("global_repair_record_failed", error=str(e))
|
||
|
||
|
||
async def get_global_repair_status() -> dict:
|
||
"""
|
||
取得全域修復狀態(供 API/監控用)
|
||
|
||
Returns:
|
||
{
|
||
"current_count": int,
|
||
"threshold": int,
|
||
"is_frozen": bool,
|
||
"ttl_remaining": int | None,
|
||
}
|
||
"""
|
||
try:
|
||
redis = get_redis()
|
||
count_raw = await redis.get(GLOBAL_COOLDOWN_KEY)
|
||
current_count = int(count_raw) if count_raw else 0
|
||
ttl = await redis.ttl(GLOBAL_COOLDOWN_KEY)
|
||
|
||
return {
|
||
"current_count": current_count,
|
||
"threshold": GLOBAL_COOLDOWN_THRESHOLD,
|
||
"is_frozen": current_count >= GLOBAL_COOLDOWN_THRESHOLD,
|
||
"ttl_remaining": ttl if ttl > 0 else None,
|
||
"window_seconds": GLOBAL_COOLDOWN_TTL,
|
||
}
|
||
|
||
except Exception as e:
|
||
logger.warning("get_global_repair_status_failed", error=str(e))
|
||
return {
|
||
"current_count": -1,
|
||
"threshold": GLOBAL_COOLDOWN_THRESHOLD,
|
||
"is_frozen": True, # 保守假設:凍結
|
||
"ttl_remaining": None,
|
||
"error": str(e),
|
||
}
|