Files
awoooi/apps/api/src/services/global_repair_cooldown.py
OG T 59902f270d fix(tests): 首席架構師審查修復 - 測試套件 + DI 強化 (96/100 OUTSTANDING)
P1 測試修復:
- test_smart_router.py: 更新至當前 API (IntentResult + DIAGNOSE/CONFIG 規範化)
- test_auto_repair_service.py: 注入 _no_cooldown fixture 隔離 Redis 依賴
- test_global_repair_cooldown.py: 加 @pytest.mark.integration 標記

P2 架構改進:
- AutoRepairService: 新增 cooldown_checker DI 參數 (Callable | None)
- global_repair_cooldown: get_redis() 移入 try-except 防止未捕獲 RuntimeError

P3 配置:
- pyproject.toml: 登記 integration pytest marker

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-01 11:11:50 +08:00

179 lines
5.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
全域修復熔斷機制
================
ADR-039防止跨資源循環修復
設計原則:
- Redis TTL 滑動窗口15 分鐘)
- 失敗降級Redis 故障時保守跳過自動修復,強制人工確認
遵循 leWOOOgo 積木化鐵律:
- 此模組屬於 Service 層
- 依賴 core/redis_client
- 無副作用的純邏輯判斷
2026-03-29 ogt - ADR-039 實作
"""
import structlog
from src.core.redis_client import get_redis
logger = structlog.get_logger(__name__)
GLOBAL_COOLDOWN_KEY = "global:auto_repair:count"
GLOBAL_COOLDOWN_TTL = 900 # 15 分鐘窗口
GLOBAL_COOLDOWN_THRESHOLD = 5 # 超過 5 次強制凍結
# ADR-039: 有狀態服務黑名單(永遠禁止自動重啟)
STATEFUL_SERVICE_BLACKLIST = frozenset(
{
# PostgreSQL
"postgres",
"postgresql",
"awoooi-postgres",
# Redis
"redis",
"awoooi-redis",
"redis-stack",
# ClickHouse (SignOz)
"clickhouse",
"signoz-clickhouse",
# 其他有狀態服務
"elasticsearch",
"etcd",
"minio",
"awoooi-minio",
"kafka",
"zookeeper",
}
)
async def check_global_repair_cooldown(
incident_id: str,
affected_services: list[str] | None = None,
) -> tuple[bool, str]:
"""
檢查是否允許自動修復
Args:
incident_id: 事件 ID用於日誌追蹤
affected_services: 受影響的服務列表
Returns:
(can_repair: bool, reason: str)
"""
affected_services = affected_services or []
# === 硬禁令:有狀態服務黑名單 (純邏輯,無需 Redis) ===
for service in affected_services:
service_lower = service.lower()
for blacklisted in STATEFUL_SERVICE_BLACKLIST:
if blacklisted in service_lower:
reason = f"服務 {service} 為有狀態服務,禁止自動重啟,請統帥手動介入"
logger.warning(
"stateful_service_blocked",
service=service,
incident_id=incident_id,
blacklist_match=blacklisted,
)
return False, reason
# === 全域冷卻期Redis 計數 ===
# 2026-04-01 ogt: 將 get_redis() 移入 try-except防止 Redis 未初始化時拋出未捕獲例外
try:
redis = get_redis()
count_raw = await redis.get(GLOBAL_COOLDOWN_KEY)
current_count = int(count_raw) if count_raw else 0
if current_count >= GLOBAL_COOLDOWN_THRESHOLD:
reason = (
f"系統在過去 15 分鐘內已自動修復 {current_count} 次,"
f"超出安全閾值 {GLOBAL_COOLDOWN_THRESHOLD}"
"強制轉為人工審核模式"
)
logger.warning(
"global_repair_cooldown_active",
current_count=current_count,
threshold=GLOBAL_COOLDOWN_THRESHOLD,
incident_id=incident_id,
)
return False, reason
return True, "允許自動修復"
except Exception as e:
# Redis 故障 → 保守策略:禁止自動修復
logger.error(
"global_repair_cooldown_redis_error",
error=str(e),
fallback="blocking_auto_repair_for_safety",
incident_id=incident_id,
)
return False, f"Redis 連線異常,保守禁止自動修復(原因:{e}"
async def record_global_repair_action() -> None:
"""
記錄一次全域修復動作
使用 INCR + EXPIRE 實現滑動窗口計數
注意INCR 是原子操作,多個 Worker 並發安全
"""
try:
redis = get_redis()
count = await redis.incr(GLOBAL_COOLDOWN_KEY)
# 只在第一次設定 TTL避免頻繁重設導致窗口延長
if count == 1:
await redis.expire(GLOBAL_COOLDOWN_KEY, GLOBAL_COOLDOWN_TTL)
logger.info(
"global_repair_action_recorded",
count=count,
threshold=GLOBAL_COOLDOWN_THRESHOLD,
ttl_seconds=GLOBAL_COOLDOWN_TTL,
)
except Exception as e:
# Redis 故障:靜默失敗(不影響主流程)
logger.warning("global_repair_record_failed", error=str(e))
async def get_global_repair_status() -> dict:
"""
取得全域修復狀態(供 API/監控用)
Returns:
{
"current_count": int,
"threshold": int,
"is_frozen": bool,
"ttl_remaining": int | None,
}
"""
try:
redis = get_redis()
count_raw = await redis.get(GLOBAL_COOLDOWN_KEY)
current_count = int(count_raw) if count_raw else 0
ttl = await redis.ttl(GLOBAL_COOLDOWN_KEY)
return {
"current_count": current_count,
"threshold": GLOBAL_COOLDOWN_THRESHOLD,
"is_frozen": current_count >= GLOBAL_COOLDOWN_THRESHOLD,
"ttl_remaining": ttl if ttl > 0 else None,
"window_seconds": GLOBAL_COOLDOWN_TTL,
}
except Exception as e:
logger.warning("get_global_repair_status_failed", error=str(e))
return {
"current_count": -1,
"threshold": GLOBAL_COOLDOWN_THRESHOLD,
"is_frozen": True, # 保守假設:凍結
"ttl_remaining": None,
"error": str(e),
}