ADR-038: OpenClaw 雙層保護 - Layer 1: Circuit Breaker (5 failures → 60s cooldown) - Layer 2: Concurrency Semaphore (max 3 concurrent) - 新增 src/core/circuit_breaker.py ADR-039: 全域修復熔斷 - Global Cooldown: 5 repairs/15min → freeze - StatefulSet Blacklist: postgres/redis/clickhouse 禁止自動重啟 - 新增 src/services/global_repair_cooldown.py - 整合到 auto_repair_service.py 測試: - test_circuit_breaker.py (狀態轉換 + Semaphore) - test_global_repair_cooldown.py (黑名單 + 計數閾值) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
167 lines
5.2 KiB
Python
167 lines
5.2 KiB
Python
"""
|
|
Global Repair Cooldown 測試
|
|
===========================
|
|
ADR-039: 全域修復熔斷機制
|
|
|
|
測試項目:
|
|
- 有狀態服務黑名單檢查
|
|
- 全域計數閾值
|
|
- Redis 故障降級
|
|
|
|
注意:需要 Redis 環境,測試會使用獨立的 key 前綴
|
|
"""
|
|
|
|
import pytest
|
|
|
|
from src.services.global_repair_cooldown import (
|
|
GLOBAL_COOLDOWN_KEY,
|
|
GLOBAL_COOLDOWN_THRESHOLD,
|
|
STATEFUL_SERVICE_BLACKLIST,
|
|
check_global_repair_cooldown,
|
|
get_global_repair_status,
|
|
record_global_repair_action,
|
|
)
|
|
|
|
|
|
class TestStatefulServiceBlacklist:
|
|
"""有狀態服務黑名單測試"""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_postgres_blocked(self):
|
|
"""PostgreSQL 服務應該被阻擋"""
|
|
can_repair, reason = await check_global_repair_cooldown(
|
|
incident_id="test-001",
|
|
affected_services=["awoooi-postgres"],
|
|
)
|
|
assert not can_repair
|
|
assert "有狀態服務" in reason
|
|
assert "禁止自動重啟" in reason
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_redis_blocked(self):
|
|
"""Redis 服務應該被阻擋"""
|
|
can_repair, reason = await check_global_repair_cooldown(
|
|
incident_id="test-002",
|
|
affected_services=["redis-stack"],
|
|
)
|
|
assert not can_repair
|
|
assert "有狀態服務" in reason
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_clickhouse_blocked(self):
|
|
"""ClickHouse 服務應該被阻擋"""
|
|
can_repair, reason = await check_global_repair_cooldown(
|
|
incident_id="test-003",
|
|
affected_services=["signoz-clickhouse-0"],
|
|
)
|
|
assert not can_repair
|
|
assert "有狀態服務" in reason
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_stateless_service_allowed(self):
|
|
"""無狀態服務應該被允許"""
|
|
can_repair, reason = await check_global_repair_cooldown(
|
|
incident_id="test-004",
|
|
affected_services=["awoooi-api-deployment"],
|
|
)
|
|
assert can_repair
|
|
assert "允許" in reason
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_empty_services_allowed(self):
|
|
"""空服務列表應該被允許"""
|
|
can_repair, reason = await check_global_repair_cooldown(
|
|
incident_id="test-005",
|
|
affected_services=[],
|
|
)
|
|
assert can_repair
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_none_services_allowed(self):
|
|
"""None 服務列表應該被允許"""
|
|
can_repair, reason = await check_global_repair_cooldown(
|
|
incident_id="test-006",
|
|
affected_services=None,
|
|
)
|
|
assert can_repair
|
|
|
|
def test_blacklist_contains_common_stateful_services(self):
|
|
"""黑名單應該包含常見有狀態服務"""
|
|
assert "postgres" in STATEFUL_SERVICE_BLACKLIST
|
|
assert "redis" in STATEFUL_SERVICE_BLACKLIST
|
|
assert "clickhouse" in STATEFUL_SERVICE_BLACKLIST
|
|
assert "elasticsearch" in STATEFUL_SERVICE_BLACKLIST
|
|
assert "etcd" in STATEFUL_SERVICE_BLACKLIST
|
|
assert "minio" in STATEFUL_SERVICE_BLACKLIST
|
|
|
|
|
|
class TestGlobalCooldown:
|
|
"""全域冷卻期測試 - 需要 Redis"""
|
|
|
|
@pytest.fixture
|
|
async def clean_redis(self):
|
|
"""清理測試用 Redis key"""
|
|
from src.core.redis_client import get_redis
|
|
|
|
redis = get_redis()
|
|
await redis.delete(GLOBAL_COOLDOWN_KEY)
|
|
yield
|
|
await redis.delete(GLOBAL_COOLDOWN_KEY)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_record_increments_counter(self, clean_redis):
|
|
"""記錄應該增加計數"""
|
|
from src.core.redis_client import get_redis
|
|
|
|
redis = get_redis()
|
|
|
|
# 記錄一次
|
|
await record_global_repair_action()
|
|
|
|
count = await redis.get(GLOBAL_COOLDOWN_KEY)
|
|
assert int(count) == 1
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_record_sets_ttl(self, clean_redis):
|
|
"""第一次記錄應該設定 TTL"""
|
|
from src.core.redis_client import get_redis
|
|
|
|
redis = get_redis()
|
|
|
|
await record_global_repair_action()
|
|
|
|
ttl = await redis.ttl(GLOBAL_COOLDOWN_KEY)
|
|
assert ttl > 0
|
|
assert ttl <= 900 # 15 分鐘
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_cooldown_triggers_after_threshold(self, clean_redis):
|
|
"""超過閾值後應該觸發凍結"""
|
|
# 記錄 5 次(達到閾值)
|
|
for _ in range(GLOBAL_COOLDOWN_THRESHOLD):
|
|
await record_global_repair_action()
|
|
|
|
can_repair, reason = await check_global_repair_cooldown(
|
|
incident_id="test-threshold",
|
|
affected_services=["awoooi-api"],
|
|
)
|
|
|
|
assert not can_repair
|
|
assert "超出安全閾值" in reason
|
|
assert str(GLOBAL_COOLDOWN_THRESHOLD) in reason
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_get_status_returns_correct_info(self, clean_redis):
|
|
"""狀態 API 應該返回正確資訊"""
|
|
# 記錄 2 次
|
|
await record_global_repair_action()
|
|
await record_global_repair_action()
|
|
|
|
status = await get_global_repair_status()
|
|
|
|
assert status["current_count"] == 2
|
|
assert status["threshold"] == GLOBAL_COOLDOWN_THRESHOLD
|
|
assert not status["is_frozen"]
|
|
assert status["ttl_remaining"] is not None
|
|
assert status["ttl_remaining"] > 0
|