Files
awoooi/apps/api/tests/test_failover_alerter.py
Your Name 4452a006bf
All checks were successful
Code Review / ai-code-review (push) Successful in 13s
CD Pipeline / tests (push) Successful in 6m20s
CD Pipeline / build-and-deploy (push) Successful in 4m54s
CD Pipeline / post-deploy-checks (push) Successful in 1m48s
feat(governance): show knowledge degradation ownership
2026-05-19 20:38:29 +08:00

352 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""FailoverAlerter 單元測試 — P1.5 Telegram 容災告警
四大 testcase覆蓋 status 文件 line 99 指定範圍):
1. test_alert_failover_dedup — 同 to_provider 第二次被 10min dedup
2. test_alert_recovery_send — 正常發送 + Markdown 訊息結構
3. test_no_telegram_chat_id_noop — chat_id 缺失時不發送fail-soft
4. test_quota_alert_dedup_24h — quota 告警 86400s TTL每日一次
2026-04-26 P1.5 補測 by Claude Opus 4.7
"""
from __future__ import annotations
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from src.services.failover_alerter import (
DEDUP_TTL_SEC,
QUOTA_DEDUP_TTL_SEC,
FailoverAlerter,
_lines_from_list,
_sanitize_telegram_error,
configure_alerter,
format_governance_alert_card,
get_failover_alerter,
reset_failover_alerter,
)
@pytest.fixture(autouse=True)
def _reset_singleton():
"""每個 test 前後重置 singleton避免 state 洩漏"""
reset_failover_alerter()
yield
reset_failover_alerter()
@pytest.fixture
def mock_redis():
"""Mock Redisset 第一次回 TrueNX 成功),第二次回 None已存在"""
redis = MagicMock()
redis.set = AsyncMock(side_effect=[True, None, True, None])
return redis
@pytest.fixture
def mock_telegram_send():
"""Mock TelegramGateway.send_alert_notification + settings.SRE_GROUP_CHAT_ID
`_send()` 在函式內 inline import必須 mock 來源 module 而非 alerter module。
"""
with patch("src.services.telegram_gateway.get_telegram_gateway") as mock_gw, \
patch("src.core.config.get_settings") as mock_settings:
gateway = MagicMock()
gateway.send_alert_notification = AsyncMock()
mock_gw.return_value = gateway
mock_settings.return_value = MagicMock(SRE_GROUP_CHAT_ID="-100123", OPENCLAW_TG_CHAT_ID="-100456")
yield gateway
# =============================================================================
# Case 1: failover dedup同 to_provider 第二次被攔)
# =============================================================================
@pytest.mark.asyncio
async def test_alert_failover_dedup(mock_redis, mock_telegram_send):
alerter = FailoverAlerter(redis_client=mock_redis)
event = {
"to_provider": "gemini",
"reason": "111 unhealthy",
"model": "qwen3:8b",
"fallback_chain_str": "gemini → ollama_local",
}
# 第 1 次dedup pass發送
await alerter.alert_failover(event)
assert mock_telegram_send.send_alert_notification.await_count == 1
# 第 2 次dedup hit不發送
await alerter.alert_failover(event)
assert mock_telegram_send.send_alert_notification.await_count == 1 # 仍是 1
# 驗證 dedup TTL = 10min
assert mock_redis.set.await_args_list[0].kwargs["ex"] == DEDUP_TTL_SEC
assert mock_redis.set.await_args_list[0].kwargs["nx"] is True
# =============================================================================
# Case 2: recovery 正常發送
# =============================================================================
@pytest.mark.asyncio
async def test_alert_recovery_send(mock_redis, mock_telegram_send):
alerter = FailoverAlerter(redis_client=mock_redis)
await alerter.alert_recovery({
"from_provider": "gemini",
"to_provider": "ollama_111",
"stable_count": 3,
})
assert mock_telegram_send.send_alert_notification.await_count == 1
sent_kwargs = mock_telegram_send.send_alert_notification.await_args.kwargs
assert sent_kwargs["parse_mode"] == "MarkdownV2"
# 訊息應提及恢復 + 連續 3 次 HEALTHY
assert "Ollama 自動恢復" in sent_kwargs["text"]
assert "連續 3" in sent_kwargs["text"]
# =============================================================================
# Case 3: chat_id 缺失 → fail-soft不發送不 raise
# =============================================================================
@pytest.mark.asyncio
async def test_no_telegram_chat_id_noop(mock_redis):
alerter = FailoverAlerter(redis_client=mock_redis)
with patch("src.services.telegram_gateway.get_telegram_gateway") as mock_gw, \
patch("src.core.config.get_settings") as mock_settings:
gateway = MagicMock()
gateway.send_alert_notification = AsyncMock()
mock_gw.return_value = gateway
mock_settings.return_value = MagicMock(SRE_GROUP_CHAT_ID=None, OPENCLAW_TG_CHAT_ID=None)
# 不該 raisededup pass 但 send 因 chat_id 缺直接 return
await alerter.alert_failover({"to_provider": "gemini"})
assert gateway.send_alert_notification.await_count == 0
# =============================================================================
# Case 4: quota 告警 24h dedup
# =============================================================================
@pytest.mark.asyncio
async def test_quota_alert_dedup_24h(mock_redis, mock_telegram_send):
alerter = FailoverAlerter(redis_client=mock_redis)
await alerter.alert_gemini_quota_exceeded({
"quota": 1000,
"current_count": 1003,
})
# 訊息發出
assert mock_telegram_send.send_alert_notification.await_count == 1
sent = mock_telegram_send.send_alert_notification.await_args.kwargs["text"]
assert "Gemini 每日配額耗盡" in sent
assert "1000" in sent
assert "1003" in sent
assert "\\-" in sent # MarkdownV2 date hyphens must be escaped
# 驗證 dedup TTL = 24h
assert mock_redis.set.await_args_list[0].kwargs["ex"] == QUOTA_DEDUP_TTL_SEC
assert QUOTA_DEDUP_TTL_SEC == 86400 # sanity check 常數本身
# =============================================================================
# 額外configure_alerter / get_failover_alerter 行為驗證
# =============================================================================
def test_configure_alerter_replaces_singleton(mock_redis):
"""configure_alerter() 應替換現有 singleton 並注入 redis"""
a1 = get_failover_alerter()
assert a1._redis is None # 預設無 redis
configure_alerter(mock_redis)
a2 = get_failover_alerter()
assert a2._redis is mock_redis # 注入後 redis 可用
assert a1 is not a2 # 是新 instance
@pytest.mark.asyncio
async def test_dedup_fail_open_when_no_redis():
"""Redis 為 None 時 dedup 第一次應允許送出in-memory dedup非 fail-open 對所有次數)"""
alerter = FailoverAlerter(redis_client=None)
# 第一次:無記錄 → 允許
assert await alerter._check_dedup("any:key", ttl=600) is True
# =============================================================================
# Wave8-X2: dedup in-memory fallback 新增測試
# =============================================================================
@pytest.mark.asyncio
async def test_dedup_redis_unavailable_uses_memory():
"""Redis 拋出例外時in-memory dedup 仍生效(不 fail-open 狂發)
Wave8-X2 fix原 fail-open 改為 in-memory dedup fallback。
驗證Redis set() raise → 第二次 _check_dedup 同 key 應回 False。
"""
bad_redis = MagicMock()
bad_redis.set = AsyncMock(side_effect=ConnectionError("Redis is down"))
alerter = FailoverAlerter(redis_client=bad_redis)
key = "alert:test:dedup_memory"
ttl = 600
# 第 1 次in-memory 無記錄 → 允許
result1 = await alerter._check_dedup(key, ttl=ttl)
assert result1 is True
# 第 2 次in-memory 已有記錄(未過 TTL→ 拒絕
result2 = await alerter._check_dedup(key, ttl=ttl)
assert result2 is False
@pytest.mark.asyncio
async def test_memory_dedup_max_size_gc():
"""超過 1000 entries 時 GC 清除過期 entry防 dict 無限成長
Wave8-X2 fix_memory_dedup_max_size = 1000超過時 GC。
驗證:注入 999 個已過期 entry + 1 個未過期 → GC 後 dict size 應減少。
"""
import time
alerter = FailoverAlerter(redis_client=None)
# 注入 999 個「已過期」entrylast_sent = 0.0TTL=600s均已過期
for i in range(999):
alerter._memory_dedup[f"stale:key:{i}"] = 0.0 # expired: now - 0.0 > 600
# 注入 1 個「未過期」entry
alerter._memory_dedup["fresh:key"] = time.time()
# 此時 dict size = 1000達 _memory_dedup_max_size
assert len(alerter._memory_dedup) == 1000
# 觸發 GC新 key check 讓 len >= max_size → 清理
result = await alerter._check_dedup("trigger:gc:key", ttl=600)
assert result is True # 新 key 應被允許
# GC 後999 個 stale entry 被清除,只剩 fresh:key + trigger:gc:key
assert len(alerter._memory_dedup) <= 3 # fresh + trigger + 可能有邊界差1
def test_lines_from_list_escapes_markdown_v2_numbered_periods() -> None:
rendered = _lines_from_list(["修復 node-exporter-110"])
assert "1\\." in rendered
assert "node\\-exporter\\-110" in rendered
def test_sanitize_telegram_error_redacts_bot_token_url() -> None:
raw = "HTTP error for https://api.telegram.org/bot123456:SECRET/sendMessage"
sanitized = _sanitize_telegram_error(raw)
assert "SECRET" not in sanitized
assert "bot<redacted>" in sanitized
def test_governance_alert_card_formats_knowledge_degradation() -> None:
card = format_governance_alert_card(
"knowledge_degradation",
{
"status": "warning",
"impact": {
"stale_count": 948,
"stale_days": 7,
"stale_ratio": 0.521,
"threshold": 0.2,
"total_count": 1819,
},
"remediation": {
"items": [
"啟動 KM 反查與自動補齊流程",
"關鍵服務告警自動同步到 KM 任務",
],
"next_action": "run_kb_growth_healthcheck",
},
"actionable": {
"items": [
"每日檢查 ANTI_PATTERN 更新結果",
"安排 owner 對 stale 條目做快速人工審核",
]
},
},
)
assert "*AI 治理警報KM 需要更新" in card
assert "💬 *白話說明*" in card
assert "🧩 *AI 流程狀態*" in card
assert "👥 *負責分工*" in card
assert "主責Hermes" in card
assert "OpenClaw提供告警分類" in card
assert "ElephantAlpharead\\-only 稽核" in card
assert "人工覆核KM owner / SRE owner" in card
assert "✅ *現在要做*" in card
assert "queued\\_kb\\_healthcheck" in card
assert "AwoooP Work Items" in card
assert "🧭 *影響摘要*" in card
assert "陳舊 KM948" in card
assert "陳舊比例52\\.1%" in card
assert "▶️ 下一步run\\_kb\\_growth\\_healthcheck" in card
assert "📎 *補充欄位*" not in card
assert "欄位快覽" not in card
def test_governance_alert_card_accepts_legacy_knowledge_degradation_payload() -> None:
card = format_governance_alert_card(
"knowledge_degradation",
{
"status": "warning",
"stale_count": 1425,
"total": 1856,
"stale_ratio": 0.768,
"threshold": 0.2,
"stale_days": 7,
"remediation": [
"啟動 KM 反查與自動補齊流程",
"關鍵服務告警自動同步到 KM 任務",
],
"next_step": "run_kb_growth_healthcheck",
"automatable_work": [
"每日檢查 ANTI_PATTERN 更新結果",
"安排至少 2 位 owner 對 stale 條目做快速人工審核",
],
},
)
assert "1425 / 1856 筆 KM" in card
assert "陳舊 KM1425" in card
assert "總 KM1856" in card
assert "陳舊比例76\\.8%" in card
assert "主責Hermes" in card
assert "人工覆核KM owner / SRE owner" in card
assert "▶️ 下一步run\\_kb\\_growth\\_healthcheck" in card
assert "每日檢查 ANTI\\_PATTERN 更新結果" in card
assert "📎 *補充欄位*" not in card
assert "? / ?" not in card
def test_governance_alert_card_limits_fallback_fields() -> None:
card = format_governance_alert_card(
"custom_signal",
{
"status": "warning",
"field_a": "a",
"field_b": "b",
"field_c": "c",
"field_d": "d",
"field_e": "e",
},
)
assert "📎 *補充欄位*" in card
assert "更多欄位已收斂至 AwoooP 稽核資料" in card
assert "field\\_e" not in card