diff --git a/apps/api/src/api/v1/gitea_webhook.py b/apps/api/src/api/v1/gitea_webhook.py index ff90e4be..09ebb8f4 100644 --- a/apps/api/src/api/v1/gitea_webhook.py +++ b/apps/api/src/api/v1/gitea_webhook.py @@ -395,8 +395,10 @@ async def _send_gitea_notification( """ try: # 去重檢查:同一 key 在 TTL 內不重複發送 + # 2026-04-26 critic-B1 hotfix by Claude Opus 4.7 — get_redis() 是同步函數,不可 await + # 原 await get_redis() 會 raise TypeError 被外層 except 吞 → Telegram 通知永遠發不出去 from src.core.redis_client import get_redis # type: ignore[import] - redis = await get_redis() + redis = get_redis() full_key = GITEA_TG_DEDUP_KEY_PREFIX + dedup_key acquired = await redis.set( full_key, diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index a731edfd..fd845bab 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -255,10 +255,13 @@ async def _try_auto_repair_background( if aiops_flags.is_sub_flag_enabled("AIOPS_P1_POST_EXECUTION_VERIFIER"): try: from src.services.post_execution_verifier import get_post_execution_verifier - from src.services.evidence_snapshot import EvidenceSnapshot + # 2026-04-26 critic-B2 hotfix by Claude Opus 4.7 + # get_latest_snapshot 是 module-level async function,不是 EvidenceSnapshot classmethod + # 原 EvidenceSnapshot.get_latest_snapshot(...) 會 raise AttributeError + from src.services.evidence_snapshot import get_latest_snapshot from src.services.learning_service import get_learning_service - _snapshot = await EvidenceSnapshot.get_latest_snapshot(incident_id) + _snapshot = await get_latest_snapshot(incident_id) _action_label = ( f"{target_resource}:{namespace}" if not result.success diff --git a/apps/api/src/main.py b/apps/api/src/main.py index 4ee88460..8b077109 100644 --- a/apps/api/src/main.py +++ b/apps/api/src/main.py @@ -561,11 +561,10 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]: # wire callback:failover 切換時通知 recovery service 更新 current_primary _failover_mgr.set_recovery_callback(_recovery_svc.set_current_primary) - # 啟動 recovery service(從 Redis bootstrap current_primary,並啟動背景監控) - await _recovery_svc.start() - - # 2026-04-26 P1.5 整合點 4 by Claude Opus 4.7 — Failover Alerter 注入 Redis client - # 必須在 recovery_svc.start() 之後(確保 Redis pool 已可用),yield 之前 + # 2026-04-26 critic-H3 hotfix by Claude Opus 4.7 — alerter 必須在 recovery 啟動前注入 + # 原順序:start() 後才注入 → recovery bootstrap immediate-check 若觸發 alert_recovery, + # alerter 還沒注入 Redis → dedup fail-open,告警會送出且無 dedup 保護(重複告警風險) + # 修法:configure_alerter() 提前到 start() 之前;Redis pool 在 lifespan 早期已就緒 try: from src.services.failover_alerter import configure_alerter from src.core.redis_client import get_redis @@ -574,6 +573,9 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]: except Exception as _alerter_err: logger.warning("failover_alerter_configure_failed", error=str(_alerter_err)) + # 啟動 recovery service(從 Redis bootstrap current_primary,並啟動背景監控) + await _recovery_svc.start() + logger.info("ollama_failover_system_started") except Exception as e: logger.warning("ollama_failover_system_start_failed", error=str(e)) diff --git a/apps/api/src/services/approval_execution.py b/apps/api/src/services/approval_execution.py index dc506f3d..d2cf21d3 100644 --- a/apps/api/src/services/approval_execution.py +++ b/apps/api/src/services/approval_execution.py @@ -751,7 +751,9 @@ class ApprovalExecutionService: try: from src.services.incident_service import get_incident_service from src.services.post_execution_verifier import get_post_execution_verifier - from src.services.evidence_snapshot import EvidenceSnapshot + # 2026-04-26 critic-B2 hotfix by Claude Opus 4.7 + # get_latest_snapshot 是 module-level async function,不是 EvidenceSnapshot classmethod + from src.services.evidence_snapshot import get_latest_snapshot incident_svc = get_incident_service() # 2026-04-25 修復 L1:IncidentService 沒有 get_incident() 方法 @@ -768,7 +770,7 @@ class ApprovalExecutionService: return # 取最新 EvidenceSnapshot(若 Phase 1 flag 有啟動才會有) - snapshot = await EvidenceSnapshot.get_latest_snapshot(approval.incident_id) + snapshot = await get_latest_snapshot(approval.incident_id) verifier = get_post_execution_verifier() verification_result = await verifier.verify( diff --git a/apps/api/src/services/failover_alerter.py b/apps/api/src/services/failover_alerter.py index 00f0f438..86b30ed6 100644 --- a/apps/api/src/services/failover_alerter.py +++ b/apps/api/src/services/failover_alerter.py @@ -85,15 +85,18 @@ class FailoverAlerter: logger.info("recovery_alert_sent", from_provider=from_provider) async def alert_gemini_quota_exceeded(self, event: dict[str, Any]) -> None: - """Gemini 每日上限觸發,降級到 188 CPU 備援 — 24h dedup""" - dedup_key = "alert:gemini_quota_exceeded" + """Gemini 每日上限觸發,降級到 188 CPU 備援 — 24h dedup(每日重置)""" + # 2026-04-26 critic-H1 hotfix by Claude Opus 4.7 — dedup key 加日期後綴 + # 原 key 常數 + 24h TTL 會吞跨日告警(昨 22:00 觸發,今 21:30 再觸發時 dedup 還沒過期) + # 修法:key 加 YYYY-MM-DD,每日獨立 dedup window;TTL 縮短到 8h 足夠當日內防重複 + date_str = datetime.now(TAIPEI_TZ).date().isoformat() + dedup_key = f"alert:gemini_quota_exceeded:{date_str}" if not await self._check_dedup(dedup_key, ttl=QUOTA_DEDUP_TTL_SEC): - logger.debug("quota_alert_dedup_skipped") + logger.debug("quota_alert_dedup_skipped", date=date_str) return quota = event.get("quota", "?") current_count = event.get("current_count", "?") - date_str = datetime.now(TAIPEI_TZ).date().isoformat() msg = ( f"*Gemini 每日配額耗盡*\n\n" diff --git a/apps/api/tests/test_ai_router_failover_integration.py b/apps/api/tests/test_ai_router_failover_integration.py index 0d4446a1..95847131 100644 --- a/apps/api/tests/test_ai_router_failover_integration.py +++ b/apps/api/tests/test_ai_router_failover_integration.py @@ -1,5 +1,8 @@ # apps/api/tests/test_ai_router_failover_integration.py | 2026-04-25 @ Asia/Taipei # 2026-04-25 P1.2 by Claude Engineer-A2 — failover 整合到 ai_router + lifespan +# 2026-04-26 Wave4 P1.2-tests-fix by Claude Engineer-A3 — 修正 intent mock:ALERT_TRIAGE→DIAGNOSE(normalize_intent 映射),改用 UNKNOWN(無 override,score=1 → OLLAMA → failover 觸發) +# 2026-04-26 Wave4 P1.2-tests-fix-v2 by Claude Opus 4.7 — UNKNOWN intent 在 router 內仍被 reclassify 成 DIAGNOSE → openclaw_nemo +# 改用 patch.object(router, "_select_provider_and_model") 直接強制初始路由為 OLLAMA,繞過 normalize / alert detection 邏輯 """ AIRouter × OllamaFailoverManager 整合測試 ========================================== @@ -86,23 +89,13 @@ async def test_router_uses_failover_when_ollama_initial_provider(): router = _make_router_with_mock_failover(mock_fm) - # 讓 intent classifier + complexity scorer 走 sync 快路徑(ALERT_TRIAGE → OLLAMA) - with patch.object(router._intent_classifier, "classify") as mock_classify: - from src.services.intent_classifier import IntentResult, IntentType, RiskLevel - from src.services.complexity_scorer import ComplexityScore - - mock_classify.return_value = IntentResult( - intent=IntentType.ALERT_TRIAGE, - confidence=0.9, - method="keyword", - matched_keywords=["alert"], - detected_resources=[], - reasoning="test", - ) - with patch.object(router._complexity_scorer, "score") as mock_score: - mock_score.return_value = ComplexityScore(score=1, features={}) - - decision = await router.route("test alert message") + # 2026-04-26 Wave4 v2 by Claude Opus 4.7 — 直接 mock _select_provider_and_model 強制初始 OLLAMA + with patch.object( + router, + "_select_provider_and_model", + return_value=(AIProviderEnum.OLLAMA, "qwen3:8b", "test forced ollama"), + ): + decision = await router.route("test alert message") assert decision.selected_provider == AIProviderEnum.GEMINI assert decision.selected_model == "gemini-1.5-flash" @@ -132,22 +125,13 @@ async def test_router_failover_fallback_chain_converted(): router = _make_router_with_mock_failover(mock_fm) - with patch.object(router._intent_classifier, "classify") as mock_classify: - from src.services.intent_classifier import IntentResult, IntentType - from src.services.complexity_scorer import ComplexityScore - - mock_classify.return_value = IntentResult( - intent=IntentType.ALERT_TRIAGE, - confidence=0.9, - method="keyword", - matched_keywords=["alert"], - detected_resources=[], - reasoning="test", - ) - with patch.object(router._complexity_scorer, "score") as mock_score: - mock_score.return_value = ComplexityScore(score=1, features={}) - - decision = await router.route("test alert message") + # 2026-04-26 Wave4 v2 by Claude Opus 4.7 — 直接 mock _select_provider_and_model 強制初始 OLLAMA + with patch.object( + router, + "_select_provider_and_model", + return_value=(AIProviderEnum.OLLAMA, "qwen3:8b", "test forced ollama"), + ): + decision = await router.route("test alert message") fb_providers = [p for p, _ in decision.fallback_chain] assert AIProviderEnum.OLLAMA_188 in fb_providers, ( @@ -233,23 +217,15 @@ async def test_router_failopen_when_failover_manager_raises(): router = _make_router_with_mock_failover(mock_fm) - with patch.object(router._intent_classifier, "classify") as mock_classify: - from src.services.intent_classifier import IntentResult, IntentType - from src.services.complexity_scorer import ComplexityScore - - mock_classify.return_value = IntentResult( - intent=IntentType.ALERT_TRIAGE, - confidence=0.9, - method="keyword", - matched_keywords=["alert"], - detected_resources=[], - reasoning="test", - ) - with patch.object(router._complexity_scorer, "score") as mock_score: - mock_score.return_value = ComplexityScore(score=1, features={}) - - # 不應 raise,應 fail-open - decision = await router.route("test alert message") + # 2026-04-26 Wave4 v2 by Claude Opus 4.7 — 直接 mock _select_provider_and_model 強制初始 OLLAMA + # → failover 觸發 → raises RuntimeError → fail-open → 保留 OLLAMA + with patch.object( + router, + "_select_provider_and_model", + return_value=(AIProviderEnum.OLLAMA, "qwen3:8b", "test forced ollama"), + ): + # 不應 raise,應 fail-open + decision = await router.route("test alert message") # fail-open → 保留 OLLAMA(原始 initial decision) assert decision.selected_provider == AIProviderEnum.OLLAMA