fix(api): resolve db-only stale incidents
All checks were successful
CD Pipeline / tests (push) Successful in 1m33s
Code Review / ai-code-review (push) Successful in 11s
CD Pipeline / build-and-deploy (push) Successful in 4m54s
CD Pipeline / post-deploy-checks (push) Successful in 2m8s

This commit is contained in:
Your Name
2026-05-29 11:15:43 +08:00
parent aeaa77bbe1
commit 92316dda04
2 changed files with 84 additions and 5 deletions

View File

@@ -1168,12 +1168,14 @@ class IncidentService:
from src.utils.timezone import now_taipei
# 1. 從 Working Memory 讀取Redis TTL 過期時退回 PostgreSQL。
hydrated_from_episodic = False
incident = await self.get_from_working_memory(incident_id)
if incident is None:
incident = await self.get_from_episodic_memory(incident_id)
if incident is None:
logger.warning("incident_not_found_for_resolve", incident_id=incident_id)
return None
hydrated_from_episodic = True
logger.info(
"incident_resolve_hydrated_from_episodic_memory",
incident_id=incident_id,
@@ -1197,11 +1199,15 @@ class IncidentService:
incident.resolved_at = now_taipei()
incident.updated_at = now_taipei()
# 3. 寫入 Working Memory
redis_success = await self.save_to_working_memory(incident)
if not redis_success:
logger.error("resolve_redis_write_failed", incident_id=incident_id)
return None
# 3. 寫入 Working Memory。Redis TTL 已過的歷史 DB-only 事件只更新 DB
# 不重新灌回 Redis working memory,避免舊事件回流成 active workload。
if hydrated_from_episodic:
logger.info("resolve_db_only_incident", incident_id=incident_id)
else:
redis_success = await self.save_to_working_memory(incident)
if not redis_success:
logger.error("resolve_redis_write_failed", incident_id=incident_id)
return None
# 4. 同步到 Episodic Memory
try:

View File

@@ -12,6 +12,7 @@ test_incident_service_resolve_idempotency
重新放大「resolve_incident 重複觸發 postmortem 洗版」的舊風險。
"""
import inspect
from datetime import UTC, datetime
from types import SimpleNamespace
from unittest.mock import AsyncMock
@@ -27,6 +28,42 @@ from src.services.incident_service import (
)
def _patch_resolve_side_effects(monkeypatch):
"""讓 active incident resolve 測試只驗證 resolve contract不真的觸發下游工作。"""
import asyncio
import src.services.anomaly_counter as anomaly_counter
import src.services.km_conversion_service as km_conversion_service
import src.services.knowledge_extractor_service as knowledge_extractor_service
def close_create_task(coro):
if inspect.iscoroutine(coro):
coro.close()
return SimpleNamespace()
monkeypatch.setattr(asyncio, "create_task", close_create_task)
monkeypatch.setattr(
knowledge_extractor_service,
"get_knowledge_extractor",
lambda: SimpleNamespace(extract_from_incident=AsyncMock(return_value=None)),
)
monkeypatch.setattr(
km_conversion_service,
"get_km_conversion_service",
lambda: SimpleNamespace(convert=AsyncMock(return_value=None)),
)
monkeypatch.setattr(
anomaly_counter,
"AnomalyCounter",
SimpleNamespace(derive_key_from_incident=staticmethod(lambda _incident: None)),
)
monkeypatch.setattr(
anomaly_counter,
"get_anomaly_counter",
lambda: SimpleNamespace(),
)
@pytest.mark.asyncio
async def test_resolve_incident_skips_when_already_resolved(monkeypatch):
"""RESOLVED 的 incident 重複 resolve 應 idempotent。"""
@@ -53,6 +90,42 @@ async def test_resolve_incident_skips_when_already_resolved(monkeypatch):
save_mock.assert_not_called()
@pytest.mark.asyncio
async def test_resolve_incident_can_close_db_only_active_incident(monkeypatch):
"""Redis TTL 過期但 DB 仍 active 的舊 incident 必須能收斂,不重新灌回 Redis。"""
fake_incident = SimpleNamespace(
incident_id="INC-DB-FALLBACK-ACTIVE-001",
status=IncidentStatus.INVESTIGATING,
resolved_at=None,
updated_at=None,
signals=[],
affected_services=[],
decision_chain=None,
outcome=None,
)
svc = IncidentService()
monkeypatch.setattr(
svc, "get_from_working_memory", AsyncMock(return_value=None)
)
episodic_mock = AsyncMock(return_value=fake_incident)
monkeypatch.setattr(svc, "get_from_episodic_memory", episodic_mock)
save_mock = AsyncMock(return_value=True)
monkeypatch.setattr(svc, "save_to_working_memory", save_mock)
_patch_resolve_side_effects(monkeypatch)
result = await svc.resolve_incident(
"INC-DB-FALLBACK-ACTIVE-001",
emit_postmortem=False,
)
assert result is fake_incident
assert fake_incident.status is IncidentStatus.RESOLVED
assert fake_incident.resolved_at is not None
episodic_mock.assert_awaited_once_with("INC-DB-FALLBACK-ACTIVE-001")
save_mock.assert_not_called()
@pytest.mark.asyncio
async def test_resolve_incident_returns_none_when_not_found(monkeypatch):
"""incident 不存在時 return None。確保 guard 不影響 not-found 路徑。"""