diff --git a/apps/api/src/services/approval_execution.py b/apps/api/src/services/approval_execution.py index 1387e6a4..22390ebc 100644 --- a/apps/api/src/services/approval_execution.py +++ b/apps/api/src/services/approval_execution.py @@ -222,6 +222,7 @@ class ApprovalExecutionService: approval_id=str(approval.id), action=approval.action, reason="NO_ACTION - 純調查/觀察類,不執行破壞動作", + path="no_action", ) # 標為 SUCCESS (觀察/調查本身就是成功完成) await service.update_execution_status(approval.id, success=True) @@ -248,6 +249,29 @@ class ApprovalExecutionService: duration_ms=int((time.time() - _aol_started_ms) * 1000), output={"reason": "NO_ACTION", "action": approval.action[:200]}, ) + # F2 (2026-05-07 ogt + Claude Sonnet 4.6 + Codex): + # NO_ACTION 路徑要把 incident 推到 RESOLVED,否則 incident 永遠卡 + # INVESTIGATING(FlywheelExecutionRateMissing 死告警 + 566 stuck 增長根因 #1)。 + # resolve_incident 內已加 RESOLVED 冪等 guard,重複 resolve 會 idempotent + # return existing incident 不會重觸發 postmortem。 + if approval.incident_id: + try: + from src.services.incident_service import get_incident_service + + await get_incident_service().resolve_incident(approval.incident_id) + logger.info( + "incident_resolved_after_no_action_execution", + incident_id=approval.incident_id, + approval_id=str(approval.id), + path="no_action", + ) + except Exception as _resolve_e: + logger.warning( + "incident_resolve_after_no_action_execution_failed", + incident_id=approval.incident_id, + approval_id=str(approval.id), + error=str(_resolve_e), + ) return True # NO_ACTION 視為成功完成 # 真解析失敗 (非 NO_ACTION) diff --git a/apps/api/src/services/incident_service.py b/apps/api/src/services/incident_service.py index bcf463a7..8f4df164 100644 --- a/apps/api/src/services/incident_service.py +++ b/apps/api/src/services/incident_service.py @@ -1103,6 +1103,18 @@ class IncidentService: logger.warning("incident_not_found_for_resolve", incident_id=incident_id) return None + # 1.5 F2 (2026-05-07 ogt + Codex + Claude Sonnet 4.6) — 冪等保護: + # 已經 RESOLVED 的 incident 直接 return existing,避免後續所有副作用 + # 重複觸發(postmortem / KB extract / KM convert / disposition / Telegram)。 + # F2 NO_ACTION 路徑會頻繁呼叫 resolve_incident,必須擋在 status mutation 之前。 + if incident.status == IncidentStatus.RESOLVED: + logger.info( + "incident_resolve_skipped_already_resolved", + incident_id=incident_id, + resolution_type=resolution_type, + ) + return incident + # 2. 更新狀態 incident.status = IncidentStatus.RESOLVED incident.resolved_at = now_taipei() diff --git a/apps/api/tests/test_approval_execution_no_action.py b/apps/api/tests/test_approval_execution_no_action.py new file mode 100644 index 00000000..13ae42bf --- /dev/null +++ b/apps/api/tests/test_approval_execution_no_action.py @@ -0,0 +1,101 @@ +from types import SimpleNamespace + +from unittest.mock import AsyncMock + +import pytest + +from src.services.approval_execution import ApprovalExecutionService + + +@pytest.mark.asyncio +async def test_no_action_execution_resolves_incident_once(monkeypatch): + # Arrange + approval = SimpleNamespace( + id="approval-noaction-1", + action="NO_ACTION: 先做觀察", + incident_id="INC-TEST-001", + ) + incident_service = SimpleNamespace(resolve_incident=AsyncMock()) + + monkeypatch.setattr( + "src.services.approval_execution.get_approval_service", + lambda: SimpleNamespace(update_execution_status=AsyncMock()), + ) + monkeypatch.setattr( + "src.services.approval_execution.get_timeline_service", + lambda: SimpleNamespace(add_event=AsyncMock()), + ) + monkeypatch.setattr( + "src.services.approval_execution.parse_operation_from_action", + lambda _: SimpleNamespace( + operation_type=None, resource_name=None, namespace=None + ), + ) + monkeypatch.setattr( + "src.services.incident_service.get_incident_service", + lambda: incident_service, + ) + monkeypatch.setattr( + "src.services.approval_execution.ApprovalExecutionService._push_execution_result_to_alert", + AsyncMock(return_value=None), + ) + monkeypatch.setattr( + "src.services.approval_execution.ApprovalExecutionService._log_aol_completed", + AsyncMock(return_value=None), + ) + + # Act + result = await ApprovalExecutionService().execute_approved_action(approval) + + # Assert + assert result is True + incident_service.resolve_incident.assert_awaited_once_with("INC-TEST-001") + + +@pytest.mark.asyncio +async def test_no_action_execution_returns_true_when_resolve_raises(monkeypatch): + """resolve_incident 拋錯時,NO_ACTION 仍須 return True。 + + 契約:NO_ACTION 是「純觀察類」成功完成(line 207-208 註解明說避免污染 + auto_execute KPI)。resolve 失敗只該 warning log,不該讓 result 退化成 False。 + """ + approval = SimpleNamespace( + id="approval-noaction-2", + action="NO_ACTION: 觀察", + incident_id="INC-TEST-002", + ) + incident_service = SimpleNamespace( + resolve_incident=AsyncMock(side_effect=RuntimeError("redis down")) + ) + + monkeypatch.setattr( + "src.services.approval_execution.get_approval_service", + lambda: SimpleNamespace(update_execution_status=AsyncMock()), + ) + monkeypatch.setattr( + "src.services.approval_execution.get_timeline_service", + lambda: SimpleNamespace(add_event=AsyncMock()), + ) + monkeypatch.setattr( + "src.services.approval_execution.parse_operation_from_action", + lambda _: SimpleNamespace( + operation_type=None, resource_name=None, namespace=None + ), + ) + monkeypatch.setattr( + "src.services.incident_service.get_incident_service", + lambda: incident_service, + ) + monkeypatch.setattr( + "src.services.approval_execution.ApprovalExecutionService._push_execution_result_to_alert", + AsyncMock(return_value=None), + ) + monkeypatch.setattr( + "src.services.approval_execution.ApprovalExecutionService._log_aol_completed", + AsyncMock(return_value=None), + ) + + result = await ApprovalExecutionService().execute_approved_action(approval) + + assert result is True + incident_service.resolve_incident.assert_awaited_once_with("INC-TEST-002") diff --git a/apps/api/tests/test_incident_service_resolve_idempotency.py b/apps/api/tests/test_incident_service_resolve_idempotency.py new file mode 100644 index 00000000..5a0cc473 --- /dev/null +++ b/apps/api/tests/test_incident_service_resolve_idempotency.py @@ -0,0 +1,64 @@ +""" +test_incident_service_resolve_idempotency +========================================== + +驗證 `IncidentService.resolve_incident` 對已經 RESOLVED 的 incident 必須 idempotent: +- 直接 return existing incident +- 不呼叫 save_to_working_memory(避免重複 Redis write) +- 不呼叫 incident_repository.update_status(避免重複 DB write) +- 不觸發 postmortem / KB extract / KM convert / disposition 副作用 + +對應 critic 必修 #2 — 沒這個單測,未來有人挪 guard 位置會悄悄破功, +重新放大「resolve_incident 重複觸發 postmortem 洗版」的舊風險。 +""" + +from types import SimpleNamespace +from unittest.mock import AsyncMock + +import pytest + +from src.models.incident import IncidentStatus +from src.services.incident_service import IncidentService + + +@pytest.mark.asyncio +async def test_resolve_incident_skips_when_already_resolved(monkeypatch): + """RESOLVED 的 incident 重複 resolve 應 idempotent。""" + fake_incident = SimpleNamespace( + incident_id="INC-IDEMPO-001", + status=IncidentStatus.RESOLVED, + ) + + svc = IncidentService() + + # Mock 入口讀取 → 回 RESOLVED incident + monkeypatch.setattr( + svc, "get_from_working_memory", AsyncMock(return_value=fake_incident) + ) + # Mock 後續所有副作用 → 用 AsyncMock 監看是否被呼叫 + save_mock = AsyncMock(return_value=True) + monkeypatch.setattr(svc, "save_to_working_memory", save_mock) + + result = await svc.resolve_incident("INC-IDEMPO-001") + + # 應 return existing incident + assert result is fake_incident + # 副作用一律不能觸發(guard 必須早於 line 1117 的 status mutation) + save_mock.assert_not_called() + + +@pytest.mark.asyncio +async def test_resolve_incident_returns_none_when_not_found(monkeypatch): + """incident 不存在時 return None。確保 guard 不影響 not-found 路徑。""" + svc = IncidentService() + + monkeypatch.setattr( + svc, "get_from_working_memory", AsyncMock(return_value=None) + ) + save_mock = AsyncMock(return_value=True) + monkeypatch.setattr(svc, "save_to_working_memory", save_mock) + + result = await svc.resolve_incident("INC-NOT-EXIST") + + assert result is None + save_mock.assert_not_called()