【根因】INC-20260507-99ADF2 飛輪斷流,566+ stuck incidents(30秒漲 1)核心 原因:NO_ACTION 路徑 (approval_execution.py:251) 提前 return True,跳過 line 482-495 已有的 resolve_incident 呼叫,incident 永遠卡 INVESTIGATING。 【修法】 - approval_execution.py NO_ACTION 分支補 resolve_incident 呼叫 + 成功/失敗 log,背景 log 加 path="no_action" 用於 prod 量化修法生效率(debugger 全鏈分析 + critic 1st/2nd 審查必修 #1)。 - incident_service.py resolve_incident 在 line 1106 加 RESOLVED 冪等 guard, 早於所有副作用(status mutation / Redis / DB / postmortem / KB / KM / disposition),順帶修 success path line 482-495 重觸 postmortem 的潛在 老風險(critic 必修 #2)。 【遵守 Codex 5/6 設計(feedback_respect_codex_design_intent.md)】 - 不動 flywheel_stats_service.py / heartbeat_report_service.py / auto_repair_service.record_auto_repair() / metrics_repository UPPER(status)。 - resolve_incident 不寫 auto_repair_executions 表(Codex 5/6 source of truth),不污染 24h KPI 計算。 【Test 覆蓋】 - test_approval_execution_no_action.py:NO_ACTION → resolve 被呼叫一次 + resolve raise 時仍 return True(NO_ACTION 不能因 resolve 失敗退化成 False, 否則污染 auto_execute KPI line 207-208 註解契約)。 - test_incident_service_resolve_idempotency.py:RESOLVED → return existing + save_to_working_memory 不被呼叫;not_found → return None。 【驗收條件(部署後 24h)】 1. grep `path="no_action"` 中 incident_resolved_after_no_action_execution 數量 vs background_execution_noop 數量,1:1 才算修復成功。 2. awoooi_flywheel_incidents_stuck 從每 30 秒漲 1 變平緩。 3. SRE 群 24h 內若湧入 >20 份 NO_ACTION postmortem 觸發 follow-up 評估 resolution_type="no_action" 跳過 postmortem(critic Minor #3 方案 B)。 Refs: INC-20260507-99ADF2, debugger root cause #1 (鏈 A), critic 1st 必修 #1 #2, critic 2nd 必修 #1 #2 #3 Co-Authored-By: Codex (aider) <noreply@anthropic.com> Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
102 lines
3.4 KiB
Python
102 lines
3.4 KiB
Python
from types import SimpleNamespace
|
||
|
||
from unittest.mock import AsyncMock
|
||
|
||
import pytest
|
||
|
||
from src.services.approval_execution import ApprovalExecutionService
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_no_action_execution_resolves_incident_once(monkeypatch):
|
||
# Arrange
|
||
approval = SimpleNamespace(
|
||
id="approval-noaction-1",
|
||
action="NO_ACTION: 先做觀察",
|
||
incident_id="INC-TEST-001",
|
||
)
|
||
incident_service = SimpleNamespace(resolve_incident=AsyncMock())
|
||
|
||
monkeypatch.setattr(
|
||
"src.services.approval_execution.get_approval_service",
|
||
lambda: SimpleNamespace(update_execution_status=AsyncMock()),
|
||
)
|
||
monkeypatch.setattr(
|
||
"src.services.approval_execution.get_timeline_service",
|
||
lambda: SimpleNamespace(add_event=AsyncMock()),
|
||
)
|
||
monkeypatch.setattr(
|
||
"src.services.approval_execution.parse_operation_from_action",
|
||
lambda _: SimpleNamespace(
|
||
operation_type=None, resource_name=None, namespace=None
|
||
),
|
||
)
|
||
monkeypatch.setattr(
|
||
"src.services.incident_service.get_incident_service",
|
||
lambda: incident_service,
|
||
)
|
||
monkeypatch.setattr(
|
||
"src.services.approval_execution.ApprovalExecutionService._push_execution_result_to_alert",
|
||
AsyncMock(return_value=None),
|
||
)
|
||
monkeypatch.setattr(
|
||
"src.services.approval_execution.ApprovalExecutionService._log_aol_completed",
|
||
AsyncMock(return_value=None),
|
||
)
|
||
|
||
# Act
|
||
result = await ApprovalExecutionService().execute_approved_action(approval)
|
||
|
||
# Assert
|
||
assert result is True
|
||
incident_service.resolve_incident.assert_awaited_once_with("INC-TEST-001")
|
||
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_no_action_execution_returns_true_when_resolve_raises(monkeypatch):
|
||
"""resolve_incident 拋錯時,NO_ACTION 仍須 return True。
|
||
|
||
契約:NO_ACTION 是「純觀察類」成功完成(line 207-208 註解明說避免污染
|
||
auto_execute KPI)。resolve 失敗只該 warning log,不該讓 result 退化成 False。
|
||
"""
|
||
approval = SimpleNamespace(
|
||
id="approval-noaction-2",
|
||
action="NO_ACTION: 觀察",
|
||
incident_id="INC-TEST-002",
|
||
)
|
||
incident_service = SimpleNamespace(
|
||
resolve_incident=AsyncMock(side_effect=RuntimeError("redis down"))
|
||
)
|
||
|
||
monkeypatch.setattr(
|
||
"src.services.approval_execution.get_approval_service",
|
||
lambda: SimpleNamespace(update_execution_status=AsyncMock()),
|
||
)
|
||
monkeypatch.setattr(
|
||
"src.services.approval_execution.get_timeline_service",
|
||
lambda: SimpleNamespace(add_event=AsyncMock()),
|
||
)
|
||
monkeypatch.setattr(
|
||
"src.services.approval_execution.parse_operation_from_action",
|
||
lambda _: SimpleNamespace(
|
||
operation_type=None, resource_name=None, namespace=None
|
||
),
|
||
)
|
||
monkeypatch.setattr(
|
||
"src.services.incident_service.get_incident_service",
|
||
lambda: incident_service,
|
||
)
|
||
monkeypatch.setattr(
|
||
"src.services.approval_execution.ApprovalExecutionService._push_execution_result_to_alert",
|
||
AsyncMock(return_value=None),
|
||
)
|
||
monkeypatch.setattr(
|
||
"src.services.approval_execution.ApprovalExecutionService._log_aol_completed",
|
||
AsyncMock(return_value=None),
|
||
)
|
||
|
||
result = await ApprovalExecutionService().execute_approved_action(approval)
|
||
|
||
assert result is True
|
||
incident_service.resolve_incident.assert_awaited_once_with("INC-TEST-002")
|