Compare commits
4 Commits
drift/adop
...
drift/adop
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d356cd32fc | ||
|
|
80c36ba801 | ||
|
|
afb5f9556e | ||
|
|
b3dc41fcd4 |
@@ -80,6 +80,7 @@ from src.core.feature_flags import aiops_flags # ADR-080: AI 自主化飛輪 fe
|
||||
from src.core.http_client import close_all_http_clients, init_all_http_clients
|
||||
from src.core.logging import get_logger, setup_logging
|
||||
from src.core.redis_client import close_redis_pool, init_redis_pool
|
||||
from src.services.flywheel_stats_service import get_flywheel_stats_service
|
||||
from src.core.sse import get_publisher
|
||||
from src.core.telemetry import setup_telemetry, shutdown_telemetry
|
||||
|
||||
@@ -1005,10 +1006,17 @@ app.include_router(platform_v1.router, prefix="/api/v1/platform", tags=["AwoooP
|
||||
@app.get("/metrics", include_in_schema=False)
|
||||
async def prometheus_metrics() -> Response:
|
||||
"""Prometheus metrics endpoint for alerting"""
|
||||
return Response(
|
||||
content=generate_latest(),
|
||||
media_type=CONTENT_TYPE_LATEST,
|
||||
)
|
||||
content = generate_latest().decode("utf-8")
|
||||
# 2026-05-07 ogt + Claude Sonnet 4.6 — INC-20260507-99ADF2 修復
|
||||
# 飛輪指標(awoooi_flywheel_*)原本只在 /api/v1/stats/flywheel/metrics 暴露,
|
||||
# 110 Prom awoooi-api job scrape /metrics 時抓不到 → FlywheelExecutionRateMissing 永久 firing
|
||||
# 修法:在此串入飛輪指標,讓既有 scrape job 無需新增 job 即可抓到
|
||||
try:
|
||||
flywheel_metrics = await get_flywheel_stats_service().compute()
|
||||
content += flywheel_metrics.to_prometheus_lines()
|
||||
except Exception:
|
||||
logger.warning("prometheus_metrics_flywheel_error")
|
||||
return Response(content=content, media_type=CONTENT_TYPE_LATEST)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
|
||||
@@ -222,6 +222,7 @@ class ApprovalExecutionService:
|
||||
approval_id=str(approval.id),
|
||||
action=approval.action,
|
||||
reason="NO_ACTION - 純調查/觀察類,不執行破壞動作",
|
||||
path="no_action",
|
||||
)
|
||||
# 標為 SUCCESS (觀察/調查本身就是成功完成)
|
||||
await service.update_execution_status(approval.id, success=True)
|
||||
@@ -248,6 +249,29 @@ class ApprovalExecutionService:
|
||||
duration_ms=int((time.time() - _aol_started_ms) * 1000),
|
||||
output={"reason": "NO_ACTION", "action": approval.action[:200]},
|
||||
)
|
||||
# F2 (2026-05-07 ogt + Claude Sonnet 4.6 + Codex):
|
||||
# NO_ACTION 路徑要把 incident 推到 RESOLVED,否則 incident 永遠卡
|
||||
# INVESTIGATING(FlywheelExecutionRateMissing 死告警 + 566 stuck 增長根因 #1)。
|
||||
# resolve_incident 內已加 RESOLVED 冪等 guard,重複 resolve 會 idempotent
|
||||
# return existing incident 不會重觸發 postmortem。
|
||||
if approval.incident_id:
|
||||
try:
|
||||
from src.services.incident_service import get_incident_service
|
||||
|
||||
await get_incident_service().resolve_incident(approval.incident_id)
|
||||
logger.info(
|
||||
"incident_resolved_after_no_action_execution",
|
||||
incident_id=approval.incident_id,
|
||||
approval_id=str(approval.id),
|
||||
path="no_action",
|
||||
)
|
||||
except Exception as _resolve_e:
|
||||
logger.warning(
|
||||
"incident_resolve_after_no_action_execution_failed",
|
||||
incident_id=approval.incident_id,
|
||||
approval_id=str(approval.id),
|
||||
error=str(_resolve_e),
|
||||
)
|
||||
return True # NO_ACTION 視為成功完成
|
||||
|
||||
# 真解析失敗 (非 NO_ACTION)
|
||||
|
||||
@@ -1103,6 +1103,18 @@ class IncidentService:
|
||||
logger.warning("incident_not_found_for_resolve", incident_id=incident_id)
|
||||
return None
|
||||
|
||||
# 1.5 F2 (2026-05-07 ogt + Codex + Claude Sonnet 4.6) — 冪等保護:
|
||||
# 已經 RESOLVED 的 incident 直接 return existing,避免後續所有副作用
|
||||
# 重複觸發(postmortem / KB extract / KM convert / disposition / Telegram)。
|
||||
# F2 NO_ACTION 路徑會頻繁呼叫 resolve_incident,必須擋在 status mutation 之前。
|
||||
if incident.status == IncidentStatus.RESOLVED:
|
||||
logger.info(
|
||||
"incident_resolve_skipped_already_resolved",
|
||||
incident_id=incident_id,
|
||||
resolution_type=resolution_type,
|
||||
)
|
||||
return incident
|
||||
|
||||
# 2. 更新狀態
|
||||
incident.status = IncidentStatus.RESOLVED
|
||||
incident.resolved_at = now_taipei()
|
||||
|
||||
101
apps/api/tests/test_approval_execution_no_action.py
Normal file
101
apps/api/tests/test_approval_execution_no_action.py
Normal file
@@ -0,0 +1,101 @@
|
||||
from types import SimpleNamespace
|
||||
|
||||
from unittest.mock import AsyncMock
|
||||
|
||||
import pytest
|
||||
|
||||
from src.services.approval_execution import ApprovalExecutionService
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_action_execution_resolves_incident_once(monkeypatch):
|
||||
# Arrange
|
||||
approval = SimpleNamespace(
|
||||
id="approval-noaction-1",
|
||||
action="NO_ACTION: 先做觀察",
|
||||
incident_id="INC-TEST-001",
|
||||
)
|
||||
incident_service = SimpleNamespace(resolve_incident=AsyncMock())
|
||||
|
||||
monkeypatch.setattr(
|
||||
"src.services.approval_execution.get_approval_service",
|
||||
lambda: SimpleNamespace(update_execution_status=AsyncMock()),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"src.services.approval_execution.get_timeline_service",
|
||||
lambda: SimpleNamespace(add_event=AsyncMock()),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"src.services.approval_execution.parse_operation_from_action",
|
||||
lambda _: SimpleNamespace(
|
||||
operation_type=None, resource_name=None, namespace=None
|
||||
),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"src.services.incident_service.get_incident_service",
|
||||
lambda: incident_service,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"src.services.approval_execution.ApprovalExecutionService._push_execution_result_to_alert",
|
||||
AsyncMock(return_value=None),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"src.services.approval_execution.ApprovalExecutionService._log_aol_completed",
|
||||
AsyncMock(return_value=None),
|
||||
)
|
||||
|
||||
# Act
|
||||
result = await ApprovalExecutionService().execute_approved_action(approval)
|
||||
|
||||
# Assert
|
||||
assert result is True
|
||||
incident_service.resolve_incident.assert_awaited_once_with("INC-TEST-001")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_action_execution_returns_true_when_resolve_raises(monkeypatch):
|
||||
"""resolve_incident 拋錯時,NO_ACTION 仍須 return True。
|
||||
|
||||
契約:NO_ACTION 是「純觀察類」成功完成(line 207-208 註解明說避免污染
|
||||
auto_execute KPI)。resolve 失敗只該 warning log,不該讓 result 退化成 False。
|
||||
"""
|
||||
approval = SimpleNamespace(
|
||||
id="approval-noaction-2",
|
||||
action="NO_ACTION: 觀察",
|
||||
incident_id="INC-TEST-002",
|
||||
)
|
||||
incident_service = SimpleNamespace(
|
||||
resolve_incident=AsyncMock(side_effect=RuntimeError("redis down"))
|
||||
)
|
||||
|
||||
monkeypatch.setattr(
|
||||
"src.services.approval_execution.get_approval_service",
|
||||
lambda: SimpleNamespace(update_execution_status=AsyncMock()),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"src.services.approval_execution.get_timeline_service",
|
||||
lambda: SimpleNamespace(add_event=AsyncMock()),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"src.services.approval_execution.parse_operation_from_action",
|
||||
lambda _: SimpleNamespace(
|
||||
operation_type=None, resource_name=None, namespace=None
|
||||
),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"src.services.incident_service.get_incident_service",
|
||||
lambda: incident_service,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"src.services.approval_execution.ApprovalExecutionService._push_execution_result_to_alert",
|
||||
AsyncMock(return_value=None),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"src.services.approval_execution.ApprovalExecutionService._log_aol_completed",
|
||||
AsyncMock(return_value=None),
|
||||
)
|
||||
|
||||
result = await ApprovalExecutionService().execute_approved_action(approval)
|
||||
|
||||
assert result is True
|
||||
incident_service.resolve_incident.assert_awaited_once_with("INC-TEST-002")
|
||||
64
apps/api/tests/test_incident_service_resolve_idempotency.py
Normal file
64
apps/api/tests/test_incident_service_resolve_idempotency.py
Normal file
@@ -0,0 +1,64 @@
|
||||
"""
|
||||
test_incident_service_resolve_idempotency
|
||||
==========================================
|
||||
|
||||
驗證 `IncidentService.resolve_incident` 對已經 RESOLVED 的 incident 必須 idempotent:
|
||||
- 直接 return existing incident
|
||||
- 不呼叫 save_to_working_memory(避免重複 Redis write)
|
||||
- 不呼叫 incident_repository.update_status(避免重複 DB write)
|
||||
- 不觸發 postmortem / KB extract / KM convert / disposition 副作用
|
||||
|
||||
對應 critic 必修 #2 — 沒這個單測,未來有人挪 guard 位置會悄悄破功,
|
||||
重新放大「resolve_incident 重複觸發 postmortem 洗版」的舊風險。
|
||||
"""
|
||||
|
||||
from types import SimpleNamespace
|
||||
from unittest.mock import AsyncMock
|
||||
|
||||
import pytest
|
||||
|
||||
from src.models.incident import IncidentStatus
|
||||
from src.services.incident_service import IncidentService
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_resolve_incident_skips_when_already_resolved(monkeypatch):
|
||||
"""RESOLVED 的 incident 重複 resolve 應 idempotent。"""
|
||||
fake_incident = SimpleNamespace(
|
||||
incident_id="INC-IDEMPO-001",
|
||||
status=IncidentStatus.RESOLVED,
|
||||
)
|
||||
|
||||
svc = IncidentService()
|
||||
|
||||
# Mock 入口讀取 → 回 RESOLVED incident
|
||||
monkeypatch.setattr(
|
||||
svc, "get_from_working_memory", AsyncMock(return_value=fake_incident)
|
||||
)
|
||||
# Mock 後續所有副作用 → 用 AsyncMock 監看是否被呼叫
|
||||
save_mock = AsyncMock(return_value=True)
|
||||
monkeypatch.setattr(svc, "save_to_working_memory", save_mock)
|
||||
|
||||
result = await svc.resolve_incident("INC-IDEMPO-001")
|
||||
|
||||
# 應 return existing incident
|
||||
assert result is fake_incident
|
||||
# 副作用一律不能觸發(guard 必須早於 line 1117 的 status mutation)
|
||||
save_mock.assert_not_called()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_resolve_incident_returns_none_when_not_found(monkeypatch):
|
||||
"""incident 不存在時 return None。確保 guard 不影響 not-found 路徑。"""
|
||||
svc = IncidentService()
|
||||
|
||||
monkeypatch.setattr(
|
||||
svc, "get_from_working_memory", AsyncMock(return_value=None)
|
||||
)
|
||||
save_mock = AsyncMock(return_value=True)
|
||||
monkeypatch.setattr(svc, "save_to_working_memory", save_mock)
|
||||
|
||||
result = await svc.resolve_incident("INC-NOT-EXIST")
|
||||
|
||||
assert result is None
|
||||
save_mock.assert_not_called()
|
||||
@@ -40,7 +40,7 @@ resources:
|
||||
images:
|
||||
- name: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
|
||||
newName: 192.168.0.110:5000/awoooi/api
|
||||
newTag: 72d86ba70bc9db6035871e22c2d4a0410e1d7cc1
|
||||
newTag: 80c36ba801b6eae5127afec9c25aa904e1dde270
|
||||
- name: 192.168.0.110:5000/library/web:IMAGE_TAG_PLACEHOLDER
|
||||
newName: 192.168.0.110:5000/awoooi/web
|
||||
newTag: 72d86ba70bc9db6035871e22c2d4a0410e1d7cc1
|
||||
newTag: 80c36ba801b6eae5127afec9c25aa904e1dde270
|
||||
|
||||
Reference in New Issue
Block a user