fix(incidents): keep list endpoint pure read
This commit is contained in:
@@ -17,9 +17,10 @@ Phase 6.4 核心功能:
|
||||
- Proposal 必須關聯到 Incident
|
||||
"""
|
||||
|
||||
from datetime import UTC, datetime, timedelta
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, HTTPException, status
|
||||
from fastapi import APIRouter, HTTPException, Query, status
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from src.core.logging import get_logger
|
||||
@@ -148,18 +149,26 @@ class IncidentTimelineResponse(BaseModel):
|
||||
|
||||
Phase 6.5 升級:
|
||||
- 每個事件自動附帶 decision_token
|
||||
- 確保 UI 永遠有決策可操作
|
||||
- 雙軌引擎: LLM (主) + Expert System (備)
|
||||
- 預設只讀取已存在的 decision_token
|
||||
- 需要新決策時改由明確的 proposal / operator run 入口觸發
|
||||
""",
|
||||
)
|
||||
async def list_incidents() -> IncidentListResponse:
|
||||
async def list_incidents(
|
||||
generate_missing_decisions: bool = Query(
|
||||
False,
|
||||
description=(
|
||||
"預設 false,列表查詢只讀既有 decision token;"
|
||||
"true 僅供明確維運操作使用,會背景產生缺少的決策。"
|
||||
),
|
||||
),
|
||||
) -> IncidentListResponse:
|
||||
"""
|
||||
取得活躍事件清單
|
||||
|
||||
Phase 6.5: 自動為每個事件生成決策令牌
|
||||
- P0/P1 事件優先處理
|
||||
- 30 秒內保證有決策
|
||||
- LLM 失敗時 Expert System 保底
|
||||
Phase 6.5: 附帶既有決策令牌
|
||||
- 列表查詢必須是低成本純讀路徑
|
||||
- 不可因為前端輪詢就背景觸發 LLM / Ollama / OpenClaw
|
||||
- 需要新決策時,呼叫 POST /api/v1/incidents/{incident_id}/proposal
|
||||
|
||||
Returns:
|
||||
IncidentListResponse: 事件清單與計數 (含決策令牌)
|
||||
@@ -174,8 +183,6 @@ async def list_incidents() -> IncidentListResponse:
|
||||
|
||||
# 按時間排序 (最新優先)
|
||||
# 2026-03-26 修復: 處理 timezone-aware 與 naive datetime 混合問題
|
||||
from datetime import UTC
|
||||
|
||||
def safe_created_at(i: Incident) -> float:
|
||||
"""安全取得 timestamp,處理 timezone 混合問題"""
|
||||
dt = i.created_at
|
||||
@@ -189,7 +196,13 @@ async def list_incidents() -> IncidentListResponse:
|
||||
# 2026-04-09 Claude Sonnet 4.6: 效能修復 — list endpoint 不同步等待 AI
|
||||
# 原設計: 每個 incident await AI 決策 (120-180s timeout),多 incident 時乘積爆炸
|
||||
# 修復: 只取已存在的決策 token,若無則背景觸發生成,前端 poll 單筆 GET 取得結果
|
||||
import asyncio
|
||||
#
|
||||
# 2026-05-06 Codex: 成本與推理槽修復 — 預設不再背景觸發 AI。
|
||||
# 根因: 多個前端頁面會輪詢 GET /incidents;若列表查詢偷偷 create_task,
|
||||
# 每次頁面載入都可能消耗 GCP Ollama / OpenClaw 推理槽,甚至 fallback 到 Gemini。
|
||||
# 新規則: GET list 是純讀;生成新修復建議必須走明確 proposal/operator-run 入口。
|
||||
if generate_missing_decisions:
|
||||
import asyncio
|
||||
|
||||
responses = []
|
||||
background_tasks = []
|
||||
@@ -207,17 +220,20 @@ async def list_incidents() -> IncidentListResponse:
|
||||
)
|
||||
responses.append(IncidentResponse.from_incident(incident, decision_info))
|
||||
else:
|
||||
# 無快取 → 背景觸發,本次返回 None(前端看到 decision=null 會 poll)
|
||||
# 無快取 → 本次返回 None。列表查詢預設不觸發 AI;
|
||||
# 前端若需要修復建議,必須呼叫明確的 proposal 入口。
|
||||
responses.append(IncidentResponse.from_incident(incident, None))
|
||||
if not generate_missing_decisions:
|
||||
continue
|
||||
|
||||
# 2026-04-16 Claude Sonnet 4.6: 只對 48h 內的 incident 觸發 AI 分析
|
||||
# 舊 incident token 每小時過期,若不限制會反覆重新分析歷史事件 → Telegram 洪水
|
||||
from datetime import datetime, timezone, timedelta
|
||||
_created = getattr(incident, "created_at", None)
|
||||
_too_old = False
|
||||
if _created:
|
||||
if _created.tzinfo is None:
|
||||
_created = _created.replace(tzinfo=timezone.utc)
|
||||
_too_old = (_created < datetime.now(timezone.utc) - timedelta(hours=48))
|
||||
_created = _created.replace(tzinfo=UTC)
|
||||
_too_old = (_created < datetime.now(UTC) - timedelta(hours=48))
|
||||
if not _too_old:
|
||||
timeout = 120.0 if incident.severity in (Severity.P0, Severity.P1) else 180.0
|
||||
background_tasks.append(
|
||||
@@ -240,6 +256,7 @@ async def list_incidents() -> IncidentListResponse:
|
||||
"incidents_listed",
|
||||
count=len(incidents),
|
||||
with_decisions=sum(1 for r in responses if r.decision is not None),
|
||||
generate_missing_decisions=generate_missing_decisions,
|
||||
)
|
||||
|
||||
return IncidentListResponse(
|
||||
|
||||
@@ -17,8 +17,8 @@ Incident API 整合測試
|
||||
- GET /api/v1/monitoring/status — 監控工具狀態
|
||||
|
||||
注意: GET /api/v1/incidents 不在此測試
|
||||
原因: 該端點觸發 AI 決策生成 (LLM 呼叫),回應時間 30s+,
|
||||
不適合整合測試。應使用 smoke_test_alert_chain.py 驗證。
|
||||
原因: 該端點是前端輪詢路徑,應維持純讀與低延遲;
|
||||
是否背景觸發 AI 決策由獨立 unit test 驗證,修復建議生成改走 proposal/operator-run 入口。
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
53
apps/api/tests/test_incidents_list_pure_read.py
Normal file
53
apps/api/tests/test_incidents_list_pure_read.py
Normal file
@@ -0,0 +1,53 @@
|
||||
"""
|
||||
Incident list endpoint pure-read regression tests.
|
||||
|
||||
目的:
|
||||
- GET /api/v1/incidents 是前端輪詢與 AwoooP Console 的列表路徑。
|
||||
- 預設不得因為缺少 decision token 就背景啟動 LLM / Ollama / OpenClaw。
|
||||
"""
|
||||
|
||||
from datetime import UTC, datetime
|
||||
|
||||
import pytest
|
||||
|
||||
from src.api.v1 import incidents as incidents_api
|
||||
from src.models.incident import Incident, Severity
|
||||
|
||||
|
||||
class _IncidentService:
|
||||
async def get_active_incidents(self) -> list[Incident]:
|
||||
return [
|
||||
Incident(
|
||||
incident_id="INC-20260506-PURE01",
|
||||
severity=Severity.P2,
|
||||
affected_services=["awoooi-api"],
|
||||
created_at=datetime(2026, 5, 6, 12, 0, tzinfo=UTC),
|
||||
updated_at=datetime(2026, 5, 6, 12, 1, tzinfo=UTC),
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
class _DecisionManager:
|
||||
def __init__(self) -> None:
|
||||
self.created = 0
|
||||
|
||||
async def _find_existing_token(self, incident_id: str):
|
||||
return None
|
||||
|
||||
async def get_or_create_decision(self, *args, **kwargs):
|
||||
self.created += 1
|
||||
raise AssertionError("GET /incidents must not create AI decisions by default")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_list_incidents_does_not_trigger_ai_decision_by_default(monkeypatch):
|
||||
decision_manager = _DecisionManager()
|
||||
monkeypatch.setattr(incidents_api, "get_incident_service", lambda: _IncidentService())
|
||||
monkeypatch.setattr(incidents_api, "get_decision_manager", lambda: decision_manager)
|
||||
|
||||
result = await incidents_api.list_incidents(generate_missing_decisions=False)
|
||||
|
||||
assert result.count == 1
|
||||
assert result.incidents[0].incident_id == "INC-20260506-PURE01"
|
||||
assert result.incidents[0].decision is None
|
||||
assert decision_manager.created == 0
|
||||
@@ -1,3 +1,22 @@
|
||||
## 2026-05-06 | Incident 列表改回純讀,停止前端輪詢觸發 AI 推理
|
||||
|
||||
**背景**:部署 AwoooP 首頁後,production log 顯示載入 `/zh-TW/awooop` 期間會打 `GET /api/v1/incidents`,接著出現 `phase24_ai_router_used provider=ollama` 與 GCP-A Ollama 推理耗時約 55 秒。這代表列表查詢仍會背景啟動 AI 決策,導致前端輪詢佔用 GCP Ollama 推理槽,極端情況下也可能 fallback 到 Gemini 產生成本。
|
||||
|
||||
**根因**:
|
||||
- `GET /api/v1/incidents` 註解雖寫「不等待 AI」,但對缺少 decision token 的 incident 仍會 `asyncio.create_task(decision_manager.get_or_create_decision(...))`。
|
||||
- 多個前端頁面與面板會輪詢 `/api/v1/incidents`,所以「只是查列表」等同於「背景產生 proposal」。
|
||||
|
||||
**本次修補**:
|
||||
- `GET /api/v1/incidents` 新增 `generate_missing_decisions=false` 預設參數。
|
||||
- 預設只讀取既有 decision token;缺少 token 時回傳 `decision=null`,不再背景觸發 Ollama / OpenClaw / Gemini。
|
||||
- 若維運人員明確需要舊行為,可用 `generate_missing_decisions=true` 觸發背景生成;正式修復建議仍應走 `POST /api/v1/incidents/{incident_id}/proposal` 或 AwoooP Operator Run。
|
||||
- 新增 regression test,鎖定列表查詢預設不會呼叫 `get_or_create_decision()`。
|
||||
|
||||
**驗證**:
|
||||
- `python -m py_compile apps/api/src/api/v1/incidents.py apps/api/tests/test_incidents_list_pure_read.py`
|
||||
- `pytest tests/test_incidents_list_pure_read.py tests/test_telegram_message_templates.py -q` → 18 passed。
|
||||
- `ruff check src/api/v1/incidents.py tests/test_incidents_list_pure_read.py` → All checks passed。
|
||||
|
||||
## 2026-05-06 | Telegram 事故通知語義收斂與 AwoooP 首頁總覽
|
||||
|
||||
**背景**:SRE 戰情室截圖顯示 ACTION REQUIRED、AI 自動修復失敗、Escalation、Code Review、Config Drift 等訊息混在同一條流中;值班者很難快速分辨哪些是 AI 已修復、哪些是 AI 無法修復需要人工、哪些只是報表或治理通知。
|
||||
|
||||
Reference in New Issue
Block a user