fix(incidents): keep list endpoint pure read
All checks were successful
Code Review / ai-code-review (push) Successful in 11s
CD Pipeline / tests (push) Successful in 1m7s
CD Pipeline / build-and-deploy (push) Successful in 3m26s
CD Pipeline / post-deploy-checks (push) Successful in 1m17s

This commit is contained in:
Your Name
2026-05-06 21:17:25 +08:00
parent ea6b7d8f27
commit a0179cec6e
4 changed files with 106 additions and 17 deletions

View File

@@ -17,9 +17,10 @@ Phase 6.4 核心功能:
- Proposal 必須關聯到 Incident
"""
from datetime import UTC, datetime, timedelta
from typing import Any
from fastapi import APIRouter, HTTPException, status
from fastapi import APIRouter, HTTPException, Query, status
from pydantic import BaseModel, Field
from src.core.logging import get_logger
@@ -148,18 +149,26 @@ class IncidentTimelineResponse(BaseModel):
Phase 6.5 升級:
- 每個事件自動附帶 decision_token
- 確保 UI 永遠有決策可操作
- 雙軌引擎: LLM (主) + Expert System (備)
- 預設只讀取已存在的 decision_token
- 需要新決策時改由明確的 proposal / operator run 入口觸發
""",
)
async def list_incidents() -> IncidentListResponse:
async def list_incidents(
generate_missing_decisions: bool = Query(
False,
description=(
"預設 false列表查詢只讀既有 decision token"
"true 僅供明確維運操作使用,會背景產生缺少的決策。"
),
),
) -> IncidentListResponse:
"""
取得活躍事件清單
Phase 6.5: 自動為每個事件生成決策令牌
- P0/P1 事件優先處理
- 30 秒內保證有決策
- LLM 失敗時 Expert System 保底
Phase 6.5: 附帶既有決策令牌
- 列表查詢必須是低成本純讀路徑
- 不可因為前端輪詢就背景觸發 LLM / Ollama / OpenClaw
- 需要新決策時,呼叫 POST /api/v1/incidents/{incident_id}/proposal
Returns:
IncidentListResponse: 事件清單與計數 (含決策令牌)
@@ -174,8 +183,6 @@ async def list_incidents() -> IncidentListResponse:
# 按時間排序 (最新優先)
# 2026-03-26 修復: 處理 timezone-aware 與 naive datetime 混合問題
from datetime import UTC
def safe_created_at(i: Incident) -> float:
"""安全取得 timestamp處理 timezone 混合問題"""
dt = i.created_at
@@ -189,7 +196,13 @@ async def list_incidents() -> IncidentListResponse:
# 2026-04-09 Claude Sonnet 4.6: 效能修復 — list endpoint 不同步等待 AI
# 原設計: 每個 incident await AI 決策 (120-180s timeout),多 incident 時乘積爆炸
# 修復: 只取已存在的決策 token若無則背景觸發生成前端 poll 單筆 GET 取得結果
import asyncio
#
# 2026-05-06 Codex: 成本與推理槽修復 — 預設不再背景觸發 AI。
# 根因: 多個前端頁面會輪詢 GET /incidents若列表查詢偷偷 create_task
# 每次頁面載入都可能消耗 GCP Ollama / OpenClaw 推理槽,甚至 fallback 到 Gemini。
# 新規則: GET list 是純讀;生成新修復建議必須走明確 proposal/operator-run 入口。
if generate_missing_decisions:
import asyncio
responses = []
background_tasks = []
@@ -207,17 +220,20 @@ async def list_incidents() -> IncidentListResponse:
)
responses.append(IncidentResponse.from_incident(incident, decision_info))
else:
# 無快取 → 背景觸發,本次返回 None(前端看到 decision=null 會 poll
# 無快取 → 本次返回 None。列表查詢預設不觸發 AI
# 前端若需要修復建議,必須呼叫明確的 proposal 入口。
responses.append(IncidentResponse.from_incident(incident, None))
if not generate_missing_decisions:
continue
# 2026-04-16 Claude Sonnet 4.6: 只對 48h 內的 incident 觸發 AI 分析
# 舊 incident token 每小時過期,若不限制會反覆重新分析歷史事件 → Telegram 洪水
from datetime import datetime, timezone, timedelta
_created = getattr(incident, "created_at", None)
_too_old = False
if _created:
if _created.tzinfo is None:
_created = _created.replace(tzinfo=timezone.utc)
_too_old = (_created < datetime.now(timezone.utc) - timedelta(hours=48))
_created = _created.replace(tzinfo=UTC)
_too_old = (_created < datetime.now(UTC) - timedelta(hours=48))
if not _too_old:
timeout = 120.0 if incident.severity in (Severity.P0, Severity.P1) else 180.0
background_tasks.append(
@@ -240,6 +256,7 @@ async def list_incidents() -> IncidentListResponse:
"incidents_listed",
count=len(incidents),
with_decisions=sum(1 for r in responses if r.decision is not None),
generate_missing_decisions=generate_missing_decisions,
)
return IncidentListResponse(

View File

@@ -17,8 +17,8 @@ Incident API 整合測試
- GET /api/v1/monitoring/status — 監控工具狀態
注意: GET /api/v1/incidents 不在此測試
原因: 該端點觸發 AI 決策生成 (LLM 呼叫),回應時間 30s+
不適合整合測試。應使用 smoke_test_alert_chain.py 驗證
原因: 該端點是前端輪詢路徑,應維持純讀與低延遲;
是否背景觸發 AI 決策由獨立 unit test 驗證,修復建議生成改走 proposal/operator-run 入口
"""
import os

View File

@@ -0,0 +1,53 @@
"""
Incident list endpoint pure-read regression tests.
目的:
- GET /api/v1/incidents 是前端輪詢與 AwoooP Console 的列表路徑。
- 預設不得因為缺少 decision token 就背景啟動 LLM / Ollama / OpenClaw。
"""
from datetime import UTC, datetime
import pytest
from src.api.v1 import incidents as incidents_api
from src.models.incident import Incident, Severity
class _IncidentService:
async def get_active_incidents(self) -> list[Incident]:
return [
Incident(
incident_id="INC-20260506-PURE01",
severity=Severity.P2,
affected_services=["awoooi-api"],
created_at=datetime(2026, 5, 6, 12, 0, tzinfo=UTC),
updated_at=datetime(2026, 5, 6, 12, 1, tzinfo=UTC),
)
]
class _DecisionManager:
def __init__(self) -> None:
self.created = 0
async def _find_existing_token(self, incident_id: str):
return None
async def get_or_create_decision(self, *args, **kwargs):
self.created += 1
raise AssertionError("GET /incidents must not create AI decisions by default")
@pytest.mark.asyncio
async def test_list_incidents_does_not_trigger_ai_decision_by_default(monkeypatch):
decision_manager = _DecisionManager()
monkeypatch.setattr(incidents_api, "get_incident_service", lambda: _IncidentService())
monkeypatch.setattr(incidents_api, "get_decision_manager", lambda: decision_manager)
result = await incidents_api.list_incidents(generate_missing_decisions=False)
assert result.count == 1
assert result.incidents[0].incident_id == "INC-20260506-PURE01"
assert result.incidents[0].decision is None
assert decision_manager.created == 0

View File

@@ -1,3 +1,22 @@
## 2026-05-06 | Incident 列表改回純讀,停止前端輪詢觸發 AI 推理
**背景**:部署 AwoooP 首頁後production log 顯示載入 `/zh-TW/awooop` 期間會打 `GET /api/v1/incidents`,接著出現 `phase24_ai_router_used provider=ollama` 與 GCP-A Ollama 推理耗時約 55 秒。這代表列表查詢仍會背景啟動 AI 決策,導致前端輪詢佔用 GCP Ollama 推理槽,極端情況下也可能 fallback 到 Gemini 產生成本。
**根因**
- `GET /api/v1/incidents` 註解雖寫「不等待 AI」但對缺少 decision token 的 incident 仍會 `asyncio.create_task(decision_manager.get_or_create_decision(...))`
- 多個前端頁面與面板會輪詢 `/api/v1/incidents`,所以「只是查列表」等同於「背景產生 proposal」。
**本次修補**
- `GET /api/v1/incidents` 新增 `generate_missing_decisions=false` 預設參數。
- 預設只讀取既有 decision token缺少 token 時回傳 `decision=null`,不再背景觸發 Ollama / OpenClaw / Gemini。
- 若維運人員明確需要舊行為,可用 `generate_missing_decisions=true` 觸發背景生成;正式修復建議仍應走 `POST /api/v1/incidents/{incident_id}/proposal` 或 AwoooP Operator Run。
- 新增 regression test鎖定列表查詢預設不會呼叫 `get_or_create_decision()`
**驗證**
- `python -m py_compile apps/api/src/api/v1/incidents.py apps/api/tests/test_incidents_list_pure_read.py`
- `pytest tests/test_incidents_list_pure_read.py tests/test_telegram_message_templates.py -q` → 18 passed。
- `ruff check src/api/v1/incidents.py tests/test_incidents_list_pure_read.py` → All checks passed。
## 2026-05-06 | Telegram 事故通知語義收斂與 AwoooP 首頁總覽
**背景**SRE 戰情室截圖顯示 ACTION REQUIRED、AI 自動修復失敗、Escalation、Code Review、Config Drift 等訊息混在同一條流中;值班者很難快速分辨哪些是 AI 已修復、哪些是 AI 無法修復需要人工、哪些只是報表或治理通知。