fix(incidents): keep list endpoint pure read

2026-05-06 21:17:25 +08:00
parent ea6b7d8f27
commit a0179cec6e
4 changed files with 106 additions and 17 deletions
--- a/apps/api/src/api/v1/incidents.py
+++ b/apps/api/src/api/v1/incidents.py
@@ -17,9 +17,10 @@ Phase 6.4 核心功能:
 - Proposal 必須關聯到 Incident
 """

+from datetime import UTC, datetime, timedelta
 from typing import Any

-from fastapi import APIRouter, HTTPException, status
+from fastapi import APIRouter, HTTPException, Query, status
 from pydantic import BaseModel, Field

 from src.core.logging import get_logger
@@ -148,18 +149,26 @@ class IncidentTimelineResponse(BaseModel):

    Phase 6.5 升級:
    - 每個事件自動附帶 decision_token
-    - 確保 UI 永遠有決策可操作
-    - 雙軌引擎: LLM (主) + Expert System (備)
+    - 預設只讀取已存在的 decision_token
+    - 需要新決策時改由明確的 proposal / operator run 入口觸發
    """,
 )
-async def list_incidents() -> IncidentListResponse:
+async def list_incidents(
+    generate_missing_decisions: bool = Query(
+        False,
+        description=(
+            "預設 false，列表查詢只讀既有 decision token；"
+            "true 僅供明確維運操作使用，會背景產生缺少的決策。"
+        ),
+    ),
+) -> IncidentListResponse:
    """
    取得活躍事件清單

-    Phase 6.5: 自動為每個事件生成決策令牌
-    - P0/P1 事件優先處理
-    - 30 秒內保證有決策
-    - LLM 失敗時 Expert System 保底
+    Phase 6.5: 附帶既有決策令牌
+    - 列表查詢必須是低成本純讀路徑
+    - 不可因為前端輪詢就背景觸發 LLM / Ollama / OpenClaw
+    - 需要新決策時，呼叫 POST /api/v1/incidents/{incident_id}/proposal

    Returns:
        IncidentListResponse: 事件清單與計數 (含決策令牌)
@@ -174,8 +183,6 @@ async def list_incidents() -> IncidentListResponse:

        # 按時間排序 (最新優先)
        # 2026-03-26 修復: 處理 timezone-aware 與 naive datetime 混合問題
-        from datetime import UTC
-
        def safe_created_at(i: Incident) -> float:
            """安全取得 timestamp，處理 timezone 混合問題"""
            dt = i.created_at
@@ -189,7 +196,13 @@ async def list_incidents() -> IncidentListResponse:
        # 2026-04-09 Claude Sonnet 4.6: 效能修復 — list endpoint 不同步等待 AI
        # 原設計: 每個 incident await AI 決策 (120-180s timeout)，多 incident 時乘積爆炸
        # 修復: 只取已存在的決策 token，若無則背景觸發生成，前端 poll 單筆 GET 取得結果
-        import asyncio
+        #
+        # 2026-05-06 Codex: 成本與推理槽修復 — 預設不再背景觸發 AI。
+        # 根因: 多個前端頁面會輪詢 GET /incidents；若列表查詢偷偷 create_task，
+        # 每次頁面載入都可能消耗 GCP Ollama / OpenClaw 推理槽，甚至 fallback 到 Gemini。
+        # 新規則: GET list 是純讀；生成新修復建議必須走明確 proposal/operator-run 入口。
+        if generate_missing_decisions:
+            import asyncio

        responses = []
        background_tasks = []
@@ -207,17 +220,20 @@ async def list_incidents() -> IncidentListResponse:
                    )
                    responses.append(IncidentResponse.from_incident(incident, decision_info))
                else:
-                    # 無快取 → 背景觸發，本次返回 None（前端看到 decision=null 會 poll）
+                    # 無快取 → 本次返回 None。列表查詢預設不觸發 AI；
+                    # 前端若需要修復建議，必須呼叫明確的 proposal 入口。
                    responses.append(IncidentResponse.from_incident(incident, None))
+                    if not generate_missing_decisions:
+                        continue
+
                    # 2026-04-16 Claude Sonnet 4.6: 只對 48h 內的 incident 觸發 AI 分析
                    # 舊 incident token 每小時過期，若不限制會反覆重新分析歷史事件 → Telegram 洪水
-                    from datetime import datetime, timezone, timedelta
                    _created = getattr(incident, "created_at", None)
                    _too_old = False
                    if _created:
                        if _created.tzinfo is None:
-                            _created = _created.replace(tzinfo=timezone.utc)
-                        _too_old = (_created < datetime.now(timezone.utc) - timedelta(hours=48))
+                            _created = _created.replace(tzinfo=UTC)
+                        _too_old = (_created < datetime.now(UTC) - timedelta(hours=48))
                    if not _too_old:
                        timeout = 120.0 if incident.severity in (Severity.P0, Severity.P1) else 180.0
                        background_tasks.append(
@@ -240,6 +256,7 @@ async def list_incidents() -> IncidentListResponse:
            "incidents_listed",
            count=len(incidents),
            with_decisions=sum(1 for r in responses if r.decision is not None),
+            generate_missing_decisions=generate_missing_decisions,
        )

        return IncidentListResponse(
--- a/apps/api/tests/integration/test_incident_api.py
+++ b/apps/api/tests/integration/test_incident_api.py
@@ -17,8 +17,8 @@ Incident API 整合測試
 - GET /api/v1/monitoring/status — 監控工具狀態

 注意: GET /api/v1/incidents 不在此測試
-  原因: 該端點觸發 AI 決策生成 (LLM 呼叫)，回應時間 30s+，
-  不適合整合測試。應使用 smoke_test_alert_chain.py 驗證。
+  原因: 該端點是前端輪詢路徑，應維持純讀與低延遲；
+  是否背景觸發 AI 決策由獨立 unit test 驗證，修復建議生成改走 proposal/operator-run 入口。
 """

 import os
--- a/apps/api/tests/test_incidents_list_pure_read.py
+++ b/apps/api/tests/test_incidents_list_pure_read.py
@@ -0,0 +1,53 @@
+"""
+Incident list endpoint pure-read regression tests.
+
+目的:
+- GET /api/v1/incidents 是前端輪詢與 AwoooP Console 的列表路徑。
+- 預設不得因為缺少 decision token 就背景啟動 LLM / Ollama / OpenClaw。
+"""
+
+from datetime import UTC, datetime
+
+import pytest
+
+from src.api.v1 import incidents as incidents_api
+from src.models.incident import Incident, Severity
+
+
+class _IncidentService:
+    async def get_active_incidents(self) -> list[Incident]:
+        return [
+            Incident(
+                incident_id="INC-20260506-PURE01",
+                severity=Severity.P2,
+                affected_services=["awoooi-api"],
+                created_at=datetime(2026, 5, 6, 12, 0, tzinfo=UTC),
+                updated_at=datetime(2026, 5, 6, 12, 1, tzinfo=UTC),
+            )
+        ]
+
+
+class _DecisionManager:
+    def __init__(self) -> None:
+        self.created = 0
+
+    async def _find_existing_token(self, incident_id: str):
+        return None
+
+    async def get_or_create_decision(self, *args, **kwargs):
+        self.created += 1
+        raise AssertionError("GET /incidents must not create AI decisions by default")
+
+
+@pytest.mark.asyncio
+async def test_list_incidents_does_not_trigger_ai_decision_by_default(monkeypatch):
+    decision_manager = _DecisionManager()
+    monkeypatch.setattr(incidents_api, "get_incident_service", lambda: _IncidentService())
+    monkeypatch.setattr(incidents_api, "get_decision_manager", lambda: decision_manager)
+
+    result = await incidents_api.list_incidents(generate_missing_decisions=False)
+
+    assert result.count == 1
+    assert result.incidents[0].incident_id == "INC-20260506-PURE01"
+    assert result.incidents[0].decision is None
+    assert decision_manager.created == 0
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -1,3 +1,22 @@
+## 2026-05-06 | Incident 列表改回純讀，停止前端輪詢觸發 AI 推理
+
+**背景**：部署 AwoooP 首頁後，production log 顯示載入 `/zh-TW/awooop` 期間會打 `GET /api/v1/incidents`，接著出現 `phase24_ai_router_used provider=ollama` 與 GCP-A Ollama 推理耗時約 55 秒。這代表列表查詢仍會背景啟動 AI 決策，導致前端輪詢佔用 GCP Ollama 推理槽，極端情況下也可能 fallback 到 Gemini 產生成本。
+
+**根因**：
+- `GET /api/v1/incidents` 註解雖寫「不等待 AI」，但對缺少 decision token 的 incident 仍會 `asyncio.create_task(decision_manager.get_or_create_decision(...))`。
+- 多個前端頁面與面板會輪詢 `/api/v1/incidents`，所以「只是查列表」等同於「背景產生 proposal」。
+
+**本次修補**：
+- `GET /api/v1/incidents` 新增 `generate_missing_decisions=false` 預設參數。
+- 預設只讀取既有 decision token；缺少 token 時回傳 `decision=null`，不再背景觸發 Ollama / OpenClaw / Gemini。
+- 若維運人員明確需要舊行為，可用 `generate_missing_decisions=true` 觸發背景生成；正式修復建議仍應走 `POST /api/v1/incidents/{incident_id}/proposal` 或 AwoooP Operator Run。
+- 新增 regression test，鎖定列表查詢預設不會呼叫 `get_or_create_decision()`。
+
+**驗證**：
+- `python -m py_compile apps/api/src/api/v1/incidents.py apps/api/tests/test_incidents_list_pure_read.py`
+- `pytest tests/test_incidents_list_pure_read.py tests/test_telegram_message_templates.py -q` → 18 passed。
+- `ruff check src/api/v1/incidents.py tests/test_incidents_list_pure_read.py` → All checks passed。
+
 ## 2026-05-06 | Telegram 事故通知語義收斂與 AwoooP 首頁總覽

 **背景**：SRE 戰情室截圖顯示 ACTION REQUIRED、AI 自動修復失敗、Escalation、Code Review、Config Drift 等訊息混在同一條流中；值班者很難快速分辨哪些是 AI 已修復、哪些是 AI 無法修復需要人工、哪些只是報表或治理通知。