From a0179cec6e447f0839a844159fbb739dae1ec500 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 6 May 2026 21:17:25 +0800 Subject: [PATCH] fix(incidents): keep list endpoint pure read --- apps/api/src/api/v1/incidents.py | 47 ++++++++++------ .../tests/integration/test_incident_api.py | 4 +- .../tests/test_incidents_list_pure_read.py | 53 +++++++++++++++++++ docs/LOGBOOK.md | 19 +++++++ 4 files changed, 106 insertions(+), 17 deletions(-) create mode 100644 apps/api/tests/test_incidents_list_pure_read.py diff --git a/apps/api/src/api/v1/incidents.py b/apps/api/src/api/v1/incidents.py index 7eac5bb9..552b237e 100644 --- a/apps/api/src/api/v1/incidents.py +++ b/apps/api/src/api/v1/incidents.py @@ -17,9 +17,10 @@ Phase 6.4 核心功能: - Proposal 必須關聯到 Incident """ +from datetime import UTC, datetime, timedelta from typing import Any -from fastapi import APIRouter, HTTPException, status +from fastapi import APIRouter, HTTPException, Query, status from pydantic import BaseModel, Field from src.core.logging import get_logger @@ -148,18 +149,26 @@ class IncidentTimelineResponse(BaseModel): Phase 6.5 升級: - 每個事件自動附帶 decision_token - - 確保 UI 永遠有決策可操作 - - 雙軌引擎: LLM (主) + Expert System (備) + - 預設只讀取已存在的 decision_token + - 需要新決策時改由明確的 proposal / operator run 入口觸發 """, ) -async def list_incidents() -> IncidentListResponse: +async def list_incidents( + generate_missing_decisions: bool = Query( + False, + description=( + "預設 false,列表查詢只讀既有 decision token;" + "true 僅供明確維運操作使用,會背景產生缺少的決策。" + ), + ), +) -> IncidentListResponse: """ 取得活躍事件清單 - Phase 6.5: 自動為每個事件生成決策令牌 - - P0/P1 事件優先處理 - - 30 秒內保證有決策 - - LLM 失敗時 Expert System 保底 + Phase 6.5: 附帶既有決策令牌 + - 列表查詢必須是低成本純讀路徑 + - 不可因為前端輪詢就背景觸發 LLM / Ollama / OpenClaw + - 需要新決策時,呼叫 POST /api/v1/incidents/{incident_id}/proposal Returns: IncidentListResponse: 事件清單與計數 (含決策令牌) @@ -174,8 +183,6 @@ async def list_incidents() -> IncidentListResponse: # 按時間排序 (最新優先) # 2026-03-26 修復: 處理 timezone-aware 與 naive datetime 混合問題 - from datetime import UTC - def safe_created_at(i: Incident) -> float: """安全取得 timestamp,處理 timezone 混合問題""" dt = i.created_at @@ -189,7 +196,13 @@ async def list_incidents() -> IncidentListResponse: # 2026-04-09 Claude Sonnet 4.6: 效能修復 — list endpoint 不同步等待 AI # 原設計: 每個 incident await AI 決策 (120-180s timeout),多 incident 時乘積爆炸 # 修復: 只取已存在的決策 token,若無則背景觸發生成,前端 poll 單筆 GET 取得結果 - import asyncio + # + # 2026-05-06 Codex: 成本與推理槽修復 — 預設不再背景觸發 AI。 + # 根因: 多個前端頁面會輪詢 GET /incidents;若列表查詢偷偷 create_task, + # 每次頁面載入都可能消耗 GCP Ollama / OpenClaw 推理槽,甚至 fallback 到 Gemini。 + # 新規則: GET list 是純讀;生成新修復建議必須走明確 proposal/operator-run 入口。 + if generate_missing_decisions: + import asyncio responses = [] background_tasks = [] @@ -207,17 +220,20 @@ async def list_incidents() -> IncidentListResponse: ) responses.append(IncidentResponse.from_incident(incident, decision_info)) else: - # 無快取 → 背景觸發,本次返回 None(前端看到 decision=null 會 poll) + # 無快取 → 本次返回 None。列表查詢預設不觸發 AI; + # 前端若需要修復建議,必須呼叫明確的 proposal 入口。 responses.append(IncidentResponse.from_incident(incident, None)) + if not generate_missing_decisions: + continue + # 2026-04-16 Claude Sonnet 4.6: 只對 48h 內的 incident 觸發 AI 分析 # 舊 incident token 每小時過期,若不限制會反覆重新分析歷史事件 → Telegram 洪水 - from datetime import datetime, timezone, timedelta _created = getattr(incident, "created_at", None) _too_old = False if _created: if _created.tzinfo is None: - _created = _created.replace(tzinfo=timezone.utc) - _too_old = (_created < datetime.now(timezone.utc) - timedelta(hours=48)) + _created = _created.replace(tzinfo=UTC) + _too_old = (_created < datetime.now(UTC) - timedelta(hours=48)) if not _too_old: timeout = 120.0 if incident.severity in (Severity.P0, Severity.P1) else 180.0 background_tasks.append( @@ -240,6 +256,7 @@ async def list_incidents() -> IncidentListResponse: "incidents_listed", count=len(incidents), with_decisions=sum(1 for r in responses if r.decision is not None), + generate_missing_decisions=generate_missing_decisions, ) return IncidentListResponse( diff --git a/apps/api/tests/integration/test_incident_api.py b/apps/api/tests/integration/test_incident_api.py index 94799912..54894e11 100644 --- a/apps/api/tests/integration/test_incident_api.py +++ b/apps/api/tests/integration/test_incident_api.py @@ -17,8 +17,8 @@ Incident API 整合測試 - GET /api/v1/monitoring/status — 監控工具狀態 注意: GET /api/v1/incidents 不在此測試 - 原因: 該端點觸發 AI 決策生成 (LLM 呼叫),回應時間 30s+, - 不適合整合測試。應使用 smoke_test_alert_chain.py 驗證。 + 原因: 該端點是前端輪詢路徑,應維持純讀與低延遲; + 是否背景觸發 AI 決策由獨立 unit test 驗證,修復建議生成改走 proposal/operator-run 入口。 """ import os diff --git a/apps/api/tests/test_incidents_list_pure_read.py b/apps/api/tests/test_incidents_list_pure_read.py new file mode 100644 index 00000000..61f7e867 --- /dev/null +++ b/apps/api/tests/test_incidents_list_pure_read.py @@ -0,0 +1,53 @@ +""" +Incident list endpoint pure-read regression tests. + +目的: +- GET /api/v1/incidents 是前端輪詢與 AwoooP Console 的列表路徑。 +- 預設不得因為缺少 decision token 就背景啟動 LLM / Ollama / OpenClaw。 +""" + +from datetime import UTC, datetime + +import pytest + +from src.api.v1 import incidents as incidents_api +from src.models.incident import Incident, Severity + + +class _IncidentService: + async def get_active_incidents(self) -> list[Incident]: + return [ + Incident( + incident_id="INC-20260506-PURE01", + severity=Severity.P2, + affected_services=["awoooi-api"], + created_at=datetime(2026, 5, 6, 12, 0, tzinfo=UTC), + updated_at=datetime(2026, 5, 6, 12, 1, tzinfo=UTC), + ) + ] + + +class _DecisionManager: + def __init__(self) -> None: + self.created = 0 + + async def _find_existing_token(self, incident_id: str): + return None + + async def get_or_create_decision(self, *args, **kwargs): + self.created += 1 + raise AssertionError("GET /incidents must not create AI decisions by default") + + +@pytest.mark.asyncio +async def test_list_incidents_does_not_trigger_ai_decision_by_default(monkeypatch): + decision_manager = _DecisionManager() + monkeypatch.setattr(incidents_api, "get_incident_service", lambda: _IncidentService()) + monkeypatch.setattr(incidents_api, "get_decision_manager", lambda: decision_manager) + + result = await incidents_api.list_incidents(generate_missing_decisions=False) + + assert result.count == 1 + assert result.incidents[0].incident_id == "INC-20260506-PURE01" + assert result.incidents[0].decision is None + assert decision_manager.created == 0 diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 44229db5..0463b62e 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,22 @@ +## 2026-05-06 | Incident 列表改回純讀,停止前端輪詢觸發 AI 推理 + +**背景**:部署 AwoooP 首頁後,production log 顯示載入 `/zh-TW/awooop` 期間會打 `GET /api/v1/incidents`,接著出現 `phase24_ai_router_used provider=ollama` 與 GCP-A Ollama 推理耗時約 55 秒。這代表列表查詢仍會背景啟動 AI 決策,導致前端輪詢佔用 GCP Ollama 推理槽,極端情況下也可能 fallback 到 Gemini 產生成本。 + +**根因**: +- `GET /api/v1/incidents` 註解雖寫「不等待 AI」,但對缺少 decision token 的 incident 仍會 `asyncio.create_task(decision_manager.get_or_create_decision(...))`。 +- 多個前端頁面與面板會輪詢 `/api/v1/incidents`,所以「只是查列表」等同於「背景產生 proposal」。 + +**本次修補**: +- `GET /api/v1/incidents` 新增 `generate_missing_decisions=false` 預設參數。 +- 預設只讀取既有 decision token;缺少 token 時回傳 `decision=null`,不再背景觸發 Ollama / OpenClaw / Gemini。 +- 若維運人員明確需要舊行為,可用 `generate_missing_decisions=true` 觸發背景生成;正式修復建議仍應走 `POST /api/v1/incidents/{incident_id}/proposal` 或 AwoooP Operator Run。 +- 新增 regression test,鎖定列表查詢預設不會呼叫 `get_or_create_decision()`。 + +**驗證**: +- `python -m py_compile apps/api/src/api/v1/incidents.py apps/api/tests/test_incidents_list_pure_read.py` +- `pytest tests/test_incidents_list_pure_read.py tests/test_telegram_message_templates.py -q` → 18 passed。 +- `ruff check src/api/v1/incidents.py tests/test_incidents_list_pure_read.py` → All checks passed。 + ## 2026-05-06 | Telegram 事故通知語義收斂與 AwoooP 首頁總覽 **背景**:SRE 戰情室截圖顯示 ACTION REQUIRED、AI 自動修復失敗、Escalation、Code Review、Config Drift 等訊息混在同一條流中;值班者很難快速分辨哪些是 AI 已修復、哪些是 AI 無法修復需要人工、哪些只是報表或治理通知。