fix(api): prefilter Runs incident drilldown

2026-06-25 21:47:52 +08:00
parent 4e329bce24
commit d6d3f666a3
3 changed files with 129 additions and 0 deletions
--- a/apps/api/src/services/platform_operator_service.py
+++ b/apps/api/src/services/platform_operator_service.py
@@ -992,6 +992,16 @@ async def list_runs(
            stmt = stmt.where(AwoooPRunState.project_id == project_id)
        if state is not None:
            stmt = stmt.where(AwoooPRunState.state == state)
+        if incident_id is not None:
+            incident_run_ids = await _find_run_ids_for_incident_filter(
+                db,
+                project_id=project_id,
+                incident_id=incident_id,
+                limit=max(per_page * 20, _MAX_LIST_CONTEXT_ROWS),
+            )
+            if not incident_run_ids:
+                return {"runs": [], "total": 0, "page": page, "per_page": per_page}
+            stmt = stmt.where(AwoooPRunState.run_id.in_(incident_run_ids))

        offset = (page - 1) * per_page
        if remediation_status or incident_id or callback_reply_status:
@@ -3696,6 +3706,92 @@ def _collect_run_incident_ids(
    return incident_ids


+async def _find_run_ids_for_incident_filter(
+    db: Any,
+    *,
+    project_id: str | None,
+    incident_id: str,
+    limit: int,
+) -> list[UUID]:
+    """Pre-filter run ids for a single incident before loading list context.
+
+    The old list filter loaded every run for a project and then searched message
+    sidecars in Python. That becomes too expensive on production-sized history
+    and can turn a single incident drilldown into a gateway timeout.
+    """
+
+    params: dict[str, Any] = {
+        "incident_id": incident_id,
+        "incident_like": f"%{incident_id}%",
+        "limit": max(int(limit), 1),
+    }
+    project_run_where = ""
+    event_project_where = ""
+    outbound_project_where = ""
+    if project_id is not None:
+        params["project_id"] = project_id
+        project_run_where = "AND r.project_id = :project_id"
+        event_project_where = "AND e.project_id = :project_id"
+        outbound_project_where = "AND m.project_id = :project_id"
+
+    query = text(f"""
+        WITH matched AS (
+            SELECT r.run_id::text AS run_id, r.created_at AS ts
+            FROM awooop_run_state r
+            WHERE (
+                r.trigger_ref ILIKE :incident_like
+                OR r.error_detail ILIKE :incident_like
+            )
+            {project_run_where}
+
+            UNION ALL
+
+            SELECT e.run_id::text AS run_id, e.received_at AS ts
+            FROM awooop_conversation_event e
+            WHERE e.run_id IS NOT NULL
+              AND (
+                e.source_envelope #> '{{source_refs,incident_ids}}' ? :incident_id
+                OR e.content_preview ILIKE :incident_like
+                OR e.content_redacted ILIKE :incident_like
+                OR e.provider_event_id ILIKE :incident_like
+              )
+              {event_project_where}
+
+            UNION ALL
+
+            SELECT m.run_id::text AS run_id, COALESCE(m.sent_at, m.queued_at) AS ts
+            FROM awooop_outbound_message m
+            WHERE (
+                m.source_envelope #> '{{source_refs,incident_ids}}' ? :incident_id
+                OR m.source_envelope #> '{{awooop_status_chain,incident_ids}}' ? :incident_id
+                OR m.source_envelope #>> '{{awooop_status_chain,source_id}}' = :incident_id
+                OR m.source_envelope #>> '{{callback_reply,incident_id}}' = :incident_id
+                OR m.content_preview ILIKE :incident_like
+                OR m.content_redacted ILIKE :incident_like
+                OR m.send_error ILIKE :incident_like
+            )
+            {outbound_project_where}
+        )
+        SELECT run_id
+        FROM matched
+        WHERE run_id IS NOT NULL
+        GROUP BY run_id
+        ORDER BY MAX(ts) DESC
+        LIMIT :limit
+    """)
+    result = await db.execute(query, params)
+    run_ids: list[UUID] = []
+    for row in result.mappings().all():
+        raw_run_id = str(row.get("run_id") or "")
+        try:
+            run_id = uuid.UUID(raw_run_id)
+        except ValueError:
+            continue
+        if run_id not in run_ids:
+            run_ids.append(run_id)
+    return run_ids
+
+
 async def _load_run_message_context(
    db: Any,
    runs: list[AwoooPRunState],
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -1,3 +1,22 @@
+## 2026-06-25｜Runs incident filter 預篩選修正避免 502
+
+**背景**：`4e329bce` 部署後，正式 API `/api/v1/platform/status-chain?incident_id=INC-20260625-977E5F` 已回 `automation_handoff.kind=ansible_check_mode_apply_gate`，但 `/zh-TW/awooop/runs?incident_id=INC-20260625-977E5F` 看不到 `乾跑後套用閘門`。追查發現 `/api/v1/platform/runs/list` 不帶 incident filter 回 `200`，一帶 `INC-20260625-977E5F` 回 `502 Bad Gateway`；原因是舊 filter 先載入 project 下大量 runs，再逐筆聚合 message context，production 歷史量一大就 timeout。
+
+**完成**：
+- `list_runs` 在 `incident_id` filter 存在時先呼叫 incident 專用 run_id 預篩選。
+- 預篩選直接從 `awooop_run_state`、`awooop_conversation_event`、`awooop_outbound_message` 的 trigger / source_refs / callback_reply / persisted status-chain / redacted content 找相關 run_id，再進入原本 summary filter。
+- 找不到 run_id 時直接回空列表，避免全表聚合與 Nginx 502。
+
+**驗證**：
+- `python3 -m py_compile apps/api/src/services/platform_operator_service.py` 通過。
+- `DATABASE_URL=sqlite:///tmp/awoooi-test.db pytest apps/api/tests/test_awooop_operator_timeline_labels.py -q`：`66 passed`。
+
+**完成度同步**：
+- Runs incident drilldown 穩定性：`60% -> 72%`。
+- AwoooP Runs apply-gate 正式頁驗證：仍待下一個 deploy marker 後重跑。
+
+**邊界**：本段只修查詢預篩選；不寫 DB、不改 run 狀態、不執行修復、不發 Telegram、不開 runtime gate。
+
 ## 2026-06-25｜Status-chain 新增乾跑後套用閘門 handoff

 **背景**：`INC-20260625-977E5F` 類告警目前已能辨識 Ansible `check_mode` 乾跑成功、`apply_total=0`、`verifier=missing`，但 operator 在 Runs / Telegram 看見的下一步仍容易停在「需人工」或長文字。這會讓 AI 自動化看起來像只是把問題丟回人工，而不是清楚交付下一個可審核 gate。
--- a/docs/workplans/2026-06-25-awoooi-product-uiux-inventory.md
+++ b/docs/workplans/2026-06-25-awoooi-product-uiux-inventory.md
@@ -250,6 +250,20 @@ Tenants 目前已讀到：

 完成度同步：AwoooP status-chain handoff 可判讀性 `67% -> 70%`；AwoooP Runs 可判讀性 `68% -> 71%`；真正 AI 自動化 verified repair 成功率仍不提高。

+### 2.5.9 Runs incident filter 502 修正
+
+2026-06-25 `4e329bce` 部署後，正式 status-chain API 已回 `automation_handoff`，但 Runs 頁帶 `incident_id=INC-20260625-977E5F` 仍看不到新區塊；追查後確認 `/api/v1/platform/runs/list` 單一事故 filter 會回 `502`。
+
+| 項目 | 完成 |
+|---|---|
+| 問題 | incident filter 舊流程先載入 project 下大量 runs，再逐筆聚合 message context，production 歷史量大時 timeout |
+| 修正 | `list_runs` 在 incident filter 存在時，先從 run trigger、conversation event、outbound message、source_refs、callback reply、persisted status-chain 預篩出 run_id |
+| 效果 | 找不到 run_id 時直接回空列表；找到時只對相關 runs 建立 summary，不再全表掃描 |
+| 驗證 | `py_compile` 通過；`pytest apps/api/tests/test_awooop_operator_timeline_labels.py -q`：`66 passed` |
+| 邊界 | 只讀查詢修正；不寫 DB、不改 run 狀態、不執行修復、不發 Telegram、不開 runtime gate |
+
+完成度同步：Runs incident drilldown 穩定性 `60% -> 72%`；Runs apply-gate 正式頁驗證需下一個 deploy marker 後重跑。
+
 ## 3. 頁面 UI/UX 現況盤點

 2026-06-25 對正式站桌機 / mobile 抽查：