From d6d3f666a3abc202858e723f6973b7b9ac12c2ae Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 25 Jun 2026 21:47:52 +0800 Subject: [PATCH] fix(api): prefilter Runs incident drilldown --- .../src/services/platform_operator_service.py | 96 +++++++++++++++++++ docs/LOGBOOK.md | 19 ++++ ...026-06-25-awoooi-product-uiux-inventory.md | 14 +++ 3 files changed, 129 insertions(+) diff --git a/apps/api/src/services/platform_operator_service.py b/apps/api/src/services/platform_operator_service.py index 25f3fc85..a01e6350 100644 --- a/apps/api/src/services/platform_operator_service.py +++ b/apps/api/src/services/platform_operator_service.py @@ -992,6 +992,16 @@ async def list_runs( stmt = stmt.where(AwoooPRunState.project_id == project_id) if state is not None: stmt = stmt.where(AwoooPRunState.state == state) + if incident_id is not None: + incident_run_ids = await _find_run_ids_for_incident_filter( + db, + project_id=project_id, + incident_id=incident_id, + limit=max(per_page * 20, _MAX_LIST_CONTEXT_ROWS), + ) + if not incident_run_ids: + return {"runs": [], "total": 0, "page": page, "per_page": per_page} + stmt = stmt.where(AwoooPRunState.run_id.in_(incident_run_ids)) offset = (page - 1) * per_page if remediation_status or incident_id or callback_reply_status: @@ -3696,6 +3706,92 @@ def _collect_run_incident_ids( return incident_ids +async def _find_run_ids_for_incident_filter( + db: Any, + *, + project_id: str | None, + incident_id: str, + limit: int, +) -> list[UUID]: + """Pre-filter run ids for a single incident before loading list context. + + The old list filter loaded every run for a project and then searched message + sidecars in Python. That becomes too expensive on production-sized history + and can turn a single incident drilldown into a gateway timeout. + """ + + params: dict[str, Any] = { + "incident_id": incident_id, + "incident_like": f"%{incident_id}%", + "limit": max(int(limit), 1), + } + project_run_where = "" + event_project_where = "" + outbound_project_where = "" + if project_id is not None: + params["project_id"] = project_id + project_run_where = "AND r.project_id = :project_id" + event_project_where = "AND e.project_id = :project_id" + outbound_project_where = "AND m.project_id = :project_id" + + query = text(f""" + WITH matched AS ( + SELECT r.run_id::text AS run_id, r.created_at AS ts + FROM awooop_run_state r + WHERE ( + r.trigger_ref ILIKE :incident_like + OR r.error_detail ILIKE :incident_like + ) + {project_run_where} + + UNION ALL + + SELECT e.run_id::text AS run_id, e.received_at AS ts + FROM awooop_conversation_event e + WHERE e.run_id IS NOT NULL + AND ( + e.source_envelope #> '{{source_refs,incident_ids}}' ? :incident_id + OR e.content_preview ILIKE :incident_like + OR e.content_redacted ILIKE :incident_like + OR e.provider_event_id ILIKE :incident_like + ) + {event_project_where} + + UNION ALL + + SELECT m.run_id::text AS run_id, COALESCE(m.sent_at, m.queued_at) AS ts + FROM awooop_outbound_message m + WHERE ( + m.source_envelope #> '{{source_refs,incident_ids}}' ? :incident_id + OR m.source_envelope #> '{{awooop_status_chain,incident_ids}}' ? :incident_id + OR m.source_envelope #>> '{{awooop_status_chain,source_id}}' = :incident_id + OR m.source_envelope #>> '{{callback_reply,incident_id}}' = :incident_id + OR m.content_preview ILIKE :incident_like + OR m.content_redacted ILIKE :incident_like + OR m.send_error ILIKE :incident_like + ) + {outbound_project_where} + ) + SELECT run_id + FROM matched + WHERE run_id IS NOT NULL + GROUP BY run_id + ORDER BY MAX(ts) DESC + LIMIT :limit + """) + result = await db.execute(query, params) + run_ids: list[UUID] = [] + for row in result.mappings().all(): + raw_run_id = str(row.get("run_id") or "") + try: + run_id = uuid.UUID(raw_run_id) + except ValueError: + continue + if run_id not in run_ids: + run_ids.append(run_id) + return run_ids + + async def _load_run_message_context( db: Any, runs: list[AwoooPRunState], diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index d547deb6..8e3693ca 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,22 @@ +## 2026-06-25|Runs incident filter 預篩選修正避免 502 + +**背景**:`4e329bce` 部署後,正式 API `/api/v1/platform/status-chain?incident_id=INC-20260625-977E5F` 已回 `automation_handoff.kind=ansible_check_mode_apply_gate`,但 `/zh-TW/awooop/runs?incident_id=INC-20260625-977E5F` 看不到 `乾跑後套用閘門`。追查發現 `/api/v1/platform/runs/list` 不帶 incident filter 回 `200`,一帶 `INC-20260625-977E5F` 回 `502 Bad Gateway`;原因是舊 filter 先載入 project 下大量 runs,再逐筆聚合 message context,production 歷史量一大就 timeout。 + +**完成**: +- `list_runs` 在 `incident_id` filter 存在時先呼叫 incident 專用 run_id 預篩選。 +- 預篩選直接從 `awooop_run_state`、`awooop_conversation_event`、`awooop_outbound_message` 的 trigger / source_refs / callback_reply / persisted status-chain / redacted content 找相關 run_id,再進入原本 summary filter。 +- 找不到 run_id 時直接回空列表,避免全表聚合與 Nginx 502。 + +**驗證**: +- `python3 -m py_compile apps/api/src/services/platform_operator_service.py` 通過。 +- `DATABASE_URL=sqlite:///tmp/awoooi-test.db pytest apps/api/tests/test_awooop_operator_timeline_labels.py -q`:`66 passed`。 + +**完成度同步**: +- Runs incident drilldown 穩定性:`60% -> 72%`。 +- AwoooP Runs apply-gate 正式頁驗證:仍待下一個 deploy marker 後重跑。 + +**邊界**:本段只修查詢預篩選;不寫 DB、不改 run 狀態、不執行修復、不發 Telegram、不開 runtime gate。 + ## 2026-06-25|Status-chain 新增乾跑後套用閘門 handoff **背景**:`INC-20260625-977E5F` 類告警目前已能辨識 Ansible `check_mode` 乾跑成功、`apply_total=0`、`verifier=missing`,但 operator 在 Runs / Telegram 看見的下一步仍容易停在「需人工」或長文字。這會讓 AI 自動化看起來像只是把問題丟回人工,而不是清楚交付下一個可審核 gate。 diff --git a/docs/workplans/2026-06-25-awoooi-product-uiux-inventory.md b/docs/workplans/2026-06-25-awoooi-product-uiux-inventory.md index 7ac5f93e..ecd5e7fc 100644 --- a/docs/workplans/2026-06-25-awoooi-product-uiux-inventory.md +++ b/docs/workplans/2026-06-25-awoooi-product-uiux-inventory.md @@ -250,6 +250,20 @@ Tenants 目前已讀到: 完成度同步:AwoooP status-chain handoff 可判讀性 `67% -> 70%`;AwoooP Runs 可判讀性 `68% -> 71%`;真正 AI 自動化 verified repair 成功率仍不提高。 +### 2.5.9 Runs incident filter 502 修正 + +2026-06-25 `4e329bce` 部署後,正式 status-chain API 已回 `automation_handoff`,但 Runs 頁帶 `incident_id=INC-20260625-977E5F` 仍看不到新區塊;追查後確認 `/api/v1/platform/runs/list` 單一事故 filter 會回 `502`。 + +| 項目 | 完成 | +|---|---| +| 問題 | incident filter 舊流程先載入 project 下大量 runs,再逐筆聚合 message context,production 歷史量大時 timeout | +| 修正 | `list_runs` 在 incident filter 存在時,先從 run trigger、conversation event、outbound message、source_refs、callback reply、persisted status-chain 預篩出 run_id | +| 效果 | 找不到 run_id 時直接回空列表;找到時只對相關 runs 建立 summary,不再全表掃描 | +| 驗證 | `py_compile` 通過;`pytest apps/api/tests/test_awooop_operator_timeline_labels.py -q`:`66 passed` | +| 邊界 | 只讀查詢修正;不寫 DB、不改 run 狀態、不執行修復、不發 Telegram、不開 runtime gate | + +完成度同步:Runs incident drilldown 穩定性 `60% -> 72%`;Runs apply-gate 正式頁驗證需下一個 deploy marker 後重跑。 + ## 3. 頁面 UI/UX 現況盤點 2026-06-25 對正式站桌機 / mobile 抽查: