diff --git a/apps/api/src/services/governance_query_service.py b/apps/api/src/services/governance_query_service.py index 40582329..67f14cb3 100644 --- a/apps/api/src/services/governance_query_service.py +++ b/apps/api/src/services/governance_query_service.py @@ -85,8 +85,38 @@ def _extract_impact(details: dict) -> str: return "" +def _extract_remediation(details: dict) -> str | None: + """ + 將治理事件 details.remediation 正規化為前端可顯示的短字串。 + + Production 事件已出現 dict 形態(例如 {"items": [...]}),API response + schema 則是字串。這裡做 read-side normalization,避免歷史資料讓 + /governance events 變成 500。 + """ + remediation = details.get("remediation") + if remediation is None: + return None + if isinstance(remediation, str): + return remediation[:160] + if isinstance(remediation, dict): + for key in ("summary", "message", "reason", "action"): + value = remediation.get(key) + if isinstance(value, str) and value: + return value[:160] + items = remediation.get("items") + if isinstance(items, list): + normalized = [str(item) for item in items if item is not None] + if normalized: + return ";".join(normalized[:3])[:160] + return str(remediation)[:160] + if isinstance(remediation, list): + normalized = [str(item) for item in remediation if item is not None] + return ";".join(normalized[:3])[:160] if normalized else None + return str(remediation)[:160] + + def _to_governance_event(row: AiGovernanceEvent) -> GovernanceEvent: - details = row.details or {} + details = row.details if isinstance(row.details, dict) else {} return GovernanceEvent( id=row.id, event_type=row.event_type, @@ -96,7 +126,7 @@ def _to_governance_event(row: AiGovernanceEvent) -> GovernanceEvent: resolved_at=row.resolved_at, impact=_extract_impact(details), details=details, - remediation=details.get("remediation"), + remediation=_extract_remediation(details), dispatch_ids=details.get("dispatch_ids", []), ) @@ -239,14 +269,14 @@ async def _query_dispatch_table( d.dispatch_status, d.decision_context, d.playbook_id, - d.created_at, + d.dispatched_at AS created_at, d.dispatched_at, d.completed_at, - d.operator_note + NULL::text AS operator_note FROM governance_remediation_dispatch d JOIN ai_governance_events e ON e.id = d.governance_event_id WHERE d.dispatch_status = :dispatch_status - ORDER BY d.created_at DESC + ORDER BY d.dispatched_at DESC """) count_sql = text(""" diff --git a/apps/api/tests/test_ai_governance_endpoints.py b/apps/api/tests/test_ai_governance_endpoints.py index 0406935f..0e133097 100644 --- a/apps/api/tests/test_ai_governance_endpoints.py +++ b/apps/api/tests/test_ai_governance_endpoints.py @@ -31,6 +31,11 @@ from src.models.governance import ( GovernanceSummaryResponse, map_severity, ) +from src.services.governance_query_service import ( + _extract_remediation, + _query_dispatch_table, + _to_governance_event, +) TAIPEI = timezone(timedelta(hours=8)) NOW = datetime(2026, 5, 2, 12, 0, tzinfo=TAIPEI) @@ -183,6 +188,40 @@ class TestEventsEndpoint: assert items[2]["severity"] == "info" +class TestEventsReadSideNormalization: + def test_remediation_dict_is_normalized_to_string(self): + """production details.remediation 可能是 dict,response schema 必須仍回字串.""" + remediation = _extract_remediation({ + "remediation": { + "items": [ + "補齊 ADR-100 SLO emitter", + "設置 PROMETHEUS_MULTIPROC_DIR", + ] + } + }) + + assert remediation == "補齊 ADR-100 SLO emitter;設置 PROMETHEUS_MULTIPROC_DIR" + + def test_governance_event_accepts_dict_remediation(self): + """dict remediation 不應讓 GovernanceEvent Pydantic validation 變成 500.""" + row = type("Row", (), { + "id": "evt-001", + "event_type": "governance_slo_data_gap", + "triggered_at": NOW, + "resolved": False, + "resolved_at": None, + "details": { + "message": "SLO metrics missing", + "remediation": {"items": ["補齊 SLO emitter"]}, + }, + })() + + event = _to_governance_event(row) + + assert event.remediation == "補齊 SLO emitter" + assert event.impact == "SLO metrics missing" + + # ============================================================================= # 3. queue endpoint graceful fallback # ============================================================================= @@ -240,6 +279,18 @@ class TestQueueEndpoint: r = client.get("/api/v1/ai/governance/queue?dispatch_status=unknown") assert r.status_code == 422 + def test_queue_query_uses_production_dispatch_schema(self): + """queue 查詢必須對齊 migration schema:使用 dispatched_at,不讀不存在的 created_at/operator_note.""" + import inspect + + source = inspect.getsource(_query_dispatch_table) + + assert "d.dispatched_at AS created_at" in source + assert "ORDER BY d.dispatched_at DESC" in source + assert "NULL::text AS operator_note" in source + assert "d.created_at" not in source + assert "d.operator_note" not in source + # ============================================================================= # 4. summary endpoint compliance_rate diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 52d670e8..77a8a37a 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,28 @@ +## 2026-05-14 | T17b 治理事件 / dispatch API 查詢修復,解除前端工作鏈路紅燈 + +**背景**:T17A production smoke 顯示 `/awooop/work-items` 可見治理 dispatch 阻塞,但 API 層本身仍有兩個紅燈:`GET /api/v1/ai/governance/events?...` 回 500,`GET /api/v1/ai/governance/queue?dispatch_status=pending` 回 `table_pending=true`。統帥要求前端要能呈現完整流程,不能讓治理告警與 dispatch 階段停在 API 黑盒。 + +**根因**: +- `governance/events`:production `ai_governance_events.details.remediation` 已有 dict 形態,例如 `{"items": [...]}`;read model `GovernanceEvent.remediation` 期待字串,Pydantic validation 造成 500。 +- `governance/queue`:查詢仍讀 `governance_remediation_dispatch.created_at` / `operator_note`,但 production migration schema 實際是 `dispatched_at` / `created_by`,沒有 `created_at` / `operator_note`。 + +**修正**: +- `governance_query_service._extract_remediation()` 將 `details.remediation` 的 string / dict / list 正規化成短字串,避免歷史治理事件破壞 response schema。 +- `_to_governance_event()` 對非 dict details 做 read-side guard。 +- `_query_dispatch_table()` 對齊 production schema:以 `d.dispatched_at AS created_at`、`NULL::text AS operator_note` 相容現有前端 DTO,不改 DB schema。 +- 補測 `test_ai_governance_endpoints.py`,覆蓋 dict remediation normalization 與 queue 查詢欄位相容性。 + +**本地驗證**: +- `python3 -m py_compile apps/api/src/services/governance_query_service.py apps/api/tests/test_ai_governance_endpoints.py`:pass。 +- `pytest tests/test_ai_governance_endpoints.py tests/test_governance_remediation_dispatch.py -q`:53 passed。 +- `ruff check --select F,E9 src/services/governance_query_service.py tests/test_ai_governance_endpoints.py`:pass。 +- `git diff --check`:pass。 + +**目前整體進度**: +- Alertmanager 低風險自動修復主線:約 96%。 +- 完整 AI 自動化管理產品化:約 75%。 +- T17A 前端工作鏈路已部署;T17B 正在推 governance API 紅燈修復。 + ## 2026-05-14 | T17 AwoooP 工作鏈路前端動態化 + Telegram 歷史補 truth-chain **背景**:統帥要求已完成與推進中的 AI 自動化工作必須在前端頁面可見,不要只靠 Telegram 卡片推測流程階段。同時截圖顯示 Telegram「詳情 / 歷史」在 incident 已有 execution / evidence / KM 記錄時,仍可能回覆「舊 incident 或 Redis 已超期」,造成 operator 無法判斷是否已 AI 自動修復、是否卡在人工作業。 diff --git a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md index f4e2cba9..5edca0df 100644 --- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md +++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md @@ -2117,6 +2117,13 @@ Phase 6 完成後 - Production deploy:`e8c4512a` 已推 Gitea main;Code Review run `2149` success;CD run `2148` tests / build-and-deploy / post-deploy-checks 全 success;deploy marker `687f37d8 chore(cd): deploy e8c4512 [skip ci]`;API / Worker / Web image 均為 `e8c4512a4068d9a781ebcfb97d28be424389c610`;K8s rollout success;health 200 healthy;`/zh-TW/awooop/work-items` 回 200。 - 目前進度更新:Alertmanager 低風險自動修復主線約 96%;完整 AI 自動化管理產品化約 73%。下一步仍是 governance leader/dedupe + ADR-100 SLO emitter、KM stale refresh、Ansible check-mode/apply/rollback audit、write/admin MCP Gateway enforcement、以及 Operator Console 將 Approvals / Monitoring / Tickets / Cost 串成同一工作流。 +**T17b Governance API 紅燈修復(2026-05-14 台北)**: +- 觸發:T17A 前端工作鏈路已能顯示治理卡點,但 live smoke 發現 `governance/events` 500、`governance/queue` 回 `table_pending=true`,會讓 Operator Console 無法可信呈現治理告警是否被 dispatch、跳過、修復或卡人工。 +- 根因:`ai_governance_events.details.remediation` 在 production 已有 dict/list 形態,read model 仍只收字串;`governance_remediation_dispatch` production schema 使用 `dispatched_at`,查詢卻讀不存在的 `created_at` / `operator_note`。 +- 修正:read-side normalization 將 remediation string/dict/list 正規化成短字串;queue query 改用 `d.dispatched_at AS created_at` 與 `NULL::text AS operator_note` 相容既有 DTO,不改 DB schema。 +- 驗證:`py_compile` pass;`pytest tests/test_ai_governance_endpoints.py tests/test_governance_remediation_dispatch.py -q` 53 passed;ruff F/E9 pass;diff check pass。 +- 目前進度更新:Alertmanager 低風險自動修復主線約 96%;完整 AI 自動化管理產品化約 75%。T17B 推版後,下一段收斂 governance dispatcher skipped reason / leader-dedupe / SLO emitter。 + --- ### 2026-04-20 晚 (台北) — C1-C4 全流程串接 — Playbook 鏈路保護(commit de2d34d)