fix(governance): normalize event and dispatch queries
This commit is contained in:
@@ -85,8 +85,38 @@ def _extract_impact(details: dict) -> str:
|
||||
return ""
|
||||
|
||||
|
||||
def _extract_remediation(details: dict) -> str | None:
|
||||
"""
|
||||
將治理事件 details.remediation 正規化為前端可顯示的短字串。
|
||||
|
||||
Production 事件已出現 dict 形態(例如 {"items": [...]}),API response
|
||||
schema 則是字串。這裡做 read-side normalization,避免歷史資料讓
|
||||
/governance events 變成 500。
|
||||
"""
|
||||
remediation = details.get("remediation")
|
||||
if remediation is None:
|
||||
return None
|
||||
if isinstance(remediation, str):
|
||||
return remediation[:160]
|
||||
if isinstance(remediation, dict):
|
||||
for key in ("summary", "message", "reason", "action"):
|
||||
value = remediation.get(key)
|
||||
if isinstance(value, str) and value:
|
||||
return value[:160]
|
||||
items = remediation.get("items")
|
||||
if isinstance(items, list):
|
||||
normalized = [str(item) for item in items if item is not None]
|
||||
if normalized:
|
||||
return ";".join(normalized[:3])[:160]
|
||||
return str(remediation)[:160]
|
||||
if isinstance(remediation, list):
|
||||
normalized = [str(item) for item in remediation if item is not None]
|
||||
return ";".join(normalized[:3])[:160] if normalized else None
|
||||
return str(remediation)[:160]
|
||||
|
||||
|
||||
def _to_governance_event(row: AiGovernanceEvent) -> GovernanceEvent:
|
||||
details = row.details or {}
|
||||
details = row.details if isinstance(row.details, dict) else {}
|
||||
return GovernanceEvent(
|
||||
id=row.id,
|
||||
event_type=row.event_type,
|
||||
@@ -96,7 +126,7 @@ def _to_governance_event(row: AiGovernanceEvent) -> GovernanceEvent:
|
||||
resolved_at=row.resolved_at,
|
||||
impact=_extract_impact(details),
|
||||
details=details,
|
||||
remediation=details.get("remediation"),
|
||||
remediation=_extract_remediation(details),
|
||||
dispatch_ids=details.get("dispatch_ids", []),
|
||||
)
|
||||
|
||||
@@ -239,14 +269,14 @@ async def _query_dispatch_table(
|
||||
d.dispatch_status,
|
||||
d.decision_context,
|
||||
d.playbook_id,
|
||||
d.created_at,
|
||||
d.dispatched_at AS created_at,
|
||||
d.dispatched_at,
|
||||
d.completed_at,
|
||||
d.operator_note
|
||||
NULL::text AS operator_note
|
||||
FROM governance_remediation_dispatch d
|
||||
JOIN ai_governance_events e ON e.id = d.governance_event_id
|
||||
WHERE d.dispatch_status = :dispatch_status
|
||||
ORDER BY d.created_at DESC
|
||||
ORDER BY d.dispatched_at DESC
|
||||
""")
|
||||
|
||||
count_sql = text("""
|
||||
|
||||
@@ -31,6 +31,11 @@ from src.models.governance import (
|
||||
GovernanceSummaryResponse,
|
||||
map_severity,
|
||||
)
|
||||
from src.services.governance_query_service import (
|
||||
_extract_remediation,
|
||||
_query_dispatch_table,
|
||||
_to_governance_event,
|
||||
)
|
||||
|
||||
TAIPEI = timezone(timedelta(hours=8))
|
||||
NOW = datetime(2026, 5, 2, 12, 0, tzinfo=TAIPEI)
|
||||
@@ -183,6 +188,40 @@ class TestEventsEndpoint:
|
||||
assert items[2]["severity"] == "info"
|
||||
|
||||
|
||||
class TestEventsReadSideNormalization:
|
||||
def test_remediation_dict_is_normalized_to_string(self):
|
||||
"""production details.remediation 可能是 dict,response schema 必須仍回字串."""
|
||||
remediation = _extract_remediation({
|
||||
"remediation": {
|
||||
"items": [
|
||||
"補齊 ADR-100 SLO emitter",
|
||||
"設置 PROMETHEUS_MULTIPROC_DIR",
|
||||
]
|
||||
}
|
||||
})
|
||||
|
||||
assert remediation == "補齊 ADR-100 SLO emitter;設置 PROMETHEUS_MULTIPROC_DIR"
|
||||
|
||||
def test_governance_event_accepts_dict_remediation(self):
|
||||
"""dict remediation 不應讓 GovernanceEvent Pydantic validation 變成 500."""
|
||||
row = type("Row", (), {
|
||||
"id": "evt-001",
|
||||
"event_type": "governance_slo_data_gap",
|
||||
"triggered_at": NOW,
|
||||
"resolved": False,
|
||||
"resolved_at": None,
|
||||
"details": {
|
||||
"message": "SLO metrics missing",
|
||||
"remediation": {"items": ["補齊 SLO emitter"]},
|
||||
},
|
||||
})()
|
||||
|
||||
event = _to_governance_event(row)
|
||||
|
||||
assert event.remediation == "補齊 SLO emitter"
|
||||
assert event.impact == "SLO metrics missing"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 3. queue endpoint graceful fallback
|
||||
# =============================================================================
|
||||
@@ -240,6 +279,18 @@ class TestQueueEndpoint:
|
||||
r = client.get("/api/v1/ai/governance/queue?dispatch_status=unknown")
|
||||
assert r.status_code == 422
|
||||
|
||||
def test_queue_query_uses_production_dispatch_schema(self):
|
||||
"""queue 查詢必須對齊 migration schema:使用 dispatched_at,不讀不存在的 created_at/operator_note."""
|
||||
import inspect
|
||||
|
||||
source = inspect.getsource(_query_dispatch_table)
|
||||
|
||||
assert "d.dispatched_at AS created_at" in source
|
||||
assert "ORDER BY d.dispatched_at DESC" in source
|
||||
assert "NULL::text AS operator_note" in source
|
||||
assert "d.created_at" not in source
|
||||
assert "d.operator_note" not in source
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 4. summary endpoint compliance_rate
|
||||
|
||||
@@ -1,3 +1,28 @@
|
||||
## 2026-05-14 | T17b 治理事件 / dispatch API 查詢修復,解除前端工作鏈路紅燈
|
||||
|
||||
**背景**:T17A production smoke 顯示 `/awooop/work-items` 可見治理 dispatch 阻塞,但 API 層本身仍有兩個紅燈:`GET /api/v1/ai/governance/events?...` 回 500,`GET /api/v1/ai/governance/queue?dispatch_status=pending` 回 `table_pending=true`。統帥要求前端要能呈現完整流程,不能讓治理告警與 dispatch 階段停在 API 黑盒。
|
||||
|
||||
**根因**:
|
||||
- `governance/events`:production `ai_governance_events.details.remediation` 已有 dict 形態,例如 `{"items": [...]}`;read model `GovernanceEvent.remediation` 期待字串,Pydantic validation 造成 500。
|
||||
- `governance/queue`:查詢仍讀 `governance_remediation_dispatch.created_at` / `operator_note`,但 production migration schema 實際是 `dispatched_at` / `created_by`,沒有 `created_at` / `operator_note`。
|
||||
|
||||
**修正**:
|
||||
- `governance_query_service._extract_remediation()` 將 `details.remediation` 的 string / dict / list 正規化成短字串,避免歷史治理事件破壞 response schema。
|
||||
- `_to_governance_event()` 對非 dict details 做 read-side guard。
|
||||
- `_query_dispatch_table()` 對齊 production schema:以 `d.dispatched_at AS created_at`、`NULL::text AS operator_note` 相容現有前端 DTO,不改 DB schema。
|
||||
- 補測 `test_ai_governance_endpoints.py`,覆蓋 dict remediation normalization 與 queue 查詢欄位相容性。
|
||||
|
||||
**本地驗證**:
|
||||
- `python3 -m py_compile apps/api/src/services/governance_query_service.py apps/api/tests/test_ai_governance_endpoints.py`:pass。
|
||||
- `pytest tests/test_ai_governance_endpoints.py tests/test_governance_remediation_dispatch.py -q`:53 passed。
|
||||
- `ruff check --select F,E9 src/services/governance_query_service.py tests/test_ai_governance_endpoints.py`:pass。
|
||||
- `git diff --check`:pass。
|
||||
|
||||
**目前整體進度**:
|
||||
- Alertmanager 低風險自動修復主線:約 96%。
|
||||
- 完整 AI 自動化管理產品化:約 75%。
|
||||
- T17A 前端工作鏈路已部署;T17B 正在推 governance API 紅燈修復。
|
||||
|
||||
## 2026-05-14 | T17 AwoooP 工作鏈路前端動態化 + Telegram 歷史補 truth-chain
|
||||
|
||||
**背景**:統帥要求已完成與推進中的 AI 自動化工作必須在前端頁面可見,不要只靠 Telegram 卡片推測流程階段。同時截圖顯示 Telegram「詳情 / 歷史」在 incident 已有 execution / evidence / KM 記錄時,仍可能回覆「舊 incident 或 Redis 已超期」,造成 operator 無法判斷是否已 AI 自動修復、是否卡在人工作業。
|
||||
|
||||
@@ -2117,6 +2117,13 @@ Phase 6 完成後
|
||||
- Production deploy:`e8c4512a` 已推 Gitea main;Code Review run `2149` success;CD run `2148` tests / build-and-deploy / post-deploy-checks 全 success;deploy marker `687f37d8 chore(cd): deploy e8c4512 [skip ci]`;API / Worker / Web image 均為 `e8c4512a4068d9a781ebcfb97d28be424389c610`;K8s rollout success;health 200 healthy;`/zh-TW/awooop/work-items` 回 200。
|
||||
- 目前進度更新:Alertmanager 低風險自動修復主線約 96%;完整 AI 自動化管理產品化約 73%。下一步仍是 governance leader/dedupe + ADR-100 SLO emitter、KM stale refresh、Ansible check-mode/apply/rollback audit、write/admin MCP Gateway enforcement、以及 Operator Console 將 Approvals / Monitoring / Tickets / Cost 串成同一工作流。
|
||||
|
||||
**T17b Governance API 紅燈修復(2026-05-14 台北)**:
|
||||
- 觸發:T17A 前端工作鏈路已能顯示治理卡點,但 live smoke 發現 `governance/events` 500、`governance/queue` 回 `table_pending=true`,會讓 Operator Console 無法可信呈現治理告警是否被 dispatch、跳過、修復或卡人工。
|
||||
- 根因:`ai_governance_events.details.remediation` 在 production 已有 dict/list 形態,read model 仍只收字串;`governance_remediation_dispatch` production schema 使用 `dispatched_at`,查詢卻讀不存在的 `created_at` / `operator_note`。
|
||||
- 修正:read-side normalization 將 remediation string/dict/list 正規化成短字串;queue query 改用 `d.dispatched_at AS created_at` 與 `NULL::text AS operator_note` 相容既有 DTO,不改 DB schema。
|
||||
- 驗證:`py_compile` pass;`pytest tests/test_ai_governance_endpoints.py tests/test_governance_remediation_dispatch.py -q` 53 passed;ruff F/E9 pass;diff check pass。
|
||||
- 目前進度更新:Alertmanager 低風險自動修復主線約 96%;完整 AI 自動化管理產品化約 75%。T17B 推版後,下一段收斂 governance dispatcher skipped reason / leader-dedupe / SLO emitter。
|
||||
|
||||
---
|
||||
|
||||
### 2026-04-20 晚 (台北) — C1-C4 全流程串接 — Playbook 鏈路保護(commit de2d34d)
|
||||
|
||||
Reference in New Issue
Block a user