fix(alerts): surface legacy hitl backlog
Some checks failed
CD Pipeline / tests (push) Successful in 1m21s
Code Review / ai-code-review (push) Successful in 13s
Type Sync Check / check-type-sync (push) Failing after 40s
CD Pipeline / build-and-deploy (push) Successful in 5m22s
CD Pipeline / post-deploy-checks (push) Successful in 2m19s
Some checks failed
CD Pipeline / tests (push) Successful in 1m21s
Code Review / ai-code-review (push) Successful in 13s
Type Sync Check / check-type-sync (push) Failing after 40s
CD Pipeline / build-and-deploy (push) Successful in 5m22s
CD Pipeline / post-deploy-checks (push) Successful in 2m19s
This commit is contained in:
@@ -167,6 +167,8 @@ class ApprovalRequest(ApprovalRequestBase):
|
||||
fingerprint: str | None = Field(default=None, description="告警指紋 Hash")
|
||||
hit_count: int = Field(default=1, description="聚合觸發次數")
|
||||
last_seen_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc), description="最後觸發時間")
|
||||
telegram_message_id: int | None = Field(default=None, description="Telegram approval card message ID")
|
||||
telegram_chat_id: int | None = Field(default=None, description="Telegram chat ID for the approval card")
|
||||
# 2026-04-14 Claude Sonnet 4.6: incident_id 已移至 Base(避免 ApprovalRequestCreate 缺欄位)
|
||||
|
||||
@property
|
||||
@@ -216,6 +218,10 @@ class ApprovalRequestResponse(BaseModel):
|
||||
hit_count: int = 1
|
||||
last_seen_at: datetime | None = None
|
||||
# Phase 6.5: Incident 關聯 (用於簽核後更新 Incident 狀態)
|
||||
incident_id: str | None = None
|
||||
matched_playbook_id: str | None = None
|
||||
telegram_message_id: int | None = None
|
||||
telegram_chat_id: int | None = None
|
||||
metadata: dict | None = None
|
||||
|
||||
@classmethod
|
||||
@@ -241,6 +247,10 @@ class ApprovalRequestResponse(BaseModel):
|
||||
hit_count=approval.hit_count,
|
||||
last_seen_at=approval.last_seen_at,
|
||||
# Phase 6.5
|
||||
incident_id=approval.incident_id,
|
||||
matched_playbook_id=approval.matched_playbook_id,
|
||||
telegram_message_id=approval.telegram_message_id,
|
||||
telegram_chat_id=approval.telegram_chat_id,
|
||||
metadata=approval.metadata,
|
||||
)
|
||||
|
||||
|
||||
@@ -106,6 +106,8 @@ def _record_to_request(record: ApprovalRecord) -> ApprovalRequest:
|
||||
# B4 fix 2026-04-24 ogt + Claude Sonnet 4.6: 補回 DB 欄位(人工審核路徑讀回必要)
|
||||
incident_id=getattr(record, "incident_id", None),
|
||||
matched_playbook_id=getattr(record, "matched_playbook_id", None),
|
||||
telegram_message_id=getattr(record, "telegram_message_id", None),
|
||||
telegram_chat_id=getattr(record, "telegram_chat_id", None),
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -110,10 +110,10 @@ def approval_record_to_request(record: ApprovalRecord) -> ApprovalRequest:
|
||||
hit_count=record.hit_count,
|
||||
last_seen_at=record.last_seen_at,
|
||||
# B3 fix 2026-04-24 ogt + Claude Sonnet 4.6: 補回 DB 欄位(人工審核路徑讀回必要)
|
||||
# incident_id / matched_playbook_id 在 ApprovalRequest 基礎模型中有定義
|
||||
# telegram_message_id / telegram_chat_id 只在 DB model,不在 Pydantic ApprovalRequest
|
||||
incident_id=getattr(record, "incident_id", None),
|
||||
matched_playbook_id=getattr(record, "matched_playbook_id", None),
|
||||
telegram_message_id=getattr(record, "telegram_message_id", None),
|
||||
telegram_chat_id=getattr(record, "telegram_chat_id", None),
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -53,6 +53,9 @@ class AlertPipelineStats:
|
||||
total_24h: int = 0
|
||||
auto_resolved_24h: int = 0
|
||||
pending_approval: int = 0
|
||||
pending_actionable: int = 0
|
||||
pending_observe_only: int = 0
|
||||
pending_without_telegram: int = 0
|
||||
execution_success_24h: int = 0
|
||||
execution_failed_24h: int = 0
|
||||
|
||||
@@ -524,18 +527,46 @@ class HeartbeatReportService:
|
||||
from src.db.base import get_db_context
|
||||
async with get_db_context() as db:
|
||||
r = await db.execute(sa_text("""
|
||||
WITH scoped AS (
|
||||
SELECT
|
||||
*,
|
||||
(
|
||||
btrim(coalesce(action, '')) = ''
|
||||
OR UPPER(action) LIKE 'OBSERVE%'
|
||||
OR UPPER(action) LIKE 'INVESTIGATE%'
|
||||
OR UPPER(action) LIKE 'NO_ACTION%'
|
||||
OR UPPER(action) LIKE '% NO_ACTION%'
|
||||
OR UPPER(action) LIKE '%| NO_ACTION%'
|
||||
) AS is_observe_only
|
||||
FROM approval_records
|
||||
WHERE created_at >= NOW() - interval '24 hours'
|
||||
)
|
||||
SELECT
|
||||
COUNT(*) AS total,
|
||||
COUNT(*) FILTER (WHERE UPPER(status::text) = 'PENDING') AS pending,
|
||||
COUNT(*) FILTER (
|
||||
WHERE UPPER(status::text) = 'PENDING'
|
||||
AND NOT is_observe_only
|
||||
) AS pending_actionable,
|
||||
COUNT(*) FILTER (
|
||||
WHERE UPPER(status::text) = 'PENDING'
|
||||
AND is_observe_only
|
||||
) AS pending_observe_only,
|
||||
COUNT(*) FILTER (
|
||||
WHERE UPPER(status::text) = 'PENDING'
|
||||
AND telegram_message_id IS NULL
|
||||
) AS pending_without_telegram,
|
||||
COUNT(*) FILTER (WHERE UPPER(status::text) = 'EXECUTION_SUCCESS') AS success,
|
||||
COUNT(*) FILTER (WHERE UPPER(status::text) = 'EXECUTION_FAILED') AS failed,
|
||||
COUNT(*) FILTER (WHERE UPPER(status::text) IN ('APPROVED','EXECUTION_SUCCESS','EXECUTION_FAILED')) AS auto_resolved
|
||||
FROM approval_records
|
||||
WHERE created_at >= NOW() - interval '24 hours'
|
||||
FROM scoped
|
||||
"""))
|
||||
row = r.one()
|
||||
stats.total_24h = int(row.total or 0)
|
||||
stats.pending_approval = int(row.pending or 0)
|
||||
stats.pending_actionable = int(row.pending_actionable or 0)
|
||||
stats.pending_observe_only = int(row.pending_observe_only or 0)
|
||||
stats.pending_without_telegram = int(row.pending_without_telegram or 0)
|
||||
stats.execution_success_24h = int(row.success or 0)
|
||||
stats.execution_failed_24h = int(row.failed or 0)
|
||||
stats.auto_resolved_24h = int(row.auto_resolved or 0)
|
||||
@@ -762,9 +793,12 @@ class HeartbeatReportService:
|
||||
if not report.db_redis.redis_ok:
|
||||
warnings.append(f"Redis: {report.db_redis.redis_status}")
|
||||
|
||||
# Pending 積壓告警
|
||||
if report.alert_pipeline.pending_approval > 10:
|
||||
warnings.append(f"PENDING 積壓 {report.alert_pipeline.pending_approval} 筆,需人工處理")
|
||||
# Pending 積壓告警:只用可執行/有風險待審計數觸發,避免 OBSERVE/NO_ACTION 觀察卡造成假待辦。
|
||||
if report.alert_pipeline.pending_actionable > 10:
|
||||
warnings.append(
|
||||
f"待人工審核 {report.alert_pipeline.pending_actionable} 筆"
|
||||
f"(前台 /awooop/approvals;觀察類 {report.alert_pipeline.pending_observe_only} 筆另列)"
|
||||
)
|
||||
|
||||
# Pod 異常 — 2026-05-03 Claude Opus 4.7 + 統帥 ogt:P0 #3 完整 K8s pod state machine
|
||||
# K8s pod phases (https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/):
|
||||
@@ -906,6 +940,10 @@ def report_to_telegram_html(report: HeartbeatReport) -> str:
|
||||
lines.append("")
|
||||
lines.append("📊 <b>告警流水線(24h)</b>")
|
||||
lines.append(f"├─ 總計: {ap.total_24h} PENDING: {ap.pending_approval}")
|
||||
lines.append(
|
||||
f"├─ 待審拆分: 人工 {ap.pending_actionable} 觀察 {ap.pending_observe_only}"
|
||||
f" 無TG {ap.pending_without_telegram}"
|
||||
)
|
||||
if ap.execution_success_24h > 0 and ap.execution_failed_24h == 0:
|
||||
exec_icon = "✅"
|
||||
elif ap.execution_failed_24h > 0:
|
||||
|
||||
122
apps/api/tests/test_approval_pending_visibility.py
Normal file
122
apps/api/tests/test_approval_pending_visibility.py
Normal file
@@ -0,0 +1,122 @@
|
||||
from datetime import UTC, datetime, timedelta
|
||||
from types import SimpleNamespace
|
||||
from uuid import UUID
|
||||
|
||||
from src.models.approval import (
|
||||
ApprovalRequest,
|
||||
ApprovalRequestResponse,
|
||||
ApprovalStatus,
|
||||
RiskLevel,
|
||||
)
|
||||
from src.services.approval_db import approval_record_to_request
|
||||
from src.services.heartbeat_report_service import (
|
||||
AlertPipelineStats,
|
||||
DbRedisStats,
|
||||
FlywheelStats,
|
||||
HeartbeatReport,
|
||||
HeartbeatReportService,
|
||||
)
|
||||
|
||||
|
||||
def test_approval_response_exposes_incident_delivery_and_playbook_fields():
|
||||
approval = ApprovalRequest(
|
||||
id=UUID("11111111-1111-1111-1111-111111111111"),
|
||||
action="kubectl rollout restart deployment/awoooi-api",
|
||||
description="manual gate",
|
||||
risk_level=RiskLevel.MEDIUM,
|
||||
requested_by="OpenClaw",
|
||||
required_signatures=1,
|
||||
incident_id="INC-20260531-VISIBLE",
|
||||
matched_playbook_id="PB-20260531-VISIBLE",
|
||||
telegram_message_id=98765,
|
||||
telegram_chat_id=-1001234567890,
|
||||
)
|
||||
|
||||
response = ApprovalRequestResponse.from_approval(approval)
|
||||
|
||||
assert response.incident_id == "INC-20260531-VISIBLE"
|
||||
assert response.matched_playbook_id == "PB-20260531-VISIBLE"
|
||||
assert response.telegram_message_id == 98765
|
||||
assert response.telegram_chat_id == -1001234567890
|
||||
|
||||
|
||||
def test_approval_db_converter_preserves_incident_and_telegram_fields():
|
||||
now = datetime.now(UTC)
|
||||
record = SimpleNamespace(
|
||||
id="22222222-2222-2222-2222-222222222222",
|
||||
action="OBSERVE",
|
||||
description="[LLM Failed] observe only",
|
||||
status=ApprovalStatus.PENDING,
|
||||
risk_level=RiskLevel.MEDIUM,
|
||||
blast_radius={
|
||||
"affected_pods": 0,
|
||||
"estimated_downtime": "0",
|
||||
"related_services": [],
|
||||
"data_impact": "none",
|
||||
},
|
||||
dry_run_checks=[],
|
||||
required_signatures=1,
|
||||
current_signatures=0,
|
||||
signatures=[],
|
||||
requested_by="OpenClaw (fallback)",
|
||||
created_at=now,
|
||||
expires_at=now + timedelta(hours=1),
|
||||
resolved_at=None,
|
||||
rejection_reason=None,
|
||||
extra_metadata={"source": "fallback"},
|
||||
fingerprint="abc123",
|
||||
hit_count=2,
|
||||
last_seen_at=now,
|
||||
incident_id="INC-20260531-LEGACY",
|
||||
matched_playbook_id="PB-20260531-LEGACY",
|
||||
telegram_message_id=45678,
|
||||
telegram_chat_id=-1001234567890,
|
||||
)
|
||||
|
||||
approval = approval_record_to_request(record)
|
||||
|
||||
assert approval.incident_id == "INC-20260531-LEGACY"
|
||||
assert approval.matched_playbook_id == "PB-20260531-LEGACY"
|
||||
assert approval.telegram_message_id == 45678
|
||||
assert approval.telegram_chat_id == -1001234567890
|
||||
|
||||
|
||||
def _report_with_pipeline(stats: AlertPipelineStats) -> HeartbeatReport:
|
||||
return HeartbeatReport(
|
||||
timestamp=datetime.now(UTC),
|
||||
flywheel=FlywheelStats(playbook_count=1),
|
||||
db_redis=DbRedisStats(db_ok=True, db_status="ok", redis_ok=True, redis_status="ok"),
|
||||
alert_pipeline=stats,
|
||||
)
|
||||
|
||||
|
||||
def test_heartbeat_does_not_warn_when_pending_backlog_is_observe_only():
|
||||
report = _report_with_pipeline(
|
||||
AlertPipelineStats(
|
||||
total_24h=25,
|
||||
pending_approval=21,
|
||||
pending_actionable=1,
|
||||
pending_observe_only=20,
|
||||
)
|
||||
)
|
||||
|
||||
warnings = HeartbeatReportService()._build_warnings(report)
|
||||
|
||||
assert not any("待人工審核" in warning for warning in warnings)
|
||||
assert not any("PENDING 積壓" in warning for warning in warnings)
|
||||
|
||||
|
||||
def test_heartbeat_warns_with_frontend_route_for_actionable_backlog():
|
||||
report = _report_with_pipeline(
|
||||
AlertPipelineStats(
|
||||
total_24h=25,
|
||||
pending_approval=21,
|
||||
pending_actionable=11,
|
||||
pending_observe_only=10,
|
||||
)
|
||||
)
|
||||
|
||||
warnings = HeartbeatReportService()._build_warnings(report)
|
||||
|
||||
assert any("/awooop/approvals" in warning for warning in warnings)
|
||||
assert any("觀察類 10" in warning for warning in warnings)
|
||||
@@ -1,3 +1,44 @@
|
||||
## 2026-05-31|Legacy HITL PENDING 前台可見性與心跳拆分
|
||||
|
||||
**背景**:
|
||||
|
||||
- Production `GET /api/v1/approvals/pending` 顯示 legacy HITL backlog `count=21`,但 `GET /api/v1/platform/approvals` 回 `total=0`,因此 AwoooP 新式 run approvals 看起來是空的。
|
||||
- 這批積壓來自 legacy `approval_records`:其中約 10 筆為 `OpenClaw (fallback) / OBSERVE / medium` 觀察卡,另有舊 fallback kubectl action 與 1 筆 rule-engine MinIO SSH 調查動作。
|
||||
- `/api/v1/approvals/pending` 原本沒有在 response model 外露 `incident_id`、`matched_playbook_id`、`telegram_message_id`、`telegram_chat_id`,前台 legacy panel 即使 fetch 到 backlog,也缺少 incident / Telegram delivery truth。
|
||||
- Heartbeat 原本用 raw `pending_approval > 10` 觸發 `PENDING 積壓 ... 需人工處理`,會把 OBSERVE / NO_ACTION 觀察卡與真正可執行人工審批混在一起,造成持續告警噪音。
|
||||
|
||||
**本次調整**:
|
||||
|
||||
- `ApprovalRequest` / `ApprovalRequestResponse` 補齊 legacy HITL 可見欄位:`incident_id`、`matched_playbook_id`、`telegram_message_id`、`telegram_chat_id`。
|
||||
- `approval_db.approval_record_to_request()` 與 `approval_repository._record_to_request()` 保留 DB 中的 incident / playbook / Telegram 欄位,不再在 Pydantic 轉換時遺失。
|
||||
- `HeartbeatReportService` 將 24h pending 拆成:
|
||||
- `pending_actionable`
|
||||
- `pending_observe_only`
|
||||
- `pending_without_telegram`
|
||||
- Heartbeat warning 改為只在 `pending_actionable > 10` 時提示,訊息帶 `/awooop/approvals` 前台入口與 observe-only 計數;Telegram heartbeat 同步顯示 pending 拆分。
|
||||
- 保留 legacy approvals 的人工決策邊界:沒有批次 approve/reject 生產 PENDING,因為舊 kubectl / SSH action 仍可能造成生產變更,需 operator 在前台逐筆判斷。
|
||||
|
||||
**Verification**:
|
||||
|
||||
```text
|
||||
python3 -m py_compile apps/api/src/models/approval.py apps/api/src/services/approval_db.py apps/api/src/repositories/approval_repository.py apps/api/src/services/heartbeat_report_service.py apps/api/tests/test_approval_pending_visibility.py
|
||||
-> pass
|
||||
/Users/ogt/.pyenv/shims/ruff check apps/api/tests/test_approval_pending_visibility.py
|
||||
-> pass
|
||||
DATABASE_URL='postgresql+asyncpg://test:test@localhost/test' /Users/ogt/.pyenv/shims/pytest apps/api/tests/test_approval_pending_visibility.py -q
|
||||
-> 4 passed
|
||||
DATABASE_URL='postgresql+asyncpg://test:test@localhost/test' /Users/ogt/.pyenv/shims/pytest apps/api/tests/test_heartbeat_ollama_endpoints.py apps/api/tests/test_heartbeat_pod_state_machine.py -q
|
||||
-> 15 passed
|
||||
git diff --check
|
||||
-> pass
|
||||
```
|
||||
|
||||
**判讀 / 下一步**:
|
||||
|
||||
- `需人工處理` 的正確入口是 `/awooop/approvals` 的 legacy HITL backlog;舊 fallback kubectl action 不應盲目批准,應逐筆 reject stale action 或重新診斷。
|
||||
- OBSERVE / NO_ACTION 類卡片不再被當成 emergency manual backlog,但仍會在拆分數字中保留,避免把觀察訊號隱藏。
|
||||
- 後續可再處理 fallback LLM failure branch 為何大量建立 `OBSERVE / medium` 卡片;本輪先修可見性與告警準確度,不改 agent 後續更新 PENDING action 的行為。
|
||||
|
||||
## 2026-05-31|CD source-link gate 過期與 pipefail 修復
|
||||
|
||||
**背景**:
|
||||
|
||||
@@ -2665,6 +2665,12 @@ Phase 6 完成後
|
||||
- 判讀:T135 已把 runner ownership 從雙 runner 搶工收斂到 host runner 單一主控;下一段不要重新啟用 Docker-wrapped runner,而是做 runner pool / repo label 隔離、API image `apt-get` / `chown -R` 分層、Web build cache/offload、Playwright apt source-list hygiene。
|
||||
- 目前進度更新:AwoooP 告警可觀測鏈約 99.998%;Incident-level source correlation 可見性約 98.8%;Source correlation apply 狀態鏈可驗證性約 99.72%;Source correlation freshness / rolling gate 約 98.2%;前端 AI 自動化管理介面同步約 99.999%;Dashboard snapshot / SSE console noise 收斂約 99.2%;CI/CD runner hygiene 約 99.2%;Runner ownership 收斂約 96%;Build host pressure治理約 82%;完整 AI 自動化管理產品化約 99.960%。
|
||||
|
||||
**T153 Legacy HITL pending visibility + warning split(2026-05-31 台北)**:
|
||||
- 觸發:production `/api/v1/approvals/pending` 有 legacy HITL backlog `count=21`,但 `/api/v1/platform/approvals` 回 `total=0`,operator 會以為前台沒有人工待辦;同時 heartbeat 用 raw PENDING 數量觸發「PENDING 積壓,需人工處理」,把 OBSERVE / NO_ACTION 觀察卡與真正可執行審批混在一起。
|
||||
- 修正:`ApprovalRequest` / `ApprovalRequestResponse` 外露 `incident_id`、`matched_playbook_id`、`telegram_message_id`、`telegram_chat_id`;DB / repository converter 保留 legacy approval record 的 incident / playbook / Telegram delivery 欄位。`HeartbeatReportService` 將 24h pending 拆成 actionable、observe-only、without-TG;warning 只看 actionable backlog,並在訊息指向 `/awooop/approvals`,Telegram heartbeat 顯示「待審拆分」。
|
||||
- Verification:API py_compile pass;targeted ruff for new test pass;`test_approval_pending_visibility.py` 4 passed;`test_heartbeat_ollama_endpoints.py` + `test_heartbeat_pod_state_machine.py` 15 passed;`git diff --check` pass。
|
||||
- 判讀:T153 不批次 approve/reject 生產 PENDING,也不把觀察卡刪掉;它把「前台看得到 legacy HITL 事實」與「告警只針對真正人工 actionable backlog」補齊。舊 fallback kubectl / SSH action 仍需 operator 在 `/awooop/approvals` 逐筆決策;OBSERVE / NO_ACTION 類不再偽裝成 emergency manual backlog。下一段可追 LLM failure fallback 為何大量產生 `OBSERVE / medium` 卡片,但需避免破壞 agent 後續把 PENDING 更新成可執行 action 的路徑。
|
||||
|
||||
**T152 Ansible runtime readiness surfaced(2026-05-24 台北)**:
|
||||
- 觸發:T151 已讓首頁看到 execution backend / Ansible attribution,但 operator 仍看不到 runtime 端缺什麼,容易把「Ansible 有候選」誤解成「Ansible 已能自動修復」。
|
||||
- 修正:API image 複製 `infra/ansible/` 作 read-only catalog;`truth-chain/quality/summary` 新增 `ansible_runtime`,回報 playbook binary、catalog、inventory、playbook_count、can_run_check_mode、blockers。首頁 execution evidence 同步顯示 runtime 狀態;目前 production 顯示 `runtime 未就緒:ansible_playbook_binary_missing`。未安裝 `ansible-core`、未啟用 check-mode / apply。
|
||||
|
||||
Reference in New Issue
Block a user