From cd17a6777486b52a0d439ef496878ae33ca5b2ba Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 31 May 2026 13:16:18 +0800 Subject: [PATCH] fix(alerts): surface legacy hitl backlog --- apps/api/src/models/approval.py | 10 ++ .../src/repositories/approval_repository.py | 2 + apps/api/src/services/approval_db.py | 4 +- .../src/services/heartbeat_report_service.py | 48 ++++++- .../tests/test_approval_pending_visibility.py | 122 ++++++++++++++++++ docs/LOGBOOK.md | 41 ++++++ ...-04-15-MASTER-ai-autonomous-flywheel-v2.md | 6 + 7 files changed, 226 insertions(+), 7 deletions(-) create mode 100644 apps/api/tests/test_approval_pending_visibility.py diff --git a/apps/api/src/models/approval.py b/apps/api/src/models/approval.py index 9fa15597..6a6ce1a9 100644 --- a/apps/api/src/models/approval.py +++ b/apps/api/src/models/approval.py @@ -167,6 +167,8 @@ class ApprovalRequest(ApprovalRequestBase): fingerprint: str | None = Field(default=None, description="告警指紋 Hash") hit_count: int = Field(default=1, description="聚合觸發次數") last_seen_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc), description="最後觸發時間") + telegram_message_id: int | None = Field(default=None, description="Telegram approval card message ID") + telegram_chat_id: int | None = Field(default=None, description="Telegram chat ID for the approval card") # 2026-04-14 Claude Sonnet 4.6: incident_id 已移至 Base(避免 ApprovalRequestCreate 缺欄位) @property @@ -216,6 +218,10 @@ class ApprovalRequestResponse(BaseModel): hit_count: int = 1 last_seen_at: datetime | None = None # Phase 6.5: Incident 關聯 (用於簽核後更新 Incident 狀態) + incident_id: str | None = None + matched_playbook_id: str | None = None + telegram_message_id: int | None = None + telegram_chat_id: int | None = None metadata: dict | None = None @classmethod @@ -241,6 +247,10 @@ class ApprovalRequestResponse(BaseModel): hit_count=approval.hit_count, last_seen_at=approval.last_seen_at, # Phase 6.5 + incident_id=approval.incident_id, + matched_playbook_id=approval.matched_playbook_id, + telegram_message_id=approval.telegram_message_id, + telegram_chat_id=approval.telegram_chat_id, metadata=approval.metadata, ) diff --git a/apps/api/src/repositories/approval_repository.py b/apps/api/src/repositories/approval_repository.py index 6dfc53cb..496bc9c5 100644 --- a/apps/api/src/repositories/approval_repository.py +++ b/apps/api/src/repositories/approval_repository.py @@ -106,6 +106,8 @@ def _record_to_request(record: ApprovalRecord) -> ApprovalRequest: # B4 fix 2026-04-24 ogt + Claude Sonnet 4.6: 補回 DB 欄位(人工審核路徑讀回必要) incident_id=getattr(record, "incident_id", None), matched_playbook_id=getattr(record, "matched_playbook_id", None), + telegram_message_id=getattr(record, "telegram_message_id", None), + telegram_chat_id=getattr(record, "telegram_chat_id", None), ) diff --git a/apps/api/src/services/approval_db.py b/apps/api/src/services/approval_db.py index 5630b696..608d1121 100644 --- a/apps/api/src/services/approval_db.py +++ b/apps/api/src/services/approval_db.py @@ -110,10 +110,10 @@ def approval_record_to_request(record: ApprovalRecord) -> ApprovalRequest: hit_count=record.hit_count, last_seen_at=record.last_seen_at, # B3 fix 2026-04-24 ogt + Claude Sonnet 4.6: 補回 DB 欄位(人工審核路徑讀回必要) - # incident_id / matched_playbook_id 在 ApprovalRequest 基礎模型中有定義 - # telegram_message_id / telegram_chat_id 只在 DB model,不在 Pydantic ApprovalRequest incident_id=getattr(record, "incident_id", None), matched_playbook_id=getattr(record, "matched_playbook_id", None), + telegram_message_id=getattr(record, "telegram_message_id", None), + telegram_chat_id=getattr(record, "telegram_chat_id", None), ) diff --git a/apps/api/src/services/heartbeat_report_service.py b/apps/api/src/services/heartbeat_report_service.py index dc6b9b74..5768b709 100644 --- a/apps/api/src/services/heartbeat_report_service.py +++ b/apps/api/src/services/heartbeat_report_service.py @@ -53,6 +53,9 @@ class AlertPipelineStats: total_24h: int = 0 auto_resolved_24h: int = 0 pending_approval: int = 0 + pending_actionable: int = 0 + pending_observe_only: int = 0 + pending_without_telegram: int = 0 execution_success_24h: int = 0 execution_failed_24h: int = 0 @@ -524,18 +527,46 @@ class HeartbeatReportService: from src.db.base import get_db_context async with get_db_context() as db: r = await db.execute(sa_text(""" + WITH scoped AS ( + SELECT + *, + ( + btrim(coalesce(action, '')) = '' + OR UPPER(action) LIKE 'OBSERVE%' + OR UPPER(action) LIKE 'INVESTIGATE%' + OR UPPER(action) LIKE 'NO_ACTION%' + OR UPPER(action) LIKE '% NO_ACTION%' + OR UPPER(action) LIKE '%| NO_ACTION%' + ) AS is_observe_only + FROM approval_records + WHERE created_at >= NOW() - interval '24 hours' + ) SELECT COUNT(*) AS total, COUNT(*) FILTER (WHERE UPPER(status::text) = 'PENDING') AS pending, + COUNT(*) FILTER ( + WHERE UPPER(status::text) = 'PENDING' + AND NOT is_observe_only + ) AS pending_actionable, + COUNT(*) FILTER ( + WHERE UPPER(status::text) = 'PENDING' + AND is_observe_only + ) AS pending_observe_only, + COUNT(*) FILTER ( + WHERE UPPER(status::text) = 'PENDING' + AND telegram_message_id IS NULL + ) AS pending_without_telegram, COUNT(*) FILTER (WHERE UPPER(status::text) = 'EXECUTION_SUCCESS') AS success, COUNT(*) FILTER (WHERE UPPER(status::text) = 'EXECUTION_FAILED') AS failed, COUNT(*) FILTER (WHERE UPPER(status::text) IN ('APPROVED','EXECUTION_SUCCESS','EXECUTION_FAILED')) AS auto_resolved - FROM approval_records - WHERE created_at >= NOW() - interval '24 hours' + FROM scoped """)) row = r.one() stats.total_24h = int(row.total or 0) stats.pending_approval = int(row.pending or 0) + stats.pending_actionable = int(row.pending_actionable or 0) + stats.pending_observe_only = int(row.pending_observe_only or 0) + stats.pending_without_telegram = int(row.pending_without_telegram or 0) stats.execution_success_24h = int(row.success or 0) stats.execution_failed_24h = int(row.failed or 0) stats.auto_resolved_24h = int(row.auto_resolved or 0) @@ -762,9 +793,12 @@ class HeartbeatReportService: if not report.db_redis.redis_ok: warnings.append(f"Redis: {report.db_redis.redis_status}") - # Pending 積壓告警 - if report.alert_pipeline.pending_approval > 10: - warnings.append(f"PENDING 積壓 {report.alert_pipeline.pending_approval} 筆,需人工處理") + # Pending 積壓告警:只用可執行/有風險待審計數觸發,避免 OBSERVE/NO_ACTION 觀察卡造成假待辦。 + if report.alert_pipeline.pending_actionable > 10: + warnings.append( + f"待人工審核 {report.alert_pipeline.pending_actionable} 筆" + f"(前台 /awooop/approvals;觀察類 {report.alert_pipeline.pending_observe_only} 筆另列)" + ) # Pod 異常 — 2026-05-03 Claude Opus 4.7 + 統帥 ogt:P0 #3 完整 K8s pod state machine # K8s pod phases (https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/): @@ -906,6 +940,10 @@ def report_to_telegram_html(report: HeartbeatReport) -> str: lines.append("") lines.append("📊 告警流水線(24h)") lines.append(f"├─ 總計: {ap.total_24h} PENDING: {ap.pending_approval}") + lines.append( + f"├─ 待審拆分: 人工 {ap.pending_actionable} 觀察 {ap.pending_observe_only}" + f" 無TG {ap.pending_without_telegram}" + ) if ap.execution_success_24h > 0 and ap.execution_failed_24h == 0: exec_icon = "✅" elif ap.execution_failed_24h > 0: diff --git a/apps/api/tests/test_approval_pending_visibility.py b/apps/api/tests/test_approval_pending_visibility.py new file mode 100644 index 00000000..5f06cf11 --- /dev/null +++ b/apps/api/tests/test_approval_pending_visibility.py @@ -0,0 +1,122 @@ +from datetime import UTC, datetime, timedelta +from types import SimpleNamespace +from uuid import UUID + +from src.models.approval import ( + ApprovalRequest, + ApprovalRequestResponse, + ApprovalStatus, + RiskLevel, +) +from src.services.approval_db import approval_record_to_request +from src.services.heartbeat_report_service import ( + AlertPipelineStats, + DbRedisStats, + FlywheelStats, + HeartbeatReport, + HeartbeatReportService, +) + + +def test_approval_response_exposes_incident_delivery_and_playbook_fields(): + approval = ApprovalRequest( + id=UUID("11111111-1111-1111-1111-111111111111"), + action="kubectl rollout restart deployment/awoooi-api", + description="manual gate", + risk_level=RiskLevel.MEDIUM, + requested_by="OpenClaw", + required_signatures=1, + incident_id="INC-20260531-VISIBLE", + matched_playbook_id="PB-20260531-VISIBLE", + telegram_message_id=98765, + telegram_chat_id=-1001234567890, + ) + + response = ApprovalRequestResponse.from_approval(approval) + + assert response.incident_id == "INC-20260531-VISIBLE" + assert response.matched_playbook_id == "PB-20260531-VISIBLE" + assert response.telegram_message_id == 98765 + assert response.telegram_chat_id == -1001234567890 + + +def test_approval_db_converter_preserves_incident_and_telegram_fields(): + now = datetime.now(UTC) + record = SimpleNamespace( + id="22222222-2222-2222-2222-222222222222", + action="OBSERVE", + description="[LLM Failed] observe only", + status=ApprovalStatus.PENDING, + risk_level=RiskLevel.MEDIUM, + blast_radius={ + "affected_pods": 0, + "estimated_downtime": "0", + "related_services": [], + "data_impact": "none", + }, + dry_run_checks=[], + required_signatures=1, + current_signatures=0, + signatures=[], + requested_by="OpenClaw (fallback)", + created_at=now, + expires_at=now + timedelta(hours=1), + resolved_at=None, + rejection_reason=None, + extra_metadata={"source": "fallback"}, + fingerprint="abc123", + hit_count=2, + last_seen_at=now, + incident_id="INC-20260531-LEGACY", + matched_playbook_id="PB-20260531-LEGACY", + telegram_message_id=45678, + telegram_chat_id=-1001234567890, + ) + + approval = approval_record_to_request(record) + + assert approval.incident_id == "INC-20260531-LEGACY" + assert approval.matched_playbook_id == "PB-20260531-LEGACY" + assert approval.telegram_message_id == 45678 + assert approval.telegram_chat_id == -1001234567890 + + +def _report_with_pipeline(stats: AlertPipelineStats) -> HeartbeatReport: + return HeartbeatReport( + timestamp=datetime.now(UTC), + flywheel=FlywheelStats(playbook_count=1), + db_redis=DbRedisStats(db_ok=True, db_status="ok", redis_ok=True, redis_status="ok"), + alert_pipeline=stats, + ) + + +def test_heartbeat_does_not_warn_when_pending_backlog_is_observe_only(): + report = _report_with_pipeline( + AlertPipelineStats( + total_24h=25, + pending_approval=21, + pending_actionable=1, + pending_observe_only=20, + ) + ) + + warnings = HeartbeatReportService()._build_warnings(report) + + assert not any("待人工審核" in warning for warning in warnings) + assert not any("PENDING 積壓" in warning for warning in warnings) + + +def test_heartbeat_warns_with_frontend_route_for_actionable_backlog(): + report = _report_with_pipeline( + AlertPipelineStats( + total_24h=25, + pending_approval=21, + pending_actionable=11, + pending_observe_only=10, + ) + ) + + warnings = HeartbeatReportService()._build_warnings(report) + + assert any("/awooop/approvals" in warning for warning in warnings) + assert any("觀察類 10" in warning for warning in warnings) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 64230787..4b6b711c 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,44 @@ +## 2026-05-31|Legacy HITL PENDING 前台可見性與心跳拆分 + +**背景**: + +- Production `GET /api/v1/approvals/pending` 顯示 legacy HITL backlog `count=21`,但 `GET /api/v1/platform/approvals` 回 `total=0`,因此 AwoooP 新式 run approvals 看起來是空的。 +- 這批積壓來自 legacy `approval_records`:其中約 10 筆為 `OpenClaw (fallback) / OBSERVE / medium` 觀察卡,另有舊 fallback kubectl action 與 1 筆 rule-engine MinIO SSH 調查動作。 +- `/api/v1/approvals/pending` 原本沒有在 response model 外露 `incident_id`、`matched_playbook_id`、`telegram_message_id`、`telegram_chat_id`,前台 legacy panel 即使 fetch 到 backlog,也缺少 incident / Telegram delivery truth。 +- Heartbeat 原本用 raw `pending_approval > 10` 觸發 `PENDING 積壓 ... 需人工處理`,會把 OBSERVE / NO_ACTION 觀察卡與真正可執行人工審批混在一起,造成持續告警噪音。 + +**本次調整**: + +- `ApprovalRequest` / `ApprovalRequestResponse` 補齊 legacy HITL 可見欄位:`incident_id`、`matched_playbook_id`、`telegram_message_id`、`telegram_chat_id`。 +- `approval_db.approval_record_to_request()` 與 `approval_repository._record_to_request()` 保留 DB 中的 incident / playbook / Telegram 欄位,不再在 Pydantic 轉換時遺失。 +- `HeartbeatReportService` 將 24h pending 拆成: + - `pending_actionable` + - `pending_observe_only` + - `pending_without_telegram` +- Heartbeat warning 改為只在 `pending_actionable > 10` 時提示,訊息帶 `/awooop/approvals` 前台入口與 observe-only 計數;Telegram heartbeat 同步顯示 pending 拆分。 +- 保留 legacy approvals 的人工決策邊界:沒有批次 approve/reject 生產 PENDING,因為舊 kubectl / SSH action 仍可能造成生產變更,需 operator 在前台逐筆判斷。 + +**Verification**: + +```text +python3 -m py_compile apps/api/src/models/approval.py apps/api/src/services/approval_db.py apps/api/src/repositories/approval_repository.py apps/api/src/services/heartbeat_report_service.py apps/api/tests/test_approval_pending_visibility.py + -> pass +/Users/ogt/.pyenv/shims/ruff check apps/api/tests/test_approval_pending_visibility.py + -> pass +DATABASE_URL='postgresql+asyncpg://test:test@localhost/test' /Users/ogt/.pyenv/shims/pytest apps/api/tests/test_approval_pending_visibility.py -q + -> 4 passed +DATABASE_URL='postgresql+asyncpg://test:test@localhost/test' /Users/ogt/.pyenv/shims/pytest apps/api/tests/test_heartbeat_ollama_endpoints.py apps/api/tests/test_heartbeat_pod_state_machine.py -q + -> 15 passed +git diff --check + -> pass +``` + +**判讀 / 下一步**: + +- `需人工處理` 的正確入口是 `/awooop/approvals` 的 legacy HITL backlog;舊 fallback kubectl action 不應盲目批准,應逐筆 reject stale action 或重新診斷。 +- OBSERVE / NO_ACTION 類卡片不再被當成 emergency manual backlog,但仍會在拆分數字中保留,避免把觀察訊號隱藏。 +- 後續可再處理 fallback LLM failure branch 為何大量建立 `OBSERVE / medium` 卡片;本輪先修可見性與告警準確度,不改 agent 後續更新 PENDING action 的行為。 + ## 2026-05-31|CD source-link gate 過期與 pipefail 修復 **背景**: diff --git a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md index 51ea436f..21a49b7c 100644 --- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md +++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md @@ -2665,6 +2665,12 @@ Phase 6 完成後 - 判讀:T135 已把 runner ownership 從雙 runner 搶工收斂到 host runner 單一主控;下一段不要重新啟用 Docker-wrapped runner,而是做 runner pool / repo label 隔離、API image `apt-get` / `chown -R` 分層、Web build cache/offload、Playwright apt source-list hygiene。 - 目前進度更新:AwoooP 告警可觀測鏈約 99.998%;Incident-level source correlation 可見性約 98.8%;Source correlation apply 狀態鏈可驗證性約 99.72%;Source correlation freshness / rolling gate 約 98.2%;前端 AI 自動化管理介面同步約 99.999%;Dashboard snapshot / SSE console noise 收斂約 99.2%;CI/CD runner hygiene 約 99.2%;Runner ownership 收斂約 96%;Build host pressure治理約 82%;完整 AI 自動化管理產品化約 99.960%。 +**T153 Legacy HITL pending visibility + warning split(2026-05-31 台北)**: +- 觸發:production `/api/v1/approvals/pending` 有 legacy HITL backlog `count=21`,但 `/api/v1/platform/approvals` 回 `total=0`,operator 會以為前台沒有人工待辦;同時 heartbeat 用 raw PENDING 數量觸發「PENDING 積壓,需人工處理」,把 OBSERVE / NO_ACTION 觀察卡與真正可執行審批混在一起。 +- 修正:`ApprovalRequest` / `ApprovalRequestResponse` 外露 `incident_id`、`matched_playbook_id`、`telegram_message_id`、`telegram_chat_id`;DB / repository converter 保留 legacy approval record 的 incident / playbook / Telegram delivery 欄位。`HeartbeatReportService` 將 24h pending 拆成 actionable、observe-only、without-TG;warning 只看 actionable backlog,並在訊息指向 `/awooop/approvals`,Telegram heartbeat 顯示「待審拆分」。 +- Verification:API py_compile pass;targeted ruff for new test pass;`test_approval_pending_visibility.py` 4 passed;`test_heartbeat_ollama_endpoints.py` + `test_heartbeat_pod_state_machine.py` 15 passed;`git diff --check` pass。 +- 判讀:T153 不批次 approve/reject 生產 PENDING,也不把觀察卡刪掉;它把「前台看得到 legacy HITL 事實」與「告警只針對真正人工 actionable backlog」補齊。舊 fallback kubectl / SSH action 仍需 operator 在 `/awooop/approvals` 逐筆決策;OBSERVE / NO_ACTION 類不再偽裝成 emergency manual backlog。下一段可追 LLM failure fallback 為何大量產生 `OBSERVE / medium` 卡片,但需避免破壞 agent 後續把 PENDING 更新成可執行 action 的路徑。 + **T152 Ansible runtime readiness surfaced(2026-05-24 台北)**: - 觸發:T151 已讓首頁看到 execution backend / Ansible attribution,但 operator 仍看不到 runtime 端缺什麼,容易把「Ansible 有候選」誤解成「Ansible 已能自動修復」。 - 修正:API image 複製 `infra/ansible/` 作 read-only catalog;`truth-chain/quality/summary` 新增 `ansible_runtime`,回報 playbook binary、catalog、inventory、playbook_count、can_run_check_mode、blockers。首頁 execution evidence 同步顯示 runtime 狀態;目前 production 顯示 `runtime 未就緒:ansible_playbook_binary_missing`。未安裝 `ansible-core`、未啟用 check-mode / apply。