From f045506abd1687972cea0f7f4527800bb1f719d1 Mon Sep 17 00:00:00 2001 From: OG T Date: Wed, 15 Apr 2026 19:21:16 +0800 Subject: [PATCH] =?UTF-8?q?fix(flywheel):=20P2=20Approval=20=E9=80=BE?= =?UTF-8?q?=E6=9C=9F=E4=B8=8D=E7=B5=90=E6=A1=88=20=E2=86=92=20KM=20?= =?UTF-8?q?=E5=AD=B8=E7=BF=92=E9=8F=88=E6=96=B7=E9=8F=88=E4=BF=AE=E5=BE=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 問題根因: PENDING approval 無人處置超過 48h 後應自動 EXPIRED, 但 get_pending_approvals() 只在用戶開 UI 時觸發, 若無人開 UI → Incident 永遠 PENDING → KM 永遠不寫入 → Phase 6 SLO human_override_rate 低估,EWMA 缺少負向樣本。 修復: 1. anomaly_counter.py: 新增 "timeout_ignored" disposition 類型, 與 auto_repair / human_approved / manual_resolved 區分 2. incident_service.py: resolve_incident() 新增 resolution_type 參數, resolution_type="timeout" 時記錄 "timeout_ignored" 而非 "manual_resolved" 3. jobs/approval_timeout_resolver.py (新): 每小時掃描逾期 PENDING approval, 批次標記 EXPIRED,對每筆有 incident_id 的記錄呼叫 resolve_incident("timeout") 4. main.py: startup 掛載 approval_timeout_resolver 排程(interval=3600s) 效果: - 告警無人處置 48h → Incident 自動結案 → KM 寫入 → EWMA 取得樣本 - disposition="timeout_ignored" 讓 SLO 計算正確區分「AI 建議被忽略」 - 飛輪學習鏈對「無人處置告警」閉環 2026-04-15 ogt + Claude Sonnet 4.6(亞太) Co-Authored-By: Claude Sonnet 4.6 --- .../api/src/jobs/approval_timeout_resolver.py | 150 ++++++++++++++++++ apps/api/src/main.py | 9 ++ apps/api/src/services/anomaly_counter.py | 2 +- apps/api/src/services/incident_service.py | 15 +- 4 files changed, 173 insertions(+), 3 deletions(-) create mode 100644 apps/api/src/jobs/approval_timeout_resolver.py diff --git a/apps/api/src/jobs/approval_timeout_resolver.py b/apps/api/src/jobs/approval_timeout_resolver.py new file mode 100644 index 00000000..a54cd98c --- /dev/null +++ b/apps/api/src/jobs/approval_timeout_resolver.py @@ -0,0 +1,150 @@ +""" +AWOOOI — Approval Timeout Resolver(逾期 Approval 自動結案 Job) +================================================================ +職責:每小時掃描 approval_records 中已逾期(expires_at < now)但狀態仍為 +PENDING 的記錄,標記為 EXPIRED,並對其關聯的 Incident 呼叫 resolve_incident +以確保 KM 學習鏈完整閉環。 + +為什麼需要這個 Job? + get_pending_approvals() 有自動過期邏輯,但只在用戶開啟待處理列表時觸發。 + 若無人開 UI,PENDING 記錄永遠停留,關聯 Incident 不會 RESOLVED, + km_conversion_service 永不觸發,AI 學習飛輪對「無人處置的告警」完全盲目。 + +disposition 記錄: + timeout_ignored — 與 auto_repair / human_approved 區別, + 讓 anomaly_counter 統計反映「AI 建議但被人類忽略」的現象, + 供 Phase 6 SLO human_override_rate 校正。 + +設計原則: + 1. 只更新 DB,不刪除記錄(符合 archive_not_delete 鐵律) + 2. resolve_incident 使用 resolution_type="timeout",記錄正確 disposition + 3. 失敗 → 只記錄 error,不影響主路徑 + 4. 每次執行記錄 resolved_count / error_count + +2026-04-15 ogt + Claude Sonnet 4.6(亞太):P2 飛輪斷鏈修復 +""" + +from __future__ import annotations + +import asyncio +from datetime import UTC, datetime, timedelta + +import structlog +from sqlalchemy import and_, select, update + +from src.db.base import get_db_context +from src.db.models import ApprovalRecord +from src.models.approval import ApprovalStatus +from src.utils.timezone import now_taipei + +logger = structlog.get_logger(__name__) + +# 每次最多處理幾筆,避免單次執行阻塞過長 +BATCH_LIMIT = 50 + + +async def run_approval_timeout_resolver() -> None: + """ + 無限迴圈:每小時執行一次逾期 Approval 結案掃描。 + 在 main.py startup 以 asyncio.create_task 掛載。 + """ + while True: + try: + resolved, errors = await _resolve_expired_approvals() + if resolved > 0 or errors > 0: + logger.info( + "approval_timeout_resolver_done", + resolved=resolved, + errors=errors, + ) + except Exception as e: + logger.error("approval_timeout_resolver_loop_error", error=str(e)) + + await asyncio.sleep(3600) # 每小時執行一次 + + +async def _resolve_expired_approvals() -> tuple[int, int]: + """ + 找出已逾期的 PENDING approval,標記 EXPIRED 並結案對應 Incident。 + + Returns: + (resolved_count, error_count) + """ + now = datetime.now(UTC) + resolved = 0 + errors = 0 + + # Step 1: 找出逾期但仍 PENDING 的記錄(有 expires_at 且逾期) + async with get_db_context() as db: + result = await db.execute( + select(ApprovalRecord) + .where( + and_( + ApprovalRecord.status == ApprovalStatus.PENDING, + ApprovalRecord.expires_at.is_not(None), + ApprovalRecord.expires_at < now, + ) + ) + .order_by(ApprovalRecord.expires_at) + .limit(BATCH_LIMIT) + ) + expired_records = result.scalars().all() + + if not expired_records: + return 0, 0 + + # Step 2: 批次標記 EXPIRED + expired_ids = [r.id for r in expired_records] + await db.execute( + update(ApprovalRecord) + .where(ApprovalRecord.id.in_(expired_ids)) + .values(status=ApprovalStatus.EXPIRED, resolved_at=now) + ) + await db.commit() + + logger.info( + "approval_timeout_batch_expired", + count=len(expired_ids), + ids=[str(i)[:8] for i in expired_ids[:10]], + ) + + # Step 3: 對每筆有 incident_id 的記錄呼叫 resolve_incident + from src.services.incident_service import get_incident_service + + inc_svc = get_incident_service() + + for record in expired_records: + incident_id = getattr(record, "incident_id", None) + if not incident_id: + continue + + try: + result = await inc_svc.resolve_incident( + incident_id=str(incident_id), + resolution_type="timeout", + ) + if result: + resolved += 1 + logger.info( + "approval_timeout_incident_resolved", + approval_id=str(record.id)[:8], + incident_id=str(incident_id)[:8], + ) + else: + # incident_not_found 或已 RESOLVED,不算 error + logger.debug( + "approval_timeout_incident_skip", + approval_id=str(record.id)[:8], + incident_id=str(incident_id)[:8], + reason="not_found_or_already_resolved", + ) + except Exception as e: + errors += 1 + logger.error( + "approval_timeout_resolve_error", + approval_id=str(record.id)[:8], + incident_id=str(incident_id)[:8], + error=str(e), + ) + + return resolved, errors diff --git a/apps/api/src/main.py b/apps/api/src/main.py index b6c5cf08..1886eed6 100644 --- a/apps/api/src/main.py +++ b/apps/api/src/main.py @@ -336,6 +336,15 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]: except Exception as e: logger.warning("daily_report_loop_schedule_failed", error=str(e)) + # ADR-073 P2 修復 2026-04-15: 逾期 Approval 自動結案(每小時) + # 確保 PENDING approval 超過 48h 後觸發 resolve_incident → KM 學習鏈閉環 + try: + from src.jobs.approval_timeout_resolver import run_approval_timeout_resolver + asyncio.create_task(run_approval_timeout_resolver()) + logger.info("approval_timeout_resolver_scheduled", interval_sec=3600) + except Exception as e: + logger.warning("approval_timeout_resolver_schedule_failed", error=str(e)) + # Phase 4 ADR-084: 主動巡檢每 5 分鐘執行一次 # 協調 DynamicBaselineService + LogAnomalyDetector + TrendPredictor # Shadow Mode 控制:AIOPS_P4_SHADOW_MODE=True 時只記錄,不觸發 Alert diff --git a/apps/api/src/services/anomaly_counter.py b/apps/api/src/services/anomaly_counter.py index 92f9c5a4..a216856a 100644 --- a/apps/api/src/services/anomaly_counter.py +++ b/apps/api/src/services/anomaly_counter.py @@ -387,7 +387,7 @@ class AnomalyCounter: # 2026-04-07 Claude Code: Sprint 4 A2/A3 — 處置類型統計 # ========================================================================== - VALID_DISPOSITION_TYPES = {"auto_repair", "human_approved", "manual_resolved", "cold_start_trust"} + VALID_DISPOSITION_TYPES = {"auto_repair", "human_approved", "manual_resolved", "cold_start_trust", "timeout_ignored"} async def record_disposition( self, diff --git a/apps/api/src/services/incident_service.py b/apps/api/src/services/incident_service.py index 4c13e320..5b176a49 100644 --- a/apps/api/src/services/incident_service.py +++ b/apps/api/src/services/incident_service.py @@ -915,7 +915,11 @@ class IncidentService: return incident - async def resolve_incident(self, incident_id: str) -> Incident | None: + async def resolve_incident( + self, + incident_id: str, + resolution_type: str = "manual", + ) -> Incident | None: """ 將 Incident 狀態更新為 RESOLVED @@ -923,6 +927,12 @@ class IncidentService: Args: incident_id: 事件 ID + resolution_type: "manual"(預設)| "timeout"(Approval 48h 逾期自動結案) + + ADR-073 補丁 2026-04-15 ogt + Claude Sonnet 4.6: + 新增 resolution_type="timeout" 路徑 — Approval EXPIRED 時由 + approval_timeout_resolver 呼叫,記錄 "timeout_ignored" disposition + 而非 "manual_resolved",確保 EWMA 採樣正確區分人工結案與逾期拋棄。 Returns: Incident | None: 更新後的事件,失敗返回 None @@ -998,7 +1008,8 @@ class IncidentService: or disposition["cold_start_trust"] > 0 ) if not has_system_resolution: - await counter.record_disposition(anomaly_key, "manual_resolved") + disp = "timeout_ignored" if resolution_type == "timeout" else "manual_resolved" + await counter.record_disposition(anomaly_key, disp) except Exception as _disp_e: logger.warning("disposition_manual_resolve_failed", error=str(_disp_e))