diff --git a/apps/api/src/jobs/approval_timeout_resolver.py b/apps/api/src/jobs/approval_timeout_resolver.py new file mode 100644 index 00000000..a54cd98c --- /dev/null +++ b/apps/api/src/jobs/approval_timeout_resolver.py @@ -0,0 +1,150 @@ +""" +AWOOOI — Approval Timeout Resolver(逾期 Approval 自動結案 Job) +================================================================ +職責:每小時掃描 approval_records 中已逾期(expires_at < now)但狀態仍為 +PENDING 的記錄,標記為 EXPIRED,並對其關聯的 Incident 呼叫 resolve_incident +以確保 KM 學習鏈完整閉環。 + +為什麼需要這個 Job? + get_pending_approvals() 有自動過期邏輯,但只在用戶開啟待處理列表時觸發。 + 若無人開 UI,PENDING 記錄永遠停留,關聯 Incident 不會 RESOLVED, + km_conversion_service 永不觸發,AI 學習飛輪對「無人處置的告警」完全盲目。 + +disposition 記錄: + timeout_ignored — 與 auto_repair / human_approved 區別, + 讓 anomaly_counter 統計反映「AI 建議但被人類忽略」的現象, + 供 Phase 6 SLO human_override_rate 校正。 + +設計原則: + 1. 只更新 DB,不刪除記錄(符合 archive_not_delete 鐵律) + 2. resolve_incident 使用 resolution_type="timeout",記錄正確 disposition + 3. 失敗 → 只記錄 error,不影響主路徑 + 4. 每次執行記錄 resolved_count / error_count + +2026-04-15 ogt + Claude Sonnet 4.6(亞太):P2 飛輪斷鏈修復 +""" + +from __future__ import annotations + +import asyncio +from datetime import UTC, datetime, timedelta + +import structlog +from sqlalchemy import and_, select, update + +from src.db.base import get_db_context +from src.db.models import ApprovalRecord +from src.models.approval import ApprovalStatus +from src.utils.timezone import now_taipei + +logger = structlog.get_logger(__name__) + +# 每次最多處理幾筆,避免單次執行阻塞過長 +BATCH_LIMIT = 50 + + +async def run_approval_timeout_resolver() -> None: + """ + 無限迴圈:每小時執行一次逾期 Approval 結案掃描。 + 在 main.py startup 以 asyncio.create_task 掛載。 + """ + while True: + try: + resolved, errors = await _resolve_expired_approvals() + if resolved > 0 or errors > 0: + logger.info( + "approval_timeout_resolver_done", + resolved=resolved, + errors=errors, + ) + except Exception as e: + logger.error("approval_timeout_resolver_loop_error", error=str(e)) + + await asyncio.sleep(3600) # 每小時執行一次 + + +async def _resolve_expired_approvals() -> tuple[int, int]: + """ + 找出已逾期的 PENDING approval,標記 EXPIRED 並結案對應 Incident。 + + Returns: + (resolved_count, error_count) + """ + now = datetime.now(UTC) + resolved = 0 + errors = 0 + + # Step 1: 找出逾期但仍 PENDING 的記錄(有 expires_at 且逾期) + async with get_db_context() as db: + result = await db.execute( + select(ApprovalRecord) + .where( + and_( + ApprovalRecord.status == ApprovalStatus.PENDING, + ApprovalRecord.expires_at.is_not(None), + ApprovalRecord.expires_at < now, + ) + ) + .order_by(ApprovalRecord.expires_at) + .limit(BATCH_LIMIT) + ) + expired_records = result.scalars().all() + + if not expired_records: + return 0, 0 + + # Step 2: 批次標記 EXPIRED + expired_ids = [r.id for r in expired_records] + await db.execute( + update(ApprovalRecord) + .where(ApprovalRecord.id.in_(expired_ids)) + .values(status=ApprovalStatus.EXPIRED, resolved_at=now) + ) + await db.commit() + + logger.info( + "approval_timeout_batch_expired", + count=len(expired_ids), + ids=[str(i)[:8] for i in expired_ids[:10]], + ) + + # Step 3: 對每筆有 incident_id 的記錄呼叫 resolve_incident + from src.services.incident_service import get_incident_service + + inc_svc = get_incident_service() + + for record in expired_records: + incident_id = getattr(record, "incident_id", None) + if not incident_id: + continue + + try: + result = await inc_svc.resolve_incident( + incident_id=str(incident_id), + resolution_type="timeout", + ) + if result: + resolved += 1 + logger.info( + "approval_timeout_incident_resolved", + approval_id=str(record.id)[:8], + incident_id=str(incident_id)[:8], + ) + else: + # incident_not_found 或已 RESOLVED,不算 error + logger.debug( + "approval_timeout_incident_skip", + approval_id=str(record.id)[:8], + incident_id=str(incident_id)[:8], + reason="not_found_or_already_resolved", + ) + except Exception as e: + errors += 1 + logger.error( + "approval_timeout_resolve_error", + approval_id=str(record.id)[:8], + incident_id=str(incident_id)[:8], + error=str(e), + ) + + return resolved, errors diff --git a/apps/api/src/main.py b/apps/api/src/main.py index b6c5cf08..1886eed6 100644 --- a/apps/api/src/main.py +++ b/apps/api/src/main.py @@ -336,6 +336,15 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]: except Exception as e: logger.warning("daily_report_loop_schedule_failed", error=str(e)) + # ADR-073 P2 修復 2026-04-15: 逾期 Approval 自動結案(每小時) + # 確保 PENDING approval 超過 48h 後觸發 resolve_incident → KM 學習鏈閉環 + try: + from src.jobs.approval_timeout_resolver import run_approval_timeout_resolver + asyncio.create_task(run_approval_timeout_resolver()) + logger.info("approval_timeout_resolver_scheduled", interval_sec=3600) + except Exception as e: + logger.warning("approval_timeout_resolver_schedule_failed", error=str(e)) + # Phase 4 ADR-084: 主動巡檢每 5 分鐘執行一次 # 協調 DynamicBaselineService + LogAnomalyDetector + TrendPredictor # Shadow Mode 控制:AIOPS_P4_SHADOW_MODE=True 時只記錄,不觸發 Alert diff --git a/apps/api/src/services/anomaly_counter.py b/apps/api/src/services/anomaly_counter.py index 92f9c5a4..a216856a 100644 --- a/apps/api/src/services/anomaly_counter.py +++ b/apps/api/src/services/anomaly_counter.py @@ -387,7 +387,7 @@ class AnomalyCounter: # 2026-04-07 Claude Code: Sprint 4 A2/A3 — 處置類型統計 # ========================================================================== - VALID_DISPOSITION_TYPES = {"auto_repair", "human_approved", "manual_resolved", "cold_start_trust"} + VALID_DISPOSITION_TYPES = {"auto_repair", "human_approved", "manual_resolved", "cold_start_trust", "timeout_ignored"} async def record_disposition( self, diff --git a/apps/api/src/services/incident_service.py b/apps/api/src/services/incident_service.py index 4c13e320..5b176a49 100644 --- a/apps/api/src/services/incident_service.py +++ b/apps/api/src/services/incident_service.py @@ -915,7 +915,11 @@ class IncidentService: return incident - async def resolve_incident(self, incident_id: str) -> Incident | None: + async def resolve_incident( + self, + incident_id: str, + resolution_type: str = "manual", + ) -> Incident | None: """ 將 Incident 狀態更新為 RESOLVED @@ -923,6 +927,12 @@ class IncidentService: Args: incident_id: 事件 ID + resolution_type: "manual"(預設)| "timeout"(Approval 48h 逾期自動結案) + + ADR-073 補丁 2026-04-15 ogt + Claude Sonnet 4.6: + 新增 resolution_type="timeout" 路徑 — Approval EXPIRED 時由 + approval_timeout_resolver 呼叫,記錄 "timeout_ignored" disposition + 而非 "manual_resolved",確保 EWMA 採樣正確區分人工結案與逾期拋棄。 Returns: Incident | None: 更新後的事件,失敗返回 None @@ -998,7 +1008,8 @@ class IncidentService: or disposition["cold_start_trust"] > 0 ) if not has_system_resolution: - await counter.record_disposition(anomaly_key, "manual_resolved") + disp = "timeout_ignored" if resolution_type == "timeout" else "manual_resolved" + await counter.record_disposition(anomaly_key, disp) except Exception as _disp_e: logger.warning("disposition_manual_resolve_failed", error=str(_disp_e))