fix(flywheel): P2 Approval 逾期不結案 → KM 學習鏈斷鏈修復
Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 12m11s

問題根因:
  PENDING approval 無人處置超過 48h 後應自動 EXPIRED,
  但 get_pending_approvals() 只在用戶開 UI 時觸發,
  若無人開 UI → Incident 永遠 PENDING → KM 永遠不寫入
  → Phase 6 SLO human_override_rate 低估,EWMA 缺少負向樣本。

修復:
  1. anomaly_counter.py: 新增 "timeout_ignored" disposition 類型,
     與 auto_repair / human_approved / manual_resolved 區分
  2. incident_service.py: resolve_incident() 新增 resolution_type 參數,
     resolution_type="timeout" 時記錄 "timeout_ignored" 而非 "manual_resolved"
  3. jobs/approval_timeout_resolver.py (新): 每小時掃描逾期 PENDING approval,
     批次標記 EXPIRED,對每筆有 incident_id 的記錄呼叫 resolve_incident("timeout")
  4. main.py: startup 掛載 approval_timeout_resolver 排程(interval=3600s)

效果:
  - 告警無人處置 48h → Incident 自動結案 → KM 寫入 → EWMA 取得樣本
  - disposition="timeout_ignored" 讓 SLO 計算正確區分「AI 建議被忽略」
  - 飛輪學習鏈對「無人處置告警」閉環

2026-04-15 ogt + Claude Sonnet 4.6(亞太)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-15 19:21:16 +08:00
parent 586602e7ff
commit f045506abd
4 changed files with 173 additions and 3 deletions

View File

@@ -0,0 +1,150 @@
"""
AWOOOI — Approval Timeout Resolver逾期 Approval 自動結案 Job
================================================================
職責:每小時掃描 approval_records 中已逾期expires_at < now但狀態仍為
PENDING 的記錄,標記為 EXPIRED並對其關聯的 Incident 呼叫 resolve_incident
以確保 KM 學習鏈完整閉環。
為什麼需要這個 Job
get_pending_approvals() 有自動過期邏輯,但只在用戶開啟待處理列表時觸發。
若無人開 UIPENDING 記錄永遠停留,關聯 Incident 不會 RESOLVED
km_conversion_service 永不觸發AI 學習飛輪對「無人處置的告警」完全盲目。
disposition 記錄:
timeout_ignored — 與 auto_repair / human_approved 區別,
讓 anomaly_counter 統計反映「AI 建議但被人類忽略」的現象,
供 Phase 6 SLO human_override_rate 校正。
設計原則:
1. 只更新 DB不刪除記錄符合 archive_not_delete 鐵律)
2. resolve_incident 使用 resolution_type="timeout",記錄正確 disposition
3. 失敗 → 只記錄 error不影響主路徑
4. 每次執行記錄 resolved_count / error_count
2026-04-15 ogt + Claude Sonnet 4.6亞太P2 飛輪斷鏈修復
"""
from __future__ import annotations
import asyncio
from datetime import UTC, datetime, timedelta
import structlog
from sqlalchemy import and_, select, update
from src.db.base import get_db_context
from src.db.models import ApprovalRecord
from src.models.approval import ApprovalStatus
from src.utils.timezone import now_taipei
logger = structlog.get_logger(__name__)
# 每次最多處理幾筆,避免單次執行阻塞過長
BATCH_LIMIT = 50
async def run_approval_timeout_resolver() -> None:
"""
無限迴圈:每小時執行一次逾期 Approval 結案掃描。
在 main.py startup 以 asyncio.create_task 掛載。
"""
while True:
try:
resolved, errors = await _resolve_expired_approvals()
if resolved > 0 or errors > 0:
logger.info(
"approval_timeout_resolver_done",
resolved=resolved,
errors=errors,
)
except Exception as e:
logger.error("approval_timeout_resolver_loop_error", error=str(e))
await asyncio.sleep(3600) # 每小時執行一次
async def _resolve_expired_approvals() -> tuple[int, int]:
"""
找出已逾期的 PENDING approval標記 EXPIRED 並結案對應 Incident。
Returns:
(resolved_count, error_count)
"""
now = datetime.now(UTC)
resolved = 0
errors = 0
# Step 1: 找出逾期但仍 PENDING 的記錄(有 expires_at 且逾期)
async with get_db_context() as db:
result = await db.execute(
select(ApprovalRecord)
.where(
and_(
ApprovalRecord.status == ApprovalStatus.PENDING,
ApprovalRecord.expires_at.is_not(None),
ApprovalRecord.expires_at < now,
)
)
.order_by(ApprovalRecord.expires_at)
.limit(BATCH_LIMIT)
)
expired_records = result.scalars().all()
if not expired_records:
return 0, 0
# Step 2: 批次標記 EXPIRED
expired_ids = [r.id for r in expired_records]
await db.execute(
update(ApprovalRecord)
.where(ApprovalRecord.id.in_(expired_ids))
.values(status=ApprovalStatus.EXPIRED, resolved_at=now)
)
await db.commit()
logger.info(
"approval_timeout_batch_expired",
count=len(expired_ids),
ids=[str(i)[:8] for i in expired_ids[:10]],
)
# Step 3: 對每筆有 incident_id 的記錄呼叫 resolve_incident
from src.services.incident_service import get_incident_service
inc_svc = get_incident_service()
for record in expired_records:
incident_id = getattr(record, "incident_id", None)
if not incident_id:
continue
try:
result = await inc_svc.resolve_incident(
incident_id=str(incident_id),
resolution_type="timeout",
)
if result:
resolved += 1
logger.info(
"approval_timeout_incident_resolved",
approval_id=str(record.id)[:8],
incident_id=str(incident_id)[:8],
)
else:
# incident_not_found 或已 RESOLVED不算 error
logger.debug(
"approval_timeout_incident_skip",
approval_id=str(record.id)[:8],
incident_id=str(incident_id)[:8],
reason="not_found_or_already_resolved",
)
except Exception as e:
errors += 1
logger.error(
"approval_timeout_resolve_error",
approval_id=str(record.id)[:8],
incident_id=str(incident_id)[:8],
error=str(e),
)
return resolved, errors

View File

@@ -336,6 +336,15 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
except Exception as e:
logger.warning("daily_report_loop_schedule_failed", error=str(e))
# ADR-073 P2 修復 2026-04-15: 逾期 Approval 自動結案(每小時)
# 確保 PENDING approval 超過 48h 後觸發 resolve_incident → KM 學習鏈閉環
try:
from src.jobs.approval_timeout_resolver import run_approval_timeout_resolver
asyncio.create_task(run_approval_timeout_resolver())
logger.info("approval_timeout_resolver_scheduled", interval_sec=3600)
except Exception as e:
logger.warning("approval_timeout_resolver_schedule_failed", error=str(e))
# Phase 4 ADR-084: 主動巡檢每 5 分鐘執行一次
# 協調 DynamicBaselineService + LogAnomalyDetector + TrendPredictor
# Shadow Mode 控制AIOPS_P4_SHADOW_MODE=True 時只記錄,不觸發 Alert

View File

@@ -387,7 +387,7 @@ class AnomalyCounter:
# 2026-04-07 Claude Code: Sprint 4 A2/A3 — 處置類型統計
# ==========================================================================
VALID_DISPOSITION_TYPES = {"auto_repair", "human_approved", "manual_resolved", "cold_start_trust"}
VALID_DISPOSITION_TYPES = {"auto_repair", "human_approved", "manual_resolved", "cold_start_trust", "timeout_ignored"}
async def record_disposition(
self,

View File

@@ -915,7 +915,11 @@ class IncidentService:
return incident
async def resolve_incident(self, incident_id: str) -> Incident | None:
async def resolve_incident(
self,
incident_id: str,
resolution_type: str = "manual",
) -> Incident | None:
"""
將 Incident 狀態更新為 RESOLVED
@@ -923,6 +927,12 @@ class IncidentService:
Args:
incident_id: 事件 ID
resolution_type: "manual"(預設)| "timeout"Approval 48h 逾期自動結案)
ADR-073 補丁 2026-04-15 ogt + Claude Sonnet 4.6:
新增 resolution_type="timeout" 路徑 — Approval EXPIRED 時由
approval_timeout_resolver 呼叫,記錄 "timeout_ignored" disposition
而非 "manual_resolved",確保 EWMA 採樣正確區分人工結案與逾期拋棄。
Returns:
Incident | None: 更新後的事件,失敗返回 None
@@ -998,7 +1008,8 @@ class IncidentService:
or disposition["cold_start_trust"] > 0
)
if not has_system_resolution:
await counter.record_disposition(anomaly_key, "manual_resolved")
disp = "timeout_ignored" if resolution_type == "timeout" else "manual_resolved"
await counter.record_disposition(anomaly_key, disp)
except Exception as _disp_e:
logger.warning("disposition_manual_resolve_failed", error=str(_disp_e))