fix(flywheel): P2 Approval 逾期不結案 → KM 學習鏈斷鏈修復
Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 12m11s
Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 12m11s
問題根因:
PENDING approval 無人處置超過 48h 後應自動 EXPIRED,
但 get_pending_approvals() 只在用戶開 UI 時觸發,
若無人開 UI → Incident 永遠 PENDING → KM 永遠不寫入
→ Phase 6 SLO human_override_rate 低估,EWMA 缺少負向樣本。
修復:
1. anomaly_counter.py: 新增 "timeout_ignored" disposition 類型,
與 auto_repair / human_approved / manual_resolved 區分
2. incident_service.py: resolve_incident() 新增 resolution_type 參數,
resolution_type="timeout" 時記錄 "timeout_ignored" 而非 "manual_resolved"
3. jobs/approval_timeout_resolver.py (新): 每小時掃描逾期 PENDING approval,
批次標記 EXPIRED,對每筆有 incident_id 的記錄呼叫 resolve_incident("timeout")
4. main.py: startup 掛載 approval_timeout_resolver 排程(interval=3600s)
效果:
- 告警無人處置 48h → Incident 自動結案 → KM 寫入 → EWMA 取得樣本
- disposition="timeout_ignored" 讓 SLO 計算正確區分「AI 建議被忽略」
- 飛輪學習鏈對「無人處置告警」閉環
2026-04-15 ogt + Claude Sonnet 4.6(亞太)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
150
apps/api/src/jobs/approval_timeout_resolver.py
Normal file
150
apps/api/src/jobs/approval_timeout_resolver.py
Normal file
@@ -0,0 +1,150 @@
|
|||||||
|
"""
|
||||||
|
AWOOOI — Approval Timeout Resolver(逾期 Approval 自動結案 Job)
|
||||||
|
================================================================
|
||||||
|
職責:每小時掃描 approval_records 中已逾期(expires_at < now)但狀態仍為
|
||||||
|
PENDING 的記錄,標記為 EXPIRED,並對其關聯的 Incident 呼叫 resolve_incident
|
||||||
|
以確保 KM 學習鏈完整閉環。
|
||||||
|
|
||||||
|
為什麼需要這個 Job?
|
||||||
|
get_pending_approvals() 有自動過期邏輯,但只在用戶開啟待處理列表時觸發。
|
||||||
|
若無人開 UI,PENDING 記錄永遠停留,關聯 Incident 不會 RESOLVED,
|
||||||
|
km_conversion_service 永不觸發,AI 學習飛輪對「無人處置的告警」完全盲目。
|
||||||
|
|
||||||
|
disposition 記錄:
|
||||||
|
timeout_ignored — 與 auto_repair / human_approved 區別,
|
||||||
|
讓 anomaly_counter 統計反映「AI 建議但被人類忽略」的現象,
|
||||||
|
供 Phase 6 SLO human_override_rate 校正。
|
||||||
|
|
||||||
|
設計原則:
|
||||||
|
1. 只更新 DB,不刪除記錄(符合 archive_not_delete 鐵律)
|
||||||
|
2. resolve_incident 使用 resolution_type="timeout",記錄正確 disposition
|
||||||
|
3. 失敗 → 只記錄 error,不影響主路徑
|
||||||
|
4. 每次執行記錄 resolved_count / error_count
|
||||||
|
|
||||||
|
2026-04-15 ogt + Claude Sonnet 4.6(亞太):P2 飛輪斷鏈修復
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from datetime import UTC, datetime, timedelta
|
||||||
|
|
||||||
|
import structlog
|
||||||
|
from sqlalchemy import and_, select, update
|
||||||
|
|
||||||
|
from src.db.base import get_db_context
|
||||||
|
from src.db.models import ApprovalRecord
|
||||||
|
from src.models.approval import ApprovalStatus
|
||||||
|
from src.utils.timezone import now_taipei
|
||||||
|
|
||||||
|
logger = structlog.get_logger(__name__)
|
||||||
|
|
||||||
|
# 每次最多處理幾筆,避免單次執行阻塞過長
|
||||||
|
BATCH_LIMIT = 50
|
||||||
|
|
||||||
|
|
||||||
|
async def run_approval_timeout_resolver() -> None:
|
||||||
|
"""
|
||||||
|
無限迴圈:每小時執行一次逾期 Approval 結案掃描。
|
||||||
|
在 main.py startup 以 asyncio.create_task 掛載。
|
||||||
|
"""
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
resolved, errors = await _resolve_expired_approvals()
|
||||||
|
if resolved > 0 or errors > 0:
|
||||||
|
logger.info(
|
||||||
|
"approval_timeout_resolver_done",
|
||||||
|
resolved=resolved,
|
||||||
|
errors=errors,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("approval_timeout_resolver_loop_error", error=str(e))
|
||||||
|
|
||||||
|
await asyncio.sleep(3600) # 每小時執行一次
|
||||||
|
|
||||||
|
|
||||||
|
async def _resolve_expired_approvals() -> tuple[int, int]:
|
||||||
|
"""
|
||||||
|
找出已逾期的 PENDING approval,標記 EXPIRED 並結案對應 Incident。
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(resolved_count, error_count)
|
||||||
|
"""
|
||||||
|
now = datetime.now(UTC)
|
||||||
|
resolved = 0
|
||||||
|
errors = 0
|
||||||
|
|
||||||
|
# Step 1: 找出逾期但仍 PENDING 的記錄(有 expires_at 且逾期)
|
||||||
|
async with get_db_context() as db:
|
||||||
|
result = await db.execute(
|
||||||
|
select(ApprovalRecord)
|
||||||
|
.where(
|
||||||
|
and_(
|
||||||
|
ApprovalRecord.status == ApprovalStatus.PENDING,
|
||||||
|
ApprovalRecord.expires_at.is_not(None),
|
||||||
|
ApprovalRecord.expires_at < now,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.order_by(ApprovalRecord.expires_at)
|
||||||
|
.limit(BATCH_LIMIT)
|
||||||
|
)
|
||||||
|
expired_records = result.scalars().all()
|
||||||
|
|
||||||
|
if not expired_records:
|
||||||
|
return 0, 0
|
||||||
|
|
||||||
|
# Step 2: 批次標記 EXPIRED
|
||||||
|
expired_ids = [r.id for r in expired_records]
|
||||||
|
await db.execute(
|
||||||
|
update(ApprovalRecord)
|
||||||
|
.where(ApprovalRecord.id.in_(expired_ids))
|
||||||
|
.values(status=ApprovalStatus.EXPIRED, resolved_at=now)
|
||||||
|
)
|
||||||
|
await db.commit()
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"approval_timeout_batch_expired",
|
||||||
|
count=len(expired_ids),
|
||||||
|
ids=[str(i)[:8] for i in expired_ids[:10]],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Step 3: 對每筆有 incident_id 的記錄呼叫 resolve_incident
|
||||||
|
from src.services.incident_service import get_incident_service
|
||||||
|
|
||||||
|
inc_svc = get_incident_service()
|
||||||
|
|
||||||
|
for record in expired_records:
|
||||||
|
incident_id = getattr(record, "incident_id", None)
|
||||||
|
if not incident_id:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = await inc_svc.resolve_incident(
|
||||||
|
incident_id=str(incident_id),
|
||||||
|
resolution_type="timeout",
|
||||||
|
)
|
||||||
|
if result:
|
||||||
|
resolved += 1
|
||||||
|
logger.info(
|
||||||
|
"approval_timeout_incident_resolved",
|
||||||
|
approval_id=str(record.id)[:8],
|
||||||
|
incident_id=str(incident_id)[:8],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# incident_not_found 或已 RESOLVED,不算 error
|
||||||
|
logger.debug(
|
||||||
|
"approval_timeout_incident_skip",
|
||||||
|
approval_id=str(record.id)[:8],
|
||||||
|
incident_id=str(incident_id)[:8],
|
||||||
|
reason="not_found_or_already_resolved",
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
errors += 1
|
||||||
|
logger.error(
|
||||||
|
"approval_timeout_resolve_error",
|
||||||
|
approval_id=str(record.id)[:8],
|
||||||
|
incident_id=str(incident_id)[:8],
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
return resolved, errors
|
||||||
@@ -336,6 +336,15 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("daily_report_loop_schedule_failed", error=str(e))
|
logger.warning("daily_report_loop_schedule_failed", error=str(e))
|
||||||
|
|
||||||
|
# ADR-073 P2 修復 2026-04-15: 逾期 Approval 自動結案(每小時)
|
||||||
|
# 確保 PENDING approval 超過 48h 後觸發 resolve_incident → KM 學習鏈閉環
|
||||||
|
try:
|
||||||
|
from src.jobs.approval_timeout_resolver import run_approval_timeout_resolver
|
||||||
|
asyncio.create_task(run_approval_timeout_resolver())
|
||||||
|
logger.info("approval_timeout_resolver_scheduled", interval_sec=3600)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("approval_timeout_resolver_schedule_failed", error=str(e))
|
||||||
|
|
||||||
# Phase 4 ADR-084: 主動巡檢每 5 分鐘執行一次
|
# Phase 4 ADR-084: 主動巡檢每 5 分鐘執行一次
|
||||||
# 協調 DynamicBaselineService + LogAnomalyDetector + TrendPredictor
|
# 協調 DynamicBaselineService + LogAnomalyDetector + TrendPredictor
|
||||||
# Shadow Mode 控制:AIOPS_P4_SHADOW_MODE=True 時只記錄,不觸發 Alert
|
# Shadow Mode 控制:AIOPS_P4_SHADOW_MODE=True 時只記錄,不觸發 Alert
|
||||||
|
|||||||
@@ -387,7 +387,7 @@ class AnomalyCounter:
|
|||||||
# 2026-04-07 Claude Code: Sprint 4 A2/A3 — 處置類型統計
|
# 2026-04-07 Claude Code: Sprint 4 A2/A3 — 處置類型統計
|
||||||
# ==========================================================================
|
# ==========================================================================
|
||||||
|
|
||||||
VALID_DISPOSITION_TYPES = {"auto_repair", "human_approved", "manual_resolved", "cold_start_trust"}
|
VALID_DISPOSITION_TYPES = {"auto_repair", "human_approved", "manual_resolved", "cold_start_trust", "timeout_ignored"}
|
||||||
|
|
||||||
async def record_disposition(
|
async def record_disposition(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -915,7 +915,11 @@ class IncidentService:
|
|||||||
|
|
||||||
return incident
|
return incident
|
||||||
|
|
||||||
async def resolve_incident(self, incident_id: str) -> Incident | None:
|
async def resolve_incident(
|
||||||
|
self,
|
||||||
|
incident_id: str,
|
||||||
|
resolution_type: str = "manual",
|
||||||
|
) -> Incident | None:
|
||||||
"""
|
"""
|
||||||
將 Incident 狀態更新為 RESOLVED
|
將 Incident 狀態更新為 RESOLVED
|
||||||
|
|
||||||
@@ -923,6 +927,12 @@ class IncidentService:
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
incident_id: 事件 ID
|
incident_id: 事件 ID
|
||||||
|
resolution_type: "manual"(預設)| "timeout"(Approval 48h 逾期自動結案)
|
||||||
|
|
||||||
|
ADR-073 補丁 2026-04-15 ogt + Claude Sonnet 4.6:
|
||||||
|
新增 resolution_type="timeout" 路徑 — Approval EXPIRED 時由
|
||||||
|
approval_timeout_resolver 呼叫,記錄 "timeout_ignored" disposition
|
||||||
|
而非 "manual_resolved",確保 EWMA 採樣正確區分人工結案與逾期拋棄。
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Incident | None: 更新後的事件,失敗返回 None
|
Incident | None: 更新後的事件,失敗返回 None
|
||||||
@@ -998,7 +1008,8 @@ class IncidentService:
|
|||||||
or disposition["cold_start_trust"] > 0
|
or disposition["cold_start_trust"] > 0
|
||||||
)
|
)
|
||||||
if not has_system_resolution:
|
if not has_system_resolution:
|
||||||
await counter.record_disposition(anomaly_key, "manual_resolved")
|
disp = "timeout_ignored" if resolution_type == "timeout" else "manual_resolved"
|
||||||
|
await counter.record_disposition(anomaly_key, disp)
|
||||||
except Exception as _disp_e:
|
except Exception as _disp_e:
|
||||||
logger.warning("disposition_manual_resolve_failed", error=str(_disp_e))
|
logger.warning("disposition_manual_resolve_failed", error=str(_disp_e))
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user