diff --git a/apps/api/src/jobs/offline_replay_service.py b/apps/api/src/jobs/offline_replay_service.py index 22e7583c..700234dc 100644 --- a/apps/api/src/jobs/offline_replay_service.py +++ b/apps/api/src/jobs/offline_replay_service.py @@ -162,6 +162,24 @@ class OfflineReplayService: # 2. 寫入治理事件 await self._save_governance_event(report) + # 2026-04-27 P3.1-T1 by Claude — 三 Tier-1 服務整合 + # 回放事件寫入後,觸發 ModelRollbackService 衰退偵測 + # feature flag 由 model_rollback_service.check() 自行判斷(AIOPS_P6_GOVERNANCE_ENABLED) + try: + from src.services.model_rollback_service import get_model_rollback_service + _mr_svc = get_model_rollback_service() + _mr_result = await _mr_svc.check() + if _mr_result.retrain_recommended: + logger.warning( + "model_rollback_triggered", + reason="retrain_recommended", + streak=_mr_result.consecutive_declines, + absolute_floor_breached=_mr_result.absolute_floor_breached, + conservative_mode_triggered=_mr_result.conservative_mode_triggered, + ) + except Exception as _mr_e: + logger.exception("model_rollback_check_failed", error=str(_mr_e)) + logger.info( "offline_replay_done", sampled=report.incidents_sampled, diff --git a/apps/api/src/services/approval_execution.py b/apps/api/src/services/approval_execution.py index d2cf21d3..e7ff93b9 100644 --- a/apps/api/src/services/approval_execution.py +++ b/apps/api/src/services/approval_execution.py @@ -154,6 +154,54 @@ class ApprovalExecutionService: resource_name = parsed.resource_name namespace = parsed.namespace + # 2026-04-27 P3.1-T1 by Claude — 三 Tier-1 服務整合 + # kubectl 指令解析後,動態驗證資源是否存在於 K8s,並套用 normalized name + # exception 不阻斷主流程;miss/suggestion 只記 warning + metadata,不攔截執行 + if resource_name is not None and operation_type is not None: + try: + from src.services.resource_resolver import get_resource_resolver + from src.core.metrics import RESOURCE_RESOLVE_TOTAL + + _resolver = get_resource_resolver() + _resolve = await _resolver.resolve( + raw_resource=resource_name, + namespace=namespace, + resource_kind="deployment", + ) + if _resolve.success and _resolve.resource_name: + if _resolve.resource_name != resource_name: + logger.info( + "resource_name_normalized", + original=resource_name, + normalized=_resolve.resource_name, + namespace=namespace, + ) + resource_name = _resolve.resource_name + RESOURCE_RESOLVE_TOTAL.labels(result="hit").inc() + elif _resolve.candidates: + logger.warning( + "resource_not_found_in_k8s", + resource=resource_name, + namespace=namespace, + suggestions=_resolve.candidates, + ) + RESOURCE_RESOLVE_TOTAL.labels(result="suggestion").inc() + else: + logger.warning( + "resource_not_found_in_k8s", + resource=resource_name, + namespace=namespace, + suggestions=[], + ) + RESOURCE_RESOLVE_TOTAL.labels(result="miss").inc() + except Exception as _rr_e: + logger.warning("resource_resolve_failed", error=str(_rr_e)) + try: + from src.core.metrics import RESOURCE_RESOLVE_TOTAL + RESOURCE_RESOLVE_TOTAL.labels(result="error").inc() + except Exception: + pass + if operation_type is None or resource_name is None: # 2026-04-19 ogt + Claude Opus 4.7: 區分 NO_ACTION vs 真解析失敗 # NO_ACTION 是 AI 刻意選的「純調查不破壞」,不該誤標 EXECUTION_FAILED