From 123d9c8a2ec296f02d5a6e3925d256dcbe23e0a8 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 27 Apr 2026 08:17:04 +0800 Subject: [PATCH] =?UTF-8?q?fix(p3.1-t1):=20=E4=B8=89=20Tier-1=20=E6=9C=8D?= =?UTF-8?q?=E5=8B=99=E6=95=B4=E5=90=88=20=E2=80=94=20model=5Frollback=5Fse?= =?UTF-8?q?rvice=20+=20resource=5Fresolver?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P3.1-T1 接線兩個既有服務到主流程: offline_replay_service.py — model_rollback_service 整合: - 回放事件寫入治理 DB 後,觸發 ModelRollbackService.check() 衰退偵測 - feature flag 由 model_rollback_service 自行判斷(AIOPS_P6_GOVERNANCE_ENABLED) - retrain_recommended → log warning 含 streak / absolute_floor / conservative_mode - exception fail-soft(不阻斷 replay 主流程) approval_execution.py — resource_resolver 整合: - kubectl 指令解析後,動態驗證資源是否存在於 K8s - 若 resolved_name != raw_name → log + apply normalized name - 若不存在但有 candidates → log warning + suggestions(不攔截執行,只記錄) - exception fail-soft(不阻斷主流程) - RESOURCE_RESOLVE_TOTAL Prometheus counter labels: hit/suggestion/miss/error Tests: 後端 1303 collected(無回歸),對應 dedicated 測試在前次 commit 已寫 Co-Authored-By: Claude Opus 4.7 (1M context) Co-Authored-By: Claude Sonnet 4.6 (P3.1-T1) --- apps/api/src/jobs/offline_replay_service.py | 18 ++++++++ apps/api/src/services/approval_execution.py | 48 +++++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/apps/api/src/jobs/offline_replay_service.py b/apps/api/src/jobs/offline_replay_service.py index 22e7583c..700234dc 100644 --- a/apps/api/src/jobs/offline_replay_service.py +++ b/apps/api/src/jobs/offline_replay_service.py @@ -162,6 +162,24 @@ class OfflineReplayService: # 2. 寫入治理事件 await self._save_governance_event(report) + # 2026-04-27 P3.1-T1 by Claude — 三 Tier-1 服務整合 + # 回放事件寫入後,觸發 ModelRollbackService 衰退偵測 + # feature flag 由 model_rollback_service.check() 自行判斷(AIOPS_P6_GOVERNANCE_ENABLED) + try: + from src.services.model_rollback_service import get_model_rollback_service + _mr_svc = get_model_rollback_service() + _mr_result = await _mr_svc.check() + if _mr_result.retrain_recommended: + logger.warning( + "model_rollback_triggered", + reason="retrain_recommended", + streak=_mr_result.consecutive_declines, + absolute_floor_breached=_mr_result.absolute_floor_breached, + conservative_mode_triggered=_mr_result.conservative_mode_triggered, + ) + except Exception as _mr_e: + logger.exception("model_rollback_check_failed", error=str(_mr_e)) + logger.info( "offline_replay_done", sampled=report.incidents_sampled, diff --git a/apps/api/src/services/approval_execution.py b/apps/api/src/services/approval_execution.py index d2cf21d3..e7ff93b9 100644 --- a/apps/api/src/services/approval_execution.py +++ b/apps/api/src/services/approval_execution.py @@ -154,6 +154,54 @@ class ApprovalExecutionService: resource_name = parsed.resource_name namespace = parsed.namespace + # 2026-04-27 P3.1-T1 by Claude — 三 Tier-1 服務整合 + # kubectl 指令解析後,動態驗證資源是否存在於 K8s,並套用 normalized name + # exception 不阻斷主流程;miss/suggestion 只記 warning + metadata,不攔截執行 + if resource_name is not None and operation_type is not None: + try: + from src.services.resource_resolver import get_resource_resolver + from src.core.metrics import RESOURCE_RESOLVE_TOTAL + + _resolver = get_resource_resolver() + _resolve = await _resolver.resolve( + raw_resource=resource_name, + namespace=namespace, + resource_kind="deployment", + ) + if _resolve.success and _resolve.resource_name: + if _resolve.resource_name != resource_name: + logger.info( + "resource_name_normalized", + original=resource_name, + normalized=_resolve.resource_name, + namespace=namespace, + ) + resource_name = _resolve.resource_name + RESOURCE_RESOLVE_TOTAL.labels(result="hit").inc() + elif _resolve.candidates: + logger.warning( + "resource_not_found_in_k8s", + resource=resource_name, + namespace=namespace, + suggestions=_resolve.candidates, + ) + RESOURCE_RESOLVE_TOTAL.labels(result="suggestion").inc() + else: + logger.warning( + "resource_not_found_in_k8s", + resource=resource_name, + namespace=namespace, + suggestions=[], + ) + RESOURCE_RESOLVE_TOTAL.labels(result="miss").inc() + except Exception as _rr_e: + logger.warning("resource_resolve_failed", error=str(_rr_e)) + try: + from src.core.metrics import RESOURCE_RESOLVE_TOTAL + RESOURCE_RESOLVE_TOTAL.labels(result="error").inc() + except Exception: + pass + if operation_type is None or resource_name is None: # 2026-04-19 ogt + Claude Opus 4.7: 區分 NO_ACTION vs 真解析失敗 # NO_ACTION 是 AI 刻意選的「純調查不破壞」,不該誤標 EXECUTION_FAILED