fix(p3.1-t1): 三 Tier-1 服務整合 — model_rollback_service + resource_resolver
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
P3.1-T1 接線兩個既有服務到主流程: offline_replay_service.py — model_rollback_service 整合: - 回放事件寫入治理 DB 後,觸發 ModelRollbackService.check() 衰退偵測 - feature flag 由 model_rollback_service 自行判斷(AIOPS_P6_GOVERNANCE_ENABLED) - retrain_recommended → log warning 含 streak / absolute_floor / conservative_mode - exception fail-soft(不阻斷 replay 主流程) approval_execution.py — resource_resolver 整合: - kubectl 指令解析後,動態驗證資源是否存在於 K8s - 若 resolved_name != raw_name → log + apply normalized name - 若不存在但有 candidates → log warning + suggestions(不攔截執行,只記錄) - exception fail-soft(不阻斷主流程) - RESOURCE_RESOLVE_TOTAL Prometheus counter labels: hit/suggestion/miss/error Tests: 後端 1303 collected(無回歸),對應 dedicated 測試在前次 commit 已寫 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-Authored-By: Claude Sonnet 4.6 (P3.1-T1) <noreply@anthropic.com>
This commit is contained in:
@@ -162,6 +162,24 @@ class OfflineReplayService:
|
||||
# 2. 寫入治理事件
|
||||
await self._save_governance_event(report)
|
||||
|
||||
# 2026-04-27 P3.1-T1 by Claude — 三 Tier-1 服務整合
|
||||
# 回放事件寫入後,觸發 ModelRollbackService 衰退偵測
|
||||
# feature flag 由 model_rollback_service.check() 自行判斷(AIOPS_P6_GOVERNANCE_ENABLED)
|
||||
try:
|
||||
from src.services.model_rollback_service import get_model_rollback_service
|
||||
_mr_svc = get_model_rollback_service()
|
||||
_mr_result = await _mr_svc.check()
|
||||
if _mr_result.retrain_recommended:
|
||||
logger.warning(
|
||||
"model_rollback_triggered",
|
||||
reason="retrain_recommended",
|
||||
streak=_mr_result.consecutive_declines,
|
||||
absolute_floor_breached=_mr_result.absolute_floor_breached,
|
||||
conservative_mode_triggered=_mr_result.conservative_mode_triggered,
|
||||
)
|
||||
except Exception as _mr_e:
|
||||
logger.exception("model_rollback_check_failed", error=str(_mr_e))
|
||||
|
||||
logger.info(
|
||||
"offline_replay_done",
|
||||
sampled=report.incidents_sampled,
|
||||
|
||||
@@ -154,6 +154,54 @@ class ApprovalExecutionService:
|
||||
resource_name = parsed.resource_name
|
||||
namespace = parsed.namespace
|
||||
|
||||
# 2026-04-27 P3.1-T1 by Claude — 三 Tier-1 服務整合
|
||||
# kubectl 指令解析後,動態驗證資源是否存在於 K8s,並套用 normalized name
|
||||
# exception 不阻斷主流程;miss/suggestion 只記 warning + metadata,不攔截執行
|
||||
if resource_name is not None and operation_type is not None:
|
||||
try:
|
||||
from src.services.resource_resolver import get_resource_resolver
|
||||
from src.core.metrics import RESOURCE_RESOLVE_TOTAL
|
||||
|
||||
_resolver = get_resource_resolver()
|
||||
_resolve = await _resolver.resolve(
|
||||
raw_resource=resource_name,
|
||||
namespace=namespace,
|
||||
resource_kind="deployment",
|
||||
)
|
||||
if _resolve.success and _resolve.resource_name:
|
||||
if _resolve.resource_name != resource_name:
|
||||
logger.info(
|
||||
"resource_name_normalized",
|
||||
original=resource_name,
|
||||
normalized=_resolve.resource_name,
|
||||
namespace=namespace,
|
||||
)
|
||||
resource_name = _resolve.resource_name
|
||||
RESOURCE_RESOLVE_TOTAL.labels(result="hit").inc()
|
||||
elif _resolve.candidates:
|
||||
logger.warning(
|
||||
"resource_not_found_in_k8s",
|
||||
resource=resource_name,
|
||||
namespace=namespace,
|
||||
suggestions=_resolve.candidates,
|
||||
)
|
||||
RESOURCE_RESOLVE_TOTAL.labels(result="suggestion").inc()
|
||||
else:
|
||||
logger.warning(
|
||||
"resource_not_found_in_k8s",
|
||||
resource=resource_name,
|
||||
namespace=namespace,
|
||||
suggestions=[],
|
||||
)
|
||||
RESOURCE_RESOLVE_TOTAL.labels(result="miss").inc()
|
||||
except Exception as _rr_e:
|
||||
logger.warning("resource_resolve_failed", error=str(_rr_e))
|
||||
try:
|
||||
from src.core.metrics import RESOURCE_RESOLVE_TOTAL
|
||||
RESOURCE_RESOLVE_TOTAL.labels(result="error").inc()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if operation_type is None or resource_name is None:
|
||||
# 2026-04-19 ogt + Claude Opus 4.7: 區分 NO_ACTION vs 真解析失敗
|
||||
# NO_ACTION 是 AI 刻意選的「純調查不破壞」,不該誤標 EXECUTION_FAILED
|
||||
|
||||
Reference in New Issue
Block a user