fix(flywheel): 三修 L5/L6 斷鏈 — RBAC 擴權 + 失敗原因入庫 + verifier 失敗時也跑
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 11m6s
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 11m6s
2026-04-18 晚(台北時區) — ogt + Claude Opus 4.7 (1M) 全景飛輪診斷暴露 3 個真斷鏈: - L5 執行 30d: EXECUTION_FAILED 216 / EXECUTION_SUCCESS 2 (失敗率 99%) - L6 驗證 7d: verification_result 全 NULL (988 筆 evidence 都沒驗) - 所有 rejection_reason / error_message 欄位全空(無法診斷) 根因: awoooi-executor ServiceAccount RBAC 不足,executor.py 每次 kubectl get nodes/HPA 都 Forbidden,連 evidence 都抓不到,後面 repair 全炸,verifier 因為 execution 沒 success 永遠不 trigger,evidence 驗證結果永遠 NULL。修一個 RBAC 解 3 個節點。 ## P0.1 RBAC 擴權 (k8s/awoooi-prod/07-rbac.yaml) 新增 cluster-scope 讀權(僅 list/get/watch,零寫入): - nodes + nodes/status (evidence gathering 必需) - horizontalpodautoscalers (HPA 狀態) - metrics.k8s.io: nodes + pods (resource metrics) - statefulsets + daemonsets (完整 workload 視圖) 已 kubectl apply + 煙霧測試: kubectl get nodes 可跑。 ## P0.2 失敗時必寫 rejection_reason (approval_db.py) update_execution_status() 新增 error_message 參數,失敗時寫入 rejection_reason (截 2000 字) → 之後診斷有依據。 approval_execution.py 呼叫端同步更新,result.error 一路傳進 DB。 ## P0.3 Verifier 失敗時也跑 (approval_execution.py) 原邏輯: verifier 只在 result.success=True 時呼叫 → 99% 失敗下 永遠不跑。 新邏輯: 失敗 path 也 create_task 跑 verifier,action_taken 後綴 加 ":FAILED" 標記。verifier 抓 post_state 寫 verification_result='failed' 回 incident_evidence。 L7 learning 從此有失敗樣本可學,playbook trust 負向 2x 衰減才 真正生效。 預期效果: - EXECUTION_FAILED 率 30d 內應從 99% 降到 <30% - incident_evidence.verification_result NULL 率應從 100% 降到 <10% - approval_records.rejection_reason 補齊率從 0% 到 100% Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -610,22 +610,32 @@ class ApprovalDBService:
|
|||||||
self,
|
self,
|
||||||
approval_id: UUID,
|
approval_id: UUID,
|
||||||
success: bool,
|
success: bool,
|
||||||
|
error_message: str | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
更新執行狀態
|
更新執行狀態
|
||||||
|
|
||||||
|
2026-04-18 ogt + Claude Opus 4.7: ADR-090 L5 斷鏈修復 — P0.2
|
||||||
|
失敗時必寫 rejection_reason,讓診斷不再黑盒
|
||||||
|
(之前 EXECUTION_FAILED 216 筆 reason 全空)
|
||||||
"""
|
"""
|
||||||
async with get_db_context() as db:
|
async with get_db_context() as db:
|
||||||
status = ApprovalStatus.EXECUTION_SUCCESS if success else ApprovalStatus.EXECUTION_FAILED
|
status = ApprovalStatus.EXECUTION_SUCCESS if success else ApprovalStatus.EXECUTION_FAILED
|
||||||
|
values: dict = {"status": status}
|
||||||
|
if not success and error_message:
|
||||||
|
# 截斷至合理長度,避免爆欄位
|
||||||
|
values["rejection_reason"] = str(error_message)[:2000]
|
||||||
await db.execute(
|
await db.execute(
|
||||||
update(ApprovalRecord)
|
update(ApprovalRecord)
|
||||||
.where(ApprovalRecord.id == str(approval_id))
|
.where(ApprovalRecord.id == str(approval_id))
|
||||||
.values(status=status)
|
.values(**values)
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"approval_execution_status_updated",
|
"approval_execution_status_updated",
|
||||||
id=str(approval_id),
|
id=str(approval_id),
|
||||||
success=success,
|
success=success,
|
||||||
|
has_error=bool(error_message),
|
||||||
)
|
)
|
||||||
|
|
||||||
async def update_incident_id(self, approval_id: UUID, incident_id: str) -> None:
|
async def update_incident_id(self, approval_id: UUID, incident_id: str) -> None:
|
||||||
|
|||||||
@@ -222,7 +222,12 @@ class ApprovalExecutionService:
|
|||||||
attempt += 1
|
attempt += 1
|
||||||
|
|
||||||
# Phase 5: 更新資料庫狀態
|
# Phase 5: 更新資料庫狀態
|
||||||
await service.update_execution_status(approval.id, success=result.success)
|
# 2026-04-18 ADR-090 L5 P0.2: 失敗時帶上 error_message,寫進 rejection_reason
|
||||||
|
await service.update_execution_status(
|
||||||
|
approval.id,
|
||||||
|
success=result.success,
|
||||||
|
error_message=None if result.success else (result.error or "(executor 未回傳錯誤)"),
|
||||||
|
)
|
||||||
|
|
||||||
# Update approval status based on result
|
# Update approval status based on result
|
||||||
total_attempts = attempt # attempt 在重試迴圈後為最終嘗試次數
|
total_attempts = attempt # attempt 在重試迴圈後為最終嘗試次數
|
||||||
@@ -389,6 +394,19 @@ class ApprovalExecutionService:
|
|||||||
approval_id=str(approval.id),
|
approval_id=str(approval.id),
|
||||||
timeout_sec=30.0,
|
timeout_sec=30.0,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 2026-04-18 ogt + Claude Opus 4.7: ADR-090 L6 斷鏈修復 — P0.3
|
||||||
|
# 失敗時也跑 verifier,把 verification_result='failed' 回寫 evidence
|
||||||
|
# 之前 988 筆 evidence 的 verification_result 全 NULL,因 verifier 只在 success 時跑
|
||||||
|
from src.core.feature_flags import aiops_flags
|
||||||
|
if aiops_flags.is_sub_flag_enabled("AIOPS_P1_POST_EXECUTION_VERIFIER"):
|
||||||
|
asyncio.create_task(
|
||||||
|
self._run_post_execution_verify(
|
||||||
|
approval=approval,
|
||||||
|
action_taken=f"{operation_type.value}:{resource_name}:FAILED",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
return False # K8s 執行失敗
|
return False # K8s 執行失敗
|
||||||
|
|
||||||
async def _push_execution_result_to_alert(
|
async def _push_execution_result_to_alert(
|
||||||
|
|||||||
@@ -70,6 +70,25 @@ rules:
|
|||||||
resources: ["deployments", "deployments/status", "replicasets"]
|
resources: ["deployments", "deployments/status", "replicasets"]
|
||||||
verbs: ["get", "list", "watch"]
|
verbs: ["get", "list", "watch"]
|
||||||
|
|
||||||
|
# 2026-04-18 Claude Opus 4.7: 補齊 L5/L6 斷鏈 — executor.py 需讀 nodes + HPA + metrics
|
||||||
|
# 根因: 修 P0.1 — 99% EXECUTION_FAILED 因 RBAC 不足(Forbidden),連 evidence gathering 都斷
|
||||||
|
# 2026-04-18 評估: P1 安全 — 純讀取,無寫入,cluster-scope 限定特定資源
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["nodes", "nodes/status"]
|
||||||
|
verbs: ["get", "list", "watch"]
|
||||||
|
|
||||||
|
- apiGroups: ["autoscaling"]
|
||||||
|
resources: ["horizontalpodautoscalers"]
|
||||||
|
verbs: ["get", "list", "watch"]
|
||||||
|
|
||||||
|
- apiGroups: ["metrics.k8s.io"]
|
||||||
|
resources: ["nodes", "pods"]
|
||||||
|
verbs: ["get", "list"]
|
||||||
|
|
||||||
|
- apiGroups: ["apps"]
|
||||||
|
resources: ["statefulsets", "daemonsets"]
|
||||||
|
verbs: ["get", "list", "watch"]
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# 寫入權限 (Write) - 僅限故障排除操作
|
# 寫入權限 (Write) - 僅限故障排除操作
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
|||||||
Reference in New Issue
Block a user