Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 7m29s
統帥 2026-04-19 全景審計發現:
- automation_operation_log: 22 筆 (全部 drift_narrator),33 件/7d approval 動作 0 筆回灌
- incident_evidence.verification_result: 1212 筆 100% NULL,verifier 從未寫入
- 根因: _run_post_execution_verify 用 asyncio.create_task fire-and-forget,
Pod recycle 時 task 被殺,verification_result 永遠寫不進去
修復 (打通 verifier→learning→Playbook EWMA→finetune 全鏈):
approval_execution.py:
+ _log_aol_started: 主流程開始時 INSERT aol(playbook_executed, pending)
+ _log_aol_completed: 4 個 return 點 UPDATE aol 為 success/failed + duration + stderr
└ NO_ACTION / parse_fail / K8s 成功 / K8s 失敗 全部留痕
~ _run_post_execution_verify 兩處 (成功+失敗 path) 從 create_task 改 await + 60s timeout
+ 失敗時 stderr_feed_back 寫入 result.error → 解開 E6 stderr 回灌閉環
declarative_remediation.py:
~ _log_remediation_event task 加 named + add_done_callback,task 失敗時有 log
(原 fire-and-forget 0 筆寫入,現在可診斷為何 task 死掉)
預期效果:
- aol playbook_executed 即時可見 (33 件/7d 立刻有資料)
- incident_evidence.verification_result 開始累積 → finetune_exporter 7d cron 終於有料
- Playbook EWMA trust_score 開始動態變化
- stderr_feed_back 接通 → 失敗訊號回灌 retry/Playbook 負向強化
不影響:
- background_task 跑在背景,+60s 延遲不阻塞 API
- aol 寫入失敗只 logger.warning,不阻塞執行主流程
Refs: MASTER §3.1 L6×D1 (ADR-081 PostExecutionVerifier),
MASTER §3.4 D4 (ADR-083 學習閉環),
ADR-090 監控盲區治理 (2026-04-18 全景審計)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
336 lines
13 KiB
Python
336 lines
13 KiB
Python
"""
|
||
AWOOOI AIOps Phase 5 — Declarative Remediation(宣告式修復)
|
||
=============================================================
|
||
職責:將修復動作包裝為 DeclarativeSpec,依爆炸半徑分四級分控。
|
||
|
||
分級邏輯:
|
||
tier=auto (≤10) → 可自動執行,必要時 dry-run 確認
|
||
tier=human (11-50) → 送人工審核(ApprovalRequest),不自動執行
|
||
tier=dual (51-99) → 需雙人審核 + GitOps PR
|
||
tier=blocked (100) → HARD_RULES 永擋,永不執行
|
||
|
||
設計原則:
|
||
1. 主入口 evaluate() 是純函數,不呼叫任何 I/O
|
||
2. dry_run_required 由 AIOPS_P5_DRY_RUN_ENFORCED flag 控制
|
||
3. rollback_plan 自動從 kubectl 動作推導(不問 LLM,確保穩定性)
|
||
4. DeclarativeSpec 是不可變 dataclass(frozen=True)
|
||
|
||
ADR-086: Phase 5 Declarative 修復與 Blast Radius 分控
|
||
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 5 初始建立
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import re
|
||
from dataclasses import dataclass
|
||
|
||
import structlog
|
||
|
||
from src.services.blast_radius_calculator import (
|
||
BlastRadiusResult,
|
||
get_blast_radius_calculator,
|
||
)
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Data Types
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
@dataclass(frozen=True)
|
||
class DeclarativeSpec:
|
||
"""
|
||
宣告式修復規格(不可變)
|
||
|
||
代替舊的直接 kubectl 字串,成為決策層統一輸出格式。
|
||
下游(_auto_execute / GitOps PR / RollbackManager)依此判斷如何執行。
|
||
"""
|
||
action: str # kubectl 命令(已解析好 target/namespace)
|
||
target_state: str # 目標狀態描述(人類可讀)
|
||
blast_radius_score: int # 0-100
|
||
tier: str # "auto" / "human" / "dual" / "blocked"
|
||
dry_run_required: bool # 執行前是否必須 dry-run
|
||
constraints: list[str] # 安全約束條件
|
||
rollback_plan: str # 如何回滾
|
||
namespace: str # 目標命名空間
|
||
target: str # 目標資源
|
||
blocked_reason: str | None # tier=blocked 時的永擋原因
|
||
blast_reason: str # 計分依據(可審計)
|
||
|
||
@property
|
||
def can_auto_execute(self) -> bool:
|
||
"""是否可自動執行(tier=auto 且未被永擋)"""
|
||
return self.tier == "auto" and not self.blocked_reason
|
||
|
||
@property
|
||
def requires_gitops_pr(self) -> bool:
|
||
"""是否需要 GitOps PR(tier=dual)"""
|
||
return self.tier == "dual"
|
||
|
||
@property
|
||
def requires_human_approval(self) -> bool:
|
||
"""是否需要人工審核(human 或 dual)"""
|
||
return self.tier in ("human", "dual")
|
||
|
||
def to_dict(self) -> dict:
|
||
return {
|
||
"action": self.action,
|
||
"target_state": self.target_state,
|
||
"blast_radius_score": self.blast_radius_score,
|
||
"tier": self.tier,
|
||
"dry_run_required": self.dry_run_required,
|
||
"constraints": self.constraints,
|
||
"rollback_plan": self.rollback_plan,
|
||
"namespace": self.namespace,
|
||
"target": self.target,
|
||
"blocked_reason": self.blocked_reason,
|
||
"blast_reason": self.blast_reason,
|
||
}
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Main Service
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
class DeclarativeRemediation:
|
||
"""
|
||
宣告式修復評估器
|
||
|
||
Usage:
|
||
svc = DeclarativeRemediation()
|
||
spec = svc.evaluate(
|
||
action="kubectl rollout restart deployment/awoooi-api",
|
||
target="awoooi-api",
|
||
namespace="awoooi-prod",
|
||
)
|
||
if spec.can_auto_execute:
|
||
# 直接執行
|
||
"""
|
||
|
||
def evaluate(
|
||
self,
|
||
action: str,
|
||
target: str = "",
|
||
namespace: str = "awoooi-prod",
|
||
description: str = "",
|
||
) -> DeclarativeSpec:
|
||
"""
|
||
評估修復動作,輸出 DeclarativeSpec。
|
||
|
||
Args:
|
||
action: kubectl 命令(已替換 placeholder)
|
||
target: 目標資源名稱
|
||
namespace: 目標命名空間
|
||
description: 動作描述(供人類閱讀)
|
||
|
||
Returns:
|
||
DeclarativeSpec(不可變,包含分級決策)
|
||
"""
|
||
from src.core.feature_flags import aiops_flags
|
||
|
||
# 計算爆炸半徑
|
||
calc = get_blast_radius_calculator()
|
||
blast: BlastRadiusResult = calc.calculate(action, namespace=namespace, target=target)
|
||
|
||
# dry-run 要求(flag 控制)
|
||
dry_run_required = aiops_flags.AIOPS_P5_DRY_RUN_ENFORCED and blast.tier != "blocked"
|
||
|
||
# 安全約束
|
||
constraints = _build_constraints(action, namespace, blast.score)
|
||
|
||
# 回滾計畫(從動作自動推導)
|
||
rollback_plan = _infer_rollback_plan(action, target, namespace)
|
||
|
||
# 目標狀態描述
|
||
target_state = description or _infer_target_state(action, target)
|
||
|
||
spec = DeclarativeSpec(
|
||
action=action,
|
||
target_state=target_state,
|
||
blast_radius_score=blast.score,
|
||
tier=blast.tier,
|
||
dry_run_required=dry_run_required,
|
||
constraints=constraints,
|
||
rollback_plan=rollback_plan,
|
||
namespace=namespace,
|
||
target=target,
|
||
blocked_reason=blast.blocked_reason,
|
||
blast_reason=blast.reason,
|
||
)
|
||
|
||
logger.info(
|
||
"declarative_spec_evaluated",
|
||
tier=spec.tier,
|
||
blast_radius=spec.blast_radius_score,
|
||
can_auto=spec.can_auto_execute,
|
||
action=action[:80],
|
||
)
|
||
|
||
# 2026-04-18 ADR-090-D: 寫入 remediation_events 表(MASTER §7.1 #6 KPI 資料源)
|
||
# 2026-04-19 ogt + Claude Opus 4.7 修復: 原 fire-and-forget 0 筆寫入。
|
||
# evaluate() 是同步函式,無法直接 await — 改用 named task + done_callback,
|
||
# 確保 task 失敗時有 log,後續可診斷為何 0 筆。
|
||
try:
|
||
import asyncio as _a
|
||
_task = _a.create_task(
|
||
_log_remediation_event(spec, action, target, namespace),
|
||
name=f"log_remediation:{action[:30]}",
|
||
)
|
||
|
||
def _on_done(t: _a.Task) -> None:
|
||
if t.cancelled():
|
||
logger.warning("log_remediation_event_cancelled", action=action[:80])
|
||
elif t.exception():
|
||
logger.warning(
|
||
"log_remediation_event_failed",
|
||
action=action[:80],
|
||
error=str(t.exception()),
|
||
)
|
||
|
||
_task.add_done_callback(_on_done)
|
||
except RuntimeError:
|
||
# 非 async context (正規呼叫都是 async),靜默跳過
|
||
logger.debug("log_remediation_event_no_event_loop", action=action[:80])
|
||
|
||
return spec
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Helpers
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
async def _log_remediation_event(
|
||
spec: "DeclarativeSpec",
|
||
action: str,
|
||
target: str,
|
||
namespace: str,
|
||
) -> None:
|
||
"""
|
||
2026-04-18 ADR-090-D: 寫入 remediation_events 表(MASTER §7.1 #6 KPI 資料源)
|
||
|
||
每次 DeclarativeRemediation.evaluate() 呼叫後寫一筆 'pending' 記錄。
|
||
後續實際執行狀態由 approval_execution.py 更新(未來 iteration)。
|
||
"""
|
||
try:
|
||
from sqlalchemy import text as _sql
|
||
from src.db.base import get_db_context
|
||
|
||
# remediation_type 判定
|
||
_rt = "declarative" if spec.can_auto_execute else "imperative"
|
||
if spec.requires_gitops_pr:
|
||
_rt = "gitops_pr"
|
||
|
||
async with get_db_context() as db:
|
||
await db.execute(
|
||
_sql("""
|
||
INSERT INTO remediation_events (
|
||
remediation_type, action_name, target_resource, namespace,
|
||
dry_run, status, blast_radius_score, executed_by,
|
||
metadata
|
||
) VALUES (
|
||
:rt, :an, :tr, :ns,
|
||
:dr, 'pending', :br, 'ai_agent',
|
||
CAST(:md AS jsonb)
|
||
)
|
||
"""),
|
||
{
|
||
"rt": _rt,
|
||
"an": action[:200],
|
||
"tr": target[:100] if target else None,
|
||
"ns": namespace[:50],
|
||
"dr": spec.dry_run_required,
|
||
"br": spec.blast_radius_score,
|
||
"md": '{"tier":"' + spec.tier + '"}',
|
||
},
|
||
)
|
||
except Exception as _e:
|
||
logger.warning("remediation_events_db_write_failed", error=str(_e))
|
||
|
||
|
||
def _build_constraints(action: str, namespace: str, score: int) -> list[str]:
|
||
"""依動作特性建立安全約束清單。"""
|
||
constraints: list[str] = []
|
||
|
||
if namespace != "awoooi-prod":
|
||
constraints.append(f"target_namespace={namespace}(非 prod 環境)")
|
||
|
||
if "scale" in action.lower():
|
||
# scale 操作:副本數不能為 0(防止停服)
|
||
m = re.search(r"--replicas=(\d+)", action)
|
||
if m:
|
||
replicas = int(m.group(1))
|
||
if replicas == 0:
|
||
constraints.append("scale_to_zero=true(停服風險)")
|
||
else:
|
||
constraints.append(f"min_replicas={max(1, replicas - 1)}(防過度縮減)")
|
||
|
||
if score >= 40:
|
||
constraints.append("pre_execute_snapshot_required(高衝擊需先存狀態)")
|
||
|
||
if "rollout undo" in action.lower():
|
||
constraints.append("verify_previous_revision_exists(確認有上一版本可回滾)")
|
||
|
||
return constraints
|
||
|
||
|
||
def _infer_rollback_plan(action: str, target: str, namespace: str) -> str:
|
||
"""從修復動作推導對應的回滾計畫(不呼叫 LLM,保持穩定性)。"""
|
||
a_lower = action.lower()
|
||
|
||
if "rollout restart" in a_lower:
|
||
return f"kubectl rollout undo deployment/{target} -n {namespace}"
|
||
if "rollout undo" in a_lower:
|
||
return f"kubectl rollout status deployment/{target} -n {namespace} && 手動確認回滾版本"
|
||
if "scale" in a_lower:
|
||
m = re.search(r"--replicas=(\d+)", action)
|
||
orig = int(m.group(1)) if m else 2
|
||
restore = max(1, orig)
|
||
return f"kubectl scale deployment/{target} --replicas={restore} -n {namespace}"
|
||
if "apply" in a_lower:
|
||
return f"kubectl rollout undo deployment/{target} -n {namespace}"
|
||
if "set image" in a_lower:
|
||
return f"kubectl rollout undo deployment/{target} -n {namespace}"
|
||
if "delete pod" in a_lower:
|
||
return "Pod 會由 ReplicaSet 自動重建;若 Deployment 異常請執行 rollout undo"
|
||
if "patch" in a_lower:
|
||
return f"kubectl patch + 手動還原原始 spec,或 kubectl rollout undo deployment/{target} -n {namespace}"
|
||
|
||
return f"kubectl rollout undo deployment/{target} -n {namespace}(通用回滾)"
|
||
|
||
|
||
def _infer_target_state(action: str, target: str) -> str:
|
||
"""從動作推導目標狀態描述。"""
|
||
a_lower = action.lower()
|
||
|
||
if "rollout restart" in a_lower:
|
||
return f"重新啟動 {target}(全 Pod 滾動重建)"
|
||
if "rollout undo" in a_lower:
|
||
return f"將 {target} 回滾至前一個部署版本"
|
||
if "scale" in a_lower:
|
||
m = re.search(r"--replicas=(\d+)", action)
|
||
n = m.group(1) if m else "?"
|
||
return f"調整 {target} 副本數至 {n}"
|
||
if "set image" in a_lower:
|
||
return f"更新 {target} 容器映像"
|
||
if "delete pod" in a_lower:
|
||
return f"刪除 {target} Pod(由 ReplicaSet 自動重建)"
|
||
if "apply" in a_lower:
|
||
return f"套用新配置到 {target}"
|
||
|
||
return action[:120]
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Singleton
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
_service: DeclarativeRemediation | None = None
|
||
|
||
|
||
def get_declarative_remediation() -> DeclarativeRemediation:
|
||
global _service
|
||
if _service is None:
|
||
_service = DeclarativeRemediation()
|
||
return _service
|