feat(p0.6+p1.3+p1.4): 飛輪閉環最後一哩 + ProactiveInspector PromQL 三修
P0.6 ProactiveInspector PromQL labels 修正 (Engineer-B): - http_error_rate: blackbox_probe_success → probe_success(實測 metric 名稱) - cpu_usage_awoooi_api: cadvisor up=0(停止)→ 改 node-exporter node_cpu_seconds_total - memory_usage_awoooi_api: cadvisor 停止 → node-exporter 記憶體使用率比例 P1.3+P1.4 飛輪閉環最後一哩 (Engineer-B2): - webhooks.py:_try_auto_repair_background 補 PostExecutionVerifier 接線 - feature flag AIOPS_P1_POST_EXECUTION_VERIFIER 守住(default off,可漸進啟用) - 60s timeout + try/except 三重防護(timeout / 一般 exception / outer exception) - asyncio.wait_for + EvidenceSnapshot.get_latest_snapshot - 補 learning_service.record_verification_result 呼叫 - matched_playbook_id 從 result.playbook_id 帶入 - 觸發 EWMA trust_score 演化(飛輪閉環) - 對稱於人工審核路徑 approval_execution._run_post_execution_verify ADR 對應: ADR-081 Phase 1 (Verifier) + ADR-083 Phase 3 (Learning) plan_complete_v3.md L5/L6 階段:⚠️ → ✅(飛輪自主化分數預估 +12 分) Note: feature flag default off → 不會立即影響 production 行為; 啟用前需 critic 審查 + production E2E 驗證。 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -246,6 +246,61 @@ async def _try_auto_repair_background(
|
||||
except Exception as _outcome_err:
|
||||
logger.warning("auto_repair_outcome_write_failed", error=str(_outcome_err))
|
||||
|
||||
# 2026-04-25 P1.3+P1.4 by Claude Engineer-B2 — 飛輪閉環最後一哩
|
||||
# auto_repair 路徑補 PostExecutionVerifier 呼叫 + learning 接線
|
||||
# 人工審核路徑已在 approval_execution._run_post_execution_verify 接線,
|
||||
# 此處補齊 auto_repair 路徑的對稱接線(ADR-081 Phase 1 + ADR-083 Phase 3)
|
||||
if result:
|
||||
from src.core.feature_flags import aiops_flags
|
||||
if aiops_flags.is_sub_flag_enabled("AIOPS_P1_POST_EXECUTION_VERIFIER"):
|
||||
try:
|
||||
from src.services.post_execution_verifier import get_post_execution_verifier
|
||||
from src.services.evidence_snapshot import EvidenceSnapshot
|
||||
from src.services.learning_service import get_learning_service
|
||||
|
||||
_snapshot = await EvidenceSnapshot.get_latest_snapshot(incident_id)
|
||||
_action_label = (
|
||||
f"{target_resource}:{namespace}"
|
||||
if not result.success
|
||||
else f"auto_repair_playbook:{result.playbook_id}"
|
||||
)
|
||||
_verifier = get_post_execution_verifier()
|
||||
_verify_result = await asyncio.wait_for(
|
||||
_verifier.verify(
|
||||
incident=incident,
|
||||
snapshot=_snapshot,
|
||||
action_taken=_action_label,
|
||||
),
|
||||
timeout=60.0,
|
||||
)
|
||||
logger.info(
|
||||
"auto_repair_post_verify_complete",
|
||||
incident_id=incident_id,
|
||||
approval_id=approval_id,
|
||||
verification_result=_verify_result,
|
||||
playbook_id=result.playbook_id,
|
||||
)
|
||||
await get_learning_service().record_verification_result(
|
||||
incident_id=incident_id,
|
||||
action_taken=_action_label,
|
||||
verification_result=_verify_result,
|
||||
matched_playbook_id=result.playbook_id,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning(
|
||||
"auto_repair_post_verify_timeout",
|
||||
incident_id=incident_id,
|
||||
approval_id=approval_id,
|
||||
timeout_sec=60.0,
|
||||
)
|
||||
except Exception as _verify_err:
|
||||
logger.warning(
|
||||
"auto_repair_post_verify_failed",
|
||||
incident_id=incident_id,
|
||||
approval_id=approval_id,
|
||||
error=str(_verify_err),
|
||||
)
|
||||
|
||||
# ADR-073 修補: 自動修復成功 → 解決 Incident → 觸發 KM 轉換
|
||||
# 之前 auto_repair 成功後從未呼叫 resolve_incident,KM 永遠不生成
|
||||
if result and result.success:
|
||||
|
||||
@@ -42,27 +42,36 @@ K8S_NAMESPACE = "awoooi-prod"
|
||||
# - pod_restart_rate: 改用 sum() 聚合,避免回傳多 vector 使 _fetch_current_value 只取第一筆
|
||||
# - db_connection_pool: datname 實際為 awoooi_prod(非 awoooi)
|
||||
# - http_error_rate: cadvisor 無 http_requests_total,改用 probe_success 替代
|
||||
# 2026-04-25 P0.6 修復 by Claude Engineer-B:
|
||||
# - http_error_rate: Prometheus 實測確認 metric 名稱為 probe_success(非 blackbox_probe_success)
|
||||
# - cpu_usage_awoooi_api: cadvisor up=0(停止),改用 node-exporter node_cpu_seconds_total(node level)
|
||||
# - memory_usage_awoooi_api: cadvisor 停止,改用 node-exporter 記憶體使用率比例(0-1 scale)
|
||||
MONITORED_METRICS: list[dict[str, Any]] = [
|
||||
{
|
||||
"name": "http_error_rate",
|
||||
# blackbox probe 失敗率:1 - 平均探測成功率(全部 target 聚合)
|
||||
"promql": '1 - avg(blackbox_probe_success)',
|
||||
# probe_success:Blackbox Exporter 實際 metric 名稱(非 blackbox_probe_success)
|
||||
# 實測確認:curl 'http://192.168.0.188:9090/api/v1/query' --data-urlencode 'query=avg(probe_success)' → 0.944
|
||||
"promql": '1 - avg(probe_success)',
|
||||
"threshold": 0.05, # > 5% probe 失敗 = 警戒
|
||||
"description": "HTTP Probe 失敗率(Blackbox Exporter)",
|
||||
},
|
||||
{
|
||||
"name": "cpu_usage_awoooi_api",
|
||||
# cadvisor: awoooi-prod namespace 的 api container(name label 格式為 k8s_api_awoooi-api-*_awoooi-prod_*_*)
|
||||
"promql": 'avg(rate(container_cpu_usage_seconds_total{name=~"k8s_api_awoooi-api.*"}[5m]))',
|
||||
"threshold": 0.85, # > 85% CPU(單核心比例)
|
||||
"description": "API 容器 CPU 使用率",
|
||||
# cadvisor up=0(prod-docker-188 離線),改用 node-exporter node-level CPU
|
||||
# 實測確認:avg(rate(node_cpu_seconds_total{mode!="idle"}[5m])) → 有資料
|
||||
# threshold 0.85 = 85% CPU 使用率(node level,0-1 比例)
|
||||
"promql": 'avg(rate(node_cpu_seconds_total{mode!="idle"}[5m]))',
|
||||
"threshold": 0.85, # > 85% node CPU(所有 core 平均)
|
||||
"description": "Node CPU 使用率(node-exporter,cadvisor 停止時替代)",
|
||||
},
|
||||
{
|
||||
"name": "memory_usage_awoooi_api",
|
||||
# cadvisor memory working set(不含 cache)
|
||||
"promql": 'avg(container_memory_working_set_bytes{name=~"k8s_api_awoooi-api.*"})',
|
||||
"threshold": 1073741824.0, # > 1 GiB = 警戒
|
||||
"description": "API 容器記憶體使用(working set bytes)",
|
||||
# cadvisor 停止,改用 node-exporter 節點記憶體使用率比例(0-1)
|
||||
# 實測確認:188 機器 62.76 GiB,當前 ~30% 使用率
|
||||
# threshold 0.85 = 85% node memory usage
|
||||
"promql": '(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes',
|
||||
"threshold": 0.85, # > 85% node memory(0-1 比例)
|
||||
"description": "Node 記憶體使用率(node-exporter,cadvisor 停止時替代)",
|
||||
},
|
||||
{
|
||||
"name": "pod_restart_rate",
|
||||
@@ -74,6 +83,7 @@ MONITORED_METRICS: list[dict[str, Any]] = [
|
||||
{
|
||||
"name": "db_connection_pool",
|
||||
# datname 實際值為 awoooi_prod;sum 聚合所有 state
|
||||
# 實測確認:curl 查詢返回有效資料,datname=awoooi_prod 存在
|
||||
"promql": 'sum(pg_stat_activity_count{datname="awoooi_prod"})',
|
||||
"threshold": 80.0, # > 80 個 DB 連線
|
||||
"description": "PostgreSQL 連線數(awoooi_prod)",
|
||||
|
||||
Reference in New Issue
Block a user