diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index 57b89f6f..a731edfd 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -246,6 +246,61 @@ async def _try_auto_repair_background( except Exception as _outcome_err: logger.warning("auto_repair_outcome_write_failed", error=str(_outcome_err)) + # 2026-04-25 P1.3+P1.4 by Claude Engineer-B2 — 飛輪閉環最後一哩 + # auto_repair 路徑補 PostExecutionVerifier 呼叫 + learning 接線 + # 人工審核路徑已在 approval_execution._run_post_execution_verify 接線, + # 此處補齊 auto_repair 路徑的對稱接線(ADR-081 Phase 1 + ADR-083 Phase 3) + if result: + from src.core.feature_flags import aiops_flags + if aiops_flags.is_sub_flag_enabled("AIOPS_P1_POST_EXECUTION_VERIFIER"): + try: + from src.services.post_execution_verifier import get_post_execution_verifier + from src.services.evidence_snapshot import EvidenceSnapshot + from src.services.learning_service import get_learning_service + + _snapshot = await EvidenceSnapshot.get_latest_snapshot(incident_id) + _action_label = ( + f"{target_resource}:{namespace}" + if not result.success + else f"auto_repair_playbook:{result.playbook_id}" + ) + _verifier = get_post_execution_verifier() + _verify_result = await asyncio.wait_for( + _verifier.verify( + incident=incident, + snapshot=_snapshot, + action_taken=_action_label, + ), + timeout=60.0, + ) + logger.info( + "auto_repair_post_verify_complete", + incident_id=incident_id, + approval_id=approval_id, + verification_result=_verify_result, + playbook_id=result.playbook_id, + ) + await get_learning_service().record_verification_result( + incident_id=incident_id, + action_taken=_action_label, + verification_result=_verify_result, + matched_playbook_id=result.playbook_id, + ) + except asyncio.TimeoutError: + logger.warning( + "auto_repair_post_verify_timeout", + incident_id=incident_id, + approval_id=approval_id, + timeout_sec=60.0, + ) + except Exception as _verify_err: + logger.warning( + "auto_repair_post_verify_failed", + incident_id=incident_id, + approval_id=approval_id, + error=str(_verify_err), + ) + # ADR-073 修補: 自動修復成功 → 解決 Incident → 觸發 KM 轉換 # 之前 auto_repair 成功後從未呼叫 resolve_incident,KM 永遠不生成 if result and result.success: diff --git a/apps/api/src/services/proactive_inspector.py b/apps/api/src/services/proactive_inspector.py index 19193050..78945747 100644 --- a/apps/api/src/services/proactive_inspector.py +++ b/apps/api/src/services/proactive_inspector.py @@ -42,27 +42,36 @@ K8S_NAMESPACE = "awoooi-prod" # - pod_restart_rate: 改用 sum() 聚合,避免回傳多 vector 使 _fetch_current_value 只取第一筆 # - db_connection_pool: datname 實際為 awoooi_prod(非 awoooi) # - http_error_rate: cadvisor 無 http_requests_total,改用 probe_success 替代 +# 2026-04-25 P0.6 修復 by Claude Engineer-B: +# - http_error_rate: Prometheus 實測確認 metric 名稱為 probe_success(非 blackbox_probe_success) +# - cpu_usage_awoooi_api: cadvisor up=0(停止),改用 node-exporter node_cpu_seconds_total(node level) +# - memory_usage_awoooi_api: cadvisor 停止,改用 node-exporter 記憶體使用率比例(0-1 scale) MONITORED_METRICS: list[dict[str, Any]] = [ { "name": "http_error_rate", - # blackbox probe 失敗率:1 - 平均探測成功率(全部 target 聚合) - "promql": '1 - avg(blackbox_probe_success)', + # probe_success:Blackbox Exporter 實際 metric 名稱(非 blackbox_probe_success) + # 實測確認:curl 'http://192.168.0.188:9090/api/v1/query' --data-urlencode 'query=avg(probe_success)' → 0.944 + "promql": '1 - avg(probe_success)', "threshold": 0.05, # > 5% probe 失敗 = 警戒 "description": "HTTP Probe 失敗率(Blackbox Exporter)", }, { "name": "cpu_usage_awoooi_api", - # cadvisor: awoooi-prod namespace 的 api container(name label 格式為 k8s_api_awoooi-api-*_awoooi-prod_*_*) - "promql": 'avg(rate(container_cpu_usage_seconds_total{name=~"k8s_api_awoooi-api.*"}[5m]))', - "threshold": 0.85, # > 85% CPU(單核心比例) - "description": "API 容器 CPU 使用率", + # cadvisor up=0(prod-docker-188 離線),改用 node-exporter node-level CPU + # 實測確認:avg(rate(node_cpu_seconds_total{mode!="idle"}[5m])) → 有資料 + # threshold 0.85 = 85% CPU 使用率(node level,0-1 比例) + "promql": 'avg(rate(node_cpu_seconds_total{mode!="idle"}[5m]))', + "threshold": 0.85, # > 85% node CPU(所有 core 平均) + "description": "Node CPU 使用率(node-exporter,cadvisor 停止時替代)", }, { "name": "memory_usage_awoooi_api", - # cadvisor memory working set(不含 cache) - "promql": 'avg(container_memory_working_set_bytes{name=~"k8s_api_awoooi-api.*"})', - "threshold": 1073741824.0, # > 1 GiB = 警戒 - "description": "API 容器記憶體使用(working set bytes)", + # cadvisor 停止,改用 node-exporter 節點記憶體使用率比例(0-1) + # 實測確認:188 機器 62.76 GiB,當前 ~30% 使用率 + # threshold 0.85 = 85% node memory usage + "promql": '(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes', + "threshold": 0.85, # > 85% node memory(0-1 比例) + "description": "Node 記憶體使用率(node-exporter,cadvisor 停止時替代)", }, { "name": "pod_restart_rate", @@ -74,6 +83,7 @@ MONITORED_METRICS: list[dict[str, Any]] = [ { "name": "db_connection_pool", # datname 實際值為 awoooi_prod;sum 聚合所有 state + # 實測確認:curl 查詢返回有效資料,datname=awoooi_prod 存在 "promql": 'sum(pg_stat_activity_count{datname="awoooi_prod"})', "threshold": 80.0, # > 80 個 DB 連線 "description": "PostgreSQL 連線數(awoooi_prod)",