From dedb12085baa809bb10a580a927daa19cdb94b1d Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 2 May 2026 23:44:03 +0800 Subject: [PATCH] chore(governance,watchdog): enrich alerts and enable prometheus multiproc --- apps/api/src/api/v1/webhooks.py | 52 ++++++-- apps/api/src/jobs/ai_slo_watchdog_job.py | 22 +-- apps/api/src/services/governance_agent.py | 156 +++++++++++++++++----- k8s/awoooi-dev/02-configmap.yaml | 1 + k8s/awoooi-dev/04-deployment-api.yaml | 9 ++ k8s/awoooi-prod/04-configmap.yaml | 1 + k8s/awoooi-prod/06-deployment-api.yaml | 6 + 7 files changed, 191 insertions(+), 56 deletions(-) diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index ef312579..983ece4a 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -1093,9 +1093,10 @@ async def receive_alert( "is_rule_based": False, "playbook_id": None, } + _cmd_cs1 = (analysis_result.kubectl_command or "").strip() approval_create = ApprovalRequestCreate( - action=f"{analysis_result.action_title} | {analysis_result.kubectl_command}", - description=f"[AI: {ai_provider}] {analysis_result.description}", + action=(_cmd_cs1 or f"{analysis_result.action_title} | NO_ACTION"), + description=f"[AI: {ai_provider}] {analysis_result.action_title} | {analysis_result.description}", risk_level=risk_level, blast_radius=BlastRadius( affected_pods=blast.affected_pods, @@ -1179,6 +1180,13 @@ async def receive_alert( status=ApprovalStatus.APPROVED, risk_level=risk_level.value, matched_playbook_id=None, + metadata={ + **_approval_metadata_cs1, + "is_high_confidence": True, + "policy_reason": _shadow_result.reason.value + if hasattr(_shadow_result, "reason") + else "cs1_auto_confident_execution", + }, ) _cs1_auto_approval.id = approval.id @@ -1194,13 +1202,19 @@ async def receive_alert( error=str(_cs1_upd_err), ) - logger.info( - "llm_high_confidence_auto_executed", - approval_id=str(approval.id), - confidence=analysis_result.confidence, - exec_success=_cs1_exec_success, - action=_cs1_kubectl[:80], - ) + logger.info( + "llm_high_confidence_auto_executed", + approval_id=str(approval.id), + confidence=analysis_result.confidence, + exec_success=_cs1_exec_success, + action=_cs1_kubectl[:80], + is_high_confidence=True, + policy_reason=( + _shadow_result.reason.value + if hasattr(_shadow_result, "reason") + else "cs1_auto_confident_execution" + ), + ) except Exception as _cs1_auto_err: logger.warning( "llm_high_confidence_auto_execute_failed", @@ -1419,7 +1433,7 @@ async def _process_new_alert_background( rule_kubectl = str(rule_response.get("kubectl_command", "")).strip() rule_description = str(rule_response.get("description", message)) rule_action = ( - f"{rule_action_title} | {rule_kubectl}" + rule_kubectl if rule_kubectl else f"NO_ACTION - {rule_description[:120]}" ) @@ -1656,9 +1670,10 @@ async def _process_new_alert_background( "is_rule_based": False, "playbook_id": None, } + _cmd_cs3 = (analysis_result.kubectl_command or "").strip() approval_create = ApprovalRequestCreate( - action=f"{analysis_result.action_title} | {analysis_result.kubectl_command}", - description=f"[AI: {ai_provider}] {analysis_result.description}", + action=(_cmd_cs3 or f"{analysis_result.action_title} | NO_ACTION"), + description=f"[AI: {ai_provider}] {analysis_result.action_title} | {analysis_result.description}", risk_level=risk_level, blast_radius=BlastRadius( affected_pods=blast.affected_pods if blast else 1, @@ -1719,6 +1734,13 @@ async def _process_new_alert_background( status=ApprovalStatus.APPROVED, risk_level=risk_level.value, matched_playbook_id=None, + metadata={ + **_approval_metadata_cs3, + "is_high_confidence": True, + "policy_reason": _shadow_result_cs3.reason.value + if hasattr(_shadow_result_cs3, "reason") + else "cs3_auto_confident_execution", + }, ) _cs3_executor = ApprovalExecutionService() _cs3_exec_success = await _cs3_executor.execute_approved_action(_cs3_auto_approval) @@ -1729,6 +1751,12 @@ async def _process_new_alert_background( confidence=analysis_result.confidence, success=_cs3_exec_success, provider=ai_provider, + is_high_confidence=True, + policy_reason=( + _shadow_result_cs3.reason.value + if hasattr(_shadow_result_cs3, "reason") + else "cs3_auto_confident_execution" + ), ) except Exception as _cs3_exec_err: logger.warning("cs3_llm_auto_execute_failed", error=str(_cs3_exec_err)) diff --git a/apps/api/src/jobs/ai_slo_watchdog_job.py b/apps/api/src/jobs/ai_slo_watchdog_job.py index f8b235de..93dcdbd0 100644 --- a/apps/api/src/jobs/ai_slo_watchdog_job.py +++ b/apps/api/src/jobs/ai_slo_watchdog_job.py @@ -144,19 +144,19 @@ async def _check_once() -> None: await redis.setex(dedup_key, _DEDUP_TTL_SEC, "1") # 發送 TYPE-8M Meta-System 告警 - diagnosis = " | ".join(violations) - incident_id = f"META-{now_taipei().strftime('%Y%m%d%H%M%S')}" - try: - from src.services.telegram_gateway import get_telegram_gateway - await get_telegram_gateway().send_meta_alert( - incident_id=incident_id, + diagnosis = " | ".join(violations) + incident_id = f"META-{now_taipei().strftime('%Y%m%d%H%M%S')}" + try: + from src.services.telegram_gateway import get_telegram_gateway + await get_telegram_gateway().send_meta_alert( + incident_id=incident_id, approval_id=str(uuid.uuid4()), alertname="AI 自健診異常", - alert_category="flywheel_health", - diagnosis=diagnosis, - severity_level="critical", - system_impact=f"{len(violations)} 項 KPI 異常(W-1~W-5),飛輪自動化能力可能降級", - ) + alert_category="flywheel_health", + diagnosis=diagnosis, + severity_level="critical", + system_impact=f"{len(violations)} 項 KPI 異常(W-1~W-6),飛輪自動化能力可能降級", + ) logger.warning( "ai_slo_watchdog_alert_sent", incident_id=incident_id, diff --git a/apps/api/src/services/governance_agent.py b/apps/api/src/services/governance_agent.py index a84802bc..267391f0 100644 --- a/apps/api/src/services/governance_agent.py +++ b/apps/api/src/services/governance_agent.py @@ -106,25 +106,42 @@ class GovernanceAgent: else: kept_ids.append(r.playbook_id) - if auto_deprecated_ids: - await db.commit() - logger.info( - "governance_trust_drift_auto_deprecated", - count=len(auto_deprecated_ids), - ids=auto_deprecated_ids[:10], - ) + if auto_deprecated_ids: + await db.commit() + logger.info( + "governance_trust_drift_auto_deprecated", + count=len(auto_deprecated_ids), + ids=auto_deprecated_ids[:10], + ) if drifted: + drift_ratio = len(drifted) / total if total > 0 else 0.0 await self._alert( "trust_drift", { - "drifted_count": len(drifted), - "total_playbooks": total, - "playbook_ids": kept_ids[:10], - "auto_deprecated_count": len(auto_deprecated_ids), - "auto_deprecated_ids": auto_deprecated_ids[:10], - "threshold": TRUST_DRIFT_THRESHOLD, - "auto_deprecate_after_days": TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS, + "status": "warning", + "impact": { + "drifted_count": len(drifted), + "total_playbooks": total, + "drift_ratio": round(drift_ratio, 3), + "threshold": TRUST_DRIFT_THRESHOLD, + "auto_deprecate_after_days": TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS, + }, + "remediation": { + "items": [ + "Auto-deprecate low-trust stale playbooks", + "Review candidate playbooks by impact scope and rollback if needed", + ], + "auto_deprecated_count": len(auto_deprecated_ids), + "auto_deprecated_ids": auto_deprecated_ids[:10], + }, + "actionable": { + "items": [ + "立即補齊 PLAYBOOK_SOURCE 與 playbook_metadata", + "必要時人工覆核 kept_ids 中的高風險 Playbook", + ], + "sample_playbook_ids": kept_ids[:10], + }, }, ) @@ -177,11 +194,27 @@ class GovernanceAgent: await self._alert( "knowledge_degradation", { - "stale_count": stale, - "total_count": total, - "stale_ratio": round(ratio, 3), - "threshold": KM_STALE_RATIO, - "stale_days": KM_STALE_DAYS, + "status": "warning", + "impact": { + "stale_count": stale, + "total_count": total, + "stale_ratio": round(ratio, 3), + "threshold": KM_STALE_RATIO, + "stale_days": KM_STALE_DAYS, + }, + "remediation": { + "items": [ + "啟動 KM 反查與自動補齊流程", + "關鍵服務告警自動同步到 KM 任務,補齊缺失條目", + ], + "next_action": "run_kb_growth_healthcheck", + }, + "actionable": { + "items": [ + "每日檢查 ANTI_PATTERN 更新結果", + "安排至少 2 位 owner 對 stale條目做快速人工審核", + ], + }, }, ) @@ -348,9 +381,11 @@ class GovernanceAgent: # 不可 fallback 0.0,否則必觸發 violated=True 噴假警報 if not result_list: results[name] = { + "name": name, + "status": "skipped", "error": "no_data", - "skipped": True, "reason": "prometheus_empty_result_metric_not_emitted", + "hint": "ADR-100 emitter 未實作或 PROMETHEUS_MULTIPROC_DIR 未設", } logger.warning( "governance_slo_no_data", @@ -365,9 +400,12 @@ class GovernanceAgent: violated = value < threshold results[name] = { + "name": name, + "status": "violated" if violated else "ok", "value": round(value, 4), "slo_target": target, "hard_red_line": threshold, + "gap": round(threshold - value, 4) if violated else round(value - target, 4), "violated": violated, } @@ -375,11 +413,27 @@ class GovernanceAgent: await self._alert( f"slo_{name}_violation", { - "slo_name": name, - "current_value": round(value, 4), - "hard_red_line": threshold, - "slo_target": target, - "gap": round(threshold - value, 4), + "status": "violation", + "impact": { + "name": name, + "value": round(value, 4), + "target": target, + "threshold": threshold, + "gap": round(threshold - value, 4), + }, + "remediation": { + "items": [ + "Pause auto-scaling or risky auto-fix tasks", + "Review evidence/decision traces and adjust policy thresholds", + ], + "next_action": "trigger_flywheel_safeguard", + }, + "actionable": { + "items": [ + "Check verifier lag and post-exec learning health", + "Run emergency incident audit on failed approvals", + ], + }, }, ) logger.warning( @@ -388,6 +442,12 @@ class GovernanceAgent: value=round(value, 4), hard_red_line=threshold, ) + elif value == 0 and threshold <= 0: + logger.warning( + "governance_slo_unexpected_zero", + slo=name, + value=round(value, 4), + ) else: logger.info( "governance_slo_ok", @@ -396,7 +456,12 @@ class GovernanceAgent: target=target, ) else: - results[name] = {"error": "prometheus_query_failed", "status": data.get("status")} + results[name] = { + "name": name, + "status": "error", + "error": "prometheus_query_failed", + "response_status": data.get("status"), + } logger.warning( "governance_slo_prometheus_error", slo=name, @@ -404,22 +469,30 @@ class GovernanceAgent: response_status=data.get("status"), ) except Exception as e: - results[name] = {"error": str(e)} + results[name] = { + "name": name, + "status": "error", + "error": str(e), + } logger.warning("governance_slo_check_error", slo=name, error=str(e)) # 2026-04-29 ogt + Claude Opus 4.7: critic M6 修 # 加聚合 _meta 區分「全 skipped」(metric 未 emit) vs「全 ok」(SLO 健康) # 防止 dashboard 把 no_data 當 pass 顯示 - violated_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("violated")) - skipped_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("skipped")) + violated_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "violated") + skipped_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "skipped") ok_count = sum( 1 for v in results.values() - if isinstance(v, dict) and not v.get("violated") and not v.get("skipped") and "error" not in v + if isinstance(v, dict) + and v.get("status") == "ok" ) + error_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "error") results["_meta"] = { "violated_count": violated_count, "skipped_count": skipped_count, "ok_count": ok_count, + "error_count": error_count, + "all_status": sorted({v.get("status") for v in results.values() if isinstance(v, dict)}), "all_skipped": skipped_count > 0 and ok_count == 0 and violated_count == 0, "status": ( "no_data" if (skipped_count > 0 and ok_count == 0 and violated_count == 0) @@ -496,9 +569,26 @@ class GovernanceAgent: await self._alert( "governance_slo_data_gap", { - "reason": "all_slo_metrics_not_emitted", - "skipped_count": slo_meta.get("skipped_count", 0), - "hint": "ADR-100 emitter 未實作或 PROMETHEUS_MULTIPROC_DIR 未設", + "status": "warning", + "impact": { + "reason": "all_slo_metrics_not_emitted", + "skipped_count": slo_meta.get("skipped_count", 0), + "all_slo_metrics_not_emitted": True, + }, + "remediation": { + "items": [ + "補齊 ADR-100 SLO emitter(automation_operation_log_total / post_execution_verification_total / km_entries_total)", + "設置 PROMETHEUS_MULTIPROC_DIR 並掛載可寫目錄(如 emptyDir)", + ], + "next_action": "run_adr100_slo_emit_playbook", + "hint": "ADR-100 emitter 未實作或 PROMETHEUS_MULTIPROC_DIR 未設", + }, + "actionable": { + "items": [ + "先確認所有 API Pod 是否有 PROMETHEUS_MULTIPROC_DIR 掛載", + "檢查 Prometheus rule 是否已載入 sli:autonomy_rate:5m 等 4 項告警規則", + ], + }, }, ) except Exception: diff --git a/k8s/awoooi-dev/02-configmap.yaml b/k8s/awoooi-dev/02-configmap.yaml index 10ddbd0e..0d156316 100644 --- a/k8s/awoooi-dev/02-configmap.yaml +++ b/k8s/awoooi-dev/02-configmap.yaml @@ -52,3 +52,4 @@ data: TG_GROUP_CUTOVER: "true" HERMES_NL_ENABLED: "false" ENABLE_12AGENT_CONSENSUS: "false" + PROMETHEUS_MULTIPROC_DIR: "/tmp/awoooi-prometheus-multiproc" diff --git a/k8s/awoooi-dev/04-deployment-api.yaml b/k8s/awoooi-dev/04-deployment-api.yaml index 3b29539a..e4db61ec 100644 --- a/k8s/awoooi-dev/04-deployment-api.yaml +++ b/k8s/awoooi-dev/04-deployment-api.yaml @@ -38,6 +38,12 @@ spec: name: awoooi-config - secretRef: name: awoooi-secrets + env: + - name: PROMETHEUS_MULTIPROC_DIR + value: "/tmp/awoooi-prometheus-multiproc" + volumeMounts: + - name: prometheus-multiproc + mountPath: /tmp/awoooi-prometheus-multiproc resources: requests: cpu: "100m" @@ -59,6 +65,9 @@ spec: initialDelaySeconds: 10 periodSeconds: 10 failureThreshold: 3 + volumes: + - name: prometheus-multiproc + emptyDir: {} --- apiVersion: v1 diff --git a/k8s/awoooi-prod/04-configmap.yaml b/k8s/awoooi-prod/04-configmap.yaml index c83c84c3..a9a83b45 100644 --- a/k8s/awoooi-prod/04-configmap.yaml +++ b/k8s/awoooi-prod/04-configmap.yaml @@ -132,6 +132,7 @@ data: SENTRY_MCP_ENABLED: "true" # Prometheus server 在 110:9090 (非 188) PROMETHEUS_URL: "http://192.168.0.110:9090" + PROMETHEUS_MULTIPROC_DIR: "/tmp/awoooi-prometheus-multiproc" # ============================================================================ # AIOps Phase 1-6 Feature Flags (2026-04-15 ogt: 全開,資料先全寫入 DB) diff --git a/k8s/awoooi-prod/06-deployment-api.yaml b/k8s/awoooi-prod/06-deployment-api.yaml index b82a877c..a7b458c7 100644 --- a/k8s/awoooi-prod/06-deployment-api.yaml +++ b/k8s/awoooi-prod/06-deployment-api.yaml @@ -77,6 +77,8 @@ spec: value: "100" - name: AGENT_SOLVER_TIMEOUT_SEC value: "80" + - name: PROMETHEUS_MULTIPROC_DIR + value: "/tmp/awoooi-prometheus-multiproc" # 2026-04-05 Claude Code: Sprint 3 — 掛載 SSH key 供 HostRepairAgent 使用 volumeMounts: - name: repair-ssh-key @@ -104,6 +106,8 @@ spec: mountPath: /etc/ssh-mcp/known_hosts subPath: known_hosts readOnly: true + - name: prometheus-multiproc + mountPath: /tmp/awoooi-prometheus-multiproc resources: requests: cpu: "200m" @@ -169,6 +173,8 @@ spec: secretName: ssh-mcp-key defaultMode: 0400 optional: true + - name: prometheus-multiproc + emptyDir: {} --- apiVersion: v1