From dedb12085baa809bb10a580a927daa19cdb94b1d Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Sat, 2 May 2026 23:44:03 +0800
Subject: [PATCH] chore(governance,watchdog): enrich alerts and enable
 prometheus multiproc

---
 apps/api/src/api/v1/webhooks.py           |  52 ++++++--
 apps/api/src/jobs/ai_slo_watchdog_job.py  |  22 +--
 apps/api/src/services/governance_agent.py | 156 +++++++++++++++++-----
 k8s/awoooi-dev/02-configmap.yaml          |   1 +
 k8s/awoooi-dev/04-deployment-api.yaml     |   9 ++
 k8s/awoooi-prod/04-configmap.yaml         |   1 +
 k8s/awoooi-prod/06-deployment-api.yaml    |   6 +
 7 files changed, 191 insertions(+), 56 deletions(-)

diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py
index ef312579..983ece4a 100644
--- a/apps/api/src/api/v1/webhooks.py
+++ b/apps/api/src/api/v1/webhooks.py
@@ -1093,9 +1093,10 @@ async def receive_alert(
                 "is_rule_based": False,
                 "playbook_id": None,
             }
+            _cmd_cs1 = (analysis_result.kubectl_command or "").strip()
             approval_create = ApprovalRequestCreate(
-                action=f"{analysis_result.action_title} | {analysis_result.kubectl_command}",
-                description=f"[AI: {ai_provider}] {analysis_result.description}",
+                action=(_cmd_cs1 or f"{analysis_result.action_title} | NO_ACTION"),
+                description=f"[AI: {ai_provider}] {analysis_result.action_title} | {analysis_result.description}",
                 risk_level=risk_level,
                 blast_radius=BlastRadius(
                     affected_pods=blast.affected_pods,
@@ -1179,6 +1180,13 @@ async def receive_alert(
                         status=ApprovalStatus.APPROVED,
                         risk_level=risk_level.value,
                         matched_playbook_id=None,
+                        metadata={
+                            **_approval_metadata_cs1,
+                            "is_high_confidence": True,
+                            "policy_reason": _shadow_result.reason.value
+                            if hasattr(_shadow_result, "reason")
+                            else "cs1_auto_confident_execution",
+                        },
                     )
                     _cs1_auto_approval.id = approval.id
 
@@ -1194,13 +1202,19 @@ async def receive_alert(
                             error=str(_cs1_upd_err),
                         )
 
-                    logger.info(
-                        "llm_high_confidence_auto_executed",
-                        approval_id=str(approval.id),
-                        confidence=analysis_result.confidence,
-                        exec_success=_cs1_exec_success,
-                        action=_cs1_kubectl[:80],
-                    )
+                        logger.info(
+                            "llm_high_confidence_auto_executed",
+                            approval_id=str(approval.id),
+                            confidence=analysis_result.confidence,
+                            exec_success=_cs1_exec_success,
+                            action=_cs1_kubectl[:80],
+                            is_high_confidence=True,
+                            policy_reason=(
+                                _shadow_result.reason.value
+                                if hasattr(_shadow_result, "reason")
+                                else "cs1_auto_confident_execution"
+                            ),
+                        )
                 except Exception as _cs1_auto_err:
                     logger.warning(
                         "llm_high_confidence_auto_execute_failed",
@@ -1419,7 +1433,7 @@ async def _process_new_alert_background(
             rule_kubectl = str(rule_response.get("kubectl_command", "")).strip()
             rule_description = str(rule_response.get("description", message))
             rule_action = (
-                f"{rule_action_title} | {rule_kubectl}"
+                rule_kubectl
                 if rule_kubectl else
                 f"NO_ACTION - {rule_description[:120]}"
             )
@@ -1656,9 +1670,10 @@ async def _process_new_alert_background(
                 "is_rule_based": False,
                 "playbook_id": None,
             }
+            _cmd_cs3 = (analysis_result.kubectl_command or "").strip()
             approval_create = ApprovalRequestCreate(
-                action=f"{analysis_result.action_title} | {analysis_result.kubectl_command}",
-                description=f"[AI: {ai_provider}] {analysis_result.description}",
+                action=(_cmd_cs3 or f"{analysis_result.action_title} | NO_ACTION"),
+                description=f"[AI: {ai_provider}] {analysis_result.action_title} | {analysis_result.description}",
                 risk_level=risk_level,
                 blast_radius=BlastRadius(
                     affected_pods=blast.affected_pods if blast else 1,
@@ -1719,6 +1734,13 @@ async def _process_new_alert_background(
                         status=ApprovalStatus.APPROVED,
                         risk_level=risk_level.value,
                         matched_playbook_id=None,
+                        metadata={
+                            **_approval_metadata_cs3,
+                            "is_high_confidence": True,
+                            "policy_reason": _shadow_result_cs3.reason.value
+                            if hasattr(_shadow_result_cs3, "reason")
+                            else "cs3_auto_confident_execution",
+                        },
                     )
                     _cs3_executor = ApprovalExecutionService()
                     _cs3_exec_success = await _cs3_executor.execute_approved_action(_cs3_auto_approval)
@@ -1729,6 +1751,12 @@ async def _process_new_alert_background(
                         confidence=analysis_result.confidence,
                         success=_cs3_exec_success,
                         provider=ai_provider,
+                        is_high_confidence=True,
+                        policy_reason=(
+                            _shadow_result_cs3.reason.value
+                            if hasattr(_shadow_result_cs3, "reason")
+                            else "cs3_auto_confident_execution"
+                        ),
                     )
                 except Exception as _cs3_exec_err:
                     logger.warning("cs3_llm_auto_execute_failed", error=str(_cs3_exec_err))
diff --git a/apps/api/src/jobs/ai_slo_watchdog_job.py b/apps/api/src/jobs/ai_slo_watchdog_job.py
index f8b235de..93dcdbd0 100644
--- a/apps/api/src/jobs/ai_slo_watchdog_job.py
+++ b/apps/api/src/jobs/ai_slo_watchdog_job.py
@@ -144,19 +144,19 @@ async def _check_once() -> None:
     await redis.setex(dedup_key, _DEDUP_TTL_SEC, "1")
 
     # 發送 TYPE-8M Meta-System 告警
-    diagnosis = " | ".join(violations)
-    incident_id = f"META-{now_taipei().strftime('%Y%m%d%H%M%S')}"
-    try:
-        from src.services.telegram_gateway import get_telegram_gateway
-        await get_telegram_gateway().send_meta_alert(
-            incident_id=incident_id,
+        diagnosis = " | ".join(violations)
+        incident_id = f"META-{now_taipei().strftime('%Y%m%d%H%M%S')}"
+        try:
+            from src.services.telegram_gateway import get_telegram_gateway
+            await get_telegram_gateway().send_meta_alert(
+                incident_id=incident_id,
             approval_id=str(uuid.uuid4()),
             alertname="AI 自健診異常",
-            alert_category="flywheel_health",
-            diagnosis=diagnosis,
-            severity_level="critical",
-            system_impact=f"{len(violations)} 項 KPI 異常（W-1~W-5），飛輪自動化能力可能降級",
-        )
+                alert_category="flywheel_health",
+                diagnosis=diagnosis,
+                severity_level="critical",
+                system_impact=f"{len(violations)} 項 KPI 異常（W-1~W-6），飛輪自動化能力可能降級",
+            )
         logger.warning(
             "ai_slo_watchdog_alert_sent",
             incident_id=incident_id,
diff --git a/apps/api/src/services/governance_agent.py b/apps/api/src/services/governance_agent.py
index a84802bc..267391f0 100644
--- a/apps/api/src/services/governance_agent.py
+++ b/apps/api/src/services/governance_agent.py
@@ -106,25 +106,42 @@ class GovernanceAgent:
                 else:
                     kept_ids.append(r.playbook_id)
 
-            if auto_deprecated_ids:
-                await db.commit()
-                logger.info(
-                    "governance_trust_drift_auto_deprecated",
-                    count=len(auto_deprecated_ids),
-                    ids=auto_deprecated_ids[:10],
-                )
+        if auto_deprecated_ids:
+            await db.commit()
+            logger.info(
+                "governance_trust_drift_auto_deprecated",
+                count=len(auto_deprecated_ids),
+                ids=auto_deprecated_ids[:10],
+            )
 
         if drifted:
+            drift_ratio = len(drifted) / total if total > 0 else 0.0
             await self._alert(
                 "trust_drift",
                 {
-                    "drifted_count": len(drifted),
-                    "total_playbooks": total,
-                    "playbook_ids": kept_ids[:10],
-                    "auto_deprecated_count": len(auto_deprecated_ids),
-                    "auto_deprecated_ids": auto_deprecated_ids[:10],
-                    "threshold": TRUST_DRIFT_THRESHOLD,
-                    "auto_deprecate_after_days": TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS,
+                    "status": "warning",
+                    "impact": {
+                        "drifted_count": len(drifted),
+                        "total_playbooks": total,
+                        "drift_ratio": round(drift_ratio, 3),
+                        "threshold": TRUST_DRIFT_THRESHOLD,
+                        "auto_deprecate_after_days": TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS,
+                    },
+                    "remediation": {
+                        "items": [
+                            "Auto-deprecate low-trust stale playbooks",
+                            "Review candidate playbooks by impact scope and rollback if needed",
+                        ],
+                        "auto_deprecated_count": len(auto_deprecated_ids),
+                        "auto_deprecated_ids": auto_deprecated_ids[:10],
+                    },
+                    "actionable": {
+                        "items": [
+                            "立即補齊 PLAYBOOK_SOURCE 與 playbook_metadata",
+                            "必要時人工覆核 kept_ids 中的高風險 Playbook",
+                        ],
+                        "sample_playbook_ids": kept_ids[:10],
+                    },
                 },
             )
 
@@ -177,11 +194,27 @@ class GovernanceAgent:
             await self._alert(
                 "knowledge_degradation",
                 {
-                    "stale_count": stale,
-                    "total_count": total,
-                    "stale_ratio": round(ratio, 3),
-                    "threshold": KM_STALE_RATIO,
-                    "stale_days": KM_STALE_DAYS,
+                    "status": "warning",
+                    "impact": {
+                        "stale_count": stale,
+                        "total_count": total,
+                        "stale_ratio": round(ratio, 3),
+                        "threshold": KM_STALE_RATIO,
+                        "stale_days": KM_STALE_DAYS,
+                    },
+                    "remediation": {
+                        "items": [
+                            "啟動 KM 反查與自動補齊流程",
+                            "關鍵服務告警自動同步到 KM 任務，補齊缺失條目",
+                        ],
+                        "next_action": "run_kb_growth_healthcheck",
+                    },
+                    "actionable": {
+                        "items": [
+                            "每日檢查 ANTI_PATTERN 更新結果",
+                            "安排至少 2 位 owner 對 stale條目做快速人工審核",
+                        ],
+                    },
                 },
             )
 
@@ -348,9 +381,11 @@ class GovernanceAgent:
                         # 不可 fallback 0.0，否則必觸發 violated=True 噴假警報
                         if not result_list:
                             results[name] = {
+                                "name": name,
+                                "status": "skipped",
                                 "error": "no_data",
-                                "skipped": True,
                                 "reason": "prometheus_empty_result_metric_not_emitted",
+                                "hint": "ADR-100 emitter 未實作或 PROMETHEUS_MULTIPROC_DIR 未設",
                             }
                             logger.warning(
                                 "governance_slo_no_data",
@@ -365,9 +400,12 @@ class GovernanceAgent:
                         violated = value < threshold
 
                         results[name] = {
+                            "name": name,
+                            "status": "violated" if violated else "ok",
                             "value": round(value, 4),
                             "slo_target": target,
                             "hard_red_line": threshold,
+                            "gap": round(threshold - value, 4) if violated else round(value - target, 4),
                             "violated": violated,
                         }
 
@@ -375,11 +413,27 @@ class GovernanceAgent:
                             await self._alert(
                                 f"slo_{name}_violation",
                                 {
-                                    "slo_name": name,
-                                    "current_value": round(value, 4),
-                                    "hard_red_line": threshold,
-                                    "slo_target": target,
-                                    "gap": round(threshold - value, 4),
+                                    "status": "violation",
+                                    "impact": {
+                                        "name": name,
+                                        "value": round(value, 4),
+                                        "target": target,
+                                        "threshold": threshold,
+                                        "gap": round(threshold - value, 4),
+                                    },
+                                    "remediation": {
+                                        "items": [
+                                            "Pause auto-scaling or risky auto-fix tasks",
+                                            "Review evidence/decision traces and adjust policy thresholds",
+                                        ],
+                                        "next_action": "trigger_flywheel_safeguard",
+                                    },
+                                    "actionable": {
+                                        "items": [
+                                            "Check verifier lag and post-exec learning health",
+                                            "Run emergency incident audit on failed approvals",
+                                        ],
+                                    },
                                 },
                             )
                             logger.warning(
@@ -388,6 +442,12 @@ class GovernanceAgent:
                                 value=round(value, 4),
                                 hard_red_line=threshold,
                             )
+                        elif value == 0 and threshold <= 0:
+                            logger.warning(
+                                "governance_slo_unexpected_zero",
+                                slo=name,
+                                value=round(value, 4),
+                            )
                         else:
                             logger.info(
                                 "governance_slo_ok",
@@ -396,7 +456,12 @@ class GovernanceAgent:
                                 target=target,
                             )
                     else:
-                        results[name] = {"error": "prometheus_query_failed", "status": data.get("status")}
+                        results[name] = {
+                            "name": name,
+                            "status": "error",
+                            "error": "prometheus_query_failed",
+                            "response_status": data.get("status"),
+                        }
                         logger.warning(
                             "governance_slo_prometheus_error",
                             slo=name,
@@ -404,22 +469,30 @@ class GovernanceAgent:
                             response_status=data.get("status"),
                         )
                 except Exception as e:
-                    results[name] = {"error": str(e)}
+                    results[name] = {
+                        "name": name,
+                        "status": "error",
+                        "error": str(e),
+                    }
                     logger.warning("governance_slo_check_error", slo=name, error=str(e))
 
         # 2026-04-29 ogt + Claude Opus 4.7: critic M6 修
         # 加聚合 _meta 區分「全 skipped」(metric 未 emit) vs「全 ok」(SLO 健康)
         # 防止 dashboard 把 no_data 當 pass 顯示
-        violated_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("violated"))
-        skipped_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("skipped"))
+        violated_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "violated")
+        skipped_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "skipped")
         ok_count = sum(
             1 for v in results.values()
-            if isinstance(v, dict) and not v.get("violated") and not v.get("skipped") and "error" not in v
+            if isinstance(v, dict)
+            and v.get("status") == "ok"
         )
+        error_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "error")
         results["_meta"] = {
             "violated_count": violated_count,
             "skipped_count": skipped_count,
             "ok_count": ok_count,
+            "error_count": error_count,
+            "all_status": sorted({v.get("status") for v in results.values() if isinstance(v, dict)}),
             "all_skipped": skipped_count > 0 and ok_count == 0 and violated_count == 0,
             "status": (
                 "no_data" if (skipped_count > 0 and ok_count == 0 and violated_count == 0)
@@ -496,9 +569,26 @@ class GovernanceAgent:
                 await self._alert(
                     "governance_slo_data_gap",
                     {
-                        "reason": "all_slo_metrics_not_emitted",
-                        "skipped_count": slo_meta.get("skipped_count", 0),
-                        "hint": "ADR-100 emitter 未實作或 PROMETHEUS_MULTIPROC_DIR 未設",
+                        "status": "warning",
+                        "impact": {
+                            "reason": "all_slo_metrics_not_emitted",
+                            "skipped_count": slo_meta.get("skipped_count", 0),
+                            "all_slo_metrics_not_emitted": True,
+                        },
+                        "remediation": {
+                            "items": [
+                                "補齊 ADR-100 SLO emitter（automation_operation_log_total / post_execution_verification_total / km_entries_total）",
+                                "設置 PROMETHEUS_MULTIPROC_DIR 並掛載可寫目錄（如 emptyDir）",
+                            ],
+                            "next_action": "run_adr100_slo_emit_playbook",
+                            "hint": "ADR-100 emitter 未實作或 PROMETHEUS_MULTIPROC_DIR 未設",
+                        },
+                        "actionable": {
+                            "items": [
+                                "先確認所有 API Pod 是否有 PROMETHEUS_MULTIPROC_DIR 掛載",
+                                "檢查 Prometheus rule 是否已載入 sli:autonomy_rate:5m 等 4 項告警規則",
+                            ],
+                        },
                     },
                 )
             except Exception:
diff --git a/k8s/awoooi-dev/02-configmap.yaml b/k8s/awoooi-dev/02-configmap.yaml
index 10ddbd0e..0d156316 100644
--- a/k8s/awoooi-dev/02-configmap.yaml
+++ b/k8s/awoooi-dev/02-configmap.yaml
@@ -52,3 +52,4 @@ data:
   TG_GROUP_CUTOVER: "true"
   HERMES_NL_ENABLED: "false"
   ENABLE_12AGENT_CONSENSUS: "false"
+  PROMETHEUS_MULTIPROC_DIR: "/tmp/awoooi-prometheus-multiproc"
diff --git a/k8s/awoooi-dev/04-deployment-api.yaml b/k8s/awoooi-dev/04-deployment-api.yaml
index 3b29539a..e4db61ec 100644
--- a/k8s/awoooi-dev/04-deployment-api.yaml
+++ b/k8s/awoooi-dev/04-deployment-api.yaml
@@ -38,6 +38,12 @@ spec:
                 name: awoooi-config
             - secretRef:
                 name: awoooi-secrets
+          env:
+            - name: PROMETHEUS_MULTIPROC_DIR
+              value: "/tmp/awoooi-prometheus-multiproc"
+          volumeMounts:
+            - name: prometheus-multiproc
+              mountPath: /tmp/awoooi-prometheus-multiproc
           resources:
             requests:
               cpu: "100m"
@@ -59,6 +65,9 @@ spec:
             initialDelaySeconds: 10
             periodSeconds: 10
             failureThreshold: 3
+      volumes:
+        - name: prometheus-multiproc
+          emptyDir: {}
 
 ---
 apiVersion: v1
diff --git a/k8s/awoooi-prod/04-configmap.yaml b/k8s/awoooi-prod/04-configmap.yaml
index c83c84c3..a9a83b45 100644
--- a/k8s/awoooi-prod/04-configmap.yaml
+++ b/k8s/awoooi-prod/04-configmap.yaml
@@ -132,6 +132,7 @@ data:
   SENTRY_MCP_ENABLED: "true"
   # Prometheus server 在 110:9090 (非 188)
   PROMETHEUS_URL: "http://192.168.0.110:9090"
+  PROMETHEUS_MULTIPROC_DIR: "/tmp/awoooi-prometheus-multiproc"
 
   # ============================================================================
   # AIOps Phase 1-6 Feature Flags (2026-04-15 ogt: 全開，資料先全寫入 DB)
diff --git a/k8s/awoooi-prod/06-deployment-api.yaml b/k8s/awoooi-prod/06-deployment-api.yaml
index b82a877c..a7b458c7 100644
--- a/k8s/awoooi-prod/06-deployment-api.yaml
+++ b/k8s/awoooi-prod/06-deployment-api.yaml
@@ -77,6 +77,8 @@ spec:
               value: "100"
             - name: AGENT_SOLVER_TIMEOUT_SEC
               value: "80"
+            - name: PROMETHEUS_MULTIPROC_DIR
+              value: "/tmp/awoooi-prometheus-multiproc"
           # 2026-04-05 Claude Code: Sprint 3 — 掛載 SSH key 供 HostRepairAgent 使用
           volumeMounts:
             - name: repair-ssh-key
@@ -104,6 +106,8 @@ spec:
               mountPath: /etc/ssh-mcp/known_hosts
               subPath: known_hosts
               readOnly: true
+            - name: prometheus-multiproc
+              mountPath: /tmp/awoooi-prometheus-multiproc
           resources:
             requests:
               cpu: "200m"
@@ -169,6 +173,8 @@ spec:
             secretName: ssh-mcp-key
             defaultMode: 0400
             optional: true
+        - name: prometheus-multiproc
+          emptyDir: {}
 
 ---
 apiVersion: v1