chore(governance,watchdog): enrich alerts and enable prometheus multiproc

2026-05-02 23:44:03 +08:00
parent b371edb70c
commit dedb12085b
7 changed files with 191 additions and 56 deletions
--- a/apps/api/src/api/v1/webhooks.py
+++ b/apps/api/src/api/v1/webhooks.py
@@ -1093,9 +1093,10 @@ async def receive_alert(
                "is_rule_based": False,
                "playbook_id": None,
            }
+            _cmd_cs1 = (analysis_result.kubectl_command or "").strip()
            approval_create = ApprovalRequestCreate(
-                action=f"{analysis_result.action_title} | {analysis_result.kubectl_command}",
-                description=f"[AI: {ai_provider}] {analysis_result.description}",
+                action=(_cmd_cs1 or f"{analysis_result.action_title} | NO_ACTION"),
+                description=f"[AI: {ai_provider}] {analysis_result.action_title} | {analysis_result.description}",
                risk_level=risk_level,
                blast_radius=BlastRadius(
                    affected_pods=blast.affected_pods,
@@ -1179,6 +1180,13 @@ async def receive_alert(
                        status=ApprovalStatus.APPROVED,
                        risk_level=risk_level.value,
                        matched_playbook_id=None,
+                        metadata={
+                            **_approval_metadata_cs1,
+                            "is_high_confidence": True,
+                            "policy_reason": _shadow_result.reason.value
+                            if hasattr(_shadow_result, "reason")
+                            else "cs1_auto_confident_execution",
+                        },
                    )
                    _cs1_auto_approval.id = approval.id

@@ -1194,13 +1202,19 @@ async def receive_alert(
                            error=str(_cs1_upd_err),
                        )

-                    logger.info(
-                        "llm_high_confidence_auto_executed",
-                        approval_id=str(approval.id),
-                        confidence=analysis_result.confidence,
-                        exec_success=_cs1_exec_success,
-                        action=_cs1_kubectl[:80],
-                    )
+                        logger.info(
+                            "llm_high_confidence_auto_executed",
+                            approval_id=str(approval.id),
+                            confidence=analysis_result.confidence,
+                            exec_success=_cs1_exec_success,
+                            action=_cs1_kubectl[:80],
+                            is_high_confidence=True,
+                            policy_reason=(
+                                _shadow_result.reason.value
+                                if hasattr(_shadow_result, "reason")
+                                else "cs1_auto_confident_execution"
+                            ),
+                        )
                except Exception as _cs1_auto_err:
                    logger.warning(
                        "llm_high_confidence_auto_execute_failed",
@@ -1419,7 +1433,7 @@ async def _process_new_alert_background(
            rule_kubectl = str(rule_response.get("kubectl_command", "")).strip()
            rule_description = str(rule_response.get("description", message))
            rule_action = (
-                f"{rule_action_title} | {rule_kubectl}"
+                rule_kubectl
                if rule_kubectl else
                f"NO_ACTION - {rule_description[:120]}"
            )
@@ -1656,9 +1670,10 @@ async def _process_new_alert_background(
                "is_rule_based": False,
                "playbook_id": None,
            }
+            _cmd_cs3 = (analysis_result.kubectl_command or "").strip()
            approval_create = ApprovalRequestCreate(
-                action=f"{analysis_result.action_title} | {analysis_result.kubectl_command}",
-                description=f"[AI: {ai_provider}] {analysis_result.description}",
+                action=(_cmd_cs3 or f"{analysis_result.action_title} | NO_ACTION"),
+                description=f"[AI: {ai_provider}] {analysis_result.action_title} | {analysis_result.description}",
                risk_level=risk_level,
                blast_radius=BlastRadius(
                    affected_pods=blast.affected_pods if blast else 1,
@@ -1719,6 +1734,13 @@ async def _process_new_alert_background(
                        status=ApprovalStatus.APPROVED,
                        risk_level=risk_level.value,
                        matched_playbook_id=None,
+                        metadata={
+                            **_approval_metadata_cs3,
+                            "is_high_confidence": True,
+                            "policy_reason": _shadow_result_cs3.reason.value
+                            if hasattr(_shadow_result_cs3, "reason")
+                            else "cs3_auto_confident_execution",
+                        },
                    )
                    _cs3_executor = ApprovalExecutionService()
                    _cs3_exec_success = await _cs3_executor.execute_approved_action(_cs3_auto_approval)
@@ -1729,6 +1751,12 @@ async def _process_new_alert_background(
                        confidence=analysis_result.confidence,
                        success=_cs3_exec_success,
                        provider=ai_provider,
+                        is_high_confidence=True,
+                        policy_reason=(
+                            _shadow_result_cs3.reason.value
+                            if hasattr(_shadow_result_cs3, "reason")
+                            else "cs3_auto_confident_execution"
+                        ),
                    )
                except Exception as _cs3_exec_err:
                    logger.warning("cs3_llm_auto_execute_failed", error=str(_cs3_exec_err))
--- a/apps/api/src/jobs/ai_slo_watchdog_job.py
+++ b/apps/api/src/jobs/ai_slo_watchdog_job.py
@@ -144,19 +144,19 @@ async def _check_once() -> None:
    await redis.setex(dedup_key, _DEDUP_TTL_SEC, "1")

    # 發送 TYPE-8M Meta-System 告警
-    diagnosis = " | ".join(violations)
-    incident_id = f"META-{now_taipei().strftime('%Y%m%d%H%M%S')}"
-    try:
-        from src.services.telegram_gateway import get_telegram_gateway
-        await get_telegram_gateway().send_meta_alert(
-            incident_id=incident_id,
+        diagnosis = " | ".join(violations)
+        incident_id = f"META-{now_taipei().strftime('%Y%m%d%H%M%S')}"
+        try:
+            from src.services.telegram_gateway import get_telegram_gateway
+            await get_telegram_gateway().send_meta_alert(
+                incident_id=incident_id,
            approval_id=str(uuid.uuid4()),
            alertname="AI 自健診異常",
-            alert_category="flywheel_health",
-            diagnosis=diagnosis,
-            severity_level="critical",
-            system_impact=f"{len(violations)} 項 KPI 異常（W-1~W-5），飛輪自動化能力可能降級",
-        )
+                alert_category="flywheel_health",
+                diagnosis=diagnosis,
+                severity_level="critical",
+                system_impact=f"{len(violations)} 項 KPI 異常（W-1~W-6），飛輪自動化能力可能降級",
+            )
        logger.warning(
            "ai_slo_watchdog_alert_sent",
            incident_id=incident_id,
--- a/apps/api/src/services/governance_agent.py
+++ b/apps/api/src/services/governance_agent.py
@@ -106,25 +106,42 @@ class GovernanceAgent:
                else:
                    kept_ids.append(r.playbook_id)

-            if auto_deprecated_ids:
-                await db.commit()
-                logger.info(
-                    "governance_trust_drift_auto_deprecated",
-                    count=len(auto_deprecated_ids),
-                    ids=auto_deprecated_ids[:10],
-                )
+        if auto_deprecated_ids:
+            await db.commit()
+            logger.info(
+                "governance_trust_drift_auto_deprecated",
+                count=len(auto_deprecated_ids),
+                ids=auto_deprecated_ids[:10],
+            )

        if drifted:
+            drift_ratio = len(drifted) / total if total > 0 else 0.0
            await self._alert(
                "trust_drift",
                {
-                    "drifted_count": len(drifted),
-                    "total_playbooks": total,
-                    "playbook_ids": kept_ids[:10],
-                    "auto_deprecated_count": len(auto_deprecated_ids),
-                    "auto_deprecated_ids": auto_deprecated_ids[:10],
-                    "threshold": TRUST_DRIFT_THRESHOLD,
-                    "auto_deprecate_after_days": TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS,
+                    "status": "warning",
+                    "impact": {
+                        "drifted_count": len(drifted),
+                        "total_playbooks": total,
+                        "drift_ratio": round(drift_ratio, 3),
+                        "threshold": TRUST_DRIFT_THRESHOLD,
+                        "auto_deprecate_after_days": TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS,
+                    },
+                    "remediation": {
+                        "items": [
+                            "Auto-deprecate low-trust stale playbooks",
+                            "Review candidate playbooks by impact scope and rollback if needed",
+                        ],
+                        "auto_deprecated_count": len(auto_deprecated_ids),
+                        "auto_deprecated_ids": auto_deprecated_ids[:10],
+                    },
+                    "actionable": {
+                        "items": [
+                            "立即補齊 PLAYBOOK_SOURCE 與 playbook_metadata",
+                            "必要時人工覆核 kept_ids 中的高風險 Playbook",
+                        ],
+                        "sample_playbook_ids": kept_ids[:10],
+                    },
                },
            )

@@ -177,11 +194,27 @@ class GovernanceAgent:
            await self._alert(
                "knowledge_degradation",
                {
-                    "stale_count": stale,
-                    "total_count": total,
-                    "stale_ratio": round(ratio, 3),
-                    "threshold": KM_STALE_RATIO,
-                    "stale_days": KM_STALE_DAYS,
+                    "status": "warning",
+                    "impact": {
+                        "stale_count": stale,
+                        "total_count": total,
+                        "stale_ratio": round(ratio, 3),
+                        "threshold": KM_STALE_RATIO,
+                        "stale_days": KM_STALE_DAYS,
+                    },
+                    "remediation": {
+                        "items": [
+                            "啟動 KM 反查與自動補齊流程",
+                            "關鍵服務告警自動同步到 KM 任務，補齊缺失條目",
+                        ],
+                        "next_action": "run_kb_growth_healthcheck",
+                    },
+                    "actionable": {
+                        "items": [
+                            "每日檢查 ANTI_PATTERN 更新結果",
+                            "安排至少 2 位 owner 對 stale條目做快速人工審核",
+                        ],
+                    },
                },
            )

@@ -348,9 +381,11 @@ class GovernanceAgent:
                        # 不可 fallback 0.0，否則必觸發 violated=True 噴假警報
                        if not result_list:
                            results[name] = {
+                                "name": name,
+                                "status": "skipped",
                                "error": "no_data",
-                                "skipped": True,
                                "reason": "prometheus_empty_result_metric_not_emitted",
+                                "hint": "ADR-100 emitter 未實作或 PROMETHEUS_MULTIPROC_DIR 未設",
                            }
                            logger.warning(
                                "governance_slo_no_data",
@@ -365,9 +400,12 @@ class GovernanceAgent:
                        violated = value < threshold

                        results[name] = {
+                            "name": name,
+                            "status": "violated" if violated else "ok",
                            "value": round(value, 4),
                            "slo_target": target,
                            "hard_red_line": threshold,
+                            "gap": round(threshold - value, 4) if violated else round(value - target, 4),
                            "violated": violated,
                        }

@@ -375,11 +413,27 @@ class GovernanceAgent:
                            await self._alert(
                                f"slo_{name}_violation",
                                {
-                                    "slo_name": name,
-                                    "current_value": round(value, 4),
-                                    "hard_red_line": threshold,
-                                    "slo_target": target,
-                                    "gap": round(threshold - value, 4),
+                                    "status": "violation",
+                                    "impact": {
+                                        "name": name,
+                                        "value": round(value, 4),
+                                        "target": target,
+                                        "threshold": threshold,
+                                        "gap": round(threshold - value, 4),
+                                    },
+                                    "remediation": {
+                                        "items": [
+                                            "Pause auto-scaling or risky auto-fix tasks",
+                                            "Review evidence/decision traces and adjust policy thresholds",
+                                        ],
+                                        "next_action": "trigger_flywheel_safeguard",
+                                    },
+                                    "actionable": {
+                                        "items": [
+                                            "Check verifier lag and post-exec learning health",
+                                            "Run emergency incident audit on failed approvals",
+                                        ],
+                                    },
                                },
                            )
                            logger.warning(
@@ -388,6 +442,12 @@ class GovernanceAgent:
                                value=round(value, 4),
                                hard_red_line=threshold,
                            )
+                        elif value == 0 and threshold <= 0:
+                            logger.warning(
+                                "governance_slo_unexpected_zero",
+                                slo=name,
+                                value=round(value, 4),
+                            )
                        else:
                            logger.info(
                                "governance_slo_ok",
@@ -396,7 +456,12 @@ class GovernanceAgent:
                                target=target,
                            )
                    else:
-                        results[name] = {"error": "prometheus_query_failed", "status": data.get("status")}
+                        results[name] = {
+                            "name": name,
+                            "status": "error",
+                            "error": "prometheus_query_failed",
+                            "response_status": data.get("status"),
+                        }
                        logger.warning(
                            "governance_slo_prometheus_error",
                            slo=name,
@@ -404,22 +469,30 @@ class GovernanceAgent:
                            response_status=data.get("status"),
                        )
                except Exception as e:
-                    results[name] = {"error": str(e)}
+                    results[name] = {
+                        "name": name,
+                        "status": "error",
+                        "error": str(e),
+                    }
                    logger.warning("governance_slo_check_error", slo=name, error=str(e))

        # 2026-04-29 ogt + Claude Opus 4.7: critic M6 修
        # 加聚合 _meta 區分「全 skipped」(metric 未 emit) vs「全 ok」(SLO 健康)
        # 防止 dashboard 把 no_data 當 pass 顯示
-        violated_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("violated"))
-        skipped_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("skipped"))
+        violated_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "violated")
+        skipped_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "skipped")
        ok_count = sum(
            1 for v in results.values()
-            if isinstance(v, dict) and not v.get("violated") and not v.get("skipped") and "error" not in v
+            if isinstance(v, dict)
+            and v.get("status") == "ok"
        )
+        error_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "error")
        results["_meta"] = {
            "violated_count": violated_count,
            "skipped_count": skipped_count,
            "ok_count": ok_count,
+            "error_count": error_count,
+            "all_status": sorted({v.get("status") for v in results.values() if isinstance(v, dict)}),
            "all_skipped": skipped_count > 0 and ok_count == 0 and violated_count == 0,
            "status": (
                "no_data" if (skipped_count > 0 and ok_count == 0 and violated_count == 0)
@@ -496,9 +569,26 @@ class GovernanceAgent:
                await self._alert(
                    "governance_slo_data_gap",
                    {
-                        "reason": "all_slo_metrics_not_emitted",
-                        "skipped_count": slo_meta.get("skipped_count", 0),
-                        "hint": "ADR-100 emitter 未實作或 PROMETHEUS_MULTIPROC_DIR 未設",
+                        "status": "warning",
+                        "impact": {
+                            "reason": "all_slo_metrics_not_emitted",
+                            "skipped_count": slo_meta.get("skipped_count", 0),
+                            "all_slo_metrics_not_emitted": True,
+                        },
+                        "remediation": {
+                            "items": [
+                                "補齊 ADR-100 SLO emitter（automation_operation_log_total / post_execution_verification_total / km_entries_total）",
+                                "設置 PROMETHEUS_MULTIPROC_DIR 並掛載可寫目錄（如 emptyDir）",
+                            ],
+                            "next_action": "run_adr100_slo_emit_playbook",
+                            "hint": "ADR-100 emitter 未實作或 PROMETHEUS_MULTIPROC_DIR 未設",
+                        },
+                        "actionable": {
+                            "items": [
+                                "先確認所有 API Pod 是否有 PROMETHEUS_MULTIPROC_DIR 掛載",
+                                "檢查 Prometheus rule 是否已載入 sli:autonomy_rate:5m 等 4 項告警規則",
+                            ],
+                        },
                    },
                )
            except Exception:
--- a/k8s/awoooi-dev/02-configmap.yaml
+++ b/k8s/awoooi-dev/02-configmap.yaml
@@ -52,3 +52,4 @@ data:
  TG_GROUP_CUTOVER: "true"
  HERMES_NL_ENABLED: "false"
  ENABLE_12AGENT_CONSENSUS: "false"
+  PROMETHEUS_MULTIPROC_DIR: "/tmp/awoooi-prometheus-multiproc"
--- a/k8s/awoooi-dev/04-deployment-api.yaml
+++ b/k8s/awoooi-dev/04-deployment-api.yaml
@@ -38,6 +38,12 @@ spec:
                name: awoooi-config
            - secretRef:
                name: awoooi-secrets
+          env:
+            - name: PROMETHEUS_MULTIPROC_DIR
+              value: "/tmp/awoooi-prometheus-multiproc"
+          volumeMounts:
+            - name: prometheus-multiproc
+              mountPath: /tmp/awoooi-prometheus-multiproc
          resources:
            requests:
              cpu: "100m"
@@ -59,6 +65,9 @@ spec:
            initialDelaySeconds: 10
            periodSeconds: 10
            failureThreshold: 3
+      volumes:
+        - name: prometheus-multiproc
+          emptyDir: {}

 ---
 apiVersion: v1
--- a/k8s/awoooi-prod/04-configmap.yaml
+++ b/k8s/awoooi-prod/04-configmap.yaml
@@ -132,6 +132,7 @@ data:
  SENTRY_MCP_ENABLED: "true"
  # Prometheus server 在 110:9090 (非 188)
  PROMETHEUS_URL: "http://192.168.0.110:9090"
+  PROMETHEUS_MULTIPROC_DIR: "/tmp/awoooi-prometheus-multiproc"

  # ============================================================================
  # AIOps Phase 1-6 Feature Flags (2026-04-15 ogt: 全開，資料先全寫入 DB)
--- a/k8s/awoooi-prod/06-deployment-api.yaml
+++ b/k8s/awoooi-prod/06-deployment-api.yaml
@@ -77,6 +77,8 @@ spec:
              value: "100"
            - name: AGENT_SOLVER_TIMEOUT_SEC
              value: "80"
+            - name: PROMETHEUS_MULTIPROC_DIR
+              value: "/tmp/awoooi-prometheus-multiproc"
          # 2026-04-05 Claude Code: Sprint 3 — 掛載 SSH key 供 HostRepairAgent 使用
          volumeMounts:
            - name: repair-ssh-key
@@ -104,6 +106,8 @@ spec:
              mountPath: /etc/ssh-mcp/known_hosts
              subPath: known_hosts
              readOnly: true
+            - name: prometheus-multiproc
+              mountPath: /tmp/awoooi-prometheus-multiproc
          resources:
            requests:
              cpu: "200m"
@@ -169,6 +173,8 @@ spec:
            secretName: ssh-mcp-key
            defaultMode: 0400
            optional: true
+        - name: prometheus-multiproc
+          emptyDir: {}

 ---
 apiVersion: v1