chore(governance,watchdog): enrich alerts and enable prometheus multiproc
Some checks failed
CD Pipeline / tests (push) Failing after 1m22s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Successful in 43s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 57s

This commit is contained in:
Your Name
2026-05-02 23:44:03 +08:00
parent b371edb70c
commit dedb12085b
7 changed files with 191 additions and 56 deletions

View File

@@ -1093,9 +1093,10 @@ async def receive_alert(
"is_rule_based": False,
"playbook_id": None,
}
_cmd_cs1 = (analysis_result.kubectl_command or "").strip()
approval_create = ApprovalRequestCreate(
action=f"{analysis_result.action_title} | {analysis_result.kubectl_command}",
description=f"[AI: {ai_provider}] {analysis_result.description}",
action=(_cmd_cs1 or f"{analysis_result.action_title} | NO_ACTION"),
description=f"[AI: {ai_provider}] {analysis_result.action_title} | {analysis_result.description}",
risk_level=risk_level,
blast_radius=BlastRadius(
affected_pods=blast.affected_pods,
@@ -1179,6 +1180,13 @@ async def receive_alert(
status=ApprovalStatus.APPROVED,
risk_level=risk_level.value,
matched_playbook_id=None,
metadata={
**_approval_metadata_cs1,
"is_high_confidence": True,
"policy_reason": _shadow_result.reason.value
if hasattr(_shadow_result, "reason")
else "cs1_auto_confident_execution",
},
)
_cs1_auto_approval.id = approval.id
@@ -1194,13 +1202,19 @@ async def receive_alert(
error=str(_cs1_upd_err),
)
logger.info(
"llm_high_confidence_auto_executed",
approval_id=str(approval.id),
confidence=analysis_result.confidence,
exec_success=_cs1_exec_success,
action=_cs1_kubectl[:80],
)
logger.info(
"llm_high_confidence_auto_executed",
approval_id=str(approval.id),
confidence=analysis_result.confidence,
exec_success=_cs1_exec_success,
action=_cs1_kubectl[:80],
is_high_confidence=True,
policy_reason=(
_shadow_result.reason.value
if hasattr(_shadow_result, "reason")
else "cs1_auto_confident_execution"
),
)
except Exception as _cs1_auto_err:
logger.warning(
"llm_high_confidence_auto_execute_failed",
@@ -1419,7 +1433,7 @@ async def _process_new_alert_background(
rule_kubectl = str(rule_response.get("kubectl_command", "")).strip()
rule_description = str(rule_response.get("description", message))
rule_action = (
f"{rule_action_title} | {rule_kubectl}"
rule_kubectl
if rule_kubectl else
f"NO_ACTION - {rule_description[:120]}"
)
@@ -1656,9 +1670,10 @@ async def _process_new_alert_background(
"is_rule_based": False,
"playbook_id": None,
}
_cmd_cs3 = (analysis_result.kubectl_command or "").strip()
approval_create = ApprovalRequestCreate(
action=f"{analysis_result.action_title} | {analysis_result.kubectl_command}",
description=f"[AI: {ai_provider}] {analysis_result.description}",
action=(_cmd_cs3 or f"{analysis_result.action_title} | NO_ACTION"),
description=f"[AI: {ai_provider}] {analysis_result.action_title} | {analysis_result.description}",
risk_level=risk_level,
blast_radius=BlastRadius(
affected_pods=blast.affected_pods if blast else 1,
@@ -1719,6 +1734,13 @@ async def _process_new_alert_background(
status=ApprovalStatus.APPROVED,
risk_level=risk_level.value,
matched_playbook_id=None,
metadata={
**_approval_metadata_cs3,
"is_high_confidence": True,
"policy_reason": _shadow_result_cs3.reason.value
if hasattr(_shadow_result_cs3, "reason")
else "cs3_auto_confident_execution",
},
)
_cs3_executor = ApprovalExecutionService()
_cs3_exec_success = await _cs3_executor.execute_approved_action(_cs3_auto_approval)
@@ -1729,6 +1751,12 @@ async def _process_new_alert_background(
confidence=analysis_result.confidence,
success=_cs3_exec_success,
provider=ai_provider,
is_high_confidence=True,
policy_reason=(
_shadow_result_cs3.reason.value
if hasattr(_shadow_result_cs3, "reason")
else "cs3_auto_confident_execution"
),
)
except Exception as _cs3_exec_err:
logger.warning("cs3_llm_auto_execute_failed", error=str(_cs3_exec_err))

View File

@@ -144,19 +144,19 @@ async def _check_once() -> None:
await redis.setex(dedup_key, _DEDUP_TTL_SEC, "1")
# 發送 TYPE-8M Meta-System 告警
diagnosis = " | ".join(violations)
incident_id = f"META-{now_taipei().strftime('%Y%m%d%H%M%S')}"
try:
from src.services.telegram_gateway import get_telegram_gateway
await get_telegram_gateway().send_meta_alert(
incident_id=incident_id,
diagnosis = " | ".join(violations)
incident_id = f"META-{now_taipei().strftime('%Y%m%d%H%M%S')}"
try:
from src.services.telegram_gateway import get_telegram_gateway
await get_telegram_gateway().send_meta_alert(
incident_id=incident_id,
approval_id=str(uuid.uuid4()),
alertname="AI 自健診異常",
alert_category="flywheel_health",
diagnosis=diagnosis,
severity_level="critical",
system_impact=f"{len(violations)} 項 KPI 異常W-1~W-5),飛輪自動化能力可能降級",
)
alert_category="flywheel_health",
diagnosis=diagnosis,
severity_level="critical",
system_impact=f"{len(violations)} 項 KPI 異常W-1~W-6),飛輪自動化能力可能降級",
)
logger.warning(
"ai_slo_watchdog_alert_sent",
incident_id=incident_id,

View File

@@ -106,25 +106,42 @@ class GovernanceAgent:
else:
kept_ids.append(r.playbook_id)
if auto_deprecated_ids:
await db.commit()
logger.info(
"governance_trust_drift_auto_deprecated",
count=len(auto_deprecated_ids),
ids=auto_deprecated_ids[:10],
)
if auto_deprecated_ids:
await db.commit()
logger.info(
"governance_trust_drift_auto_deprecated",
count=len(auto_deprecated_ids),
ids=auto_deprecated_ids[:10],
)
if drifted:
drift_ratio = len(drifted) / total if total > 0 else 0.0
await self._alert(
"trust_drift",
{
"drifted_count": len(drifted),
"total_playbooks": total,
"playbook_ids": kept_ids[:10],
"auto_deprecated_count": len(auto_deprecated_ids),
"auto_deprecated_ids": auto_deprecated_ids[:10],
"threshold": TRUST_DRIFT_THRESHOLD,
"auto_deprecate_after_days": TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS,
"status": "warning",
"impact": {
"drifted_count": len(drifted),
"total_playbooks": total,
"drift_ratio": round(drift_ratio, 3),
"threshold": TRUST_DRIFT_THRESHOLD,
"auto_deprecate_after_days": TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS,
},
"remediation": {
"items": [
"Auto-deprecate low-trust stale playbooks",
"Review candidate playbooks by impact scope and rollback if needed",
],
"auto_deprecated_count": len(auto_deprecated_ids),
"auto_deprecated_ids": auto_deprecated_ids[:10],
},
"actionable": {
"items": [
"立即補齊 PLAYBOOK_SOURCE 與 playbook_metadata",
"必要時人工覆核 kept_ids 中的高風險 Playbook",
],
"sample_playbook_ids": kept_ids[:10],
},
},
)
@@ -177,11 +194,27 @@ class GovernanceAgent:
await self._alert(
"knowledge_degradation",
{
"stale_count": stale,
"total_count": total,
"stale_ratio": round(ratio, 3),
"threshold": KM_STALE_RATIO,
"stale_days": KM_STALE_DAYS,
"status": "warning",
"impact": {
"stale_count": stale,
"total_count": total,
"stale_ratio": round(ratio, 3),
"threshold": KM_STALE_RATIO,
"stale_days": KM_STALE_DAYS,
},
"remediation": {
"items": [
"啟動 KM 反查與自動補齊流程",
"關鍵服務告警自動同步到 KM 任務,補齊缺失條目",
],
"next_action": "run_kb_growth_healthcheck",
},
"actionable": {
"items": [
"每日檢查 ANTI_PATTERN 更新結果",
"安排至少 2 位 owner 對 stale條目做快速人工審核",
],
},
},
)
@@ -348,9 +381,11 @@ class GovernanceAgent:
# 不可 fallback 0.0,否則必觸發 violated=True 噴假警報
if not result_list:
results[name] = {
"name": name,
"status": "skipped",
"error": "no_data",
"skipped": True,
"reason": "prometheus_empty_result_metric_not_emitted",
"hint": "ADR-100 emitter 未實作或 PROMETHEUS_MULTIPROC_DIR 未設",
}
logger.warning(
"governance_slo_no_data",
@@ -365,9 +400,12 @@ class GovernanceAgent:
violated = value < threshold
results[name] = {
"name": name,
"status": "violated" if violated else "ok",
"value": round(value, 4),
"slo_target": target,
"hard_red_line": threshold,
"gap": round(threshold - value, 4) if violated else round(value - target, 4),
"violated": violated,
}
@@ -375,11 +413,27 @@ class GovernanceAgent:
await self._alert(
f"slo_{name}_violation",
{
"slo_name": name,
"current_value": round(value, 4),
"hard_red_line": threshold,
"slo_target": target,
"gap": round(threshold - value, 4),
"status": "violation",
"impact": {
"name": name,
"value": round(value, 4),
"target": target,
"threshold": threshold,
"gap": round(threshold - value, 4),
},
"remediation": {
"items": [
"Pause auto-scaling or risky auto-fix tasks",
"Review evidence/decision traces and adjust policy thresholds",
],
"next_action": "trigger_flywheel_safeguard",
},
"actionable": {
"items": [
"Check verifier lag and post-exec learning health",
"Run emergency incident audit on failed approvals",
],
},
},
)
logger.warning(
@@ -388,6 +442,12 @@ class GovernanceAgent:
value=round(value, 4),
hard_red_line=threshold,
)
elif value == 0 and threshold <= 0:
logger.warning(
"governance_slo_unexpected_zero",
slo=name,
value=round(value, 4),
)
else:
logger.info(
"governance_slo_ok",
@@ -396,7 +456,12 @@ class GovernanceAgent:
target=target,
)
else:
results[name] = {"error": "prometheus_query_failed", "status": data.get("status")}
results[name] = {
"name": name,
"status": "error",
"error": "prometheus_query_failed",
"response_status": data.get("status"),
}
logger.warning(
"governance_slo_prometheus_error",
slo=name,
@@ -404,22 +469,30 @@ class GovernanceAgent:
response_status=data.get("status"),
)
except Exception as e:
results[name] = {"error": str(e)}
results[name] = {
"name": name,
"status": "error",
"error": str(e),
}
logger.warning("governance_slo_check_error", slo=name, error=str(e))
# 2026-04-29 ogt + Claude Opus 4.7: critic M6 修
# 加聚合 _meta 區分「全 skipped」(metric 未 emit) vs「全 ok」(SLO 健康)
# 防止 dashboard 把 no_data 當 pass 顯示
violated_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("violated"))
skipped_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("skipped"))
violated_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "violated")
skipped_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "skipped")
ok_count = sum(
1 for v in results.values()
if isinstance(v, dict) and not v.get("violated") and not v.get("skipped") and "error" not in v
if isinstance(v, dict)
and v.get("status") == "ok"
)
error_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "error")
results["_meta"] = {
"violated_count": violated_count,
"skipped_count": skipped_count,
"ok_count": ok_count,
"error_count": error_count,
"all_status": sorted({v.get("status") for v in results.values() if isinstance(v, dict)}),
"all_skipped": skipped_count > 0 and ok_count == 0 and violated_count == 0,
"status": (
"no_data" if (skipped_count > 0 and ok_count == 0 and violated_count == 0)
@@ -496,9 +569,26 @@ class GovernanceAgent:
await self._alert(
"governance_slo_data_gap",
{
"reason": "all_slo_metrics_not_emitted",
"skipped_count": slo_meta.get("skipped_count", 0),
"hint": "ADR-100 emitter 未實作或 PROMETHEUS_MULTIPROC_DIR 未設",
"status": "warning",
"impact": {
"reason": "all_slo_metrics_not_emitted",
"skipped_count": slo_meta.get("skipped_count", 0),
"all_slo_metrics_not_emitted": True,
},
"remediation": {
"items": [
"補齊 ADR-100 SLO emitterautomation_operation_log_total / post_execution_verification_total / km_entries_total",
"設置 PROMETHEUS_MULTIPROC_DIR 並掛載可寫目錄(如 emptyDir",
],
"next_action": "run_adr100_slo_emit_playbook",
"hint": "ADR-100 emitter 未實作或 PROMETHEUS_MULTIPROC_DIR 未設",
},
"actionable": {
"items": [
"先確認所有 API Pod 是否有 PROMETHEUS_MULTIPROC_DIR 掛載",
"檢查 Prometheus rule 是否已載入 sli:autonomy_rate:5m 等 4 項告警規則",
],
},
},
)
except Exception:

View File

@@ -52,3 +52,4 @@ data:
TG_GROUP_CUTOVER: "true"
HERMES_NL_ENABLED: "false"
ENABLE_12AGENT_CONSENSUS: "false"
PROMETHEUS_MULTIPROC_DIR: "/tmp/awoooi-prometheus-multiproc"

View File

@@ -38,6 +38,12 @@ spec:
name: awoooi-config
- secretRef:
name: awoooi-secrets
env:
- name: PROMETHEUS_MULTIPROC_DIR
value: "/tmp/awoooi-prometheus-multiproc"
volumeMounts:
- name: prometheus-multiproc
mountPath: /tmp/awoooi-prometheus-multiproc
resources:
requests:
cpu: "100m"
@@ -59,6 +65,9 @@ spec:
initialDelaySeconds: 10
periodSeconds: 10
failureThreshold: 3
volumes:
- name: prometheus-multiproc
emptyDir: {}
---
apiVersion: v1

View File

@@ -132,6 +132,7 @@ data:
SENTRY_MCP_ENABLED: "true"
# Prometheus server 在 110:9090 (非 188)
PROMETHEUS_URL: "http://192.168.0.110:9090"
PROMETHEUS_MULTIPROC_DIR: "/tmp/awoooi-prometheus-multiproc"
# ============================================================================
# AIOps Phase 1-6 Feature Flags (2026-04-15 ogt: 全開,資料先全寫入 DB)

View File

@@ -77,6 +77,8 @@ spec:
value: "100"
- name: AGENT_SOLVER_TIMEOUT_SEC
value: "80"
- name: PROMETHEUS_MULTIPROC_DIR
value: "/tmp/awoooi-prometheus-multiproc"
# 2026-04-05 Claude Code: Sprint 3 — 掛載 SSH key 供 HostRepairAgent 使用
volumeMounts:
- name: repair-ssh-key
@@ -104,6 +106,8 @@ spec:
mountPath: /etc/ssh-mcp/known_hosts
subPath: known_hosts
readOnly: true
- name: prometheus-multiproc
mountPath: /tmp/awoooi-prometheus-multiproc
resources:
requests:
cpu: "200m"
@@ -169,6 +173,8 @@ spec:
secretName: ssh-mcp-key
defaultMode: 0400
optional: true
- name: prometheus-multiproc
emptyDir: {}
---
apiVersion: v1