chore(governance,watchdog): enrich alerts and enable prometheus multiproc
Some checks failed
CD Pipeline / tests (push) Failing after 1m22s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Successful in 43s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 57s
Some checks failed
CD Pipeline / tests (push) Failing after 1m22s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Successful in 43s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 57s
This commit is contained in:
@@ -1093,9 +1093,10 @@ async def receive_alert(
|
||||
"is_rule_based": False,
|
||||
"playbook_id": None,
|
||||
}
|
||||
_cmd_cs1 = (analysis_result.kubectl_command or "").strip()
|
||||
approval_create = ApprovalRequestCreate(
|
||||
action=f"{analysis_result.action_title} | {analysis_result.kubectl_command}",
|
||||
description=f"[AI: {ai_provider}] {analysis_result.description}",
|
||||
action=(_cmd_cs1 or f"{analysis_result.action_title} | NO_ACTION"),
|
||||
description=f"[AI: {ai_provider}] {analysis_result.action_title} | {analysis_result.description}",
|
||||
risk_level=risk_level,
|
||||
blast_radius=BlastRadius(
|
||||
affected_pods=blast.affected_pods,
|
||||
@@ -1179,6 +1180,13 @@ async def receive_alert(
|
||||
status=ApprovalStatus.APPROVED,
|
||||
risk_level=risk_level.value,
|
||||
matched_playbook_id=None,
|
||||
metadata={
|
||||
**_approval_metadata_cs1,
|
||||
"is_high_confidence": True,
|
||||
"policy_reason": _shadow_result.reason.value
|
||||
if hasattr(_shadow_result, "reason")
|
||||
else "cs1_auto_confident_execution",
|
||||
},
|
||||
)
|
||||
_cs1_auto_approval.id = approval.id
|
||||
|
||||
@@ -1194,13 +1202,19 @@ async def receive_alert(
|
||||
error=str(_cs1_upd_err),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"llm_high_confidence_auto_executed",
|
||||
approval_id=str(approval.id),
|
||||
confidence=analysis_result.confidence,
|
||||
exec_success=_cs1_exec_success,
|
||||
action=_cs1_kubectl[:80],
|
||||
)
|
||||
logger.info(
|
||||
"llm_high_confidence_auto_executed",
|
||||
approval_id=str(approval.id),
|
||||
confidence=analysis_result.confidence,
|
||||
exec_success=_cs1_exec_success,
|
||||
action=_cs1_kubectl[:80],
|
||||
is_high_confidence=True,
|
||||
policy_reason=(
|
||||
_shadow_result.reason.value
|
||||
if hasattr(_shadow_result, "reason")
|
||||
else "cs1_auto_confident_execution"
|
||||
),
|
||||
)
|
||||
except Exception as _cs1_auto_err:
|
||||
logger.warning(
|
||||
"llm_high_confidence_auto_execute_failed",
|
||||
@@ -1419,7 +1433,7 @@ async def _process_new_alert_background(
|
||||
rule_kubectl = str(rule_response.get("kubectl_command", "")).strip()
|
||||
rule_description = str(rule_response.get("description", message))
|
||||
rule_action = (
|
||||
f"{rule_action_title} | {rule_kubectl}"
|
||||
rule_kubectl
|
||||
if rule_kubectl else
|
||||
f"NO_ACTION - {rule_description[:120]}"
|
||||
)
|
||||
@@ -1656,9 +1670,10 @@ async def _process_new_alert_background(
|
||||
"is_rule_based": False,
|
||||
"playbook_id": None,
|
||||
}
|
||||
_cmd_cs3 = (analysis_result.kubectl_command or "").strip()
|
||||
approval_create = ApprovalRequestCreate(
|
||||
action=f"{analysis_result.action_title} | {analysis_result.kubectl_command}",
|
||||
description=f"[AI: {ai_provider}] {analysis_result.description}",
|
||||
action=(_cmd_cs3 or f"{analysis_result.action_title} | NO_ACTION"),
|
||||
description=f"[AI: {ai_provider}] {analysis_result.action_title} | {analysis_result.description}",
|
||||
risk_level=risk_level,
|
||||
blast_radius=BlastRadius(
|
||||
affected_pods=blast.affected_pods if blast else 1,
|
||||
@@ -1719,6 +1734,13 @@ async def _process_new_alert_background(
|
||||
status=ApprovalStatus.APPROVED,
|
||||
risk_level=risk_level.value,
|
||||
matched_playbook_id=None,
|
||||
metadata={
|
||||
**_approval_metadata_cs3,
|
||||
"is_high_confidence": True,
|
||||
"policy_reason": _shadow_result_cs3.reason.value
|
||||
if hasattr(_shadow_result_cs3, "reason")
|
||||
else "cs3_auto_confident_execution",
|
||||
},
|
||||
)
|
||||
_cs3_executor = ApprovalExecutionService()
|
||||
_cs3_exec_success = await _cs3_executor.execute_approved_action(_cs3_auto_approval)
|
||||
@@ -1729,6 +1751,12 @@ async def _process_new_alert_background(
|
||||
confidence=analysis_result.confidence,
|
||||
success=_cs3_exec_success,
|
||||
provider=ai_provider,
|
||||
is_high_confidence=True,
|
||||
policy_reason=(
|
||||
_shadow_result_cs3.reason.value
|
||||
if hasattr(_shadow_result_cs3, "reason")
|
||||
else "cs3_auto_confident_execution"
|
||||
),
|
||||
)
|
||||
except Exception as _cs3_exec_err:
|
||||
logger.warning("cs3_llm_auto_execute_failed", error=str(_cs3_exec_err))
|
||||
|
||||
@@ -144,19 +144,19 @@ async def _check_once() -> None:
|
||||
await redis.setex(dedup_key, _DEDUP_TTL_SEC, "1")
|
||||
|
||||
# 發送 TYPE-8M Meta-System 告警
|
||||
diagnosis = " | ".join(violations)
|
||||
incident_id = f"META-{now_taipei().strftime('%Y%m%d%H%M%S')}"
|
||||
try:
|
||||
from src.services.telegram_gateway import get_telegram_gateway
|
||||
await get_telegram_gateway().send_meta_alert(
|
||||
incident_id=incident_id,
|
||||
diagnosis = " | ".join(violations)
|
||||
incident_id = f"META-{now_taipei().strftime('%Y%m%d%H%M%S')}"
|
||||
try:
|
||||
from src.services.telegram_gateway import get_telegram_gateway
|
||||
await get_telegram_gateway().send_meta_alert(
|
||||
incident_id=incident_id,
|
||||
approval_id=str(uuid.uuid4()),
|
||||
alertname="AI 自健診異常",
|
||||
alert_category="flywheel_health",
|
||||
diagnosis=diagnosis,
|
||||
severity_level="critical",
|
||||
system_impact=f"{len(violations)} 項 KPI 異常(W-1~W-5),飛輪自動化能力可能降級",
|
||||
)
|
||||
alert_category="flywheel_health",
|
||||
diagnosis=diagnosis,
|
||||
severity_level="critical",
|
||||
system_impact=f"{len(violations)} 項 KPI 異常(W-1~W-6),飛輪自動化能力可能降級",
|
||||
)
|
||||
logger.warning(
|
||||
"ai_slo_watchdog_alert_sent",
|
||||
incident_id=incident_id,
|
||||
|
||||
@@ -106,25 +106,42 @@ class GovernanceAgent:
|
||||
else:
|
||||
kept_ids.append(r.playbook_id)
|
||||
|
||||
if auto_deprecated_ids:
|
||||
await db.commit()
|
||||
logger.info(
|
||||
"governance_trust_drift_auto_deprecated",
|
||||
count=len(auto_deprecated_ids),
|
||||
ids=auto_deprecated_ids[:10],
|
||||
)
|
||||
if auto_deprecated_ids:
|
||||
await db.commit()
|
||||
logger.info(
|
||||
"governance_trust_drift_auto_deprecated",
|
||||
count=len(auto_deprecated_ids),
|
||||
ids=auto_deprecated_ids[:10],
|
||||
)
|
||||
|
||||
if drifted:
|
||||
drift_ratio = len(drifted) / total if total > 0 else 0.0
|
||||
await self._alert(
|
||||
"trust_drift",
|
||||
{
|
||||
"drifted_count": len(drifted),
|
||||
"total_playbooks": total,
|
||||
"playbook_ids": kept_ids[:10],
|
||||
"auto_deprecated_count": len(auto_deprecated_ids),
|
||||
"auto_deprecated_ids": auto_deprecated_ids[:10],
|
||||
"threshold": TRUST_DRIFT_THRESHOLD,
|
||||
"auto_deprecate_after_days": TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS,
|
||||
"status": "warning",
|
||||
"impact": {
|
||||
"drifted_count": len(drifted),
|
||||
"total_playbooks": total,
|
||||
"drift_ratio": round(drift_ratio, 3),
|
||||
"threshold": TRUST_DRIFT_THRESHOLD,
|
||||
"auto_deprecate_after_days": TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS,
|
||||
},
|
||||
"remediation": {
|
||||
"items": [
|
||||
"Auto-deprecate low-trust stale playbooks",
|
||||
"Review candidate playbooks by impact scope and rollback if needed",
|
||||
],
|
||||
"auto_deprecated_count": len(auto_deprecated_ids),
|
||||
"auto_deprecated_ids": auto_deprecated_ids[:10],
|
||||
},
|
||||
"actionable": {
|
||||
"items": [
|
||||
"立即補齊 PLAYBOOK_SOURCE 與 playbook_metadata",
|
||||
"必要時人工覆核 kept_ids 中的高風險 Playbook",
|
||||
],
|
||||
"sample_playbook_ids": kept_ids[:10],
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
@@ -177,11 +194,27 @@ class GovernanceAgent:
|
||||
await self._alert(
|
||||
"knowledge_degradation",
|
||||
{
|
||||
"stale_count": stale,
|
||||
"total_count": total,
|
||||
"stale_ratio": round(ratio, 3),
|
||||
"threshold": KM_STALE_RATIO,
|
||||
"stale_days": KM_STALE_DAYS,
|
||||
"status": "warning",
|
||||
"impact": {
|
||||
"stale_count": stale,
|
||||
"total_count": total,
|
||||
"stale_ratio": round(ratio, 3),
|
||||
"threshold": KM_STALE_RATIO,
|
||||
"stale_days": KM_STALE_DAYS,
|
||||
},
|
||||
"remediation": {
|
||||
"items": [
|
||||
"啟動 KM 反查與自動補齊流程",
|
||||
"關鍵服務告警自動同步到 KM 任務,補齊缺失條目",
|
||||
],
|
||||
"next_action": "run_kb_growth_healthcheck",
|
||||
},
|
||||
"actionable": {
|
||||
"items": [
|
||||
"每日檢查 ANTI_PATTERN 更新結果",
|
||||
"安排至少 2 位 owner 對 stale條目做快速人工審核",
|
||||
],
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
@@ -348,9 +381,11 @@ class GovernanceAgent:
|
||||
# 不可 fallback 0.0,否則必觸發 violated=True 噴假警報
|
||||
if not result_list:
|
||||
results[name] = {
|
||||
"name": name,
|
||||
"status": "skipped",
|
||||
"error": "no_data",
|
||||
"skipped": True,
|
||||
"reason": "prometheus_empty_result_metric_not_emitted",
|
||||
"hint": "ADR-100 emitter 未實作或 PROMETHEUS_MULTIPROC_DIR 未設",
|
||||
}
|
||||
logger.warning(
|
||||
"governance_slo_no_data",
|
||||
@@ -365,9 +400,12 @@ class GovernanceAgent:
|
||||
violated = value < threshold
|
||||
|
||||
results[name] = {
|
||||
"name": name,
|
||||
"status": "violated" if violated else "ok",
|
||||
"value": round(value, 4),
|
||||
"slo_target": target,
|
||||
"hard_red_line": threshold,
|
||||
"gap": round(threshold - value, 4) if violated else round(value - target, 4),
|
||||
"violated": violated,
|
||||
}
|
||||
|
||||
@@ -375,11 +413,27 @@ class GovernanceAgent:
|
||||
await self._alert(
|
||||
f"slo_{name}_violation",
|
||||
{
|
||||
"slo_name": name,
|
||||
"current_value": round(value, 4),
|
||||
"hard_red_line": threshold,
|
||||
"slo_target": target,
|
||||
"gap": round(threshold - value, 4),
|
||||
"status": "violation",
|
||||
"impact": {
|
||||
"name": name,
|
||||
"value": round(value, 4),
|
||||
"target": target,
|
||||
"threshold": threshold,
|
||||
"gap": round(threshold - value, 4),
|
||||
},
|
||||
"remediation": {
|
||||
"items": [
|
||||
"Pause auto-scaling or risky auto-fix tasks",
|
||||
"Review evidence/decision traces and adjust policy thresholds",
|
||||
],
|
||||
"next_action": "trigger_flywheel_safeguard",
|
||||
},
|
||||
"actionable": {
|
||||
"items": [
|
||||
"Check verifier lag and post-exec learning health",
|
||||
"Run emergency incident audit on failed approvals",
|
||||
],
|
||||
},
|
||||
},
|
||||
)
|
||||
logger.warning(
|
||||
@@ -388,6 +442,12 @@ class GovernanceAgent:
|
||||
value=round(value, 4),
|
||||
hard_red_line=threshold,
|
||||
)
|
||||
elif value == 0 and threshold <= 0:
|
||||
logger.warning(
|
||||
"governance_slo_unexpected_zero",
|
||||
slo=name,
|
||||
value=round(value, 4),
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
"governance_slo_ok",
|
||||
@@ -396,7 +456,12 @@ class GovernanceAgent:
|
||||
target=target,
|
||||
)
|
||||
else:
|
||||
results[name] = {"error": "prometheus_query_failed", "status": data.get("status")}
|
||||
results[name] = {
|
||||
"name": name,
|
||||
"status": "error",
|
||||
"error": "prometheus_query_failed",
|
||||
"response_status": data.get("status"),
|
||||
}
|
||||
logger.warning(
|
||||
"governance_slo_prometheus_error",
|
||||
slo=name,
|
||||
@@ -404,22 +469,30 @@ class GovernanceAgent:
|
||||
response_status=data.get("status"),
|
||||
)
|
||||
except Exception as e:
|
||||
results[name] = {"error": str(e)}
|
||||
results[name] = {
|
||||
"name": name,
|
||||
"status": "error",
|
||||
"error": str(e),
|
||||
}
|
||||
logger.warning("governance_slo_check_error", slo=name, error=str(e))
|
||||
|
||||
# 2026-04-29 ogt + Claude Opus 4.7: critic M6 修
|
||||
# 加聚合 _meta 區分「全 skipped」(metric 未 emit) vs「全 ok」(SLO 健康)
|
||||
# 防止 dashboard 把 no_data 當 pass 顯示
|
||||
violated_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("violated"))
|
||||
skipped_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("skipped"))
|
||||
violated_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "violated")
|
||||
skipped_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "skipped")
|
||||
ok_count = sum(
|
||||
1 for v in results.values()
|
||||
if isinstance(v, dict) and not v.get("violated") and not v.get("skipped") and "error" not in v
|
||||
if isinstance(v, dict)
|
||||
and v.get("status") == "ok"
|
||||
)
|
||||
error_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "error")
|
||||
results["_meta"] = {
|
||||
"violated_count": violated_count,
|
||||
"skipped_count": skipped_count,
|
||||
"ok_count": ok_count,
|
||||
"error_count": error_count,
|
||||
"all_status": sorted({v.get("status") for v in results.values() if isinstance(v, dict)}),
|
||||
"all_skipped": skipped_count > 0 and ok_count == 0 and violated_count == 0,
|
||||
"status": (
|
||||
"no_data" if (skipped_count > 0 and ok_count == 0 and violated_count == 0)
|
||||
@@ -496,9 +569,26 @@ class GovernanceAgent:
|
||||
await self._alert(
|
||||
"governance_slo_data_gap",
|
||||
{
|
||||
"reason": "all_slo_metrics_not_emitted",
|
||||
"skipped_count": slo_meta.get("skipped_count", 0),
|
||||
"hint": "ADR-100 emitter 未實作或 PROMETHEUS_MULTIPROC_DIR 未設",
|
||||
"status": "warning",
|
||||
"impact": {
|
||||
"reason": "all_slo_metrics_not_emitted",
|
||||
"skipped_count": slo_meta.get("skipped_count", 0),
|
||||
"all_slo_metrics_not_emitted": True,
|
||||
},
|
||||
"remediation": {
|
||||
"items": [
|
||||
"補齊 ADR-100 SLO emitter(automation_operation_log_total / post_execution_verification_total / km_entries_total)",
|
||||
"設置 PROMETHEUS_MULTIPROC_DIR 並掛載可寫目錄(如 emptyDir)",
|
||||
],
|
||||
"next_action": "run_adr100_slo_emit_playbook",
|
||||
"hint": "ADR-100 emitter 未實作或 PROMETHEUS_MULTIPROC_DIR 未設",
|
||||
},
|
||||
"actionable": {
|
||||
"items": [
|
||||
"先確認所有 API Pod 是否有 PROMETHEUS_MULTIPROC_DIR 掛載",
|
||||
"檢查 Prometheus rule 是否已載入 sli:autonomy_rate:5m 等 4 項告警規則",
|
||||
],
|
||||
},
|
||||
},
|
||||
)
|
||||
except Exception:
|
||||
|
||||
@@ -52,3 +52,4 @@ data:
|
||||
TG_GROUP_CUTOVER: "true"
|
||||
HERMES_NL_ENABLED: "false"
|
||||
ENABLE_12AGENT_CONSENSUS: "false"
|
||||
PROMETHEUS_MULTIPROC_DIR: "/tmp/awoooi-prometheus-multiproc"
|
||||
|
||||
@@ -38,6 +38,12 @@ spec:
|
||||
name: awoooi-config
|
||||
- secretRef:
|
||||
name: awoooi-secrets
|
||||
env:
|
||||
- name: PROMETHEUS_MULTIPROC_DIR
|
||||
value: "/tmp/awoooi-prometheus-multiproc"
|
||||
volumeMounts:
|
||||
- name: prometheus-multiproc
|
||||
mountPath: /tmp/awoooi-prometheus-multiproc
|
||||
resources:
|
||||
requests:
|
||||
cpu: "100m"
|
||||
@@ -59,6 +65,9 @@ spec:
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
failureThreshold: 3
|
||||
volumes:
|
||||
- name: prometheus-multiproc
|
||||
emptyDir: {}
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
|
||||
@@ -132,6 +132,7 @@ data:
|
||||
SENTRY_MCP_ENABLED: "true"
|
||||
# Prometheus server 在 110:9090 (非 188)
|
||||
PROMETHEUS_URL: "http://192.168.0.110:9090"
|
||||
PROMETHEUS_MULTIPROC_DIR: "/tmp/awoooi-prometheus-multiproc"
|
||||
|
||||
# ============================================================================
|
||||
# AIOps Phase 1-6 Feature Flags (2026-04-15 ogt: 全開,資料先全寫入 DB)
|
||||
|
||||
@@ -77,6 +77,8 @@ spec:
|
||||
value: "100"
|
||||
- name: AGENT_SOLVER_TIMEOUT_SEC
|
||||
value: "80"
|
||||
- name: PROMETHEUS_MULTIPROC_DIR
|
||||
value: "/tmp/awoooi-prometheus-multiproc"
|
||||
# 2026-04-05 Claude Code: Sprint 3 — 掛載 SSH key 供 HostRepairAgent 使用
|
||||
volumeMounts:
|
||||
- name: repair-ssh-key
|
||||
@@ -104,6 +106,8 @@ spec:
|
||||
mountPath: /etc/ssh-mcp/known_hosts
|
||||
subPath: known_hosts
|
||||
readOnly: true
|
||||
- name: prometheus-multiproc
|
||||
mountPath: /tmp/awoooi-prometheus-multiproc
|
||||
resources:
|
||||
requests:
|
||||
cpu: "200m"
|
||||
@@ -169,6 +173,8 @@ spec:
|
||||
secretName: ssh-mcp-key
|
||||
defaultMode: 0400
|
||||
optional: true
|
||||
- name: prometheus-multiproc
|
||||
emptyDir: {}
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
|
||||
Reference in New Issue
Block a user