diff --git a/apps/api/src/jobs/ai_slo_watchdog_job.py b/apps/api/src/jobs/ai_slo_watchdog_job.py index 2eeba58d..d0c382c1 100644 --- a/apps/api/src/jobs/ai_slo_watchdog_job.py +++ b/apps/api/src/jobs/ai_slo_watchdog_job.py @@ -20,6 +20,7 @@ ADR-092 B3 (2026-04-24 ogt + Claude Sonnet 4.6 Asia/Taipei): from __future__ import annotations import asyncio +import time import uuid from datetime import UTC, datetime, timedelta @@ -40,6 +41,17 @@ _TG_SILENCE_THRESHOLD = 2 # PENDING telegram_message_id IS NULL 告警門檻 _FLYWHEEL_SUCCESS_MIN = 0.30 # 執行成功率下限 _STUCK_ANALYSIS_THRESHOLD = 3 # Agent Debate 失敗導致卡住的告警門檻 +# 2026-05-03 ogt + Claude Opus 4.7 — feedback_silencing_alerts_recurring_violation +# 啟動寬限期:30 分鐘內可 skip「資料還沒到」噪音;超過寬限期仍空 = 真資料管線斷,必須告警 +# 不可單獨用 skip 吞告警 — 一定要配對打「初始化期過、資料應該來但沒來」新告警 +_INIT_GRACE_SEC = 1800 +_PROCESS_START = time.monotonic() + + +def _grace_active() -> bool: + """啟動 30 分鐘內為寬限期;超過後資料缺失必須告警""" + return (time.monotonic() - _PROCESS_START) < _INIT_GRACE_SEC + async def run_ai_slo_watchdog_loop() -> None: """ @@ -82,29 +94,51 @@ async def _check_once() -> None: except Exception as e: logger.warning("watchdog_w2_tg_silence_check_failed", error=str(e)) - # W-3: 飛輪執行成功率過低 - # 2026-05-02 ogt + Claude Sonnet 4.6 — Bug 2 修復(fresh deploy 假告警) - # execution_success_rate=None 代表樣本不足(total_exec < FLYWHEEL_MIN_SAMPLE), - # 跳過本次 W-3 檢查,避免每次 restart / fresh deploy 必噴「飛輪成功率 0%」假告警 + # W-3a: 飛輪執行成功率過低(有樣本但低於門檻) + # W-3b: 啟動寬限期過後仍無樣本 = 飛輪資料管線斷流(rate=None > 30min) + # 2026-05-03 ogt + Claude Opus 4.7(亞太)— feedback_silencing_alerts_recurring_violation + # 2026-05-02 的 W-3 修復用 `rate is None: skip` 把告警吞了,違反「禁消音化解法」鐵律。 + # 修正:分流 — 啟動 30 分鐘內 skip(避免 fresh deploy 噪音),超過寬限期仍 None + # 改打「資料管線無流量」告警,補回故障可見性。 try: from src.services.flywheel_stats_service import FlywheelStatsService metrics = await FlywheelStatsService().compute() if metrics and metrics.execution_success_rate is None: - logger.debug("watchdog_w3_skipped_insufficient_sample", reason="execution_sample_below_min") + if _grace_active(): + logger.debug( + "watchdog_w3_init_grace_skip", + reason="execution_sample_below_min", + uptime_sec=int(time.monotonic() - _PROCESS_START), + ) + else: + violations.append( + "飛輪執行成功率資料管線無流量(uptime > 30min 仍無樣本)" + ) elif metrics and metrics.execution_success_rate < _FLYWHEEL_SUCCESS_MIN: - violations.append(f"飛輪執行成功率 {metrics.execution_success_rate:.1%} < {_FLYWHEEL_SUCCESS_MIN:.0%}") + violations.append( + f"飛輪執行成功率 {metrics.execution_success_rate:.1%} < {_FLYWHEEL_SUCCESS_MIN:.0%}" + ) except Exception as e: logger.warning("watchdog_w3_flywheel_check_failed", error=str(e)) - # W-4: 無 APPROVED Playbook(自動修復鏈路斷裂) - # 2026-05-02 ogt + Claude Sonnet 4.6 — Bug 4 修復(全封存初始化誤報) - # 原邏輯:approved==0 即告警,未排除「playbooks 表本身為空」的初始化 / migration 場景 - # 修法:先查 total count,total==0 表示表初始化中 → skip 並 log; - # total>0 且 approved==0 才是真正的「全封存」斷鏈告警 + # W-4a: 無 APPROVED Playbook(total > 0 但 approved=0,evolver 全封存自動修復斷鏈) + # W-4b: 啟動寬限期過後 playbooks 表仍空(migration 沒跑 / 表被清空) + # 2026-05-03 ogt + Claude Opus 4.7(亞太)— feedback_silencing_alerts_recurring_violation + # 2026-05-02 的 W-4 修復用 `total==0: skip` 把告警吞了,violates 同樣鐵律。 + # 修正:分流 — 啟動 30 分鐘內 skip,超過寬限期仍 0 改打「Playbook 表初始化失敗」告警。 try: approved_count, total_playbook_count = await _count_approved_playbooks() if total_playbook_count == 0: - logger.info("watchdog_w4_skipped_empty_table", reason="playbook_table_empty_likely_initializing") + if _grace_active(): + logger.info( + "watchdog_w4_init_grace_skip", + reason="playbook_table_empty_likely_initializing", + uptime_sec=int(time.monotonic() - _PROCESS_START), + ) + else: + violations.append( + "Playbook 表為空 — 初始化失敗或表被清空(uptime > 30min 仍 0 筆)" + ) elif approved_count == 0: violations.append("無 APPROVED Playbook — 自動修復鏈路斷裂(evolver 可能全部封存)") except Exception as e: @@ -123,20 +157,23 @@ async def _check_once() -> None: except Exception as e: logger.warning("watchdog_w5_stuck_analysis_check_failed", error=str(e)) - # W-6: Trust Drift 偵測(Playbook 信任度分布偏態) - # P2.6 接入 2026-04-24 ogt + Claude Sonnet 4.6 - # trust_drift_detector 是孤立服務,此處首次接入 watchdog 自動觸發 + # W-6: Trust Drift 偵測(Playbook 信任度漂移) + # 2026-05-02 ogt + Claude Sonnet 4.6(亞太): 整併雙寫路徑 + # 原行為:呼叫 trust_drift_detector.run() 直接寫 event_type=trust_drift 到 PG + # governance_agent.check_trust_drift() 每 1h 也寫同一 event_type → 雙寫 + # 整併:改呼叫 governance_agent.check_trust_drift() 為唯一 source-of-truth + # W-6 watchdog 仍每 15 分鐘執行(感知器),violations 計數用於 meta-alert 觸發 + # PG 寫入由 governance_agent._alert() 統一處理,避免雙寫 try: - from src.services.trust_drift_detector import get_trust_drift_detector - dist = await get_trust_drift_detector().run() - if dist.drift_detected: - drift_labels = { - "optimism_bias": "盲目樂觀 — PostExecutionVerifier 可能失效或 RAG 資料污染", - "confidence_collapse": "學習鎖死 — EWMA 計算異常或所有執行誤判失敗", - } - label = drift_labels.get(dist.drift_type or "", dist.drift_type or "未知") + from src.services.governance_agent import get_governance_agent + trust_result = await get_governance_agent().check_trust_drift() + if trust_result.get("drifted", 0) > 0: + drifted = trust_result["drifted"] + auto_deprecated = trust_result.get("auto_deprecated", 0) + kept = trust_result.get("kept", 0) violations.append( - f"Trust Drift 偵測到 {label}(高分 {dist.high_ratio:.0%} / 低分 {dist.low_ratio:.0%},共 {dist.total} 個 Playbook)" + f"Trust Drift 偵測到 {drifted} 個 Playbook 信任度低落" + f"(auto-deprecated: {auto_deprecated},待人工審核: {kept})" ) except Exception as e: logger.warning("watchdog_w6_trust_drift_check_failed", error=str(e)) diff --git a/k8s/monitoring/flywheel-alerts.yaml b/k8s/monitoring/flywheel-alerts.yaml index 49343ee9..0f0efd47 100644 --- a/k8s/monitoring/flywheel-alerts.yaml +++ b/k8s/monitoring/flywheel-alerts.yaml @@ -42,7 +42,11 @@ spec: description: "Playbook 數量持續 1 小時為 0,飛輪學習節點完全失效。" runbook: "執行 scripts/cold_start_playbooks.py 冷啟動" - # P0: 執行成功率極低 + # P0: 執行成功率極低(有資料但低於門檻) + # 2026-05-03 ogt + Claude Opus 4.7(亞太)— anti-silencing 補配對告警 + # 新版 flywheel_stats_service 樣本不足會 emit NaN(Prom 把 NaN 當 stale 不參與比較) + # 故此規則只在「有資料、值低於 0.1」時觸發,不會被 NaN 誤觸; + # 真正的「資料管線斷流」由下方 FlywheelExecutionRateMissing 補打。 - alert: FlywheelExecutionSuccessLow expr: awoooi_flywheel_execution_success_rate < 0.1 for: 2h @@ -55,6 +59,22 @@ spec: description: "執行成功率 {{ $value | humanizePercentage }},低於健康基線 10%。" runbook: "檢查 decision_manager 日誌,確認 target 解析和 SSH MCP 狀態" + # P0: 飛輪執行率資料管線斷流(NaN sentinel + 30 分鐘無資料) + # 2026-05-03 ogt + Claude Opus 4.7(亞太)— feedback_silencing_alerts_recurring_violation + # 配對 FlywheelExecutionSuccessLow:當指標連續 30 分鐘為 NaN(樣本不足 sentinel) + # 即代表「資料應該來但沒來」,watchdog W-3b 也會打同一情境,雙保險。 + - alert: FlywheelExecutionRateMissing + expr: absent(awoooi_flywheel_execution_success_rate) or (awoooi_flywheel_execution_success_rate != awoooi_flywheel_execution_success_rate) + for: 30m + labels: + severity: warning + alert_category: infrastructure + notification_type: TYPE-3 + annotations: + summary: "飛輪執行率指標連續 30 分鐘無資料" + description: "execution_success_rate 連續 30 分鐘為 NaN 或不存在,代表 Redis playbook 統計斷流(資料管線壞 / Redis flush / FlywheelStatsService 異常)。" + runbook: "1) 檢查 Redis playbook:* keys 是否存在 2) 檢查 FlywheelStatsService 日誌 3) /metrics endpoint 直接拉看 NaN 來源" + # P0: KM 大量未向量化 → RAG 無法使用歷史案例 - alert: FlywheelKMVectorizationLow expr: awoooi_flywheel_km_unvectorized_count > 10 diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index ed665e09..dc6f9acc 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -765,6 +765,8 @@ groups: summary: "飛輪 Playbook 數量為零,AI 修復完全依賴 LLM" description: "Redis 中無任何已批准 Playbook,自動修復能力大幅降低" runbook: "執行 scripts/cold_start_playbooks.py 冷啟動" + # 2026-05-03 ogt + Claude Opus 4.7(亞太)— anti-silencing 補配對告警 + # NaN sentinel 不會被 < 0.1 誤觸;下方 FlywheelExecutionRateMissing 補「無資料」獨立告警 - alert: FlywheelExecutionSuccessLow expr: awoooi_flywheel_execution_success_rate < 0.1 for: 2h @@ -779,6 +781,20 @@ groups: summary: "飛輪執行成功率 {{ $value | humanizePercentage }} 低於 10%" description: "連續 2 小時執行成功率不足 10%,Playbook 可能已過時" runbook: "檢查 decision_manager 日誌,確認 target 解析和 SSH MCP 狀態" + - alert: FlywheelExecutionRateMissing + expr: absent(awoooi_flywheel_execution_success_rate) or (awoooi_flywheel_execution_success_rate != awoooi_flywheel_execution_success_rate) + for: 30m + labels: + severity: warning + layer: k8s + team: aiops + auto_repair: "false" + alert_category: flywheel_health + notification_type: TYPE-8M + annotations: + summary: "飛輪執行率指標 30 分鐘無資料" + description: "execution_success_rate 連續 30 分鐘為 NaN 或不存在,Redis playbook 統計斷流(資料管線壞 / Redis flush / FlywheelStatsService 異常)" + runbook: "1) 檢查 Redis playbook:* keys 2) 檢查 FlywheelStatsService 日誌 3) curl /metrics 直接拉看 NaN 來源" - alert: FlywheelKMVectorizationLow expr: awoooi_flywheel_km_unvectorized_count > 10 for: 30m diff --git a/ops/monitoring/alerts.yml b/ops/monitoring/alerts.yml index da18aff2..20e9dea6 100644 --- a/ops/monitoring/alerts.yml +++ b/ops/monitoring/alerts.yml @@ -771,6 +771,8 @@ groups: summary: "飛輪 Playbook 數量為零,AI 修復完全依賴 LLM" description: "Redis 中無任何已批准 Playbook,自動修復能力大幅降低" runbook: "執行 scripts/cold_start_playbooks.py 冷啟動" + # 2026-05-03 ogt + Claude Opus 4.7(亞太)— anti-silencing 補配對告警 + # NaN sentinel 不會被 < 0.1 誤觸;下方 FlywheelExecutionRateMissing 補「無資料」獨立告警 - alert: FlywheelExecutionSuccessLow expr: awoooi_flywheel_execution_success_rate < 0.1 for: 2h @@ -785,6 +787,20 @@ groups: summary: "飛輪執行成功率 {{ $value | humanizePercentage }} 低於 10%" description: "連續 2 小時執行成功率不足 10%,Playbook 可能已過時" runbook: "檢查 decision_manager 日誌,確認 target 解析和 SSH MCP 狀態" + - alert: FlywheelExecutionRateMissing + expr: absent(awoooi_flywheel_execution_success_rate) or (awoooi_flywheel_execution_success_rate != awoooi_flywheel_execution_success_rate) + for: 30m + labels: + severity: warning + layer: k8s + team: aiops + auto_repair: "false" + alert_category: flywheel_health + notification_type: TYPE-8M + annotations: + summary: "飛輪執行率指標 30 分鐘無資料" + description: "execution_success_rate 連續 30 分鐘為 NaN 或不存在,Redis playbook 統計斷流(資料管線壞 / Redis flush / FlywheelStatsService 異常)" + runbook: "1) 檢查 Redis playbook:* keys 2) 檢查 FlywheelStatsService 日誌 3) curl /metrics 直接拉看 NaN 來源" - alert: FlywheelKMVectorizationLow expr: awoooi_flywheel_km_unvectorized_count > 10 for: 30m