diff --git a/apps/api/src/jobs/ai_slo_watchdog_job.py b/apps/api/src/jobs/ai_slo_watchdog_job.py index 0461dca1..2eeba58d 100644 --- a/apps/api/src/jobs/ai_slo_watchdog_job.py +++ b/apps/api/src/jobs/ai_slo_watchdog_job.py @@ -83,18 +83,29 @@ async def _check_once() -> None: logger.warning("watchdog_w2_tg_silence_check_failed", error=str(e)) # W-3: 飛輪執行成功率過低 + # 2026-05-02 ogt + Claude Sonnet 4.6 — Bug 2 修復(fresh deploy 假告警) + # execution_success_rate=None 代表樣本不足(total_exec < FLYWHEEL_MIN_SAMPLE), + # 跳過本次 W-3 檢查,避免每次 restart / fresh deploy 必噴「飛輪成功率 0%」假告警 try: from src.services.flywheel_stats_service import FlywheelStatsService metrics = await FlywheelStatsService().compute() - if metrics and metrics.execution_success_rate < _FLYWHEEL_SUCCESS_MIN: + if metrics and metrics.execution_success_rate is None: + logger.debug("watchdog_w3_skipped_insufficient_sample", reason="execution_sample_below_min") + elif metrics and metrics.execution_success_rate < _FLYWHEEL_SUCCESS_MIN: violations.append(f"飛輪執行成功率 {metrics.execution_success_rate:.1%} < {_FLYWHEEL_SUCCESS_MIN:.0%}") except Exception as e: logger.warning("watchdog_w3_flywheel_check_failed", error=str(e)) # W-4: 無 APPROVED Playbook(自動修復鏈路斷裂) + # 2026-05-02 ogt + Claude Sonnet 4.6 — Bug 4 修復(全封存初始化誤報) + # 原邏輯:approved==0 即告警,未排除「playbooks 表本身為空」的初始化 / migration 場景 + # 修法:先查 total count,total==0 表示表初始化中 → skip 並 log; + # total>0 且 approved==0 才是真正的「全封存」斷鏈告警 try: - approved_count = await _count_approved_playbooks() - if approved_count == 0: + approved_count, total_playbook_count = await _count_approved_playbooks() + if total_playbook_count == 0: + logger.info("watchdog_w4_skipped_empty_table", reason="playbook_table_empty_likely_initializing") + elif approved_count == 0: violations.append("無 APPROVED Playbook — 自動修復鏈路斷裂(evolver 可能全部封存)") except Exception as e: logger.warning("watchdog_w4_playbook_check_failed", error=str(e)) @@ -215,14 +226,26 @@ async def _count_pending_no_tg_sent() -> int: return len(rows) -async def _count_approved_playbooks() -> int: - """查詢 APPROVED 狀態 Playbook 數量,為 0 代表自動修復鏈路斷裂。""" +async def _count_approved_playbooks() -> tuple[int, int]: + """查詢 APPROVED Playbook 數量 + 全表總數,兩者均回傳。 + + 2026-05-02 ogt + Claude Sonnet 4.6 — Bug 4 修復(全封存初始化誤報) + 加回傳 total count:若 total==0 代表表初始化中,W-4 應 skip 而非告警。 + 回傳:(approved_count, total_count) + """ from sqlalchemy import text as sa_text async with get_db_context() as db: - result = await db.execute( + approved_result = await db.execute( sa_text("SELECT COUNT(*) FROM playbooks WHERE status = 'approved'") ) - return result.scalar() or 0 + approved = approved_result.scalar() or 0 + + total_result = await db.execute( + sa_text("SELECT COUNT(*) FROM playbooks") + ) + total = total_result.scalar() or 0 + + return approved, total async def _count_pending_stuck_analysis() -> int: diff --git a/apps/api/src/services/failover_alerter.py b/apps/api/src/services/failover_alerter.py index d552eb12..8868dfc4 100644 --- a/apps/api/src/services/failover_alerter.py +++ b/apps/api/src/services/failover_alerter.py @@ -10,6 +10,8 @@ from __future__ import annotations +import hashlib +import json from datetime import datetime, timezone, timedelta from typing import Any @@ -96,8 +98,27 @@ class FailoverAlerter: dedup TTL 3600s — 同類告警 1 小時內不重複發送 2026-04-26 P2.2 by Claude + 2026-05-02 ogt + Claude Sonnet 4.6 — Bug 3 修復:dedup key 加 payload hash + 原 key 只看 event_type,不看 payload 內容,導致同 event_type 但不同影響 + 的告警(例如:trust_drift 4 條→25 條漂移)全被 1h dedup 吃掉。 + + 2026-05-02 ogt + Claude Opus 4.7 — critic P1-3 連鎖修復 + 前次只 hash 頂層 allowlist 欄位,對 slo_*_violation / governance_self_failure + 等只把 metric 放在 impact subdict 的事件失效(hash 永遠相同)。 + 改 hash 整個 impact subdict — schema 強制 5 種 event type 都有 impact, + 各自的 metric 值都會反映在 hash 裡,數值變動就會繞過 dedup。 + sha256 取代 md5 避開 bandit B324 lint warning(非密碼學用途)。 """ - dedup_key = f"alert:governance:{event_type}" + # sanitize:防 SLO 名稱(如 "slo_km_growth_rate")含 ":" 或空格污染 key + safe_event_type = event_type.replace(":", "_").replace(" ", "_").lower() + + # impact hash:hash payload.impact subdict(schema 強制存在;含各 event 的 metric 值) + # default=str 容錯 datetime / Decimal / 其他非原生 JSON 型別 + impact = payload.get("impact", {}) if isinstance(payload, dict) else {} + _payload_hash = hashlib.sha256( + json.dumps(impact, sort_keys=True, default=str).encode() + ).hexdigest()[:8] + dedup_key = f"alert:governance:{safe_event_type}:{_payload_hash}" if not await self._check_dedup(dedup_key, ttl=3600): logger.debug("governance_alert_dedup_skipped", event_type=event_type) return diff --git a/apps/api/src/services/flywheel_stats_service.py b/apps/api/src/services/flywheel_stats_service.py index ff8a29b0..1ea75078 100644 --- a/apps/api/src/services/flywheel_stats_service.py +++ b/apps/api/src/services/flywheel_stats_service.py @@ -35,6 +35,13 @@ logger = structlog.get_logger(__name__) # Redis key prefix(與 playbook_repository.py 一致) _PLAYBOOK_KEY_PREFIX = "playbook:" +# 2026-05-02 ogt + Claude Sonnet 4.6 — Bug 2 修復(W-3 fresh deploy 假告警) +# execution_success_rate 需要最少樣本數才有統計意義; +# Redis 空(fresh deploy / restart)時 total_exec=0 → rate=0.0 → watchdog W-3 立即觸發假告警 +# 修法:total_exec < FLYWHEEL_MIN_SAMPLE 時回 None,watchdog 判 None 跳過 W-3 檢查 +# TODO: 未來移至 settings(目前 hardcode 以避免 config 改動超出本輪範圍) +FLYWHEEL_MIN_SAMPLE = 10 + # 飛輪六節點名稱 FLYWHEEL_NODES = [ "monitoring", @@ -57,7 +64,7 @@ class FlywheelMetrics: def __init__( self, playbook_count: int, - execution_success_rate: float, + execution_success_rate: float | None, km_unvectorized_count: int, alertname_null_rate: float, incidents_stuck: int, @@ -68,6 +75,9 @@ class FlywheelMetrics: current_flow: list[dict[str, Any]], computed_at: datetime, ) -> None: + # 2026-05-02 ogt + Claude Sonnet 4.6 — Bug 2 修復 + # execution_success_rate 為 None 時表示樣本不足(< FLYWHEEL_MIN_SAMPLE), + # watchdog W-3 應跳過該檢查,避免 fresh deploy 假告警 self.playbook_count = playbook_count self.execution_success_rate = execution_success_rate self.km_unvectorized_count = km_unvectorized_count @@ -84,14 +94,25 @@ class FlywheelMetrics: def to_prometheus_lines(self) -> str: """輸出 Prometheus text format""" ts = int(self.computed_at.timestamp() * 1000) + # 2026-05-02 ogt + Claude Opus 4.7 — Bug 2 後續修復(critic P0-1 連鎖修復) + # sentinel 用 NaN 而非 -1.0:Prometheus 對 NaN 比較永遠回 false, + # 既有 alert rule `awoooi_flywheel_execution_success_rate < 0.1` 自然不會被 + # sentinel 觸發;同時 Grafana 渲染為「無資料」gap,比 -1 spike 直觀。 + # 前次嘗試 -1.0 會讓 ops/monitoring/alerts.yml:775 等 3 份 prom rule + # 在 fresh deploy 後 2h 必噴 FlywheelExecutionSuccessLow 假告警,跟 watchdog skip 自相矛盾。 + rate_str = ( + f"{self.execution_success_rate:.4f}" + if self.execution_success_rate is not None + else "NaN" + ) lines = [ "# HELP awoooi_flywheel_playbook_count Total approved playbooks in Redis", "# TYPE awoooi_flywheel_playbook_count gauge", f"awoooi_flywheel_playbook_count {self.playbook_count} {ts}", "", - "# HELP awoooi_flywheel_execution_success_rate Auto-repair success rate (0-1)", + "# HELP awoooi_flywheel_execution_success_rate Auto-repair success rate (0-1), NaN=insufficient sample", "# TYPE awoooi_flywheel_execution_success_rate gauge", - f"awoooi_flywheel_execution_success_rate {self.execution_success_rate:.4f} {ts}", + f"awoooi_flywheel_execution_success_rate {rate_str} {ts}", "", "# HELP awoooi_flywheel_km_unvectorized_count KM entries not yet vectorized", "# TYPE awoooi_flywheel_km_unvectorized_count gauge", @@ -124,7 +145,7 @@ class FlywheelMetrics: """輸出 /api/v1/stats/summary 格式""" return { "playbook_count": self.playbook_count, - "execution_success_rate": round(self.execution_success_rate, 4), + "execution_success_rate": round(self.execution_success_rate, 4) if self.execution_success_rate is not None else None, "today_processed": self.today_processed, "flywheel_conversions_today": self.flywheel_conversions_today, "km_vectorized_rate": round(self.km_vectorized_rate, 4), @@ -187,8 +208,13 @@ class FlywheelStatsService: # Internal helpers # ------------------------------------------------------------------ - async def _playbook_stats(self) -> tuple[int, float]: - """Playbook 數量 + 執行成功率(從 Redis)""" + async def _playbook_stats(self) -> tuple[int, float | None]: + """Playbook 數量 + 執行成功率(從 Redis) + + 2026-05-02 ogt + Claude Sonnet 4.6 — Bug 2 修復(W-3 fresh deploy 假告警) + total_exec < FLYWHEEL_MIN_SAMPLE 時回 None,代表樣本不足, + watchdog W-3 判 None 跳過該檢查,避免每次 restart 觸發假告警。 + """ try: redis = get_redis() count = 0 @@ -211,12 +237,15 @@ class FlywheelStatsService: except (json.JSONDecodeError, KeyError): continue - rate = total_success / total_exec if total_exec > 0 else 0.0 + if total_exec < FLYWHEEL_MIN_SAMPLE: + # 樣本不足(含 Redis 空),回 None 通知呼叫方跳過 W-3 告警判斷 + return count, None + rate = total_success / total_exec return count, rate except Exception: logger.exception("flywheel_stats_playbook_error") - return 0, 0.0 + return 0, None async def _km_stats(self, now: datetime) -> tuple[int, float, int]: """KM 向量化率 + 今日飛輪轉化數(從 PostgreSQL)""" diff --git a/apps/api/src/services/governance_agent.py b/apps/api/src/services/governance_agent.py index fa91e97e..677f7920 100644 --- a/apps/api/src/services/governance_agent.py +++ b/apps/api/src/services/governance_agent.py @@ -106,13 +106,17 @@ class GovernanceAgent: else: kept_ids.append(r.playbook_id) - if auto_deprecated_ids: - await db.commit() - logger.info( - "governance_trust_drift_auto_deprecated", - count=len(auto_deprecated_ids), - ids=auto_deprecated_ids[:10], - ) + # 2026-05-02 ogt + Claude Sonnet 4.6 — Bug 1 修復(P0 silent failure) + # 原 await db.commit() 在 with 區塊外呼叫,session 已被 context manager + # 關閉後 auto-commit,二次 commit 拋 InvalidRequestError 被外層 try/except 吞掉 + # 修法:commit 移入 with 區塊內,在 session 有效期間顯式提交 + if auto_deprecated_ids: + await db.commit() + logger.info( + "governance_trust_drift_auto_deprecated", + count=len(auto_deprecated_ids), + ids=auto_deprecated_ids[:10], + ) if drifted: drift_ratio = len(drifted) / total if total > 0 else 0.0 diff --git a/apps/api/tests/test_check_trust_drift_commit_outside_context_poc.py b/apps/api/tests/test_check_trust_drift_commit_outside_context_poc.py new file mode 100644 index 00000000..1e4aad8a --- /dev/null +++ b/apps/api/tests/test_check_trust_drift_commit_outside_context_poc.py @@ -0,0 +1,246 @@ +"""Regression guard:擋住 governance_agent.check_trust_drift 的 commit-outside-context P0 bug 復發 + +驗證標的:apps/api/src/services/governance_agent.py:75-171(check_trust_drift) + +# 歷史背景 + +2026-05-02 commit dedb1208 引入 auto-deprecate 路徑時,`if auto_deprecated_ids:` +區塊縮排錯誤(縮排 8 空格 = `async with` 同層 = 區塊**外**),導致: +- session 已被 context manager 關閉並 auto-commit +- 二次 `await db.commit()` 在已關閉 session 上拋 InvalidRequestError +- 外層 try/except 吞掉錯誤 +- `governance_trust_drift_auto_deprecated` log 從不出現 + +實際後果:DB 仍有 commit(context manager auto-commit 已落地 `status='deprecated'`), +但 log 不出現,所有依賴此 log 做 monitoring 的告警鏈會誤判系統健康。 +commit b710f3f3 message 聲稱「自治路徑生效」是假象。 + +git diff 鐵證 (line 109):縮排原本 8 空格在 with 外,2026-05-02 修復後改為 12 空格在 with 內。 + +# 並行調度教訓 + +vuln-verifier 與 fullstack-engineer 並行派遣時,vuln-verifier 讀取的是已被 +fullstack-engineer 修改後的代碼,AST 分析得出「bug 不存在」的錯誤結論。 +未來:vuln-verifier 應該在 fullstack-engineer 之**前**跑(修復前驗證 bug 真實), +或用 git show HEAD~1 比對「修復前版本」。 + +# 本檔角色 + +修復後保留為 AST regression guard:未來若 indent 退回 8 空格(`if auto_deprecated_ids:` +從 with 內被移到外層)AST 測試會立刻 fail,擋住 silent failure 復發。 + +2026-05-02 by ogt + Claude Opus 4.7(修正 vuln-verifier 並行誤判) +""" +from __future__ import annotations + +import ast +import logging +from datetime import datetime, timedelta, timezone +from pathlib import Path +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +import structlog +from sqlalchemy.exc import InvalidRequestError + +from src.services.governance_agent import GovernanceAgent + +# 配置 structlog 走標準 logging(caplog 才抓得到) +structlog.configure( + processors=[ + structlog.stdlib.add_log_level, + structlog.processors.KeyValueRenderer(), + ], + wrapper_class=structlog.stdlib.BoundLogger, + logger_factory=structlog.stdlib.LoggerFactory(), + cache_logger_on_first_use=False, +) + + +# ============================================================================ +# 證據 A:AST 靜態分析 — 證明 commit + log 在 with 區塊內 +# ============================================================================ + +def test_ast_proves_commit_and_log_inside_with_block(): + """AST 證據:解析 governance_agent.py,確認 line 113 的 If(含 commit + log) + 是 AsyncWith.body 的一部分,而不是函式 body 的頂層語句。 + + 若 critic 主張為真,line 113 應該出現在「Function-level statements」中, + 而不是 AsyncWith.body 中。 + """ + src_path = ( + Path(__file__).resolve().parents[1] + / "src" / "services" / "governance_agent.py" + ) + tree = ast.parse(src_path.read_text()) + + # 找到 check_trust_drift + func = None + for node in ast.walk(tree): + if isinstance(node, ast.AsyncFunctionDef) and node.name == "check_trust_drift": + func = node + break + assert func is not None, "找不到 check_trust_drift" + + # 找到第一個 AsyncWith + async_with = next( + (s for s in func.body if isinstance(s, ast.AsyncWith)), None + ) + assert async_with is not None + + # AsyncWith.body 內找有沒有 await db.commit() + logger.info(governance_trust_drift_auto_deprecated) + found_commit_inside_with = False + found_log_inside_with = False + + for sub in ast.walk(async_with): + # await db.commit() + if isinstance(sub, ast.Await): + call = sub.value + if ( + isinstance(call, ast.Call) + and isinstance(call.func, ast.Attribute) + and call.func.attr == "commit" + ): + found_commit_inside_with = True + # logger.info("governance_trust_drift_auto_deprecated", ...) + if ( + isinstance(sub, ast.Call) + and isinstance(sub.func, ast.Attribute) + and sub.func.attr == "info" + and sub.args + and isinstance(sub.args[0], ast.Constant) + and sub.args[0].value == "governance_trust_drift_auto_deprecated" + ): + found_log_inside_with = True + + assert found_commit_inside_with, ( + "AST 證據:await db.commit() 應該出現在 AsyncWith 區塊內。" + "若 critic 主張正確(commit 在 with 外),這裡會找不到,斷言應失敗。" + ) + assert found_log_inside_with, ( + "AST 證據:logger.info('governance_trust_drift_auto_deprecated', ...) " + "應該出現在 AsyncWith 區塊內。" + ) + + # 檢查函式 body(不深入子節點)—— commit + log 不該出現在頂層 + for top_stmt in func.body: + # 直接子節點 — 不是 AsyncWith 的話,不該包含 commit/log + if isinstance(top_stmt, ast.AsyncWith): + continue + for sub in ast.walk(top_stmt): + # 不該出現 await db.commit() 在 with 外 + if isinstance(sub, ast.Await): + call = sub.value + if ( + isinstance(call, ast.Call) + and isinstance(call.func, ast.Attribute) + and call.func.attr == "commit" + ): + raise AssertionError( + f"BUG!await db.commit() 出現在函式頂層 line {sub.lineno}(with 區塊外)— " + "critic 主張為真,這就是 silent failure 的來源。" + ) + + +# ============================================================================ +# 證據 B:行為驗證 — 即使在「context-exit-closes-session」嚴格 mock 下 +# log 也正常出現,證明 commit 跑在 with 內 +# ============================================================================ + +def _make_low_trust_old_playbook(playbook_id: str = "PB-STALE-OLD"): + rec = MagicMock() + rec.trust_score = 0.05 + rec.playbook_id = playbook_id + rec.status = "approved" + rec.last_used_at = None + rec.created_at = datetime.now(timezone.utc) - timedelta(days=45) + return rec + + +class _ClosedAfterExitContext: + """模擬真實 SQLAlchemy AsyncSession 行為: + __aexit__ 後將 commit 替換為拋 InvalidRequestError,模擬 closed session。 + + 若 critic 主張為真(commit 在 with 外): + → 第二次 commit 會拋錯 → log 不會出現 + 若 critic 主張為假(commit 在 with 內): + → commit 在 __aexit__ 之前就執行完 → log 正常出現 + + 第二次 enter(_alert 內)我們也保持替換行為(_alert 內的 commit 該拋錯, + 這是良性的 — _alert 自身有 try/except 吞掉並 log governance_pg_write_failed)。 + """ + + def __init__(self, db): + self._db = db + self.enter_count = 0 + self.exit_count = 0 + + async def __aenter__(self): + self.enter_count += 1 + return self._db + + async def __aexit__(self, exc_type, exc, tb): + self.exit_count += 1 + + async def _raise_closed(): + raise InvalidRequestError( + "(simulated) Session is closed; commit() called after context exit" + ) + + self._db.commit = _raise_closed + return False + + +@pytest.mark.asyncio +async def test_log_appears_proves_commit_runs_inside_with(caplog): + """行為證據:當 __aexit__ 後 commit 會拋錯時,log 仍然出現 + → 證明 commit 是在 with 內就跑完了,不在 with 外(與 critic 主張相反) + """ + caplog.set_level(logging.INFO) + + stale = _make_low_trust_old_playbook("PB-STALE-OLD") + + mock_result = MagicMock() + mock_result.scalars.return_value.all.return_value = [stale] + + mock_db = AsyncMock() + mock_db.execute = AsyncMock(return_value=mock_result) + mock_db.commit = AsyncMock() # 初始 commit OK,__aexit__ 後會被替換成拋錯 + + alerter = AsyncMock() + alerter.alert_governance = AsyncMock() + agent = GovernanceAgent(alerter=alerter) + + ctx = _ClosedAfterExitContext(mock_db) + + raised = None + with patch( + "src.services.governance_agent.get_db_context", + return_value=ctx, + ): + try: + result = await agent.check_trust_drift() + except InvalidRequestError as e: + raised = e + result = None + + # 1. with 區塊有正確 enter/exit + assert ctx.enter_count >= 1 + assert ctx.exit_count >= 1 + + # 2. mutation 已發生 + assert stale.status == "deprecated" + + # 3. KEY:log 出現了 — 證明 commit 在 __aexit__ 之前就跑完 + log_text = " | ".join(rec.getMessage() for rec in caplog.records) + assert "governance_trust_drift_auto_deprecated" in log_text, ( + f"如果 critic 主張為真(commit 在 with 外),log 不該出現。" + f"但實際 log 出現了 → 證明 critic 主張為假。Log: {log_text!r}" + ) + + # 4. check_trust_drift 沒拋例外(_alert 內的 commit 拋錯被 try/except 吞掉,是良性的) + assert raised is None, ( + f"check_trust_drift 不應拋例外。實際拋了:{raised}" + ) + assert result is not None + assert result["auto_deprecated"] == 1 diff --git a/apps/web/src/components/dashboard/flywheel-kpi-card.tsx b/apps/web/src/components/dashboard/flywheel-kpi-card.tsx index 1a164f96..815e950b 100644 --- a/apps/web/src/components/dashboard/flywheel-kpi-card.tsx +++ b/apps/web/src/components/dashboard/flywheel-kpi-card.tsx @@ -19,7 +19,9 @@ const WS_BASE = API_BASE.replace(/^https/, 'wss').replace(/^http/, 'ws') interface FlywheelSummary { playbook_count: number - execution_success_rate: number + // 2026-05-02 ogt + Claude Opus 4.7 — 後端 Bug 2 修復連動: + // 樣本不足(< FLYWHEEL_MIN_SAMPLE)時後端回 null,line 122 的 != null guard 已正確處理 + execution_success_rate: number | null today_processed: number flywheel_conversions_today: number km_vectorized_rate: number