diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index 983ece4a..015e0938 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -35,6 +35,7 @@ from src.core.config import settings from src.core.constants import is_cicd_alertname, is_heartbeat_alertname from src.services.alert_rule_engine import get_incident_type, match_rule from src.services.action_parser import is_safe_kubectl_action +from src.services.security_interceptor import check_webhook_nonce # P0-06: nonce dedup via Service 層 from src.core.logging import get_logger from src.core.metrics import record_alert_chain_success @@ -648,6 +649,8 @@ class HMACVerificationError(Exception): async def verify_webhook_signature( request: Request, x_signature_256: str | None = Header(None, alias="X-Signature-256"), + x_webhook_timestamp: str | None = Header(None, alias="X-Webhook-Timestamp"), + x_webhook_nonce: str | None = Header(None, alias="X-Webhook-Nonce"), ) -> bool: """ 驗證 Webhook 請求的 HMAC-SHA256 簽章 @@ -657,6 +660,11 @@ async def verify_webhook_signature( - 簽章格式: sha256= - 使用 WEBHOOK_HMAC_SECRET 進行驗證 + ADR-116 Replay 防護(向後相容): + - X-Webhook-Timestamp: Unix epoch 秒,若提供則驗證 ±300 秒範圍 + - X-Webhook-Nonce: 隨機字串,若提供則用 Redis NX 去重(TTL=600s) + - 兩個 Header 均可選;過渡期不提供時僅記錄 warning + 安全鐵律 (Fail-Closed): - 生產環境: HMAC Secret 未設定 → 直接拒絕 (不可跳過) - 開發環境: 可跳過驗證 (僅供本地測試) @@ -664,6 +672,8 @@ async def verify_webhook_signature( Args: request: FastAPI Request 物件 x_signature_256: X-Signature-256 Header 值 + x_webhook_timestamp: X-Webhook-Timestamp Header 值(Unix epoch 秒,可選) + x_webhook_nonce: X-Webhook-Nonce Header 值(隨機字串,可選) Returns: bool: 驗證是否通過 @@ -671,6 +681,8 @@ async def verify_webhook_signature( Raises: HMACVerificationError: 簽章驗證失敗 """ + import time as _time + # ========================================================================== # Fail-Closed 安全策略 (CISO 要求) # ========================================================================== @@ -725,6 +737,54 @@ async def verify_webhook_signature( raise HMACVerificationError("Invalid signature") logger.info("hmac_verification_success") + + # ========================================================================== + # ADR-116: Replay 防護(向後相容,HMAC 驗證成功後才執行) + # ========================================================================== + + # --- Timestamp 驗證(±300 秒) --- + if x_webhook_timestamp is not None: + try: + req_ts = int(x_webhook_timestamp) + now_ts = int(_time.time()) + skew = abs(now_ts - req_ts) + if skew > 300: + logger.warning( + "webhook_timestamp_out_of_window", + request_ts=req_ts, + server_ts=now_ts, + skew_seconds=skew, + ) + raise HMACVerificationError( + f"Timestamp out of acceptable window (skew={skew}s > 300s)" + ) + except ValueError: + logger.warning( + "webhook_timestamp_invalid_format", + raw_value=x_webhook_timestamp, + ) + raise HMACVerificationError("X-Webhook-Timestamp must be a Unix epoch integer") + else: + # 過渡期:沒有提供 Timestamp 則記錄 warning 但允許通過 + logger.warning( + "webhook_replay_protection_missing", + header="X-Webhook-Timestamp", + note="transition period: request allowed but sender should add replay headers", + ) + + # --- Nonce 去重(透過 security_interceptor.check_webhook_nonce,fail open) --- + if x_webhook_nonce is not None: + valid = await check_webhook_nonce(x_webhook_nonce) + if not valid: + raise HMACVerificationError("Nonce replay detected") + else: + # 過渡期:沒有提供 Nonce 則記錄 warning 但允許通過 + logger.warning( + "webhook_replay_protection_missing", + header="X-Webhook-Nonce", + note="transition period: request allowed but sender should add replay headers", + ) + return True diff --git a/apps/api/src/core/config.py b/apps/api/src/core/config.py index b5de10d0..25ce767e 100644 --- a/apps/api/src/core/config.py +++ b/apps/api/src/core/config.py @@ -703,6 +703,13 @@ class Settings(BaseSettings): default="", description="HMAC secret for webhook signature verification", ) + # ADR-116 P0-05: Callback Nonce 防偽造 HMAC Secret + # 2026-05-04 Claude Sonnet 4.6 (ADR-116): 附加至 callback nonce 末尾的 HMAC-SHA256[:16] + # 空字串 → 過渡期跳過驗證並記錄 warning + CALLBACK_HMAC_SECRET: str = Field( + default="", + description="ADR-116: HMAC secret for callback nonce anti-forgery (HMAC-SHA256 appended to nonce)", + ) # 2026-04-24 Claude Sonnet 4.6 (ADR-094): Telegram Webhook Secret Token # 與 setWebhook API 呼叫時的 secret_token 相同;空字串 → dev 環境跳過驗證 TELEGRAM_WEBHOOK_SECRET: str = Field( diff --git a/apps/api/src/services/consensus_engine.py b/apps/api/src/services/consensus_engine.py index 7199bfdd..295a7fbd 100644 --- a/apps/api/src/services/consensus_engine.py +++ b/apps/api/src/services/consensus_engine.py @@ -337,10 +337,36 @@ class PerformanceAgent(ExpertAgent): # Consensus Engine # ============================================================================= -CONSENSUS_PREFIX = "consensus:" +# P0-12 修正 2026-05-04 ogt + Claude Sonnet 4.6: +# 舊格式(無 project 前綴):consensus:{consensus_id} +# 新格式(含 project 前綴):{project_id}:consensus:{consensus_id} +# 遷移策略:Phase A 雙寫 + fallback 讀舊 key,待全部遷移後移除 fallback +CONSENSUS_PREFIX = "consensus:" # 舊格式前綴(讀 fallback 用) +PLATFORM_INTERNAL = "__platform__" # project_id 不可得時的 sentinel namespace CONSENSUS_TTL = 3600 # 1 小時 +def _consensus_key(consensus_id: str, project_id: str | None) -> str: + """ + 建構 consensus Redis key(含 project_id namespace) + + Args: + consensus_id: 共識 ID(如 CON-20260504-ABCD1234) + project_id: 租戶 project ID;若為 None 則使用 __platform__ sentinel + + Returns: + 新格式 key:{project_id}:consensus:{consensus_id} + 或 fallback:__platform__:consensus:{consensus_id} + """ + ns = project_id if project_id else PLATFORM_INTERNAL + return f"{ns}:consensus:{consensus_id}" + + +def _consensus_legacy_key(consensus_id: str) -> str: + """舊格式 key(Phase A fallback 讀取用)""" + return f"{CONSENSUS_PREFIX}{consensus_id}" + + class ConsensusEngine: """ 共識引擎 - Phase 9.4 核心 @@ -526,6 +552,7 @@ class ConsensusEngine: consensus_score: float, recommended_action_type: str, dissenting: list[str], + project_id: str | None = None, ) -> ConsensusResult: """ 產生最終決策 @@ -578,8 +605,8 @@ class ConsensusEngine: dissenting_opinions=dissenting, ) - # 儲存到 Redis - await self._save_consensus(result) + # 儲存到 Redis(含 project_id namespace) + await self._save_consensus(result, project_id=project_id) logger.info( "consensus_generated", @@ -595,6 +622,7 @@ class ConsensusEngine: self, incident: Incident, timeout_sec: float = 30.0, + project_id: str | None = None, ) -> ConsensusResult: """ 執行完整的共識流程 @@ -603,6 +631,11 @@ class ConsensusEngine: 1. 收集意見 2. 計算共識 3. 產生決策 + + Args: + incident: 要分析的事件 + timeout_sec: 超時秒數 + project_id: 租戶 project ID,用於 Redis key namespace 隔離(P0-12) """ # Step 1: 收集意見 opinions = await self.gather_opinions(incident, timeout_sec) @@ -610,32 +643,43 @@ class ConsensusEngine: # Step 2: 計算共識 consensus_score, recommended_action, dissenting = self.calculate_consensus(opinions) - # Step 3: 產生決策 + # Step 3: 產生決策(傳入 project_id 供 Redis key namespace 隔離) result = await self.generate_final_decision( incident=incident, opinions=opinions, consensus_score=consensus_score, recommended_action_type=recommended_action, dissenting=dissenting, + project_id=project_id, ) return result - async def _save_consensus(self, result: ConsensusResult) -> None: + async def _save_consensus( + self, + result: ConsensusResult, + project_id: str | None = None, + ) -> None: """儲存共識結果到 Redis(熱快取)+ PG(永久記錄) 2026-04-26 P2-DB-Fix by Claude — db-expert P0 三修(P0.2): 補 PG 寫入 agent_sessions,符合 ADR-085 鐵律 Redis TTL 到期不再造成共識記憶消失 + + P0-12 修正 2026-05-04 ogt + Claude Sonnet 4.6: + Phase A 雙寫:新 key(含 project_id 前綴)+ 舊 key(向後相容) + 待全部遷移完成後移除舊 key 寫入 """ - # 1. 既有 Redis 寫(熱快取,保留) redis_client = get_redis() - key = f"{CONSENSUS_PREFIX}{result.consensus_id}" - await redis_client.set( - key, - json.dumps(result.to_dict()), - ex=CONSENSUS_TTL, - ) + payload = json.dumps(result.to_dict()) + + # 1a. 新 key(含 project namespace)— Phase A 主要 key + new_key = _consensus_key(result.consensus_id, project_id) + await redis_client.set(new_key, payload, ex=CONSENSUS_TTL) + + # 1b. 舊 key(無 project 前綴)— Phase A fallback,向後相容 + legacy_key = _consensus_legacy_key(result.consensus_id) + await redis_client.set(legacy_key, payload, ex=CONSENSUS_TTL) # 2. 補 PG 永久寫入(ADR-085 鐵律 — 失敗不阻斷主流程) try: @@ -691,12 +735,34 @@ class ConsensusEngine: consensus_id=result.consensus_id, ) - async def get_consensus(self, consensus_id: str) -> ConsensusResult | None: - """取得共識結果""" - redis_client = get_redis() - key = f"{CONSENSUS_PREFIX}{consensus_id}" + async def get_consensus( + self, + consensus_id: str, + project_id: str | None = None, + ) -> ConsensusResult | None: + """取得共識結果 + + P0-12 修正 2026-05-04 ogt + Claude Sonnet 4.6: + Phase A 雙讀:先讀新 key(含 project 前綴),若 miss 再 fallback 舊 key + """ + redis_client = get_redis() + + # 先嘗試新格式 key(含 project namespace) + new_key = _consensus_key(consensus_id, project_id) + data = await redis_client.get(new_key) + + if not data: + # Phase A fallback:讀舊格式 key(無 project 前綴) + legacy_key = _consensus_legacy_key(consensus_id) + data = await redis_client.get(legacy_key) + if data: + logger.info( + "consensus_legacy_key_hit", + consensus_id=consensus_id, + project_id=project_id, + note="Phase A fallback 命中,建議觸發資料遷移", + ) - data = await redis_client.get(key) if data: return ConsensusResult.from_dict(json.loads(data)) return None diff --git a/apps/api/src/services/ollama_auto_recovery.py b/apps/api/src/services/ollama_auto_recovery.py index 4b7865ae..d4b218a3 100644 --- a/apps/api/src/services/ollama_auto_recovery.py +++ b/apps/api/src/services/ollama_auto_recovery.py @@ -225,16 +225,25 @@ class OllamaAutoRecoveryService: # ------------------------------------------------------------------------- # Redis 持久化(跨重啟恢復) # 2026-04-25 critic-fix Part2 H5+H6 by Claude Engineer-C2 + # 2026-05-04 P0-11 Phase A 雙寫遷移(ADR-110/ADR-118): + # 舊 key "ollama:current_primary" → 新 key "platform:ollama:current_primary" + # Ollama 是 platform_resource(非 tenant 資源),加 platform: 前綴明確分類 + # Phase A(30 天):同時寫入新舊 key,讀取以新 key 為主,舊 key 作 fallback + # Phase C(~2026-06-04):停止寫入舊 key,刪除舊 key # ------------------------------------------------------------------------- - _REDIS_PRIMARY_KEY = "ollama:current_primary" + _REDIS_PRIMARY_KEY = "platform:ollama:current_primary" # 新 key(Phase A 起生效) + _REDIS_PRIMARY_KEY_LEGACY = "ollama:current_primary" # 舊 key(Phase C 前持續雙寫) async def _persist_primary(self, primary: str) -> None: - """持久化 current_primary 到 Redis(無 TTL,跨重啟恢復)""" + """持久化 current_primary 到 Redis(無 TTL,跨重啟恢復) + Phase A 雙寫:同時寫入新舊 key,確保舊版 Pod 滾動期間不失效。 + """ try: from src.core.redis_client import get_redis redis = get_redis() await redis.set(self._REDIS_PRIMARY_KEY, primary) + await redis.set(self._REDIS_PRIMARY_KEY_LEGACY, primary) # Phase A 雙寫 except Exception as e: logger.warning( "ollama_auto_recovery_persist_failed", @@ -243,11 +252,22 @@ class OllamaAutoRecoveryService: ) async def _load_primary(self) -> str: - """從 Redis 載入 current_primary(找不到時預設 "ollama")""" + """從 Redis 載入 current_primary(找不到時預設 "ollama") + Phase A 讀取:優先讀新 key,不存在時 fallback 舊 key。 + """ try: from src.core.redis_client import get_redis redis = get_redis() val = await redis.get(self._REDIS_PRIMARY_KEY) + if not val: + # Phase A fallback:新 key 尚未有值(舊版 Pod 只寫了舊 key) + val = await redis.get(self._REDIS_PRIMARY_KEY_LEGACY) + if val: + logger.info( + "ollama_auto_recovery_loaded_from_legacy_key", + service="ollama_auto_recovery", + note="Phase A fallback,預計 2026-06-04 移除舊 key", + ) if val: decoded = val.decode() if isinstance(val, bytes) else val logger.info( diff --git a/apps/api/src/services/security_interceptor.py b/apps/api/src/services/security_interceptor.py index a5cd9ecd..65280a14 100644 --- a/apps/api/src/services/security_interceptor.py +++ b/apps/api/src/services/security_interceptor.py @@ -14,6 +14,8 @@ Features: - 過期的 Nonce 自動清除 """ +import hashlib +import hmac import time from dataclasses import dataclass from typing import Protocol, runtime_checkable @@ -134,6 +136,34 @@ class NonceStore: logger.debug("nonce_cleanup", removed_count=len(expired)) +async def check_webhook_nonce(nonce: str, ttl: int = 600) -> bool: + """ + Webhook replay 防護:用 Redis NX 記錄 nonce(TTL=600s),重複使用回傳 False。 + + Service 層 helper,供 Router 層(webhooks.py)呼叫;禁止 Router 直接用 get_redis。 + Redis 不可用時 fail open(回傳 True + 記錄 warning)。 + + P0-06 修正(ADR-116,2026-05-04 ogt + Claude Sonnet 4.6) + """ + from src.core.redis_client import get_redis + nonce_key = f"webhook:nonce:{nonce}" + try: + redis = get_redis() + stored = await redis.set(nonce_key, "1", nx=True, ex=ttl) + if not stored: + logger.warning("webhook_nonce_replay_detected", nonce_prefix=nonce[:16] + "...") + return False + logger.debug("webhook_nonce_registered", nonce_key=nonce_key) + return True + except Exception as exc: + logger.warning( + "webhook_nonce_redis_unavailable", + error=str(exc), + note="fail open: request allowed despite Redis unavailability", + ) + return True + + # ============================================================================= # Telegram Security Interceptor # ============================================================================= @@ -478,13 +508,30 @@ class TelegramSecurityInterceptor: # Not a valid UUID (e.g. legacy format) — use as-is, may exceed limit but won't crash short_id = approval_id - nonce = f"{action}:{short_id}:{timestamp}:{random_part}" + nonce_body = f"{action}:{short_id}:{timestamp}:{random_part}" + + # ADR-116 P0-05: 附加 HMAC-SHA256[:16] 防偽造 + # 2026-05-04 Claude Sonnet 4.6 (ADR-116): 若 CALLBACK_HMAC_SECRET 未設定則 warning + 降級 + if settings.CALLBACK_HMAC_SECRET: + hmac_hex = hmac.new( + settings.CALLBACK_HMAC_SECRET.encode(), + nonce_body.encode(), + hashlib.sha256, + ).hexdigest() + nonce = f"{nonce_body}:{hmac_hex[:16]}" + else: + logger.warning( + "callback_hmac_secret_missing", + note="CALLBACK_HMAC_SECRET not configured; nonce generated without HMAC (transition mode)", + ) + nonce = nonce_body logger.debug( "callback_nonce_generated", approval_id=approval_id, action=action, nonce_len=len(nonce.encode()), + hmac_appended=bool(settings.CALLBACK_HMAC_SECRET), ) return nonce @@ -517,7 +564,32 @@ class TelegramSecurityInterceptor: "is_info_action": True, } - if len(parts) != 4: + # ADR-116 P0-05: 支援 5-part 格式(含 HMAC) + # 2026-05-04 Claude Sonnet 4.6 (ADR-116): {action}:{short_id}:{ts}:{rand}:{hmac16} + if len(parts) == 5: + # 5-part:驗證 HMAC,然後還原成 4-part 格式繼續解析 + embedded_hmac = parts[4] + nonce_body = ":".join(parts[:4]) + if settings.CALLBACK_HMAC_SECRET: + expected_hmac = hmac.new( + settings.CALLBACK_HMAC_SECRET.encode(), + nonce_body.encode(), + hashlib.sha256, + ).hexdigest()[:16] + if not hmac.compare_digest(embedded_hmac, expected_hmac): + logger.warning( + "callback_nonce_hmac_mismatch", + nonce_prefix=callback_data[:20] + "...", + ) + raise ValueError(f"Callback nonce HMAC verification failed") + else: + logger.warning( + "callback_hmac_secret_missing", + note="CALLBACK_HMAC_SECRET not configured; skipping nonce HMAC verification (transition mode)", + ) + # 以 4-part nonce_body 繼續解析(以下邏輯共用) + parts = parts[:4] + elif len(parts) != 4: raise ValueError(f"Invalid callback_data format: {callback_data}") import base64, uuid as _uuid diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index e121ba02..e92672c8 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -6,6 +6,24 @@ --- +## 2026-05-04 | AwoooP Phase 2 初批 P0 修正 + Phase 1.7 Tests(commit 14bf86a4) + +### 修正 +- **P0-08 telemetry.py**:`_validate_endpoint()` 移除硬碼 IP assert → `OTEL_ALLOWED_ENDPOINTS` / `OTEL_FORBIDDEN_ENDPOINTS` config-driven;EwoooC 可覆寫 +- **P0-13 mcp_bridge.py**:5 處 `"awoooi-prod"` hardcode → `settings.AWOOOI_K8S_NAMESPACE`;config.py 新增此欄位 +- **P1-24 decision_manager.py**:`f"telegram_silence:{target}"` → `SILENCE_KEY_PREFIX` 從 telegram_gateway import,消除雙重定義 +- **Phase 1 Task 1.7**:新增 `tests/integration/test_awooop_phase1_schema.py`(31 test cases:revision 不可變性 / VIEW draft 隔離 / active_pointer_guard / RLS fail-closed / outbox FK) + +### 下一步(Phase 2 剩餘 P0) +- P0-05: security_interceptor.py nonce 重設計(PoC 確認漏洞) +- P0-06: webhooks.py replay 防護 +- P0-11: ollama:current_primary 加 platform: 前綴(Redis 雙寫遷移 Batch A) +- P0-12: consensus_engine.py CONSENSUS_PREFIX 加 project 前綴 +- P1-16: nl_gateway.py hermes Redis key 加 project 前綴 +- P1-17: anomaly_counter.py per-project 改造 + +--- + ## 2026-05-04 | AwoooP Phase 1 Critic 修正(4 Critical) critic review 發現的 4 個 Critical + 3 個 Major 問題全部修正: