From 3cb091f5987aaec971e57d1b29bdc67da52832b6 Mon Sep 17 00:00:00 2001 From: OoO Date: Wed, 13 May 2026 10:17:54 +0800 Subject: [PATCH] =?UTF-8?q?=E8=A8=98=E9=8C=84=20Observability=20fail-safe?= =?UTF-8?q?=20=E5=8D=80=E5=A1=8A=E5=A4=B1=E6=95=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../claude_inventory_validation_20260513.md | 2 ++ routes/admin_observability_routes.py | 17 ++++++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/docs/memory/claude_inventory_validation_20260513.md b/docs/memory/claude_inventory_validation_20260513.md index 0743e4b..ff870fa 100644 --- a/docs/memory/claude_inventory_validation_20260513.md +++ b/docs/memory/claude_inventory_validation_20260513.md @@ -17,6 +17,7 @@ - Elephant Alpha short-circuit:`log_ai_call` 遙測失敗仍不阻擋省成本 return,但已改為 warning + stack。 - Claude cost throttle:成本節流檢查失敗仍維持 Claude 可用,但已改為 warning + stack,避免成本保護失效無跡可查。 - `ai_call_logger` caller registry:registry 匯入失敗仍不阻擋 LLM 遙測,但已改為 warning + stack。 +- Observability route:promotion review RAG 相似查詢、PPT audit history 缺表、host health probe 寫入、MCP 24h summary 缺表等 fail-safe 區塊已改成 debug/warning log,不再完全靜默。 ## 已驗證為已修或過期 @@ -55,3 +56,4 @@ - `f49413e` 記錄 EA short-circuit 遙測失敗 - `0a75d11` 記錄 Claude 成本節流檢查失敗 - `5625032` 記錄 AI caller registry 匯入失敗 +- `1757837` 記錄 Observability fail-safe 區塊失敗 diff --git a/routes/admin_observability_routes.py b/routes/admin_observability_routes.py index 39c7ad2..b1cfe77 100644 --- a/routes/admin_observability_routes.py +++ b/routes/admin_observability_routes.py @@ -17,6 +17,7 @@ Operation Ollama-First v5.0 / Phase 27 — Admin Observability Dashboard - 不暴露 secret / prompt 原文 """ +import logging from datetime import datetime, timedelta from flask import Blueprint, render_template, request, jsonify from sqlalchemy import text as sa_text @@ -24,6 +25,8 @@ from sqlalchemy import text as sa_text from auth import login_required, get_current_user from database.manager import get_session +logger = logging.getLogger(__name__) + admin_observability_bp = Blueprint( 'admin_observability', @@ -1289,9 +1292,13 @@ def promotion_review_list(): for h in rag_result.hits[:3] ] except Exception: - pass # 單筆 RAG 失敗不影響其餘 + logger.debug( + "Promotion review similar-insight lookup failed for episode_id=%s", + ep.get('id'), + exc_info=True, + ) except Exception: - pass # rag_service import 失敗(feature flag OFF)→ 略過 + logger.debug("Promotion review RAG service unavailable; skipping similar insights", exc_info=True) # Phase 47 K-4: 蒸餾池 status 分布(30d) ep_distribution = session.execute( @@ -2192,7 +2199,7 @@ def ppt_audit_history(): finally: session.close() except Exception: - pass # 表可能尚未 migration,失敗安全 + logger.debug("PPT audit history table unavailable; rendering empty audit history", exc_info=True) # PPT vision 啟用狀態 try: @@ -2374,7 +2381,7 @@ def host_health_dashboard(): finally: _session.close() except Exception: - pass # DB 寫入失敗不影響頁面顯示 + logger.warning("Failed to persist host health probe records", exc_info=True) # MCP server 健康 mcp_status = {} @@ -2520,7 +2527,7 @@ def host_health_dashboard(): finally: _session2.close() except Exception: - pass # 表可能尚未 migration,失敗安全 + logger.debug("MCP calls table unavailable; rendering empty MCP 24h summary", exc_info=True) # Phase 47 K-1: incidents + heal_logs 詳細列表 + playbooks 排行 + backup + embed queue recent_incidents = []