diff --git a/migrations/029_create_host_health_probes.sql b/migrations/029_create_host_health_probes.sql new file mode 100644 index 0000000..5340644 --- /dev/null +++ b/migrations/029_create_host_health_probes.sql @@ -0,0 +1,48 @@ +-- ============================================================================= +-- Migration 029: host_health_probes — 三主機健康歷史 +-- Operation Ollama-First v5.0 — Phase 38 +-- 日期: 2026-05-04 台北 +-- 對應頁面: /observability/host_health +-- ============================================================================= +-- 說明: +-- 原本 host_health 頁面每次刷新都即時 HTTP probe 三主機 /api/tags, +-- 無歷史 → 無法看趨勢、無法回查「昨天 GCP 是不是有掛過」。 +-- 本 migration 加表,每次 probe 寫一筆,留 30 天歷史(cron 清理)。 +-- +-- 寫入點: +-- 1. routes/admin_observability_routes.py::host_health_dashboard 每次 render 寫 +-- 2. scheduler.py 加每 5 分鐘 background probe(即使無人開頁也記錄) +-- +-- 索引設計: +-- - (probed_at DESC) 給最新 N 筆查詢 +-- - (host_label, probed_at DESC) 給「某台主機過去 24h 趨勢」 +-- ============================================================================= + +CREATE TABLE IF NOT EXISTS host_health_probes ( + id BIGSERIAL PRIMARY KEY, + probed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + host_label VARCHAR(64) NOT NULL, -- 'Primary (GCP)' / 'Secondary (GCP)' / 'Fallback (111)' + host_url VARCHAR(256) NOT NULL, -- http://34.143.170.20:11434 等 + healthy BOOLEAN NOT NULL, + unhealthy_mark BOOLEAN NOT NULL DEFAULT FALSE, -- 對應 _is_unhealthy(host) + models_count INTEGER DEFAULT 0, -- 載入模型數 + response_ms INTEGER, -- HTTP probe 耗時(ms) + error_msg TEXT, -- 失敗時的 exception 文字(截 500 字) + + CONSTRAINT chk_host_label_029 + CHECK (host_label IN ('Primary (GCP)', 'Secondary (GCP)', 'Fallback (111)')) +); + +CREATE INDEX IF NOT EXISTS idx_host_health_probes_at + ON host_health_probes (probed_at DESC); + +CREATE INDEX IF NOT EXISTS idx_host_health_probes_label_at + ON host_health_probes (host_label, probed_at DESC); + +-- 清理舊資料(保留 30 天)— 由 scheduler 每日 03:00 跑: +-- DELETE FROM host_health_probes WHERE probed_at < NOW() - INTERVAL '30 days'; + +COMMENT ON TABLE host_health_probes IS + '三主機 Ollama 健康歷史;每次 host_health 頁面 render 或 scheduler 5min cron 寫入'; +COMMENT ON COLUMN host_health_probes.host_label IS + 'services/ollama_service.py::get_host_label() 對應標籤'; diff --git a/migrations/030_create_ppt_audit_results.sql b/migrations/030_create_ppt_audit_results.sql new file mode 100644 index 0000000..4ea907a --- /dev/null +++ b/migrations/030_create_ppt_audit_results.sql @@ -0,0 +1,57 @@ +-- ============================================================================= +-- Migration 030: ppt_audit_results — PPT 視覺審核歷史持久化 +-- Operation Ollama-First v5.0 — Phase 38 +-- 日期: 2026-05-04 台北 +-- 對應頁面: /observability/ppt_audit_history +-- ============================================================================= +-- 說明: +-- 原本 ppt_audit_history 頁面只 os.listdir(reports/) 列檔, +-- PPT_VISION minicpm-v 跑出的審核結論(issues_found, confidence)完全遺失。 +-- 本 migration 加表,audit 完一律寫入,方便: +-- 1. 觀測頁面顯示「audit 結果」而不只「檔案存在」 +-- 2. 趨勢分析(過去 30 天 PPT 通過率?常見 issue 類型?) +-- 3. Telegram 推播去重(同檔案同問題 7 天內不重推) +-- +-- 寫入點: +-- 1. services/ppt_vision_service.py::check_ppt_file 跑完 minicpm-v 後寫 +-- 2. scheduler.py daily 22:00 cron 跑完所有當日 PPT 後 batch 寫 +-- +-- 索引設計: +-- - (audited_at DESC) 最新 audit +-- - (pptx_filename) 同檔多次審核 +-- - (audit_status) 篩選 failed only +-- ============================================================================= + +CREATE TABLE IF NOT EXISTS ppt_audit_results ( + id BIGSERIAL PRIMARY KEY, + audited_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + pptx_filename VARCHAR(256) NOT NULL, + pptx_size_kb INTEGER, + pptx_mtime TIMESTAMPTZ, -- 檔案本身 mtime(區分同名重生) + vision_enabled BOOLEAN NOT NULL, -- audit 當時 PPT_VISION_ENABLED 狀態 + audit_status VARCHAR(32) NOT NULL, -- 'passed' / 'failed' / 'skipped' / 'error' + issues_count INTEGER DEFAULT 0, + issues_found JSONB, -- ppt_vision_service 回傳的 issues 陣列 + confidence NUMERIC(4,3), -- 0-1 minicpm-v 信心度 + duration_ms INTEGER, -- audit 耗時 + error_msg TEXT, -- 失敗時的 exception + reviewer_notes TEXT, -- 人工補註(admin 介面後續可加) + + CONSTRAINT chk_audit_status_030 + CHECK (audit_status IN ('passed', 'failed', 'skipped', 'error', 'pending')) +); + +CREATE INDEX IF NOT EXISTS idx_ppt_audit_at + ON ppt_audit_results (audited_at DESC); + +CREATE INDEX IF NOT EXISTS idx_ppt_audit_filename + ON ppt_audit_results (pptx_filename); + +CREATE INDEX IF NOT EXISTS idx_ppt_audit_failed + ON ppt_audit_results (audited_at DESC) + WHERE audit_status = 'failed'; + +COMMENT ON TABLE ppt_audit_results IS + 'PPT 視覺審核結果歷史;by services/ppt_vision_service.py minicpm-v 推論'; +COMMENT ON COLUMN ppt_audit_results.issues_found IS + 'JSONB 陣列:[{type, severity, description, slide_index}]'; diff --git a/routes/admin_observability_routes.py b/routes/admin_observability_routes.py index 07caf07..ead4ef6 100644 --- a/routes/admin_observability_routes.py +++ b/routes/admin_observability_routes.py @@ -371,11 +371,12 @@ def budget_update(budget_id: int): @admin_observability_bp.route('/ppt_audit_history') @login_required def ppt_audit_history(): - """掃 reports/ 目錄列近 7 日 .pptx 檔 + 即時跑 audit(如已啟用)""" + """掃 reports/ 目錄列近 7 日 .pptx 檔 + 從 ppt_audit_results 表讀 audit 歷史(Phase 38)""" import os import time reports_dir = 'reports' files = [] + audit_records = [] error = None try: @@ -405,6 +406,37 @@ def ppt_audit_history(): except Exception as e: error = f'{type(e).__name__}: {str(e)[:200]}' + # Phase 38:讀過去 7 日 audit 歷史 + try: + session = get_session() + try: + audit_rows = session.execute( + sa_text(""" + SELECT audited_at, pptx_filename, audit_status, + issues_count, confidence, duration_ms, error_msg + FROM ppt_audit_results + WHERE audited_at >= NOW() - INTERVAL '7 days' + ORDER BY audited_at DESC + LIMIT 100 + """), + ).fetchall() + audit_records = [ + { + 'audited_at': r[0].strftime('%Y-%m-%d %H:%M'), + 'pptx_filename': r[1], + 'audit_status': r[2], + 'issues_count': int(r[3] or 0), + 'confidence': float(r[4] or 0), + 'duration_ms': int(r[5] or 0), + 'error_msg': r[6], + } + for r in audit_rows + ] + finally: + session.close() + except Exception: + pass # 表可能尚未 migration,失敗安全 + # PPT vision 啟用狀態 try: from services.ppt_vision_service import is_ppt_vision_enabled @@ -416,6 +448,7 @@ def ppt_audit_history(): 'admin/ppt_audit_history.html', active_page='obs_ppt_audit', files=files, + audit_records=audit_records, vision_enabled=vision_enabled, error=error, ) @@ -428,8 +461,10 @@ def ppt_audit_history(): @admin_observability_bp.route('/host_health') @login_required def host_health_dashboard(): - """三主機 Ollama + 4 個 MCP server 即時健康""" + """三主機 Ollama + 4 個 MCP server 即時健康(同時寫入 host_health_probes 留歷史)""" + import time as _time ollama_hosts = [] + probe_records = [] # 收集本次 probe 結果以批次寫 DB try: from services.ollama_service import ( OLLAMA_HOST_PRIMARY, OLLAMA_HOST_SECONDARY, OLLAMA_HOST_FALLBACK, @@ -443,6 +478,8 @@ def host_health_dashboard(): ]: entry = {'label': label, 'host': host, 'healthy': False, 'unhealthy_mark': _is_unhealthy(host), 'models': []} + t0 = _time.monotonic() + err = None try: resp = _r.get(f"{host.rstrip('/')}/api/tags", timeout=3) if resp.status_code == 200: @@ -450,12 +487,44 @@ def host_health_dashboard(): entry['models'] = [ m.get('name', '') for m in resp.json().get('models', []) ][:15] - except Exception: - pass + else: + err = f"HTTP {resp.status_code}" + except Exception as e: + err = f"{type(e).__name__}: {str(e)[:200]}" + response_ms = int((_time.monotonic() - t0) * 1000) + probe_records.append({ + 'host_label': label, 'host_url': host, 'healthy': entry['healthy'], + 'unhealthy_mark': entry['unhealthy_mark'], + 'models_count': len(entry['models']), 'response_ms': response_ms, + 'error_msg': err, + }) ollama_hosts.append(entry) except Exception: pass + # Phase 38:寫入 host_health_probes 留歷史(失敗安全,不擋頁面渲染) + if probe_records: + try: + _session = get_session() + try: + for rec in probe_records: + _session.execute( + sa_text(""" + INSERT INTO host_health_probes + (host_label, host_url, healthy, unhealthy_mark, + models_count, response_ms, error_msg) + VALUES + (:host_label, :host_url, :healthy, :unhealthy_mark, + :models_count, :response_ms, :error_msg) + """), + rec, + ) + _session.commit() + finally: + _session.close() + except Exception: + pass # DB 寫入失敗不影響頁面顯示 + # MCP server 健康 mcp_status = {} try: @@ -472,10 +541,45 @@ def host_health_dashboard(): except Exception: pass + # Phase 38:讀過去 24h 三主機健康歷史(給趨勢卡片) + health_history = [] + try: + _session2 = get_session() + try: + history_rows = _session2.execute( + sa_text(""" + SELECT host_label, + COUNT(*) FILTER (WHERE healthy) AS up_count, + COUNT(*) FILTER (WHERE NOT healthy) AS down_count, + COALESCE(AVG(response_ms) FILTER (WHERE healthy), 0) AS avg_ms, + COUNT(*) AS total + FROM host_health_probes + WHERE probed_at >= NOW() - INTERVAL '24 hours' + GROUP BY host_label + ORDER BY host_label + """), + ).fetchall() + health_history = [ + { + 'host_label': r[0], + 'up_count': int(r[1] or 0), + 'down_count': int(r[2] or 0), + 'avg_ms': int(r[3] or 0), + 'total': int(r[4] or 0), + 'uptime_pct': (float(r[1] or 0) / float(r[4]) * 100) if r[4] else 0, + } + for r in history_rows + ] + finally: + _session2.close() + except Exception: + pass # 表可能尚未 migration,失敗安全 + return render_template( 'admin/host_health.html', active_page='obs_host_health', ollama_hosts=ollama_hosts, mcp_status=mcp_status, throttle_state=throttle_state, + health_history=health_history, ) diff --git a/services/code_review_pipeline_service.py b/services/code_review_pipeline_service.py index 3b57a28..5374c77 100644 --- a/services/code_review_pipeline_service.py +++ b/services/code_review_pipeline_service.py @@ -204,7 +204,7 @@ class CodeReviewPipeline: # ── Step 2:Hermes 掃描 ─────────────────────────────────────────────────── def _hermes_scan(self, files: Dict[str, str]) -> List[Dict]: - """直呼內網 Ollama(http://192.168.0.188:11434),免認證""" + """走 resolve_ollama_host() 三主機級聯:GCP-A → GCP-B → 111(ADR-027 Phase 2)""" try: import requests as _req diff --git a/services/ppt_vision_service.py b/services/ppt_vision_service.py index b822439..5a0819a 100644 --- a/services/ppt_vision_service.py +++ b/services/ppt_vision_service.py @@ -149,11 +149,15 @@ class PPTVisionService: return result # 2. 對前 N 張跑 check_image + import time as _time + t0 = _time.monotonic() + confidences = [] for idx, png in enumerate(png_files[:max_slides]): try: vr = self.check_image(png) if vr.success: result['slides_checked'] += 1 + confidences.append(vr.confidence) if vr.issues_found: result['total_issues'] += len(vr.issues_found) result['issues_by_slide'].append((idx + 1, vr.issues_found)) @@ -161,8 +165,89 @@ class PPTVisionService: logger.warning(f"[PPTVision] slide {idx+1} check failed: {exc}") result['success'] = result['slides_checked'] > 0 + duration_ms = int((_time.monotonic() - t0) * 1000) + + # Phase 38:寫入 ppt_audit_results 留歷史(失敗安全) + try: + self._persist_audit_result( + pptx_path=pptx_path, + result=result, + avg_confidence=(sum(confidences) / len(confidences)) if confidences else 0.0, + duration_ms=duration_ms, + ) + except Exception as e: + logger.warning(f"[PPTVision] persist audit result failed: {e}") + return result + def _persist_audit_result(self, pptx_path: str, result: Dict[str, Any], + avg_confidence: float, duration_ms: int) -> None: + """Phase 38: 把每次 audit 結果寫入 ppt_audit_results 表。 + + 失敗安全:DB 寫入失敗只 log warning,不擋主流程。 + """ + import os + from datetime import datetime as _dt + from sqlalchemy import text as _sa_text + from database.manager import get_session + + # 推論 audit_status + if result.get('error'): + err = result['error'] + if 'libreoffice not installed' in err or 'PPT_VISION_ENABLED' in err: + status = 'skipped' + else: + status = 'error' + elif result.get('total_issues', 0) > 0: + status = 'failed' + elif result.get('success'): + status = 'passed' + else: + status = 'error' + + # issues_found JSONB 序列化 + import json as _json + issues_json = _json.dumps([ + {'slide': slide_num, 'issues': issues} + for slide_num, issues in result.get('issues_by_slide', []) + ], ensure_ascii=False) + + try: + size_kb = round(os.path.getsize(pptx_path) / 1024, 1) if os.path.isfile(pptx_path) else None + mtime = _dt.fromtimestamp(os.path.getmtime(pptx_path)) if os.path.isfile(pptx_path) else None + except OSError: + size_kb = None + mtime = None + + session = get_session() + try: + session.execute( + _sa_text(""" + INSERT INTO ppt_audit_results + (pptx_filename, pptx_size_kb, pptx_mtime, vision_enabled, + audit_status, issues_count, issues_found, confidence, + duration_ms, error_msg) + VALUES + (:fname, :sz, :mt, :ve, :st, :ic, CAST(:if AS JSONB), + :cf, :du, :em) + """), + { + 'fname': os.path.basename(pptx_path), + 'sz': size_kb, + 'mt': mtime, + 've': True, # 進到這裡代表 vision 已 enabled + 'st': status, + 'ic': result.get('total_issues', 0), + 'if': issues_json, + 'cf': round(avg_confidence, 3), + 'du': duration_ms, + 'em': result.get('error', None), + }, + ) + session.commit() + finally: + session.close() + def check_image(self, image_path: str) -> VisionResult: """檢查單張 PPT 截圖。 diff --git a/templates/admin/host_health.html b/templates/admin/host_health.html index e410355..0ed8c5c 100644 --- a/templates/admin/host_health.html +++ b/templates/admin/host_health.html @@ -112,8 +112,47 @@ + + {% if health_history %} +
+
過去 24 小時健康趨勢 + 資料來源:host_health_probes(每次刷新自動寫入) +
+
+ + + + + + + + + + + + + {% for h in health_history %} + + + + + + + + + {% endfor %} + +
角色總探針次數正常次數離線次數在線率平均回應 ms
{{ h.host_label }}{{ h.total }}{{ h.up_count }}{{ h.down_count }} + + {{ "%.1f"|format(h.uptime_pct) }}% + + {{ h.avg_ms }}
+
+
+ {% endif %} +

- Operation Ollama-First v5.0 / Phase 29 — 主機健康監控 + Operation Ollama-First v5.0 / Phase 38 — 主機健康監控(含 24h 歷史)

{% endblock %} diff --git a/templates/admin/ppt_audit_history.html b/templates/admin/ppt_audit_history.html index 09d1744..abb01aa 100644 --- a/templates/admin/ppt_audit_history.html +++ b/templates/admin/ppt_audit_history.html @@ -19,28 +19,75 @@ {% endif %} - - - - - - - - - {% for f in files %} - - - - - - - {% else %} - - {% endfor %} - -
檔名大小 (KB)修改時間動作
{{ f.name }}{{ f.size_kb }}{{ f.mtime }} - 由 audit cron 22:00 自動執行 -
過去 7 日無 PPT 生成
+
+
過去 7 日 PPT 檔案
+
+ + + + + + + + + {% for f in files %} + + + + + + + {% else %} + + {% endfor %} + +
檔名大小 (KB)修改時間動作
{{ f.name }}{{ f.size_kb }}{{ f.mtime }} + 由 audit cron 22:00 自動執行 +
過去 7 日無 PPT 生成
+
+
+ +
+
視覺審核歷史紀錄(最近 100 筆)
+
+ + + + + + + + + + {% for r in audit_records %} + + + + + + + + + + {% else %} + + {% endfor %} + +
審核時間檔名結果問題數信心度耗時 ms錯誤訊息
{{ r.audited_at }}{{ r.pptx_filename }} + {% if r.audit_status == 'passed' %} + 通過 + {% elif r.audit_status == 'failed' %} + 有問題 + {% elif r.audit_status == 'skipped' %} + 跳過 + {% elif r.audit_status == 'error' %} + 錯誤 + {% else %} + {{ r.audit_status }} + {% endif %} + {{ r.issues_count }}{{ "%.2f"|format(r.confidence) }}{{ r.duration_ms }}{{ (r.error_msg or '')[:80] }}
尚無審核紀錄(migration 030 跑過後即會累積)
+
+

審核結果:有問題才推 Telegram(避免靜默無問題洗版)。