diff --git a/routes/admin_observability_routes.py b/routes/admin_observability_routes.py index b434d95..768a786 100644 --- a/routes/admin_observability_routes.py +++ b/routes/admin_observability_routes.py @@ -340,12 +340,78 @@ def quality_trend_dashboard(): key=lambda kv: kv[1].get('avg_score', 5), ) + # Phase 40 D-6: learning_episodes 各 status 分布(蒸餾池飽和度) + episode_distribution = {} + try: + session = get_session() + try: + rows = session.execute( + sa_text(""" + SELECT promotion_status, COUNT(*) AS cnt + FROM learning_episodes + WHERE created_at >= NOW() - INTERVAL ':days days' + GROUP BY promotion_status + """).bindparams(days=days), + ).fetchall() if False else session.execute( + sa_text(f""" + SELECT promotion_status, COUNT(*) AS cnt + FROM learning_episodes + WHERE created_at >= NOW() - INTERVAL '{int(days)} days' + GROUP BY promotion_status + """), + ).fetchall() + episode_distribution = {r[0]: int(r[1] or 0) for r in rows} + finally: + session.close() + except Exception: + pass + + # Phase 40 D-6: 對最差 3 名 caller 跑 RAG 找根因建議 + rag_root_causes = [] + try: + from services.rag_service import rag_service + worst_3 = sorted_trends[:3] if len(sorted_trends) >= 3 else sorted_trends + for caller, info in worst_3: + if info.get('avg_score', 5) < 3.0 and info.get('total_feedback', 0) >= 3: + try: + q = ( + f"caller {caller} 反饋分數低 平均 " + f"{info.get('avg_score', 0):.1f}/5 應採取什麼根因排查" + ) + rag_result = rag_service.query( + text=q, + caller='admin_quality_trend', + top_k=2, + threshold=0.6, + ) + if rag_result.hits: + rag_root_causes.append({ + 'caller': caller, + 'avg_score': info.get('avg_score', 0), + 'feedback_n': info.get('total_feedback', 0), + 'hits': [ + { + 'id': h.get('id'), + 'insight_type': h.get('insight_type'), + 'content': (h.get('content') or '')[:200], + 'similarity': round(float(h.get('similarity', 0)), 3), + } + for h in rag_result.hits[:2] + ], + }) + except Exception: + pass + except Exception: + pass + return render_template( 'admin/quality_trend.html', active_page='obs_quality_trend', days=days, trends=[(c, info) for c, info in sorted_trends], recommendations=recommendations, + episode_distribution=episode_distribution, + rag_root_causes=rag_root_causes, error=None, ) except Exception as e: @@ -353,6 +419,7 @@ def quality_trend_dashboard(): 'admin/quality_trend.html', active_page='obs_quality_trend', days=days, trends=[], recommendations=[], + episode_distribution={}, rag_root_causes=[], error=f'查詢失敗: {type(e).__name__}: {str(e)[:200]}', ) @@ -456,6 +523,137 @@ def budget_dashboard(): session.close() +@admin_observability_bp.route('/ai_calls/trigger_code_review', methods=['POST']) +@login_required +def ai_calls_trigger_code_review(): + """Phase 40 D-7 (L2 自動化):對高錯誤率時段觸發 Code Review Pipeline。 + + 用途:admin 在觀測台看到某 caller 錯誤率飆高時,一鍵觸發 5-step + pipeline (read→hermes_scan→openclaw_summary→ea_decision→nemoton_act) + 在 daemon thread 自動審查最近 commit 變更檔案,找出可能的 regression。 + """ + try: + import subprocess + import threading + from services.code_review_pipeline_service import CodeReviewPipeline + + # 取最新 commit + 變更檔案 + commit_sha = subprocess.check_output( + ['git', 'rev-parse', 'HEAD'], stderr=subprocess.DEVNULL, + ).decode().strip() + changed = subprocess.check_output( + ['git', 'diff-tree', '--no-commit-id', '--name-only', '-r', commit_sha], + stderr=subprocess.DEVNULL, + ).decode().strip().split('\n') + changed = [f for f in changed if f] + + if not changed: + return jsonify({'ok': False, 'error': '最新 commit 無變更檔案'}), 400 + + pipeline = CodeReviewPipeline( + commit_sha=commit_sha, + changed_files=changed, + branch='main', + deploy_type='manual_observability', + ) + threading.Thread(target=pipeline.run, daemon=True).start() + return jsonify({ + 'ok': True, + 'pipeline_id': pipeline.pipeline_id, + 'commit_sha': commit_sha[:8], + 'changed_files_count': len(changed), + 'message': f'已觸發 Code Review (pipeline_id={pipeline.pipeline_id}) 在背景執行,' + f'5 step 完成後會推 Telegram 通知。', + }) + except Exception as e: + return jsonify({'ok': False, 'error': f'{type(e).__name__}: {str(e)[:200]}'}), 500 + + +@admin_observability_bp.route('/ppt_audit/trigger_aider_heal', methods=['POST']) +@login_required +def ppt_audit_trigger_aider_heal(): + """Phase 40 D-8 (L2 自動化):對失敗 PPT audit 觸發 AiderHeal 修 generator。 + + 用途:admin 在觀測台看到 PPT vision audit 連續失敗時,一鍵觸發 AiderHeal + 自動修 services/ppt_generator.py(或對應 template generator), + 結果會 git push 到 main 觸發 CD 自動部署。 + """ + try: + from services.aider_heal_executor import execute_code_fix + data = request.json or {} + error_msg = (data.get('error_msg') or '').strip() + pptx_filename = (data.get('pptx_filename') or '').strip() + if not error_msg: + return jsonify({'ok': False, 'error': '需提供 error_msg'}), 400 + + # 構造 context 給 AiderHeal + context = { + 'error_type': 'ppt_vision_audit_failure', + 'error_message': error_msg[:500], + 'target_file': 'services/ppt_generator.py', + 'pptx_filename': pptx_filename, + 'triggered_by': 'admin_observability', + } + result = execute_code_fix(context) + return jsonify({ + 'ok': bool(getattr(result, 'success', False)), + 'action': getattr(result, 'action', None), + 'message': getattr(result, 'message', '') or '已派出 AiderHeal', + }) + except Exception as e: + return jsonify({'ok': False, 'error': f'{type(e).__name__}: {str(e)[:200]}'}), 500 + + +@admin_observability_bp.route('/host_health/trigger_autoheal', methods=['POST']) +@login_required +def host_health_trigger_autoheal(): + """Phase 40 D-9 (L2 自動化):對掛掉的主機觸發 AutoHeal playbook。 + + 用途:admin 看到某台 Ollama 主機標記 unhealthy 時一鍵觸發 AutoHeal + (ADR-013) 跑對應 playbook(DOCKER_RESTART / SSH_CMD / ALERT_ONLY)。 + + 安全:只能對已標記 unhealthy 的 host 觸發;不接受任意 host URL(防 SSRF)。 + """ + try: + data = request.json or {} + host_label = (data.get('host_label') or '').strip() + from services.auto_heal_service import auto_heal_service + from services.ollama_service import _is_unhealthy, OLLAMA_HOST_PRIMARY, OLLAMA_HOST_SECONDARY, OLLAMA_HOST_FALLBACK + + # 白名單對應 + host_map = { + 'Primary (GCP)': OLLAMA_HOST_PRIMARY, + 'Secondary (GCP)': OLLAMA_HOST_SECONDARY, + 'Fallback (111)': OLLAMA_HOST_FALLBACK, + } + host_url = host_map.get(host_label) + if not host_url: + return jsonify({'ok': False, 'error': f'未知 host_label: {host_label}'}), 400 + + if not _is_unhealthy(host_url): + return jsonify({ + 'ok': False, + 'error': f'{host_label} 目前未標記異常,無需 AutoHeal', + }), 400 + + result = auto_heal_service.handle_exception( + error_type='ollama_unhealthy', + context={ + 'host_label': host_label, + 'host_url': host_url, + 'error_message': f'Ollama host {host_label} ({host_url}) marked unhealthy', + 'triggered_by': 'admin_observability', + }, + ) + return jsonify({ + 'ok': bool(getattr(result, 'success', False)), + 'action': getattr(result, 'action', None), + 'message': getattr(result, 'message', '') or 'AutoHeal 已派遣', + }) + except Exception as e: + return jsonify({'ok': False, 'error': f'{type(e).__name__}: {str(e)[:200]}'}), 500 + + @admin_observability_bp.route('/budget/force_throttle', methods=['POST']) @login_required def budget_force_throttle(): diff --git a/templates/admin/ai_calls_dashboard.html b/templates/admin/ai_calls_dashboard.html index a45f749..caa56bf 100644 --- a/templates/admin/ai_calls_dashboard.html +++ b/templates/admin/ai_calls_dashboard.html @@ -12,6 +12,17 @@
{{ error }}
{% endif %} + +
+ + + 看到錯誤率飆高?一鍵觸發 Hermes→OpenClaw→EA→NemoTron 5 步審查最新 commit, + 在背景執行,完成後 Telegram 通知。 + +
+
@@ -172,7 +183,24 @@

- Operation Ollama-First v5.0 / Phase 29 — AI 呼叫總覽 + Operation Ollama-First v5.0 / Phase 40 — AI 呼叫總覽(含 RAG/MCP 編排矩陣)

+ + {% endblock %} diff --git a/templates/admin/host_health.html b/templates/admin/host_health.html index 7cb4852..1dd9645 100644 --- a/templates/admin/host_health.html +++ b/templates/admin/host_health.html @@ -14,7 +14,7 @@
- + {% for h in ollama_hosts %} @@ -41,6 +41,16 @@ {% endfor %} {% if not h.models %}無 / 未連線{% endif %} + {% endfor %} @@ -258,7 +268,29 @@ {% endif %}

- Operation Ollama-First v5.0 / Phase 38 — 主機健康監控(含 24h 歷史) + Operation Ollama-First v5.0 / Phase 40 — 主機健康監控(含 24h 歷史 / MCP / AIOps / AutoHeal L2)

+ + {% endblock %} diff --git a/templates/admin/ppt_audit_history.html b/templates/admin/ppt_audit_history.html index abb01aa..c6ae3d6 100644 --- a/templates/admin/ppt_audit_history.html +++ b/templates/admin/ppt_audit_history.html @@ -55,7 +55,7 @@ - + @@ -80,9 +80,17 @@ + {% else %} - + {% endfor %}
角色主機HTTP 健康異常標記已載入模型
角色主機HTTP 健康異常標記已載入模型動作
+ {% if h.unhealthy_mark or not h.healthy %} + + {% else %} + + {% endif %} +
審核時間檔名結果 問題數信心度耗時 ms錯誤訊息耗時 ms錯誤訊息動作
{{ "%.2f"|format(r.confidence) }} {{ r.duration_ms }} {{ (r.error_msg or '')[:80] }} + {% if r.audit_status in ('failed', 'error') %} + + {% endif %} +
尚無審核紀錄(migration 030 跑過後即會累積)
尚無審核紀錄(migration 030 跑過後即會累積)
@@ -96,7 +104,28 @@

- Operation Ollama-First v5.0 / Phase 29 — PPT 視覺審核歷史 + Operation Ollama-First v5.0 / Phase 40 — PPT 視覺審核歷史(含 AiderHeal L2)

+ + {% endblock %} diff --git a/templates/admin/quality_trend.html b/templates/admin/quality_trend.html index 5d33533..b85f092 100644 --- a/templates/admin/quality_trend.html +++ b/templates/admin/quality_trend.html @@ -23,6 +23,63 @@
+ {% if episode_distribution %} +
+
蒸餾池狀態(learning_episodes 過去 {{ days }} 日) + 資料來源:learning_episodes — 展現 RAG 學習鏈路飽和度 +
+
+
+ {% for status, cnt in episode_distribution.items() %} +
+
+ + {% if status == 'pending' %} 待處理 + {% elif status == 'awaiting_review' %} 待審核 + {% elif status == 'approved' %} 已晉升 + {% elif status == 'rejected_quality' %} 品質拒 + {% elif status == 'rejected_hallucination' %} 幻覺拒 + {% elif status == 'rejected_duplicate' %} 重複拒 + {% elif status == 'rejected_human' %} 人工拒 + {% elif status == 'expired' %} 已過期 + {% else %}{{ status }}{% endif %} + + {{ cnt }} +
+
+ {% endfor %} +
+
+
+ {% endif %} + + {% if rag_root_causes %} +
+
+ RAG 自動根因建議 + — 對最差 3 名 caller 自動從 ai_insights 召回相似案例 +
+
+ {% for rc in rag_root_causes %} +
+ {{ rc.caller }} + {{ "%.2f"|format(rc.avg_score) }}/5 + {{ rc.feedback_n }} 筆反饋 +
    + {% for h in rc.hits %} +
  • + {{ h.insight_type }} + 相似度 {{ "%.2f"|format(h.similarity) }} + {{ h.content }}{% if h.content|length >= 200 %}…{% endif %} +
  • + {% endfor %} +
+
+ {% endfor %} +
+
+ {% endif %} + {% if recommendations %}
智能建議