diff --git a/routes/admin_observability_routes.py b/routes/admin_observability_routes.py index b434d95..768a786 100644 --- a/routes/admin_observability_routes.py +++ b/routes/admin_observability_routes.py @@ -340,12 +340,78 @@ def quality_trend_dashboard(): key=lambda kv: kv[1].get('avg_score', 5), ) + # Phase 40 D-6: learning_episodes 各 status 分布(蒸餾池飽和度) + episode_distribution = {} + try: + session = get_session() + try: + rows = session.execute( + sa_text(""" + SELECT promotion_status, COUNT(*) AS cnt + FROM learning_episodes + WHERE created_at >= NOW() - INTERVAL ':days days' + GROUP BY promotion_status + """).bindparams(days=days), + ).fetchall() if False else session.execute( + sa_text(f""" + SELECT promotion_status, COUNT(*) AS cnt + FROM learning_episodes + WHERE created_at >= NOW() - INTERVAL '{int(days)} days' + GROUP BY promotion_status + """), + ).fetchall() + episode_distribution = {r[0]: int(r[1] or 0) for r in rows} + finally: + session.close() + except Exception: + pass + + # Phase 40 D-6: 對最差 3 名 caller 跑 RAG 找根因建議 + rag_root_causes = [] + try: + from services.rag_service import rag_service + worst_3 = sorted_trends[:3] if len(sorted_trends) >= 3 else sorted_trends + for caller, info in worst_3: + if info.get('avg_score', 5) < 3.0 and info.get('total_feedback', 0) >= 3: + try: + q = ( + f"caller {caller} 反饋分數低 平均 " + f"{info.get('avg_score', 0):.1f}/5 應採取什麼根因排查" + ) + rag_result = rag_service.query( + text=q, + caller='admin_quality_trend', + top_k=2, + threshold=0.6, + ) + if rag_result.hits: + rag_root_causes.append({ + 'caller': caller, + 'avg_score': info.get('avg_score', 0), + 'feedback_n': info.get('total_feedback', 0), + 'hits': [ + { + 'id': h.get('id'), + 'insight_type': h.get('insight_type'), + 'content': (h.get('content') or '')[:200], + 'similarity': round(float(h.get('similarity', 0)), 3), + } + for h in rag_result.hits[:2] + ], + }) + except Exception: + pass + except Exception: + pass + return render_template( 'admin/quality_trend.html', active_page='obs_quality_trend', days=days, trends=[(c, info) for c, info in sorted_trends], recommendations=recommendations, + episode_distribution=episode_distribution, + rag_root_causes=rag_root_causes, error=None, ) except Exception as e: @@ -353,6 +419,7 @@ def quality_trend_dashboard(): 'admin/quality_trend.html', active_page='obs_quality_trend', days=days, trends=[], recommendations=[], + episode_distribution={}, rag_root_causes=[], error=f'查詢失敗: {type(e).__name__}: {str(e)[:200]}', ) @@ -456,6 +523,137 @@ def budget_dashboard(): session.close() +@admin_observability_bp.route('/ai_calls/trigger_code_review', methods=['POST']) +@login_required +def ai_calls_trigger_code_review(): + """Phase 40 D-7 (L2 自動化):對高錯誤率時段觸發 Code Review Pipeline。 + + 用途:admin 在觀測台看到某 caller 錯誤率飆高時,一鍵觸發 5-step + pipeline (read→hermes_scan→openclaw_summary→ea_decision→nemoton_act) + 在 daemon thread 自動審查最近 commit 變更檔案,找出可能的 regression。 + """ + try: + import subprocess + import threading + from services.code_review_pipeline_service import CodeReviewPipeline + + # 取最新 commit + 變更檔案 + commit_sha = subprocess.check_output( + ['git', 'rev-parse', 'HEAD'], stderr=subprocess.DEVNULL, + ).decode().strip() + changed = subprocess.check_output( + ['git', 'diff-tree', '--no-commit-id', '--name-only', '-r', commit_sha], + stderr=subprocess.DEVNULL, + ).decode().strip().split('\n') + changed = [f for f in changed if f] + + if not changed: + return jsonify({'ok': False, 'error': '最新 commit 無變更檔案'}), 400 + + pipeline = CodeReviewPipeline( + commit_sha=commit_sha, + changed_files=changed, + branch='main', + deploy_type='manual_observability', + ) + threading.Thread(target=pipeline.run, daemon=True).start() + return jsonify({ + 'ok': True, + 'pipeline_id': pipeline.pipeline_id, + 'commit_sha': commit_sha[:8], + 'changed_files_count': len(changed), + 'message': f'已觸發 Code Review (pipeline_id={pipeline.pipeline_id}) 在背景執行,' + f'5 step 完成後會推 Telegram 通知。', + }) + except Exception as e: + return jsonify({'ok': False, 'error': f'{type(e).__name__}: {str(e)[:200]}'}), 500 + + +@admin_observability_bp.route('/ppt_audit/trigger_aider_heal', methods=['POST']) +@login_required +def ppt_audit_trigger_aider_heal(): + """Phase 40 D-8 (L2 自動化):對失敗 PPT audit 觸發 AiderHeal 修 generator。 + + 用途:admin 在觀測台看到 PPT vision audit 連續失敗時,一鍵觸發 AiderHeal + 自動修 services/ppt_generator.py(或對應 template generator), + 結果會 git push 到 main 觸發 CD 自動部署。 + """ + try: + from services.aider_heal_executor import execute_code_fix + data = request.json or {} + error_msg = (data.get('error_msg') or '').strip() + pptx_filename = (data.get('pptx_filename') or '').strip() + if not error_msg: + return jsonify({'ok': False, 'error': '需提供 error_msg'}), 400 + + # 構造 context 給 AiderHeal + context = { + 'error_type': 'ppt_vision_audit_failure', + 'error_message': error_msg[:500], + 'target_file': 'services/ppt_generator.py', + 'pptx_filename': pptx_filename, + 'triggered_by': 'admin_observability', + } + result = execute_code_fix(context) + return jsonify({ + 'ok': bool(getattr(result, 'success', False)), + 'action': getattr(result, 'action', None), + 'message': getattr(result, 'message', '') or '已派出 AiderHeal', + }) + except Exception as e: + return jsonify({'ok': False, 'error': f'{type(e).__name__}: {str(e)[:200]}'}), 500 + + +@admin_observability_bp.route('/host_health/trigger_autoheal', methods=['POST']) +@login_required +def host_health_trigger_autoheal(): + """Phase 40 D-9 (L2 自動化):對掛掉的主機觸發 AutoHeal playbook。 + + 用途:admin 看到某台 Ollama 主機標記 unhealthy 時一鍵觸發 AutoHeal + (ADR-013) 跑對應 playbook(DOCKER_RESTART / SSH_CMD / ALERT_ONLY)。 + + 安全:只能對已標記 unhealthy 的 host 觸發;不接受任意 host URL(防 SSRF)。 + """ + try: + data = request.json or {} + host_label = (data.get('host_label') or '').strip() + from services.auto_heal_service import auto_heal_service + from services.ollama_service import _is_unhealthy, OLLAMA_HOST_PRIMARY, OLLAMA_HOST_SECONDARY, OLLAMA_HOST_FALLBACK + + # 白名單對應 + host_map = { + 'Primary (GCP)': OLLAMA_HOST_PRIMARY, + 'Secondary (GCP)': OLLAMA_HOST_SECONDARY, + 'Fallback (111)': OLLAMA_HOST_FALLBACK, + } + host_url = host_map.get(host_label) + if not host_url: + return jsonify({'ok': False, 'error': f'未知 host_label: {host_label}'}), 400 + + if not _is_unhealthy(host_url): + return jsonify({ + 'ok': False, + 'error': f'{host_label} 目前未標記異常,無需 AutoHeal', + }), 400 + + result = auto_heal_service.handle_exception( + error_type='ollama_unhealthy', + context={ + 'host_label': host_label, + 'host_url': host_url, + 'error_message': f'Ollama host {host_label} ({host_url}) marked unhealthy', + 'triggered_by': 'admin_observability', + }, + ) + return jsonify({ + 'ok': bool(getattr(result, 'success', False)), + 'action': getattr(result, 'action', None), + 'message': getattr(result, 'message', '') or 'AutoHeal 已派遣', + }) + except Exception as e: + return jsonify({'ok': False, 'error': f'{type(e).__name__}: {str(e)[:200]}'}), 500 + + @admin_observability_bp.route('/budget/force_throttle', methods=['POST']) @login_required def budget_force_throttle(): diff --git a/templates/admin/ai_calls_dashboard.html b/templates/admin/ai_calls_dashboard.html index a45f749..caa56bf 100644 --- a/templates/admin/ai_calls_dashboard.html +++ b/templates/admin/ai_calls_dashboard.html @@ -12,6 +12,17 @@
{{ rc.caller }}
+ {{ "%.2f"|format(rc.avg_score) }}/5
+ {{ rc.feedback_n }} 筆反饋
+