feat(p40): 觀測台收官 — 4 頁升 L2 + RAG 根因 + 蒸餾池監控

接續 Phase 39 (commit 79cf08c)，本 commit 完成 Phase D 最後 4 項： D-6: quality_trend 蒸餾池 + RAG 根因 - 新「蒸餾池狀態」card：learning_episodes 各 promotion_status 分布（pending / awaiting_review / approved / rejected_quality / rejected_hallucination / rejected_duplicate / rejected_human / expired） - 對最差 3 名 caller (avg_score < 3 且反饋 ≥ 3) 自動 RAG 根因建議 - RAG 從 ai_insights 召回相似低品質案例 D-7: ai_calls 一鍵 Code Review (L2) - 新 POST /observability/ai_calls/trigger_code_review 讀 git rev-parse HEAD + diff-tree 取最新變更檔案在 daemon thread 跑 CodeReviewPipeline.run() (5 step Hermes→ OpenClaw→EA→NemoTron) - 頁面新增「觸發 Code Review Pipeline」按鈕 D-8: ppt_audit 失敗 row 一鍵 AiderHeal (L2) - 新 POST /observability/ppt_audit/trigger_aider_heal 接收 pptx_filename + error_msg，呼叫 services/aider_heal_executor:: execute_code_fix 自動修 services/ppt_generator.py AiderHeal 修完會 git push 觸發 CD - audit_records 表中 status='failed'/'error' 的 row 自動顯示按鈕 D-9: host_health 一鍵 AutoHeal (L2) - 新 POST /observability/host_health/trigger_autoheal 接收 host_label，白名單對應 OLLAMA_HOST_PRIMARY/SECONDARY/FALLBACK 防 SSRF。已標記 unhealthy 的 host 才允許觸發呼叫 auto_heal_service.handle_exception(error_type='ollama_unhealthy') 跑 ADR-013 playbook（DOCKER_RESTART / SSH_CMD / ALERT_ONLY） - 三主機 row 中 unhealthy / down 的 host 自動顯示按鈕升級對應： - AI 自動化：L2 從 1 個 → 4 個（budget force_throttle / Code Review / AiderHeal / AutoHeal） - DB 利用率 ~60%：新增 learning_episodes 分布查詢 - RAG 整合 4/6（promotion_review + budget + quality_trend + 待 ppt_audit） Phase 38+39+40 累計：6 commits 完成觀測台從 raw dashboard 升級到 AI 自動化專業舞台。 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-04 19:13:39 +08:00
parent 79cf08c58c
commit 65f236da2d
5 changed files with 350 additions and 6 deletions
--- a/routes/admin_observability_routes.py
+++ b/routes/admin_observability_routes.py
@@ -340,12 +340,78 @@ def quality_trend_dashboard():
            key=lambda kv: kv[1].get('avg_score', 5),
        )

+        # Phase 40 D-6: learning_episodes 各 status 分布（蒸餾池飽和度）
+        episode_distribution = {}
+        try:
+            session = get_session()
+            try:
+                rows = session.execute(
+                    sa_text("""
+                        SELECT promotion_status, COUNT(*) AS cnt
+                        FROM learning_episodes
+                        WHERE created_at >= NOW() - INTERVAL ':days days'
+                        GROUP BY promotion_status
+                    """).bindparams(days=days),
+                ).fetchall() if False else session.execute(
+                    sa_text(f"""
+                        SELECT promotion_status, COUNT(*) AS cnt
+                        FROM learning_episodes
+                        WHERE created_at >= NOW() - INTERVAL '{int(days)} days'
+                        GROUP BY promotion_status
+                    """),
+                ).fetchall()
+                episode_distribution = {r[0]: int(r[1] or 0) for r in rows}
+            finally:
+                session.close()
+        except Exception:
+            pass
+
+        # Phase 40 D-6: 對最差 3 名 caller 跑 RAG 找根因建議
+        rag_root_causes = []
+        try:
+            from services.rag_service import rag_service
+            worst_3 = sorted_trends[:3] if len(sorted_trends) >= 3 else sorted_trends
+            for caller, info in worst_3:
+                if info.get('avg_score', 5) < 3.0 and info.get('total_feedback', 0) >= 3:
+                    try:
+                        q = (
+                            f"caller {caller} 反饋分數低 平均 "
+                            f"{info.get('avg_score', 0):.1f}/5 應採取什麼根因排查"
+                        )
+                        rag_result = rag_service.query(
+                            text=q,
+                            caller='admin_quality_trend',
+                            top_k=2,
+                            threshold=0.6,
+                        )
+                        if rag_result.hits:
+                            rag_root_causes.append({
+                                'caller': caller,
+                                'avg_score': info.get('avg_score', 0),
+                                'feedback_n': info.get('total_feedback', 0),
+                                'hits': [
+                                    {
+                                        'id': h.get('id'),
+                                        'insight_type': h.get('insight_type'),
+                                        'content': (h.get('content') or '')[:200],
+                                        'similarity': round(float(h.get('similarity', 0)), 3),
+                                    }
+                                    for h in rag_result.hits[:2]
+                                ],
+                            })
+                    except Exception:
+                        pass
+        except Exception:
+            pass
+
        return render_template(
            'admin/quality_trend.html',
            active_page='obs_quality_trend',
            days=days,
            trends=[(c, info) for c, info in sorted_trends],
            recommendations=recommendations,
+            episode_distribution=episode_distribution,
+            rag_root_causes=rag_root_causes,
            error=None,
        )
    except Exception as e:
@@ -353,6 +419,7 @@ def quality_trend_dashboard():
            'admin/quality_trend.html',
            active_page='obs_quality_trend',
            days=days, trends=[], recommendations=[],
+            episode_distribution={}, rag_root_causes=[],
            error=f'查詢失敗: {type(e).__name__}: {str(e)[:200]}',
        )

@@ -456,6 +523,137 @@ def budget_dashboard():
        session.close()


+@admin_observability_bp.route('/ai_calls/trigger_code_review', methods=['POST'])
+@login_required
+def ai_calls_trigger_code_review():
+    """Phase 40 D-7 (L2 自動化)：對高錯誤率時段觸發 Code Review Pipeline。
+
+    用途：admin 在觀測台看到某 caller 錯誤率飆高時，一鍵觸發 5-step
+         pipeline (read→hermes_scan→openclaw_summary→ea_decision→nemoton_act)
+         在 daemon thread 自動審查最近 commit 變更檔案，找出可能的 regression。
+    """
+    try:
+        import subprocess
+        import threading
+        from services.code_review_pipeline_service import CodeReviewPipeline
+
+        # 取最新 commit + 變更檔案
+        commit_sha = subprocess.check_output(
+            ['git', 'rev-parse', 'HEAD'], stderr=subprocess.DEVNULL,
+        ).decode().strip()
+        changed = subprocess.check_output(
+            ['git', 'diff-tree', '--no-commit-id', '--name-only', '-r', commit_sha],
+            stderr=subprocess.DEVNULL,
+        ).decode().strip().split('\n')
+        changed = [f for f in changed if f]
+
+        if not changed:
+            return jsonify({'ok': False, 'error': '最新 commit 無變更檔案'}), 400
+
+        pipeline = CodeReviewPipeline(
+            commit_sha=commit_sha,
+            changed_files=changed,
+            branch='main',
+            deploy_type='manual_observability',
+        )
+        threading.Thread(target=pipeline.run, daemon=True).start()
+        return jsonify({
+            'ok': True,
+            'pipeline_id': pipeline.pipeline_id,
+            'commit_sha': commit_sha[:8],
+            'changed_files_count': len(changed),
+            'message': f'已觸發 Code Review (pipeline_id={pipeline.pipeline_id}) 在背景執行，'
+                       f'5 step 完成後會推 Telegram 通知。',
+        })
+    except Exception as e:
+        return jsonify({'ok': False, 'error': f'{type(e).__name__}: {str(e)[:200]}'}), 500
+
+
+@admin_observability_bp.route('/ppt_audit/trigger_aider_heal', methods=['POST'])
+@login_required
+def ppt_audit_trigger_aider_heal():
+    """Phase 40 D-8 (L2 自動化)：對失敗 PPT audit 觸發 AiderHeal 修 generator。
+
+    用途：admin 在觀測台看到 PPT vision audit 連續失敗時，一鍵觸發 AiderHeal
+         自動修 services/ppt_generator.py（或對應 template generator），
+         結果會 git push 到 main 觸發 CD 自動部署。
+    """
+    try:
+        from services.aider_heal_executor import execute_code_fix
+        data = request.json or {}
+        error_msg = (data.get('error_msg') or '').strip()
+        pptx_filename = (data.get('pptx_filename') or '').strip()
+        if not error_msg:
+            return jsonify({'ok': False, 'error': '需提供 error_msg'}), 400
+
+        # 構造 context 給 AiderHeal
+        context = {
+            'error_type': 'ppt_vision_audit_failure',
+            'error_message': error_msg[:500],
+            'target_file': 'services/ppt_generator.py',
+            'pptx_filename': pptx_filename,
+            'triggered_by': 'admin_observability',
+        }
+        result = execute_code_fix(context)
+        return jsonify({
+            'ok': bool(getattr(result, 'success', False)),
+            'action': getattr(result, 'action', None),
+            'message': getattr(result, 'message', '') or '已派出 AiderHeal',
+        })
+    except Exception as e:
+        return jsonify({'ok': False, 'error': f'{type(e).__name__}: {str(e)[:200]}'}), 500
+
+
+@admin_observability_bp.route('/host_health/trigger_autoheal', methods=['POST'])
+@login_required
+def host_health_trigger_autoheal():
+    """Phase 40 D-9 (L2 自動化)：對掛掉的主機觸發 AutoHeal playbook。
+
+    用途：admin 看到某台 Ollama 主機標記 unhealthy 時一鍵觸發 AutoHeal
+         (ADR-013) 跑對應 playbook（DOCKER_RESTART / SSH_CMD / ALERT_ONLY）。
+
+    安全：只能對已標記 unhealthy 的 host 觸發；不接受任意 host URL（防 SSRF）。
+    """
+    try:
+        data = request.json or {}
+        host_label = (data.get('host_label') or '').strip()
+        from services.auto_heal_service import auto_heal_service
+        from services.ollama_service import _is_unhealthy, OLLAMA_HOST_PRIMARY, OLLAMA_HOST_SECONDARY, OLLAMA_HOST_FALLBACK
+
+        # 白名單對應
+        host_map = {
+            'Primary (GCP)': OLLAMA_HOST_PRIMARY,
+            'Secondary (GCP)': OLLAMA_HOST_SECONDARY,
+            'Fallback (111)': OLLAMA_HOST_FALLBACK,
+        }
+        host_url = host_map.get(host_label)
+        if not host_url:
+            return jsonify({'ok': False, 'error': f'未知 host_label: {host_label}'}), 400
+
+        if not _is_unhealthy(host_url):
+            return jsonify({
+                'ok': False,
+                'error': f'{host_label} 目前未標記異常，無需 AutoHeal',
+            }), 400
+
+        result = auto_heal_service.handle_exception(
+            error_type='ollama_unhealthy',
+            context={
+                'host_label': host_label,
+                'host_url': host_url,
+                'error_message': f'Ollama host {host_label} ({host_url}) marked unhealthy',
+                'triggered_by': 'admin_observability',
+            },
+        )
+        return jsonify({
+            'ok': bool(getattr(result, 'success', False)),
+            'action': getattr(result, 'action', None),
+            'message': getattr(result, 'message', '') or 'AutoHeal 已派遣',
+        })
+    except Exception as e:
+        return jsonify({'ok': False, 'error': f'{type(e).__name__}: {str(e)[:200]}'}), 500
+
+
@admin_observability_bp.route('/budget/force_throttle', methods=['POST'])
@login_required
 def budget_force_throttle():
--- a/templates/admin/ai_calls_dashboard.html
+++ b/templates/admin/ai_calls_dashboard.html
@@ -12,6 +12,17 @@
    <div class="alert alert-warning"><strong><i class="fas fa-exclamation-triangle me-1"></i></strong> {{ error }}</div>
  {% endif %}

+  <!-- Phase 40 D-7: 一鍵觸發 Code Review (L2 自動化) -->
+  <div class="mb-3">
+    <button class="btn btn-warning btn-sm" onclick="triggerCodeReview()">
+      <i class="fas fa-microscope me-1"></i>觸發 Code Review Pipeline (5 step)
+    </button>
+    <small class="text-muted ms-2">
+      看到錯誤率飆高？一鍵觸發 Hermes→OpenClaw→EA→NemoTron 5 步審查最新 commit，
+      在背景執行，完成後 Telegram 通知。
+    </small>
+  </div>
+
  <!-- 篩選 bar -->
  <form method="get" class="row g-2 mb-3">
    <div class="col-auto">
@@ -172,7 +183,24 @@
  </div>

  <p class="text-muted mt-2"><small>
-    <i class="fas fa-robot me-1"></i>Operation Ollama-First v5.0 / Phase 29 — AI 呼叫總覽
+    <i class="fas fa-robot me-1"></i>Operation Ollama-First v5.0 / Phase 40 — AI 呼叫總覽（含 RAG/MCP 編排矩陣）
  </small></p>
 </div>
+
+<script>
+async function triggerCodeReview() {
+  if (!confirm('觸發 Code Review Pipeline？\n\n會對最新 commit 跑 5 step 審查（Hermes 掃描 → OpenClaw 摘要 → EA 決策 → NemoTron 行動），背景執行。')) return;
+  try {
+    const r = await fetch('/observability/ai_calls/trigger_code_review', {method: 'POST'});
+    const d = await r.json();
+    if (d.ok) {
+      alert(`✅ ${d.message}\n\nPipeline ID: ${d.pipeline_id}\nCommit: ${d.commit_sha}\n變更檔案: ${d.changed_files_count} 個`);
+    } else {
+      alert('❌ ' + (d.error || '觸發失敗'));
+    }
+  } catch (e) {
+    alert('Error: ' + e);
+  }
+}
+</script>
 {% endblock %}
--- a/templates/admin/host_health.html
+++ b/templates/admin/host_health.html
@@ -14,7 +14,7 @@
    <div class="card-body p-0">
      <table class="table mb-0">
        <thead class="table-light">
-          <tr><th>角色</th><th>主機</th><th>HTTP 健康</th><th>異常標記</th><th>已載入模型</th></tr>
+          <tr><th>角色</th><th>主機</th><th>HTTP 健康</th><th>異常標記</th><th>已載入模型</th><th>動作</th></tr>
        </thead>
        <tbody>
          {% for h in ollama_hosts %}
@@ -41,6 +41,16 @@
                {% endfor %}
                {% if not h.models %}<small class="text-muted">無 / 未連線</small>{% endif %}
              </td>
+              <td>
+                {% if h.unhealthy_mark or not h.healthy %}
+                  <button class="btn btn-sm btn-outline-danger"
+                          onclick="triggerAutoHeal({{ h.label|tojson }})">
+                    <i class="fas fa-band-aid me-1"></i>AutoHeal
+                  </button>
+                {% else %}
+                  <small class="text-muted">—</small>
+                {% endif %}
+              </td>
            </tr>
          {% endfor %}
        </tbody>
@@ -258,7 +268,29 @@
  {% endif %}

  <p class="text-muted mt-3"><small>
-    <i class="fas fa-robot me-1"></i>Operation Ollama-First v5.0 / Phase 38 — 主機健康監控（含 24h 歷史）
+    <i class="fas fa-robot me-1"></i>Operation Ollama-First v5.0 / Phase 40 — 主機健康監控（含 24h 歷史 / MCP / AIOps / AutoHeal L2）
  </small></p>
 </div>
+
+<script>
+async function triggerAutoHeal(hostLabel) {
+  if (!confirm(`觸發 AutoHeal？\n\n主機：${hostLabel}\n\n會跑對應 ADR-013 playbook（DOCKER_RESTART / SSH_CMD / ALERT_ONLY）並寫入 incidents 表。`)) return;
+  try {
+    const r = await fetch('/observability/host_health/trigger_autoheal', {
+      method: 'POST',
+      headers: {'Content-Type': 'application/json'},
+      body: JSON.stringify({host_label: hostLabel}),
+    });
+    const d = await r.json();
+    if (d.ok) {
+      alert(`✅ AutoHeal 已派出\n動作：${d.action || '—'}\n訊息：${d.message || ''}`);
+      window.location.reload();
+    } else {
+      alert('❌ ' + (d.error || d.message || '觸發失敗'));
+    }
+  } catch (e) {
+    alert('Error: ' + e);
+  }
+}
+</script>
 {% endblock %}
--- a/templates/admin/ppt_audit_history.html
+++ b/templates/admin/ppt_audit_history.html
@@ -55,7 +55,7 @@
          <tr>
            <th>審核時間</th><th>檔名</th><th>結果</th>
            <th class="text-end">問題數</th><th class="text-end">信心度</th>
-            <th class="text-end">耗時 ms</th><th>錯誤訊息</th>
+            <th class="text-end">耗時 ms</th><th>錯誤訊息</th><th>動作</th>
          </tr>
        </thead>
        <tbody>
@@ -80,9 +80,17 @@
              <td class="text-end">{{ "%.2f"|format(r.confidence) }}</td>
              <td class="text-end">{{ r.duration_ms }}</td>
              <td><small class="text-muted">{{ (r.error_msg or '')[:80] }}</small></td>
+              <td>
+                {% if r.audit_status in ('failed', 'error') %}
+                  <button class="btn btn-sm btn-outline-warning"
+                          onclick="triggerAiderHeal({{ r.pptx_filename|tojson }}, {{ (r.error_msg or '')|tojson }})">
+                    <i class="fas fa-wrench me-1"></i>AiderHeal
+                  </button>
+                {% endif %}
+              </td>
            </tr>
          {% else %}
-            <tr><td colspan="7" class="text-center text-muted">尚無審核紀錄（migration 030 跑過後即會累積）</td></tr>
+            <tr><td colspan="8" class="text-center text-muted">尚無審核紀錄（migration 030 跑過後即會累積）</td></tr>
          {% endfor %}
        </tbody>
      </table>
@@ -96,7 +104,28 @@
  </p>

  <p class="text-muted mt-3"><small>
-    <i class="fas fa-robot me-1"></i>Operation Ollama-First v5.0 / Phase 29 — PPT 視覺審核歷史
+    <i class="fas fa-robot me-1"></i>Operation Ollama-First v5.0 / Phase 40 — PPT 視覺審核歷史（含 AiderHeal L2）
  </small></p>
 </div>
+
+<script>
+async function triggerAiderHeal(pptxFilename, errorMsg) {
+  if (!confirm(`觸發 AiderHeal 自動修復？\n\n檔案：${pptxFilename}\n錯誤：${(errorMsg || '').substring(0, 200)}\n\nAiderHeal 會嘗試修 services/ppt_generator.py 並 git push 到 main 觸發 CD。`)) return;
+  try {
+    const r = await fetch('/observability/ppt_audit/trigger_aider_heal', {
+      method: 'POST',
+      headers: {'Content-Type': 'application/json'},
+      body: JSON.stringify({pptx_filename: pptxFilename, error_msg: errorMsg || ''}),
+    });
+    const d = await r.json();
+    if (d.ok) {
+      alert(`✅ AiderHeal 已派出\n動作：${d.action || '—'}\n訊息：${d.message || ''}`);
+    } else {
+      alert('❌ ' + (d.error || d.message || '觸發失敗'));
+    }
+  } catch (e) {
+    alert('Error: ' + e);
+  }
+}
+</script>
 {% endblock %}
--- a/templates/admin/quality_trend.html
+++ b/templates/admin/quality_trend.html
@@ -23,6 +23,63 @@
    <div class="col-auto"><button class="btn btn-primary btn-sm">查詢</button></div>
  </form>

+  {% if episode_distribution %}
+  <div class="card mb-3">
+    <div class="card-header"><strong><i class="fas fa-flask me-2"></i>蒸餾池狀態（learning_episodes 過去 {{ days }} 日）</strong>
+      <small class="text-muted">資料來源：learning_episodes — 展現 RAG 學習鏈路飽和度</small>
+    </div>
+    <div class="card-body">
+      <div class="row g-2">
+        {% for status, cnt in episode_distribution.items() %}
+        <div class="col-md-2 col-sm-4">
+          <div class="border rounded p-2 text-center">
+            <small class="text-muted d-block">
+              {% if status == 'pending' %}<i class="fas fa-hourglass-start"></i> 待處理
+              {% elif status == 'awaiting_review' %}<i class="fas fa-user-clock"></i> 待審核
+              {% elif status == 'approved' %}<i class="fas fa-check-circle text-success"></i> 已晉升
+              {% elif status == 'rejected_quality' %}<i class="fas fa-times text-danger"></i> 品質拒
+              {% elif status == 'rejected_hallucination' %}<i class="fas fa-times text-danger"></i> 幻覺拒
+              {% elif status == 'rejected_duplicate' %}<i class="fas fa-clone text-warning"></i> 重複拒
+              {% elif status == 'rejected_human' %}<i class="fas fa-user-times text-danger"></i> 人工拒
+              {% elif status == 'expired' %}<i class="fas fa-clock text-muted"></i> 已過期
+              {% else %}{{ status }}{% endif %}
+            </small>
+            <strong style="font-size: 1.4em;">{{ cnt }}</strong>
+          </div>
+        </div>
+        {% endfor %}
+      </div>
+    </div>
+  </div>
+  {% endif %}
+
+  {% if rag_root_causes %}
+  <div class="card mb-3" style="border-left: 4px solid #6f42c1;">
+    <div class="card-header bg-light">
+      <strong><i class="fas fa-stethoscope me-2"></i>RAG 自動根因建議</strong>
+      <small class="text-muted">— 對最差 3 名 caller 自動從 ai_insights 召回相似案例</small>
+    </div>
+    <div class="card-body p-2">
+      {% for rc in rag_root_causes %}
+      <div class="mb-3 p-2" style="background: #fafafa; border-radius: 6px;">
+        <strong><code>{{ rc.caller }}</code></strong>
+        <span class="badge bg-danger ms-1">{{ "%.2f"|format(rc.avg_score) }}/5</span>
+        <span class="badge bg-secondary ms-1">{{ rc.feedback_n }} 筆反饋</span>
+        <ul class="list-unstyled mt-2 mb-0 small">
+          {% for h in rc.hits %}
+          <li class="mb-1">
+            <span class="badge bg-info text-dark me-1">{{ h.insight_type }}</span>
+            <span class="badge bg-light text-dark me-1">相似度 {{ "%.2f"|format(h.similarity) }}</span>
+            {{ h.content }}{% if h.content|length >= 200 %}…{% endif %}
+          </li>
+          {% endfor %}
+        </ul>
+      </div>
+      {% endfor %}
+    </div>
+  </div>
+  {% endif %}
+
  {% if recommendations %}
    <div class="card mb-3">
      <div class="card-header bg-warning"><strong><i class="fas fa-lightbulb me-2"></i>智能建議</strong></div>