Files
ewoooc/templates/admin/host_health.html
OoO b21b40cae2
All checks were successful
CD Pipeline / deploy (push) Successful in 1m2s
fix(observability): soften frontend error copy
2026-05-05 21:58:49 +08:00

185 lines
20 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{% extends "ewoooc_base.html" %}
{% block title %}基礎設施生命線{% endblock %}
{% block ewooo_content %}
<style>
.runtime-hero, .runtime-panel, .runtime-lane, .runtime-table-shell {
border: 1px solid var(--obs-line);
border-radius: 26px;
background: var(--obs-card);
box-shadow: 0 16px 38px rgba(70, 46, 28, 0.08);
}
.runtime-hero {
padding: clamp(1.2rem, 2.4vw, 2rem);
background:
radial-gradient(circle at 12% 15%, rgba(201, 100, 66, 0.18), transparent 24rem),
radial-gradient(circle at 88% 6%, rgba(79, 111, 143, 0.14), transparent 22rem),
linear-gradient(135deg, rgba(255, 248, 239, 0.98), rgba(255, 255, 255, 0.72));
}
.runtime-kicker { color: var(--obs-accent); font-size: .76rem; letter-spacing: .13em; text-transform: uppercase; font-weight: 800; }
.runtime-title { margin: .45rem 0 .25rem; font-family:'Noto Sans TC','Inter',sans-serif; font-size:var(--obs-title-size); letter-spacing: -.055em; line-height: .98; }
.runtime-subtitle { color: var(--obs-muted); max-width: 820px; line-height: 1.7; }
.runtime-command { display: grid; grid-template-columns: repeat(4, minmax(0, 1fr)); gap: .8rem; margin-top: 1.2rem; }
.runtime-signal { padding: .95rem; border: 1px solid var(--obs-line); border-radius: 20px; background: rgba(255,255,255,.62); }
.runtime-label { color: var(--obs-muted); font-size: .72rem; letter-spacing: .1em; text-transform: uppercase; }
.runtime-value { display: block; margin-top: .28rem; font-size:var(--obs-value-size); font-weight: 880; letter-spacing: -.045em; }
.runtime-main { display: grid; grid-template-columns: minmax(0, 1.22fr) minmax(330px, .78fr); gap: 1rem; margin-top: 1rem; }
.runtime-stack { display: grid; gap: 1rem; }
.runtime-panel-head { display:flex; justify-content:space-between; align-items:flex-start; gap:1rem; padding:1.05rem 1.1rem .2rem; }
.runtime-panel-title { margin: .15rem 0 0; font-size: 1.1rem; font-weight: 850; letter-spacing: -.025em; }
.runtime-panel-body { padding: 1rem 1.1rem 1.1rem; }
.host-grid { display:grid; gap:.75rem; }
.host-lane { display:grid; grid-template-columns: minmax(0, 1fr) auto; gap:.85rem; align-items:center; padding:.95rem; border:1px solid var(--obs-line); border-radius:20px; background:rgba(255,255,255,.62); }
.host-lane.is-up { border-left: 5px solid var(--obs-green); }
.host-lane.is-down { border-left: 5px solid var(--obs-red); }
.host-lane-top { display:flex; justify-content:space-between; gap:.75rem; align-items:baseline; }
.host-name { font-weight: 850; }
.host-url { color: var(--obs-muted); font-size:.8rem; word-break: break-all; }
.model-cloud { display:flex; flex-wrap:wrap; gap:.35rem; margin-top:.55rem; }
.model-chip { padding:.24rem .48rem; border-radius:999px; background:rgba(79,111,143,.12); color:var(--obs-blue); font-size:.72rem; }
.runtime-mini-grid { display:grid; grid-template-columns: repeat(2, minmax(0, 1fr)); gap:.7rem; }
.runtime-mini { padding:.85rem; border:1px solid var(--obs-line); border-radius:18px; background:rgba(255,255,255,.58); }
.runtime-mini strong { display:block; margin-top:.24rem; font-size:1.45rem; letter-spacing:-.04em; }
.runtime-table-shell { overflow:hidden; margin-top:1rem; }
.runtime-table-title { display:flex; justify-content:space-between; gap:1rem; padding:1rem 1.1rem .4rem; }
.runtime-table-title h3 { margin:0; font-size:1.05rem; font-weight:850; }
.runtime-actions { display:flex; flex-wrap:wrap; gap:.45rem; justify-content:flex-end; }
.status-good { color: var(--obs-green); } .status-warn { color: var(--obs-amber); } .status-bad { color: var(--obs-red); } .status-blue { color: var(--obs-blue); }
@media (max-width: 1100px) { .runtime-command { grid-template-columns: repeat(2, minmax(0, 1fr)); } .runtime-main { grid-template-columns:1fr; } }
@media (max-width: 720px) { .runtime-command, .runtime-mini-grid { grid-template-columns:1fr; } .host-lane { grid-template-columns:1fr; } }
</style>
{% set down = namespace(count=0) %}
{% for h in ollama_hosts %}{% if not h.healthy %}{% set down.count = down.count + 1 %}{% endif %}{% endfor %}
{% set active_playbooks = namespace(count=0) %}
{% for p in playbook_ranking %}{% if p.is_active %}{% set active_playbooks.count = active_playbooks.count + 1 %}{% endif %}{% endfor %}
{% set throttled = namespace(count=0) %}
{% for provider, info in throttle_state.items() %}{% if info.throttled %}{% set throttled.count = throttled.count + 1 %}{% endif %}{% endfor %}
<div class="container-fluid mt-3">
<section class="runtime-hero">
<div class="runtime-kicker"><i class="fas fa-heartbeat me-1"></i> Infrastructure Lifeline · Ollama / MCP / AIOps</div>
<h1 class="runtime-title">基礎設施生命線</h1>
<p class="runtime-subtitle">這頁是 AI 中樞的底盤監控:三主機 Ollama 級聯、MCP 工具層、成本節流與 ADR-013 AutoHeal 閉環。先看能不能活,再看要不要修。</p>
<div class="runtime-command">
<div class="runtime-signal"><div class="runtime-label">Ollama Down</div><span class="runtime-value {% if down.count > 0 %}status-bad{% else %}status-good{% endif %}">{{ down.count }}</span><small class="text-muted">{{ ollama_hosts|length }} 台即時 probe</small></div>
<div class="runtime-signal"><div class="runtime-label">AIOps Open</div><span class="runtime-value {% if aiops_summary and aiops_summary.incidents_open > 0 %}status-bad{% else %}status-good{% endif %}">{{ aiops_summary.incidents_open if aiops_summary else '—' }}</span><small class="text-muted">7 日 incident 未解決</small></div>
<div class="runtime-signal"><div class="runtime-label">Heal Rate</div><span class="runtime-value {% if aiops_summary and aiops_summary.heal_success_rate >= 80 %}status-good{% elif aiops_summary and aiops_summary.heal_success_rate >= 50 %}status-warn{% else %}status-bad{% endif %}">{{ "%.0f"|format(aiops_summary.heal_success_rate) if aiops_summary else '—' }}{% if aiops_summary %}%{% endif %}</span><small class="text-muted">ADR-013 自癒成功率</small></div>
<div class="runtime-signal"><div class="runtime-label">Throttled</div><span class="runtime-value {% if throttled.count > 0 %}status-warn{% else %}status-good{% endif %}">{{ throttled.count }}</span><small class="text-muted">成本節流供應商</small></div>
</div>
</section>
<section class="runtime-main">
<div class="runtime-stack">
<article class="runtime-panel">
<div class="runtime-panel-head">
<div><div class="runtime-label">Host Cascade</div><h2 class="runtime-panel-title">Ollama 三主機</h2></div>
<span class="badge {% if down.count > 0 %}bg-danger{% else %}bg-success{% endif %}">{{ '需要處理' if down.count > 0 else '全部在線' }}</span>
</div>
<div class="runtime-panel-body">
<div class="host-grid">
{% for h in ollama_hosts %}
<div class="host-lane {% if h.healthy %}is-up{% else %}is-down{% endif %}">
<div>
<div class="host-lane-top">
<span class="host-name">{{ h.label }}</span>
{% if h.healthy %}<span class="badge bg-success">HTTP 正常</span>{% else %}<span class="badge bg-danger">離線</span>{% endif %}
</div>
<div class="host-url"><code>{{ h.host }}</code></div>
<div class="model-cloud">
{% for m in h.models %}<span class="model-chip">{{ m }}</span>{% endfor %}
{% if not h.models %}<span class="text-muted small">無模型資料 / 未連線</span>{% endif %}
</div>
</div>
<div class="runtime-actions">
{% if h.unhealthy_mark %}<span class="badge bg-warning">30 秒異常標記</span>{% endif %}
{% if h.unhealthy_mark or not h.healthy %}<button class="btn btn-sm btn-outline-danger" onclick="triggerAutoHeal({{ h.label|tojson }})"><i class="fas fa-band-aid me-1"></i>AutoHeal</button>{% endif %}
</div>
</div>
{% endfor %}
</div>
</div>
</article>
{% if health_history %}
<article class="runtime-table-shell">
<div class="runtime-table-title"><div><div class="runtime-label">24h Probe History</div><h3>健康趨勢摘要</h3></div></div>
<div class="table-responsive"><table class="table mb-0"><thead class="table-light"><tr><th>角色</th><th class="text-end">總探針</th><th class="text-end">正常</th><th class="text-end">離線</th><th class="text-end">在線率</th><th class="text-end">平均 ms</th></tr></thead><tbody>{% for h in health_history %}<tr><td><strong>{{ h.host_label }}</strong></td><td class="text-end">{{ h.total }}</td><td class="text-end status-good">{{ h.up_count }}</td><td class="text-end status-bad">{{ h.down_count }}</td><td class="text-end"><strong class="{% if h.uptime_pct >= 99 %}status-good{% elif h.uptime_pct >= 90 %}status-warn{% else %}status-bad{% endif %}">{{ "%.1f"|format(h.uptime_pct) }}%</strong></td><td class="text-end">{{ h.avg_ms }}</td></tr>{% endfor %}</tbody></table></div>
</article>
{% endif %}
</div>
<aside class="runtime-stack">
{% if aiops_summary %}
<article class="runtime-panel">
<div class="runtime-panel-head"><div><div class="runtime-label">AIOps Loop</div><h2 class="runtime-panel-title">自癒閉環 7 日</h2></div></div>
<div class="runtime-panel-body">
<div class="runtime-mini-grid">
<div class="runtime-mini"><span class="runtime-label">事件總數</span><strong>{{ aiops_summary.incidents_total }}</strong></div>
<div class="runtime-mini"><span class="runtime-label">未解決</span><strong class="{% if aiops_summary.incidents_open > 0 %}status-bad{% else %}status-good{% endif %}">{{ aiops_summary.incidents_open }}</strong></div>
<div class="runtime-mini"><span class="runtime-label">P0/P1</span><strong class="{% if (aiops_summary.incidents_p0 + aiops_summary.incidents_p1) > 0 %}status-bad{% else %}status-good{% endif %}">{{ aiops_summary.incidents_p0 + aiops_summary.incidents_p1 }}</strong></div>
<div class="runtime-mini"><span class="runtime-label">平均耗時</span><strong>{{ aiops_summary.heals_avg_ms }}ms</strong></div>
</div>
{% if aiops_summary.heal_sparkline %}<div class="mt-3" class="obs-chart-frame obs-chart-frame-slim"><canvas id="healSparkline"></canvas></div>{% endif %}
</div>
</article>
{% endif %}
<article class="runtime-panel">
<div class="runtime-panel-head"><div><div class="runtime-label">MCP / Budget</div><h2 class="runtime-panel-title">工具層與節流</h2></div></div>
<div class="runtime-panel-body">
<div class="runtime-mini-grid">
<div class="runtime-mini"><span class="runtime-label">MCP Servers</span><strong>{{ mcp_status|length }}</strong></div>
<div class="runtime-mini"><span class="runtime-label">MCP 24h Calls</span><strong>{{ "{:,}".format(mcp_24h|sum(attribute='total_calls')) if mcp_24h else 0 }}</strong></div>
<div class="runtime-mini"><span class="runtime-label">Playbooks</span><strong>{{ active_playbooks.count }}/{{ playbook_ranking|length }}</strong></div>
<div class="runtime-mini"><span class="runtime-label">Embed Queue</span><strong class="{% if embed_queue_failed > 0 %}status-bad{% elif embed_queue_pending > 0 %}status-warn{% else %}status-good{% endif %}">{{ embed_queue_pending }}/{{ embed_queue_failed }}</strong></div>
</div>
</div>
</article>
</aside>
</section>
{% if throttle_state %}
<section class="runtime-table-shell">
<div class="runtime-table-title"><div><div class="runtime-label">Cost Throttle</div><h3>成本節流狀態</h3></div></div>
<div class="table-responsive"><table class="table mb-0"><thead class="table-light"><tr><th>供應商</th><th>已花費</th><th>預算</th><th>月底推估</th><th>使用率</th><th>狀態</th></tr></thead><tbody>{% for provider, info in throttle_state.items() %}<tr><td><code>{{ provider }}</code></td><td>${{ "%.2f"|format(info.spent) }}</td><td>${{ "%.2f"|format(info.budget) }}</td><td>${{ "%.2f"|format(info.projected) }}</td><td>{{ "%.0f"|format(info.ratio * 100) }}%</td><td>{% if info.throttled %}<span class="badge bg-danger">已節流</span>{% else %}<span class="badge bg-success">正常</span>{% endif %}</td></tr>{% endfor %}</tbody></table></div>
</section>
{% endif %}
{% if mcp_24h %}
<section class="runtime-table-shell">
<div class="runtime-table-title"><div><div class="runtime-label">MCP Workload</div><h3>MCP 服務 24h 工作量</h3></div></div>
<div class="table-responsive"><table class="table mb-0"><thead class="table-light"><tr><th>服務</th><th class="text-end">呼叫</th><th class="text-end">成功率</th><th class="text-end">快取</th><th class="text-end">Tools</th><th class="text-end">平均</th><th class="text-end">成本</th></tr></thead><tbody>{% for s in mcp_24h %}<tr><td><code>{{ s.server }}</code></td><td class="text-end">{{ "{:,}".format(s.total_calls) }}</td><td class="text-end"><strong class="{% if s.success_rate >= 95 %}status-good{% elif s.success_rate >= 80 %}status-warn{% else %}status-bad{% endif %}">{{ "%.1f"|format(s.success_rate) }}%</strong></td><td class="text-end">{{ "%.1f"|format(s.cache_rate) }}%</td><td class="text-end">{{ s.tools_used }}</td><td class="text-end">{{ s.avg_ms }} ms</td><td class="text-end">${{ "%.4f"|format(s.total_cost) }}</td></tr>{% endfor %}</tbody></table></div>
</section>
{% endif %}
<section class="runtime-main">
<div class="runtime-stack">
<article class="runtime-table-shell"><div class="runtime-table-title"><div><div class="runtime-label">Incidents</div><h3>最近 10 筆事件</h3></div></div><div class="table-responsive">{% if recent_incidents %}<table class="table table-sm mb-0"><thead class="table-light"><tr><th>時間</th><th>任務</th><th>錯誤</th><th>等級</th><th>狀態</th><th>訊息</th></tr></thead><tbody>{% for i in recent_incidents %}<tr><td><small>{{ i.created_at }}</small></td><td><code>{{ i.task_name }}</code></td><td><span class="badge bg-secondary">{{ i.error_type }}</span></td><td><span class="badge {% if i.severity in ('P0','P1') %}bg-danger{% elif i.severity == 'P2' %}bg-warning{% else %}bg-info{% endif %}">{{ i.severity }}</span></td><td>{{ i.status }}</td><td><small class="text-muted">{{ i.error_message }}</small></td></tr>{% endfor %}</tbody></table>{% else %}<div class="text-muted text-center p-3 small">尚無 incident 紀錄</div>{% endif %}</div></article>
<article class="runtime-table-shell"><div class="runtime-table-title"><div><div class="runtime-label">Heal Logs</div><h3>最近 10 筆自癒</h3></div></div><div class="table-responsive">{% if recent_heals %}<table class="table table-sm mb-0"><thead class="table-light"><tr><th>時間</th><th>動作</th><th>結果</th><th class="text-end">耗時</th><th>細節</th></tr></thead><tbody>{% for h in recent_heals %}<tr><td><small>{{ h.created_at }}</small></td><td><span class="badge bg-info">{{ h.action_type or '—' }}</span></td><td>{% if h.result == 'success' %}<span class="badge bg-success">成功</span>{% elif h.result == 'failed' %}<span class="badge bg-danger">失敗</span>{% else %}<span class="badge bg-secondary">{{ h.result }}</span>{% endif %}</td><td class="text-end">{{ h.duration_ms }} ms</td><td><small class="text-muted">{{ h.action_detail }}</small></td></tr>{% endfor %}</tbody></table>{% else %}<div class="text-muted text-center p-3 small">尚無 heal log</div>{% endif %}</div></article>
</div>
<aside class="runtime-stack">
<article class="runtime-table-shell"><div class="runtime-table-title"><div><div class="runtime-label">Playbooks</div><h3>AutoHeal Playbook</h3></div></div><div class="table-responsive">{% if playbook_ranking %}<table class="table table-sm mb-0"><thead class="table-light"><tr><th>名稱</th><th>成功率</th><th>狀態</th><th>切換</th></tr></thead><tbody>{% for p in playbook_ranking %}<tr><td><strong>{{ p.name }}</strong><br><small class="text-muted"><code>{{ p.error_type }}</code> · {{ p.action_type }}</small></td><td>{% if (p.success + p.fail) > 0 %}<strong>{{ "%.0f"|format(p.success_rate) }}%</strong>{% else %}<span class="text-muted"></span>{% endif %}</td><td>{% if p.is_active %}<span class="badge bg-success">啟用</span>{% else %}<span class="badge bg-secondary">停用</span>{% endif %}</td><td><button class="btn btn-sm btn-outline-secondary" onclick="togglePlaybook({{ p.id }}, {{ p.name|tojson }})">切換</button></td></tr>{% endfor %}</tbody></table>{% else %}<div class="text-muted text-center p-3 small">尚無 playbook 資料</div>{% endif %}</div></article>
<article class="runtime-table-shell"><div class="runtime-table-title"><div><div class="runtime-label">Backup</div><h3>備份歷史 7 日</h3></div></div><div class="table-responsive">{% if backup_history %}<table class="table table-sm mb-0"><thead class="table-light"><tr><th>時間</th><th>狀態</th><th class="text-end">MB</th></tr></thead><tbody>{% for b in backup_history %}<tr><td><small>{{ b.created_at }}</small></td><td>{% if b.status == 'success' %}<span class="badge bg-success">成功</span>{% else %}<span class="badge bg-danger">{{ b.status }}</span>{% endif %}</td><td class="text-end">{{ b.size_mb }}</td></tr>{% endfor %}</tbody></table>{% else %}<div class="text-muted text-center p-3 small">過去 7 日無備份紀錄</div>{% endif %}</div></article>
</aside>
</section>
{% if embed_queue_pending > 0 or embed_queue_failed > 0 %}<div class="alert alert-warning mt-3"><strong>Embedding 重試佇列:</strong>待處理 {{ embed_queue_pending }} 筆 · 失敗 {{ embed_queue_failed }} 筆</div>{% endif %}
<p class="text-muted mt-3"><small><i class="fas fa-robot me-1"></i>Operation Ollama-First v5.0 — 基礎設施生命線</small></p>
</div>
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.1/dist/chart.umd.min.js"></script>
<script>
(function() {
const data = {{ aiops_summary.heal_sparkline | default([]) | tojson }};
const el = document.getElementById('healSparkline');
if (!el || !data.length) return;
new Chart(el, { type: 'line', data: { labels: data.map(d => d.date), datasets: [{ data: data.map(d => d.rate), borderColor: '#c96442', backgroundColor: 'rgba(201,100,66,.14)', borderWidth: 2, fill: true, tension: .35, pointRadius: 2 }] }, options: { responsive: true, maintainAspectRatio: false, plugins: { legend: { display: false } }, scales: { x: { display: false }, y: { min: 0, max: 100, ticks: { callback: v => v + '%' } } } } });
})();
async function togglePlaybook(id, name) { if (!confirm(`切換 Playbook 「${name}」狀態?`)) return; try { const r = await fetch(`/observability/playbooks/toggle/${id}`, {method: 'POST'}); const d = await r.json(); if (d.ok) { alert(`${d.message}`); window.location.reload(); } else { alert('❌ ' + (d.error || '切換失敗')); } } catch (e) { console.warn('playbook_toggle_failed', e); alert('操作暫時無法完成,請稍後再試或查看系統日誌。'); } }
async function triggerAutoHeal(hostLabel) { if (!confirm(`觸發 AutoHeal\n\n主機:${hostLabel}`)) return; try { const r = await fetch('/observability/host_health/trigger_autoheal', { method: 'POST', headers: {'Content-Type': 'application/json'}, body: JSON.stringify({host_label: hostLabel}) }); const d = await r.json(); if (d.ok) { alert(`✅ AutoHeal 已派出\n動作:${d.action || '—'}\n訊息:${d.message || ''}`); window.location.reload(); } else { alert('❌ ' + (d.error || d.message || '觸發失敗')); } } catch (e) { console.warn('host_autoheal_failed', e); alert('操作暫時無法完成,請稍後再試或查看系統日誌。'); } }
</script>
{% endblock %}