diff --git a/apps/api/src/api/v1/gitea_webhook.py b/apps/api/src/api/v1/gitea_webhook.py
index fcc7bd52..9356e840 100644
--- a/apps/api/src/api/v1/gitea_webhook.py
+++ b/apps/api/src/api/v1/gitea_webhook.py
@@ -538,7 +538,7 @@ async def handle_workflow_run(
"alert_name": "GiteaCIPipelineFailed",
"severity": "warning",
"source": "gitea",
- "fingerprint": f"gitea-ci-{repo}-{branch}-{sha_short}",
+ "fingerprint": f"gitea-ci-{repo}-{branch}",
"labels": {
"alertname": "GiteaCIPipelineFailed",
"severity": "warning",
diff --git a/apps/api/src/services/heartbeat_report_service.py b/apps/api/src/services/heartbeat_report_service.py
index 94910df4..b936525d 100644
--- a/apps/api/src/services/heartbeat_report_service.py
+++ b/apps/api/src/services/heartbeat_report_service.py
@@ -444,18 +444,18 @@ def report_to_telegram_html(report: HeartbeatReport) -> str:
lines.append("🤖 AI 服務")
ollama_probe = report.ai_services.get("ollama", ProbeResult(False, "❌ 無回應"))
latency_str = f" ({ollama_probe.latency_ms:.0f}ms)" if ollama_probe.latency_ms else ""
- lines.append(f"Ollama {ollama_probe.status}{latency_str}")
+ lines.append(f" Ollama: {ollama_probe.status}{latency_str}")
# 各模型狀態(縮排顯示)
for model, loaded in report.ollama_models.items():
icon = "✅" if loaded else "❌"
short = model.split(":")[0]
- lines.append(f" {icon} {html.escape(short)}")
+ lines.append(f" {icon} {html.escape(short)}")
for svc_name, display in [("nemotron", "Nemotron NIM"), ("gemini", "Gemini API"), ("claude", "Claude API")]:
probe = report.ai_services.get(svc_name, ProbeResult(False, "❌ 無回應"))
latency_str = f" ({probe.latency_ms:.0f}ms)" if probe.latency_ms else ""
- lines.append(f"{display} {probe.status}{latency_str}")
+ lines.append(f" {display:<18}{probe.status}{latency_str}")
lines.append("")
@@ -469,24 +469,24 @@ def report_to_telegram_html(report: HeartbeatReport) -> str:
}
for key, display in mcp_display.items():
probe = report.mcp_providers.get(key, ProbeResult(False, "❌ 無回應"))
- lines.append(f"{display} {probe.status}")
+ lines.append(f" {display:<18}{probe.status}")
lines.append("")
# --- 飛輪狀態 ---
fw = report.flywheel
lines.append("🔄 飛輪狀態(24h)")
- lines.append(f"Playbooks: {fw.playbook_count} 個")
+ lines.append(f" Playbooks: {fw.playbook_count} 個")
if fw.attempt_24h > 0:
rate = int(fw.success_24h / fw.attempt_24h * 100)
- lines.append(f"今日修復: {fw.success_24h}/{fw.attempt_24h} 次 ({rate}%)")
+ lines.append(f" 今日修復: {fw.success_24h}/{fw.attempt_24h} 次 ({rate}%)")
else:
- lines.append("今日修復: 0 次")
+ lines.append(f" 今日修復: 0 次")
if fw.km_total > 0:
vec_rate = int(fw.km_vectorized / fw.km_total * 100)
- lines.append(f"KM 向量化: {fw.km_vectorized}/{fw.km_total} ({vec_rate}%)")
+ lines.append(f" KM 向量化: {fw.km_vectorized}/{fw.km_total} ({vec_rate}%)")
if fw.last_learning_at:
- lines.append(f"最後學習固化: {fw.last_learning_at.strftime('%H:%M')}")
+ lines.append(f" 最後學習固化: {fw.last_learning_at.strftime('%H:%M')}")
lines.append("")
@@ -494,15 +494,15 @@ def report_to_telegram_html(report: HeartbeatReport) -> str:
lines.append("🚀 基礎設施")
argocd = report.infra.get("argocd_sync", ProbeResult(False, "❌ 無回應"))
velero = report.infra.get("velero", ProbeResult(False, "❌ 無回應"))
- lines.append(f"ArgoCD: {argocd.status}")
- lines.append(f"Velero 備份: {velero.status}")
+ lines.append(f" ArgoCD: {argocd.status}")
+ lines.append(f" Velero 備份: {velero.status}")
# --- Warnings ---
if report.warnings:
lines.append("")
lines.append(f"⚠️ 需關注({len(report.warnings)} 項)")
for w in report.warnings:
- lines.append(f"- {html.escape(w)}")
+ lines.append(f" - {html.escape(w)}")
else:
lines.append("")
lines.append("✅ 全部正常")
diff --git a/apps/api/src/services/incident_service.py b/apps/api/src/services/incident_service.py
index db752758..ea726f38 100644
--- a/apps/api/src/services/incident_service.py
+++ b/apps/api/src/services/incident_service.py
@@ -126,10 +126,6 @@ def classify_alert_early(alertname: str, severity: str, labels: dict | None = No
return "config_drift", "TYPE-4D"
if severity in ("info", "none"):
return "info", "TYPE-1"
- # backup/heartbeat 關鍵字只有 severity=info/none 才是純資訊
- # severity=warning/critical(例如 VeleroBackupFailed, HostBackupFailed)→ 繼續走 prefix 規則
- if severity in ("info", "none") and any(kw in alertname_lower for kw in ("backup", "heartbeat")):
- return "backup", "TYPE-1"
# Watchdog/Heartbeat 永遠是 TYPE-1(Alertmanager 心跳)
if "watchdog" in alertname_lower or alertname in ("Heartbeat",):
return "backup", "TYPE-1"
diff --git a/scripts/cron_backup_restore_test.sh b/scripts/cron_backup_restore_test.sh
index 9d68666c..ccb806c1 100755
--- a/scripts/cron_backup_restore_test.sh
+++ b/scripts/cron_backup_restore_test.sh
@@ -30,7 +30,9 @@ velero restore create \
2>&1 || EXIT_CODE=$?
# --- 寫入 textfile metric ---
-TS=$(date +%s%3N)
+# 注意:Prometheus textfile collector 不接受毫秒時間戳(13位),只接受秒(10位)
+# 直接省略 timestamp,由 node-exporter scrape 時自動補上
+TS=$(date +%s)
mkdir -p "$(dirname "${TEXTFILE}")"
if [ "${EXIT_CODE}" -eq 0 ]; then
@@ -38,10 +40,10 @@ if [ "${EXIT_CODE}" -eq 0 ]; then
cat > "${TEXTFILE}" < "${TEXTFILE}" <