diff --git a/apps/api/src/api/v1/gitea_webhook.py b/apps/api/src/api/v1/gitea_webhook.py index fcc7bd52..9356e840 100644 --- a/apps/api/src/api/v1/gitea_webhook.py +++ b/apps/api/src/api/v1/gitea_webhook.py @@ -538,7 +538,7 @@ async def handle_workflow_run( "alert_name": "GiteaCIPipelineFailed", "severity": "warning", "source": "gitea", - "fingerprint": f"gitea-ci-{repo}-{branch}-{sha_short}", + "fingerprint": f"gitea-ci-{repo}-{branch}", "labels": { "alertname": "GiteaCIPipelineFailed", "severity": "warning", diff --git a/apps/api/src/services/heartbeat_report_service.py b/apps/api/src/services/heartbeat_report_service.py index 94910df4..b936525d 100644 --- a/apps/api/src/services/heartbeat_report_service.py +++ b/apps/api/src/services/heartbeat_report_service.py @@ -444,18 +444,18 @@ def report_to_telegram_html(report: HeartbeatReport) -> str: lines.append("🤖 AI 服務") ollama_probe = report.ai_services.get("ollama", ProbeResult(False, "❌ 無回應")) latency_str = f" ({ollama_probe.latency_ms:.0f}ms)" if ollama_probe.latency_ms else "" - lines.append(f"Ollama {ollama_probe.status}{latency_str}") + lines.append(f" Ollama: {ollama_probe.status}{latency_str}") # 各模型狀態(縮排顯示) for model, loaded in report.ollama_models.items(): icon = "✅" if loaded else "❌" short = model.split(":")[0] - lines.append(f" {icon} {html.escape(short)}") + lines.append(f" {icon} {html.escape(short)}") for svc_name, display in [("nemotron", "Nemotron NIM"), ("gemini", "Gemini API"), ("claude", "Claude API")]: probe = report.ai_services.get(svc_name, ProbeResult(False, "❌ 無回應")) latency_str = f" ({probe.latency_ms:.0f}ms)" if probe.latency_ms else "" - lines.append(f"{display} {probe.status}{latency_str}") + lines.append(f" {display:<18}{probe.status}{latency_str}") lines.append("") @@ -469,24 +469,24 @@ def report_to_telegram_html(report: HeartbeatReport) -> str: } for key, display in mcp_display.items(): probe = report.mcp_providers.get(key, ProbeResult(False, "❌ 無回應")) - lines.append(f"{display} {probe.status}") + lines.append(f" {display:<18}{probe.status}") lines.append("") # --- 飛輪狀態 --- fw = report.flywheel lines.append("🔄 飛輪狀態(24h)") - lines.append(f"Playbooks: {fw.playbook_count} 個") + lines.append(f" Playbooks: {fw.playbook_count} 個") if fw.attempt_24h > 0: rate = int(fw.success_24h / fw.attempt_24h * 100) - lines.append(f"今日修復: {fw.success_24h}/{fw.attempt_24h} 次 ({rate}%)") + lines.append(f" 今日修復: {fw.success_24h}/{fw.attempt_24h} 次 ({rate}%)") else: - lines.append("今日修復: 0 次") + lines.append(f" 今日修復: 0 次") if fw.km_total > 0: vec_rate = int(fw.km_vectorized / fw.km_total * 100) - lines.append(f"KM 向量化: {fw.km_vectorized}/{fw.km_total} ({vec_rate}%)") + lines.append(f" KM 向量化: {fw.km_vectorized}/{fw.km_total} ({vec_rate}%)") if fw.last_learning_at: - lines.append(f"最後學習固化: {fw.last_learning_at.strftime('%H:%M')}") + lines.append(f" 最後學習固化: {fw.last_learning_at.strftime('%H:%M')}") lines.append("") @@ -494,15 +494,15 @@ def report_to_telegram_html(report: HeartbeatReport) -> str: lines.append("🚀 基礎設施") argocd = report.infra.get("argocd_sync", ProbeResult(False, "❌ 無回應")) velero = report.infra.get("velero", ProbeResult(False, "❌ 無回應")) - lines.append(f"ArgoCD: {argocd.status}") - lines.append(f"Velero 備份: {velero.status}") + lines.append(f" ArgoCD: {argocd.status}") + lines.append(f" Velero 備份: {velero.status}") # --- Warnings --- if report.warnings: lines.append("") lines.append(f"⚠️ 需關注({len(report.warnings)} 項)") for w in report.warnings: - lines.append(f"- {html.escape(w)}") + lines.append(f" - {html.escape(w)}") else: lines.append("") lines.append("✅ 全部正常") diff --git a/apps/api/src/services/incident_service.py b/apps/api/src/services/incident_service.py index db752758..ea726f38 100644 --- a/apps/api/src/services/incident_service.py +++ b/apps/api/src/services/incident_service.py @@ -126,10 +126,6 @@ def classify_alert_early(alertname: str, severity: str, labels: dict | None = No return "config_drift", "TYPE-4D" if severity in ("info", "none"): return "info", "TYPE-1" - # backup/heartbeat 關鍵字只有 severity=info/none 才是純資訊 - # severity=warning/critical(例如 VeleroBackupFailed, HostBackupFailed)→ 繼續走 prefix 規則 - if severity in ("info", "none") and any(kw in alertname_lower for kw in ("backup", "heartbeat")): - return "backup", "TYPE-1" # Watchdog/Heartbeat 永遠是 TYPE-1(Alertmanager 心跳) if "watchdog" in alertname_lower or alertname in ("Heartbeat",): return "backup", "TYPE-1" diff --git a/scripts/cron_backup_restore_test.sh b/scripts/cron_backup_restore_test.sh index 9d68666c..ccb806c1 100755 --- a/scripts/cron_backup_restore_test.sh +++ b/scripts/cron_backup_restore_test.sh @@ -30,7 +30,9 @@ velero restore create \ 2>&1 || EXIT_CODE=$? # --- 寫入 textfile metric --- -TS=$(date +%s%3N) +# 注意:Prometheus textfile collector 不接受毫秒時間戳(13位),只接受秒(10位) +# 直接省略 timestamp,由 node-exporter scrape 時自動補上 +TS=$(date +%s) mkdir -p "$(dirname "${TEXTFILE}")" if [ "${EXIT_CODE}" -eq 0 ]; then @@ -38,10 +40,10 @@ if [ "${EXIT_CODE}" -eq 0 ]; then cat > "${TEXTFILE}" < "${TEXTFILE}" <