From a28625f088c80636c255d1309c2f4e408175a2fa Mon Sep 17 00:00:00 2001 From: OG T Date: Sun, 12 Apr 2026 16:10:46 +0800 Subject: [PATCH] =?UTF-8?q?fix(cr):=20=E9=A6=96=E5=B8=AD=E6=9E=B6=E6=A7=8B?= =?UTF-8?q?=E5=B8=AB=20CR=20P0/P1/P2=20=E5=85=A8=E4=BF=AE=E8=A3=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P0-1: incident_service.py — 刪除 classify_alert_early 死碼 L131-132 P0-2: cron_backup_restore_test.sh — date +%s%3N→+%s,修正毫秒時間戳 P1-2: gitea_webhook.py — fingerprint 移除 sha_short,收斂同 branch 失敗 heartbeat: 還原原始空格對齊格式(統帥要求原本怎樣就怎樣) P1-1(積木化)/P1-3(TYPE-4)/P2-1(timeZone)/P2-2(IP)/P2-3(WS重連) 待後續處理 2026-04-12 ogt Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/api/v1/gitea_webhook.py | 2 +- .../src/services/heartbeat_report_service.py | 24 +++++++++---------- apps/api/src/services/incident_service.py | 4 ---- scripts/cron_backup_restore_test.sh | 12 ++++++---- 4 files changed, 20 insertions(+), 22 deletions(-) diff --git a/apps/api/src/api/v1/gitea_webhook.py b/apps/api/src/api/v1/gitea_webhook.py index fcc7bd52..9356e840 100644 --- a/apps/api/src/api/v1/gitea_webhook.py +++ b/apps/api/src/api/v1/gitea_webhook.py @@ -538,7 +538,7 @@ async def handle_workflow_run( "alert_name": "GiteaCIPipelineFailed", "severity": "warning", "source": "gitea", - "fingerprint": f"gitea-ci-{repo}-{branch}-{sha_short}", + "fingerprint": f"gitea-ci-{repo}-{branch}", "labels": { "alertname": "GiteaCIPipelineFailed", "severity": "warning", diff --git a/apps/api/src/services/heartbeat_report_service.py b/apps/api/src/services/heartbeat_report_service.py index 94910df4..b936525d 100644 --- a/apps/api/src/services/heartbeat_report_service.py +++ b/apps/api/src/services/heartbeat_report_service.py @@ -444,18 +444,18 @@ def report_to_telegram_html(report: HeartbeatReport) -> str: lines.append("🤖 AI 服務") ollama_probe = report.ai_services.get("ollama", ProbeResult(False, "❌ 無回應")) latency_str = f" ({ollama_probe.latency_ms:.0f}ms)" if ollama_probe.latency_ms else "" - lines.append(f"Ollama {ollama_probe.status}{latency_str}") + lines.append(f" Ollama: {ollama_probe.status}{latency_str}") # 各模型狀態(縮排顯示) for model, loaded in report.ollama_models.items(): icon = "✅" if loaded else "❌" short = model.split(":")[0] - lines.append(f" {icon} {html.escape(short)}") + lines.append(f" {icon} {html.escape(short)}") for svc_name, display in [("nemotron", "Nemotron NIM"), ("gemini", "Gemini API"), ("claude", "Claude API")]: probe = report.ai_services.get(svc_name, ProbeResult(False, "❌ 無回應")) latency_str = f" ({probe.latency_ms:.0f}ms)" if probe.latency_ms else "" - lines.append(f"{display} {probe.status}{latency_str}") + lines.append(f" {display:<18}{probe.status}{latency_str}") lines.append("") @@ -469,24 +469,24 @@ def report_to_telegram_html(report: HeartbeatReport) -> str: } for key, display in mcp_display.items(): probe = report.mcp_providers.get(key, ProbeResult(False, "❌ 無回應")) - lines.append(f"{display} {probe.status}") + lines.append(f" {display:<18}{probe.status}") lines.append("") # --- 飛輪狀態 --- fw = report.flywheel lines.append("🔄 飛輪狀態(24h)") - lines.append(f"Playbooks: {fw.playbook_count} 個") + lines.append(f" Playbooks: {fw.playbook_count} 個") if fw.attempt_24h > 0: rate = int(fw.success_24h / fw.attempt_24h * 100) - lines.append(f"今日修復: {fw.success_24h}/{fw.attempt_24h} 次 ({rate}%)") + lines.append(f" 今日修復: {fw.success_24h}/{fw.attempt_24h} 次 ({rate}%)") else: - lines.append("今日修復: 0 次") + lines.append(f" 今日修復: 0 次") if fw.km_total > 0: vec_rate = int(fw.km_vectorized / fw.km_total * 100) - lines.append(f"KM 向量化: {fw.km_vectorized}/{fw.km_total} ({vec_rate}%)") + lines.append(f" KM 向量化: {fw.km_vectorized}/{fw.km_total} ({vec_rate}%)") if fw.last_learning_at: - lines.append(f"最後學習固化: {fw.last_learning_at.strftime('%H:%M')}") + lines.append(f" 最後學習固化: {fw.last_learning_at.strftime('%H:%M')}") lines.append("") @@ -494,15 +494,15 @@ def report_to_telegram_html(report: HeartbeatReport) -> str: lines.append("🚀 基礎設施") argocd = report.infra.get("argocd_sync", ProbeResult(False, "❌ 無回應")) velero = report.infra.get("velero", ProbeResult(False, "❌ 無回應")) - lines.append(f"ArgoCD: {argocd.status}") - lines.append(f"Velero 備份: {velero.status}") + lines.append(f" ArgoCD: {argocd.status}") + lines.append(f" Velero 備份: {velero.status}") # --- Warnings --- if report.warnings: lines.append("") lines.append(f"⚠️ 需關注({len(report.warnings)} 項)") for w in report.warnings: - lines.append(f"- {html.escape(w)}") + lines.append(f" - {html.escape(w)}") else: lines.append("") lines.append("✅ 全部正常") diff --git a/apps/api/src/services/incident_service.py b/apps/api/src/services/incident_service.py index db752758..ea726f38 100644 --- a/apps/api/src/services/incident_service.py +++ b/apps/api/src/services/incident_service.py @@ -126,10 +126,6 @@ def classify_alert_early(alertname: str, severity: str, labels: dict | None = No return "config_drift", "TYPE-4D" if severity in ("info", "none"): return "info", "TYPE-1" - # backup/heartbeat 關鍵字只有 severity=info/none 才是純資訊 - # severity=warning/critical(例如 VeleroBackupFailed, HostBackupFailed)→ 繼續走 prefix 規則 - if severity in ("info", "none") and any(kw in alertname_lower for kw in ("backup", "heartbeat")): - return "backup", "TYPE-1" # Watchdog/Heartbeat 永遠是 TYPE-1(Alertmanager 心跳) if "watchdog" in alertname_lower or alertname in ("Heartbeat",): return "backup", "TYPE-1" diff --git a/scripts/cron_backup_restore_test.sh b/scripts/cron_backup_restore_test.sh index 9d68666c..ccb806c1 100755 --- a/scripts/cron_backup_restore_test.sh +++ b/scripts/cron_backup_restore_test.sh @@ -30,7 +30,9 @@ velero restore create \ 2>&1 || EXIT_CODE=$? # --- 寫入 textfile metric --- -TS=$(date +%s%3N) +# 注意:Prometheus textfile collector 不接受毫秒時間戳(13位),只接受秒(10位) +# 直接省略 timestamp,由 node-exporter scrape 時自動補上 +TS=$(date +%s) mkdir -p "$(dirname "${TEXTFILE}")" if [ "${EXIT_CODE}" -eq 0 ]; then @@ -38,10 +40,10 @@ if [ "${EXIT_CODE}" -eq 0 ]; then cat > "${TEXTFILE}" < "${TEXTFILE}" <