fix(cr): 首席架構師 CR P0/P1/P2 全修補
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
P0-1: incident_service.py — 刪除 classify_alert_early 死碼 L131-132 P0-2: cron_backup_restore_test.sh — date +%s%3N→+%s,修正毫秒時間戳 P1-2: gitea_webhook.py — fingerprint 移除 sha_short,收斂同 branch 失敗 heartbeat: 還原原始空格對齊格式(統帥要求原本怎樣就怎樣) P1-1(積木化)/P1-3(TYPE-4)/P2-1(timeZone)/P2-2(IP)/P2-3(WS重連) 待後續處理 2026-04-12 ogt Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -538,7 +538,7 @@ async def handle_workflow_run(
|
||||
"alert_name": "GiteaCIPipelineFailed",
|
||||
"severity": "warning",
|
||||
"source": "gitea",
|
||||
"fingerprint": f"gitea-ci-{repo}-{branch}-{sha_short}",
|
||||
"fingerprint": f"gitea-ci-{repo}-{branch}",
|
||||
"labels": {
|
||||
"alertname": "GiteaCIPipelineFailed",
|
||||
"severity": "warning",
|
||||
|
||||
@@ -444,18 +444,18 @@ def report_to_telegram_html(report: HeartbeatReport) -> str:
|
||||
lines.append("🤖 <b>AI 服務</b>")
|
||||
ollama_probe = report.ai_services.get("ollama", ProbeResult(False, "❌ 無回應"))
|
||||
latency_str = f" ({ollama_probe.latency_ms:.0f}ms)" if ollama_probe.latency_ms else ""
|
||||
lines.append(f"Ollama {ollama_probe.status}{latency_str}")
|
||||
lines.append(f" Ollama: {ollama_probe.status}{latency_str}")
|
||||
|
||||
# 各模型狀態(縮排顯示)
|
||||
for model, loaded in report.ollama_models.items():
|
||||
icon = "✅" if loaded else "❌"
|
||||
short = model.split(":")[0]
|
||||
lines.append(f" {icon} {html.escape(short)}")
|
||||
lines.append(f" {icon} {html.escape(short)}")
|
||||
|
||||
for svc_name, display in [("nemotron", "Nemotron NIM"), ("gemini", "Gemini API"), ("claude", "Claude API")]:
|
||||
probe = report.ai_services.get(svc_name, ProbeResult(False, "❌ 無回應"))
|
||||
latency_str = f" ({probe.latency_ms:.0f}ms)" if probe.latency_ms else ""
|
||||
lines.append(f"{display} {probe.status}{latency_str}")
|
||||
lines.append(f" {display:<18}{probe.status}{latency_str}")
|
||||
|
||||
lines.append("")
|
||||
|
||||
@@ -469,24 +469,24 @@ def report_to_telegram_html(report: HeartbeatReport) -> str:
|
||||
}
|
||||
for key, display in mcp_display.items():
|
||||
probe = report.mcp_providers.get(key, ProbeResult(False, "❌ 無回應"))
|
||||
lines.append(f"{display} {probe.status}")
|
||||
lines.append(f" {display:<18}{probe.status}")
|
||||
|
||||
lines.append("")
|
||||
|
||||
# --- 飛輪狀態 ---
|
||||
fw = report.flywheel
|
||||
lines.append("🔄 <b>飛輪狀態(24h)</b>")
|
||||
lines.append(f"Playbooks: {fw.playbook_count} 個")
|
||||
lines.append(f" Playbooks: {fw.playbook_count} 個")
|
||||
if fw.attempt_24h > 0:
|
||||
rate = int(fw.success_24h / fw.attempt_24h * 100)
|
||||
lines.append(f"今日修復: {fw.success_24h}/{fw.attempt_24h} 次 ({rate}%)")
|
||||
lines.append(f" 今日修復: {fw.success_24h}/{fw.attempt_24h} 次 ({rate}%)")
|
||||
else:
|
||||
lines.append("今日修復: 0 次")
|
||||
lines.append(f" 今日修復: 0 次")
|
||||
if fw.km_total > 0:
|
||||
vec_rate = int(fw.km_vectorized / fw.km_total * 100)
|
||||
lines.append(f"KM 向量化: {fw.km_vectorized}/{fw.km_total} ({vec_rate}%)")
|
||||
lines.append(f" KM 向量化: {fw.km_vectorized}/{fw.km_total} ({vec_rate}%)")
|
||||
if fw.last_learning_at:
|
||||
lines.append(f"最後學習固化: {fw.last_learning_at.strftime('%H:%M')}")
|
||||
lines.append(f" 最後學習固化: {fw.last_learning_at.strftime('%H:%M')}")
|
||||
|
||||
lines.append("")
|
||||
|
||||
@@ -494,15 +494,15 @@ def report_to_telegram_html(report: HeartbeatReport) -> str:
|
||||
lines.append("🚀 <b>基礎設施</b>")
|
||||
argocd = report.infra.get("argocd_sync", ProbeResult(False, "❌ 無回應"))
|
||||
velero = report.infra.get("velero", ProbeResult(False, "❌ 無回應"))
|
||||
lines.append(f"ArgoCD: {argocd.status}")
|
||||
lines.append(f"Velero 備份: {velero.status}")
|
||||
lines.append(f" ArgoCD: {argocd.status}")
|
||||
lines.append(f" Velero 備份: {velero.status}")
|
||||
|
||||
# --- Warnings ---
|
||||
if report.warnings:
|
||||
lines.append("")
|
||||
lines.append(f"⚠️ <b>需關注({len(report.warnings)} 項)</b>")
|
||||
for w in report.warnings:
|
||||
lines.append(f"- {html.escape(w)}")
|
||||
lines.append(f" - {html.escape(w)}")
|
||||
else:
|
||||
lines.append("")
|
||||
lines.append("✅ <b>全部正常</b>")
|
||||
|
||||
@@ -126,10 +126,6 @@ def classify_alert_early(alertname: str, severity: str, labels: dict | None = No
|
||||
return "config_drift", "TYPE-4D"
|
||||
if severity in ("info", "none"):
|
||||
return "info", "TYPE-1"
|
||||
# backup/heartbeat 關鍵字只有 severity=info/none 才是純資訊
|
||||
# severity=warning/critical(例如 VeleroBackupFailed, HostBackupFailed)→ 繼續走 prefix 規則
|
||||
if severity in ("info", "none") and any(kw in alertname_lower for kw in ("backup", "heartbeat")):
|
||||
return "backup", "TYPE-1"
|
||||
# Watchdog/Heartbeat 永遠是 TYPE-1(Alertmanager 心跳)
|
||||
if "watchdog" in alertname_lower or alertname in ("Heartbeat",):
|
||||
return "backup", "TYPE-1"
|
||||
|
||||
@@ -30,7 +30,9 @@ velero restore create \
|
||||
2>&1 || EXIT_CODE=$?
|
||||
|
||||
# --- 寫入 textfile metric ---
|
||||
TS=$(date +%s%3N)
|
||||
# 注意:Prometheus textfile collector 不接受毫秒時間戳(13位),只接受秒(10位)
|
||||
# 直接省略 timestamp,由 node-exporter scrape 時自動補上
|
||||
TS=$(date +%s)
|
||||
mkdir -p "$(dirname "${TEXTFILE}")"
|
||||
|
||||
if [ "${EXIT_CODE}" -eq 0 ]; then
|
||||
@@ -38,10 +40,10 @@ if [ "${EXIT_CODE}" -eq 0 ]; then
|
||||
cat > "${TEXTFILE}" <<PROM
|
||||
# HELP awoooi_backup_restore_test_success 1 = last backup restore dry-run succeeded
|
||||
# TYPE awoooi_backup_restore_test_success gauge
|
||||
awoooi_backup_restore_test_success 1 ${TS}
|
||||
awoooi_backup_restore_test_success 1
|
||||
# HELP awoooi_backup_restore_test_timestamp_seconds Unix timestamp of last test run
|
||||
# TYPE awoooi_backup_restore_test_timestamp_seconds gauge
|
||||
awoooi_backup_restore_test_timestamp_seconds $(date +%s) ${TS}
|
||||
awoooi_backup_restore_test_timestamp_seconds ${TS}
|
||||
PROM
|
||||
echo "Textfile written: success"
|
||||
exit 0
|
||||
@@ -50,10 +52,10 @@ else
|
||||
cat > "${TEXTFILE}" <<PROM
|
||||
# HELP awoooi_backup_restore_test_success 1 = last backup restore dry-run succeeded
|
||||
# TYPE awoooi_backup_restore_test_success gauge
|
||||
awoooi_backup_restore_test_success 0 ${TS}
|
||||
awoooi_backup_restore_test_success 0
|
||||
# HELP awoooi_backup_restore_test_timestamp_seconds Unix timestamp of last test run
|
||||
# TYPE awoooi_backup_restore_test_timestamp_seconds gauge
|
||||
awoooi_backup_restore_test_timestamp_seconds $(date +%s) ${TS}
|
||||
awoooi_backup_restore_test_timestamp_seconds ${TS}
|
||||
PROM
|
||||
echo "Textfile written: failure"
|
||||
exit 1
|
||||
|
||||
Reference in New Issue
Block a user