fix(cr): 首席架構師 CR P0/P1/P2 全修補
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled

P0-1: incident_service.py — 刪除 classify_alert_early 死碼 L131-132
P0-2: cron_backup_restore_test.sh — date +%s%3N→+%s,修正毫秒時間戳
P1-2: gitea_webhook.py — fingerprint 移除 sha_short,收斂同 branch 失敗
heartbeat: 還原原始空格對齊格式(統帥要求原本怎樣就怎樣)

P1-1(積木化)/P1-3(TYPE-4)/P2-1(timeZone)/P2-2(IP)/P2-3(WS重連) 待後續處理

2026-04-12 ogt
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-12 16:10:46 +08:00
parent d72c7d5ac4
commit a28625f088
4 changed files with 20 additions and 22 deletions

View File

@@ -538,7 +538,7 @@ async def handle_workflow_run(
"alert_name": "GiteaCIPipelineFailed",
"severity": "warning",
"source": "gitea",
"fingerprint": f"gitea-ci-{repo}-{branch}-{sha_short}",
"fingerprint": f"gitea-ci-{repo}-{branch}",
"labels": {
"alertname": "GiteaCIPipelineFailed",
"severity": "warning",

View File

@@ -444,18 +444,18 @@ def report_to_telegram_html(report: HeartbeatReport) -> str:
lines.append("🤖 <b>AI 服務</b>")
ollama_probe = report.ai_services.get("ollama", ProbeResult(False, "❌ 無回應"))
latency_str = f" ({ollama_probe.latency_ms:.0f}ms)" if ollama_probe.latency_ms else ""
lines.append(f"Ollama {ollama_probe.status}{latency_str}")
lines.append(f" Ollama: {ollama_probe.status}{latency_str}")
# 各模型狀態(縮排顯示)
for model, loaded in report.ollama_models.items():
icon = "" if loaded else ""
short = model.split(":")[0]
lines.append(f" {icon} {html.escape(short)}")
lines.append(f" {icon} {html.escape(short)}")
for svc_name, display in [("nemotron", "Nemotron NIM"), ("gemini", "Gemini API"), ("claude", "Claude API")]:
probe = report.ai_services.get(svc_name, ProbeResult(False, "❌ 無回應"))
latency_str = f" ({probe.latency_ms:.0f}ms)" if probe.latency_ms else ""
lines.append(f"{display} {probe.status}{latency_str}")
lines.append(f" {display:<18}{probe.status}{latency_str}")
lines.append("")
@@ -469,24 +469,24 @@ def report_to_telegram_html(report: HeartbeatReport) -> str:
}
for key, display in mcp_display.items():
probe = report.mcp_providers.get(key, ProbeResult(False, "❌ 無回應"))
lines.append(f"{display} {probe.status}")
lines.append(f" {display:<18}{probe.status}")
lines.append("")
# --- 飛輪狀態 ---
fw = report.flywheel
lines.append("🔄 <b>飛輪狀態24h</b>")
lines.append(f"Playbooks: {fw.playbook_count}")
lines.append(f" Playbooks: {fw.playbook_count}")
if fw.attempt_24h > 0:
rate = int(fw.success_24h / fw.attempt_24h * 100)
lines.append(f"今日修復: {fw.success_24h}/{fw.attempt_24h} 次 ({rate}%)")
lines.append(f" 今日修復: {fw.success_24h}/{fw.attempt_24h} 次 ({rate}%)")
else:
lines.append("今日修復: 0 次")
lines.append(f" 今日修復: 0 次")
if fw.km_total > 0:
vec_rate = int(fw.km_vectorized / fw.km_total * 100)
lines.append(f"KM 向量化: {fw.km_vectorized}/{fw.km_total} ({vec_rate}%)")
lines.append(f" KM 向量化: {fw.km_vectorized}/{fw.km_total} ({vec_rate}%)")
if fw.last_learning_at:
lines.append(f"最後學習固化: {fw.last_learning_at.strftime('%H:%M')}")
lines.append(f" 最後學習固化: {fw.last_learning_at.strftime('%H:%M')}")
lines.append("")
@@ -494,15 +494,15 @@ def report_to_telegram_html(report: HeartbeatReport) -> str:
lines.append("🚀 <b>基礎設施</b>")
argocd = report.infra.get("argocd_sync", ProbeResult(False, "❌ 無回應"))
velero = report.infra.get("velero", ProbeResult(False, "❌ 無回應"))
lines.append(f"ArgoCD: {argocd.status}")
lines.append(f"Velero 備份: {velero.status}")
lines.append(f" ArgoCD: {argocd.status}")
lines.append(f" Velero 備份: {velero.status}")
# --- Warnings ---
if report.warnings:
lines.append("")
lines.append(f"⚠️ <b>需關注({len(report.warnings)} 項)</b>")
for w in report.warnings:
lines.append(f"- {html.escape(w)}")
lines.append(f" - {html.escape(w)}")
else:
lines.append("")
lines.append("✅ <b>全部正常</b>")

View File

@@ -126,10 +126,6 @@ def classify_alert_early(alertname: str, severity: str, labels: dict | None = No
return "config_drift", "TYPE-4D"
if severity in ("info", "none"):
return "info", "TYPE-1"
# backup/heartbeat 關鍵字只有 severity=info/none 才是純資訊
# severity=warning/critical例如 VeleroBackupFailed, HostBackupFailed→ 繼續走 prefix 規則
if severity in ("info", "none") and any(kw in alertname_lower for kw in ("backup", "heartbeat")):
return "backup", "TYPE-1"
# Watchdog/Heartbeat 永遠是 TYPE-1Alertmanager 心跳)
if "watchdog" in alertname_lower or alertname in ("Heartbeat",):
return "backup", "TYPE-1"

View File

@@ -30,7 +30,9 @@ velero restore create \
2>&1 || EXIT_CODE=$?
# --- 寫入 textfile metric ---
TS=$(date +%s%3N)
# 注意Prometheus textfile collector 不接受毫秒時間戳13位只接受秒10位
# 直接省略 timestamp由 node-exporter scrape 時自動補上
TS=$(date +%s)
mkdir -p "$(dirname "${TEXTFILE}")"
if [ "${EXIT_CODE}" -eq 0 ]; then
@@ -38,10 +40,10 @@ if [ "${EXIT_CODE}" -eq 0 ]; then
cat > "${TEXTFILE}" <<PROM
# HELP awoooi_backup_restore_test_success 1 = last backup restore dry-run succeeded
# TYPE awoooi_backup_restore_test_success gauge
awoooi_backup_restore_test_success 1 ${TS}
awoooi_backup_restore_test_success 1
# HELP awoooi_backup_restore_test_timestamp_seconds Unix timestamp of last test run
# TYPE awoooi_backup_restore_test_timestamp_seconds gauge
awoooi_backup_restore_test_timestamp_seconds $(date +%s) ${TS}
awoooi_backup_restore_test_timestamp_seconds ${TS}
PROM
echo "Textfile written: success"
exit 0
@@ -50,10 +52,10 @@ else
cat > "${TEXTFILE}" <<PROM
# HELP awoooi_backup_restore_test_success 1 = last backup restore dry-run succeeded
# TYPE awoooi_backup_restore_test_success gauge
awoooi_backup_restore_test_success 0 ${TS}
awoooi_backup_restore_test_success 0
# HELP awoooi_backup_restore_test_timestamp_seconds Unix timestamp of last test run
# TYPE awoooi_backup_restore_test_timestamp_seconds gauge
awoooi_backup_restore_test_timestamp_seconds $(date +%s) ${TS}
awoooi_backup_restore_test_timestamp_seconds ${TS}
PROM
echo "Textfile written: failure"
exit 1