fix(monitoring): fix false positive NoAlertsReceived2Hours by filtering only alertmanager source
This commit is contained in:
@@ -88,7 +88,7 @@ spec:
|
||||
# -----------------------------------------------------------------
|
||||
- alert: NoAlertsReceived2Hours
|
||||
expr: |
|
||||
time() - max by (source)(awoooi_alert_chain_last_success_timestamp) > 7200
|
||||
time() - max by (source)(awoooi_alert_chain_last_success_timestamp{source="alertmanager"}) > 7200
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
@@ -15,6 +15,39 @@
|
||||
|
||||
groups:
|
||||
|
||||
# =========================================================================
|
||||
# Full-stack recovery scorecard recording rules
|
||||
# =========================================================================
|
||||
- name: full_stack_recovery_scorecard_rules
|
||||
interval: 60s
|
||||
rules:
|
||||
- record: awoooi_recovery_core_ready
|
||||
expr: |
|
||||
sum without(result) (
|
||||
awoooi_cold_start_last_result{host="110",scope="110_120_121_188",result="green"} == bool 1
|
||||
)
|
||||
* on(host,scope) (
|
||||
awoooi_cold_start_warn_gates{host="110",scope="110_120_121_188"} == bool 0
|
||||
)
|
||||
* on(host,scope) (
|
||||
awoooi_cold_start_blocked_gates{host="110",scope="110_120_121_188"} == bool 0
|
||||
)
|
||||
* on(host,scope) (
|
||||
(time() - awoooi_cold_start_last_green_timestamp{host="110",scope="110_120_121_188"}) < bool 3600
|
||||
)
|
||||
|
||||
- record: awoooi_recovery_dr_offsite_ready
|
||||
expr: |
|
||||
max by(host) (
|
||||
awoooi_backup_offsite_configured{host="110"} == bool 1
|
||||
)
|
||||
* on(host) max by(host) (
|
||||
awoooi_backup_offsite_fresh{host="110"} == bool 1
|
||||
)
|
||||
* on(host) min by(host) (
|
||||
awoooi_backup_credential_escrow_fresh{host="110"} == bool 1
|
||||
)
|
||||
|
||||
# =========================================================================
|
||||
# 主機層告警 (host_alerts)
|
||||
# =========================================================================
|
||||
@@ -41,7 +74,7 @@ groups:
|
||||
severity: warning
|
||||
layer: systemd-188
|
||||
team: ops
|
||||
auto_repair: "true"
|
||||
auto_repair: "false"
|
||||
# MCP Phase 2a (ADR-071, 2026-04-11 Claude Sonnet 4.6): SSH MCP 路由標籤
|
||||
mcp_provider: "ssh_host"
|
||||
host_type: "bare_metal"
|
||||
@@ -167,7 +200,7 @@ groups:
|
||||
description: "過去 24 小時有備份失敗"
|
||||
|
||||
- alert: VeleroBackupNotRun
|
||||
expr: time() - velero_backup_last_successful_timestamp > 86400
|
||||
expr: max by(host, namespace) (awoooi_velero_latest_completed_backup_fresh{host="110",namespace="velero"}) == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -177,7 +210,7 @@ groups:
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "Velero 超過 24 小時未成功備份"
|
||||
description: "最後一次成功備份超過 24 小時"
|
||||
description: "backup health exporter 顯示 latest Completed Velero backup 超過 25 小時或不存在。"
|
||||
|
||||
# Sprint C-2 Host rsync 備份告警 (2026-04-11 Claude Sonnet 4.6)
|
||||
# backup-from-110.sh 成功時寫入 /var/run/backup-110.last_success
|
||||
@@ -507,7 +540,7 @@ groups:
|
||||
description: "Sentry 錯誤可能無法正確處理"
|
||||
|
||||
- alert: NoAlertsReceived2Hours
|
||||
expr: time() - max by (source)(awoooi_alert_chain_last_success_timestamp) > 7200
|
||||
expr: time() - max by (source)(awoooi_alert_chain_last_success_timestamp{source="alertmanager"}) > 7200
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -1013,10 +1046,10 @@ groups:
|
||||
# 備份還原告警 (awoooi_backup_restore) — 從主機補回 2026-04-12
|
||||
# =========================================================================
|
||||
- name: awoooi_backup_restore
|
||||
interval: 1h
|
||||
interval: 1m
|
||||
rules:
|
||||
- alert: BackupRestoreTestFailed
|
||||
expr: awoooi_backup_restore_test_success == 0
|
||||
expr: max by(host, namespace, cronjob) (awoooi_velero_restore_test_failed_jobs{host="110",namespace="velero",cronjob="backup-restore-test"}) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -1025,11 +1058,37 @@ groups:
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "備份還原 dry-run 測試失敗"
|
||||
description: "Velero restore dry-run 失敗,備份可能無法還原。立即人工驗證備份狀態。"
|
||||
runbook: "執行 velero backup describe awoooi-daily 及 velero restore create --from-backup awoooi-daily --dry-run"
|
||||
description: "velero namespace 中保留了失敗的 backup-restore-test Job,備份可能無法還原。立即人工驗證備份狀態。"
|
||||
runbook: "先找最新 Completed Velero backup,再執行 restore dry-run;禁止在 production namespace 做真還原"
|
||||
|
||||
- alert: BackupRestoreTestMissing
|
||||
expr: absent(awoooi_velero_restore_test_cron_present{host="110",namespace="velero",cronjob="backup-restore-test"})
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "備份還原 dry-run 監控指標缺失"
|
||||
description: "Prometheus 沒有收到 awoooi_velero_restore_test_cron_present;110 backup health exporter 或 120 kubectl 查詢可能失效。"
|
||||
runbook: "檢查 110 backup_health.prom、SSH 110→120、以及 velero namespace 的 backup-restore-test CronJob"
|
||||
|
||||
- alert: BackupRestoreTestCronMissing
|
||||
expr: max by(host, namespace, cronjob) (awoooi_velero_restore_test_cron_present{host="110",namespace="velero",cronjob="backup-restore-test"}) == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "備份還原 dry-run CronJob 缺失"
|
||||
description: "velero namespace 找不到 backup-restore-test CronJob;備份可還原性沒有定期驗證。"
|
||||
runbook: "kubectl apply k8s/awoooi-prod/17-configmap-backup-restore-scripts.yaml 與 16-cronjob-backup-restore-test.yaml"
|
||||
|
||||
- alert: BackupRestoreTestStale
|
||||
expr: (time() - awoooi_backup_restore_test_timestamp_seconds) > 691200
|
||||
expr: max by(host, namespace, cronjob) (awoooi_velero_restore_test_last_success_fresh{host="110",namespace="velero",cronjob="backup-restore-test"}) == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -1038,9 +1097,375 @@ groups:
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "備份還原測試超過 8 天未執行"
|
||||
description: "上次備份測試距今 {{ $value | humanizeDuration }},週排程 CronJob 可能失效。"
|
||||
description: "backup-restore-test CronJob 沒有 8 天內成功紀錄;週排程 CronJob 可能失效。"
|
||||
runbook: "檢查 velero namespace 中的 backup-restore-test CronJob 狀態"
|
||||
|
||||
# =========================================================================
|
||||
# Host / service / config backup health
|
||||
# =========================================================================
|
||||
- name: full_stack_backup_health_alerts
|
||||
interval: 1m
|
||||
rules:
|
||||
- alert: BackupHealthMonitorMissing110
|
||||
expr: absent(awoooi_backup_health_monitor_up{host="110"})
|
||||
for: 20m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-health-monitor
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 備份健康指標缺失"
|
||||
description: "110 沒有輸出 backup_health.prom,無法確認資料庫、設定檔與服務備份是否新鮮。"
|
||||
runbook: "用 Ansible `110-devops.yml --tags textfile_exporters` 或手動部署 scripts/ops/backup-health-textfile-exporter.py"
|
||||
|
||||
- alert: BackupHealthMonitorMissing188
|
||||
expr: absent(awoooi_backup_health_monitor_up{host="188"})
|
||||
for: 20m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-health-monitor
|
||||
host: "188"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "188 備份健康指標缺失"
|
||||
description: "188 沒有輸出 backup_health.prom,無法確認 110 rsync 與 momo PostgreSQL 備份是否新鮮。"
|
||||
runbook: "用 Ansible `188-ai-web.yml --tags textfile_exporters` 或手動部署 scripts/ops/backup-health-textfile-exporter.py"
|
||||
|
||||
- alert: BackupHealthMonitorStale
|
||||
expr: time() - awoooi_backup_health_last_run_timestamp{host=~"110|188"} > 1800
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-health-monitor
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "{{ $labels.host }} 備份健康 exporter 超過 30 分鐘未更新"
|
||||
description: "backup health textfile exporter stale,備份狀態不可觀測。"
|
||||
runbook: "SSH 主機檢查 cron、/tmp/awoooi-backup-health-textfile-exporter.cron.log 與 node-exporter textfile collector"
|
||||
|
||||
- alert: BackupExpectedJobMissing
|
||||
expr: awoooi_backup_job_configured{host=~"110|188"} == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: host-backup
|
||||
component: backup-cron
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "{{ $labels.host }} 備份排程缺失:{{ $labels.exported_job }}"
|
||||
description: "預期備份 cron/config 不存在;下一次重開機後資料可能沒有可用還原點。"
|
||||
runbook: "依 docs/runbooks/FULL-STACK-COLD-START-SOP.md 的備份章節補回 cron,先 dry-run 再執行"
|
||||
|
||||
- alert: BackupScheduleDuplicateActiveEntries
|
||||
expr: awoooi_backup_cron_active_duplicate_count{host="110"} > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-cron
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 備份 crontab 有重複 active entries"
|
||||
description: "110 crontab 目前有 {{ $value }} 個 exact duplicate active entry;可能造成 offsite sync、verifier 或 status job 重複執行。"
|
||||
runbook: "SSH 110 執行 `crontab -l | awk 'NF && $0 !~ /^#/ {count[$0]++} END {for (line in count) if (count[line] > 1) print count[line], line}'`,只移除重複 active entry,不要刪除未理解的備份排程。"
|
||||
|
||||
- alert: BackupScheduleSingletonMismatch
|
||||
expr: awoooi_backup_cron_singular_entry_ok{host="110"} == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-cron
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 備份排程單一入口異常:{{ $labels.entry }}"
|
||||
description: "{{ $labels.entry }} 應該剛好只有一個 active cron entry;目前 count={{ $value }},可能造成排程缺失或重複執行。"
|
||||
runbook: "用 Ansible `110-devops.yml --tags backup_jobs` 收斂排程,並用 `scripts/reboot-recovery/reboot-recovery-readiness-audit.sh --live --no-color` 驗證。"
|
||||
|
||||
- alert: BackupScriptMissing
|
||||
expr: awoooi_backup_script_present{host=~"110|188"} == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: host-backup
|
||||
component: backup-script
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "{{ $labels.host }} 備份腳本缺失:{{ $labels.script }}"
|
||||
description: "備份排程可能存在,但實際腳本不存在或路徑漂移。"
|
||||
runbook: "從 repo 部署對應 scripts/backup 或 scripts/ops 腳本,確認權限 0755"
|
||||
|
||||
- alert: BackupJobStale
|
||||
expr: awoooi_backup_job_fresh{host=~"110|188"} == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: host-backup
|
||||
component: backup-freshness
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "{{ $labels.host }} 備份過舊:{{ $labels.exported_job }}"
|
||||
description: "{{ $labels.exported_job }} 最新成功證據超過 {{ $labels.max_age_hours }} 小時或不存在;來源 {{ $labels.source }},目標 {{ $labels.target }}。"
|
||||
runbook: "先檢查備份 log 與磁碟空間,再手動執行對應備份;禁止直接刪除舊備份或 production 資料"
|
||||
|
||||
- alert: BackupAggregateRunFailed
|
||||
expr: awoooi_backup_last_run_failed_count{host="110",exported_job="backup_all"} > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-all
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 全服務備份最近一次有 {{ $value }} 個失敗項目"
|
||||
description: "backup-all.sh 最近一次 aggregate run 仍有失敗;即使個別 DB 備份已手動補跑,也要重跑 aggregate backup 清除紅燈。"
|
||||
runbook: "SSH 110 檢查 /backup/logs/cron.log 與 /backup/logs/backup.log,修正後執行 /backup/scripts/backup-all.sh"
|
||||
|
||||
- alert: BackupConfigCapturePartial
|
||||
expr: awoooi_backup_config_capture_ok{host="110",critical="true"} == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-config-capture
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 設定檔備份缺少關鍵目標:{{ $labels.target }}"
|
||||
description: "configs restic snapshot 雖可能存在,但最新設定檔備份未成功捕捉 {{ $labels.target }};source={{ $labels.source }}。這會影響下一次冷啟動或災難還原的設定還原完整性。"
|
||||
runbook: "先修復對應主機或 K8s API 可達性,再執行 /backup/scripts/backup-configs.sh,確認 awoooi_backup_config_capture_ok 回到 1,最後補跑 Google Drive/rclone offsite sync。"
|
||||
|
||||
- alert: BackupConfigCaptureStatusStale
|
||||
expr: absent(awoooi_backup_config_capture_status_timestamp{host="110"}) or (time() - awoooi_backup_config_capture_status_timestamp{host="110"} > 172800)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-config-capture
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 設定檔備份覆蓋率狀態缺失或過舊"
|
||||
description: "backup-configs.sh 沒有新鮮的 capture status;無法判斷 110/120/121/188/K8s 設定檔是否真的被最新 snapshot 捕捉。"
|
||||
runbook: "部署新版 /backup/scripts/backup-configs.sh 與 /home/wooo/scripts/backup-health-textfile-exporter.py,執行 /backup/scripts/backup-configs.sh 後刷新 textfile exporter。"
|
||||
|
||||
- alert: BackupIntegrityCheckMissingOrFailed
|
||||
expr: absent(awoooi_backup_integrity_fresh{host="110",scope="restic_check"}) or awoooi_backup_integrity_fresh{host="110",scope="restic_check"} == 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: host-backup
|
||||
component: backup-integrity
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 備份倉庫完整性檢查缺失或失敗"
|
||||
description: "每週 restic check 沒有成功證據,或有 repo 檢查失敗;目前不能假設備份可讀。"
|
||||
runbook: "SSH 110 執行 `/backup/scripts/check-backup-integrity.sh --mode check`,先看 /backup/logs/backup-integrity.log;禁止刪 repo 或 prune 直到確認原因"
|
||||
|
||||
- alert: BackupRestoreDrillMissingOrFailed
|
||||
expr: absent(awoooi_backup_integrity_fresh{host="110",scope="restore_drill"}) or awoooi_backup_integrity_fresh{host="110",scope="restore_drill"} == 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-restore-drill
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 備份抽樣還原演練缺失或失敗"
|
||||
description: "每月 restore drill 沒有成功證據,備份雖可能新鮮,但尚未驗證可讀取還原。"
|
||||
runbook: "SSH 110 執行 `/backup/scripts/check-backup-integrity.sh --mode restore-drill`;只允許還原到隔離暫存目錄,不得覆蓋 production"
|
||||
|
||||
- alert: BackupOffsiteCopyNotConfigured
|
||||
expr: sum by(host) (awoooi_backup_offsite_configured{host="110"}) == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-offsite
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 尚未配置離機備份 provider"
|
||||
description: "backup health exporter 未偵測到 Google Drive/rclone 或其他 offsite provider 配置;本地 restic 全綠仍不等於異地可恢復。"
|
||||
runbook: "在 110 以 `/backup/scripts/configure-offsite-rclone.sh --interactive` 建立 Google Drive remote,產生 `/backup/offsite/*last_success` 證據;不得把 provider token 寫入 repo、Telegram 或 Prometheus label。"
|
||||
|
||||
- alert: BackupOffsiteCopyStale
|
||||
expr: |
|
||||
(
|
||||
(sum by(host) (awoooi_backup_offsite_configured{host="110"}) > 0)
|
||||
and
|
||||
(sum by(host) (awoooi_backup_offsite_fresh{host="110"}) == 0)
|
||||
)
|
||||
and
|
||||
(
|
||||
(sum by(host) (awoooi_backup_offsite_full_sync_enabled{host="110"}) == 0)
|
||||
or
|
||||
((time() - max by(host) (awoooi_backup_offsite_full_sync_enabled_timestamp{host="110"})) > 30 * 3600)
|
||||
)
|
||||
for: 2h
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-offsite
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 離機備份超過 48 小時未成功"
|
||||
description: "已偵測到 offsite provider 配置,但沒有新鮮成功標記;本地備份可能無法抵抗整台 110 遺失。"
|
||||
runbook: "SSH 110 檢查 Google Drive/rclone 同步 log 與 `/backup/offsite/*last_success`;full sync 需在 enable marker 與低負載門檻成立後由 `/backup/scripts/sync-offsite-backups.sh --mode sync` 鏡像本地 latest-only repo。"
|
||||
|
||||
- alert: BackupRetentionPolicyNotLatestOnly
|
||||
expr: |
|
||||
absent(awoooi_backup_retention_latest_only{host="110"})
|
||||
or
|
||||
awoooi_backup_retention_latest_only{host="110"} != 1
|
||||
or
|
||||
absent(awoooi_backup_retention_offsite_delete_old_enabled{host="110",provider="rclone"})
|
||||
or
|
||||
awoooi_backup_retention_offsite_delete_old_enabled{host="110",provider="rclone"} != 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-retention
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 備份保留策略不是 latest-only"
|
||||
description: "operator 要求所有備份只保留最新一份;本地 restic 必須 keep-last=1,Google Drive/rclone 必須在成功 mirror 後刪除遠端舊檔。"
|
||||
runbook: "檢查 `/backup/scripts/common.sh` 的 BACKUP_RETENTION_MODE=latest、KEEP_LAST=1 與 OFFSITE_SYNC_DELETE_OLD=1,刷新 backup-health textfile;必要時在備份成功後執行 `/backup/scripts/enforce-latest-only-retention.sh`。"
|
||||
|
||||
- alert: BackupSnapshotRetentionExceeded
|
||||
expr: awoooi_backup_job_snapshot_count{host="110",type="restic"} > 1
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-retention
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 備份 repo {{ $labels.exported_job }} 保留超過 1 份 snapshot"
|
||||
description: "{{ $labels.exported_job }} 目前有 {{ $value }} 份 restic snapshot;latest-only 策略要求每個 repo 全域只保留最新 1 份。"
|
||||
runbook: "SSH 110 執行 `/backup/scripts/enforce-latest-only-retention.sh`;若仍未收斂,確認 `common.sh` 使用 `restic forget --group-by \"\" --keep-last 1 --prune`,避免 restic 依 path/tag 分組保留多份。"
|
||||
|
||||
- alert: BackupOffsiteFullVerifyFailed
|
||||
expr: |
|
||||
awoooi_backup_offsite_fresh{host="110",provider="rclone"} == 1
|
||||
unless on(host, provider)
|
||||
(awoooi_backup_offsite_remote_verify_ok{host="110",provider="rclone"} == 1)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-offsite
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 Google Drive full sync 完成但遠端驗證未通過"
|
||||
description: "full offsite marker 已 fresh,但 verify-offsite-full-sync.sh 沒有證明 13 個 Google Drive repo 都可列出且符合 latest-only。"
|
||||
runbook: "SSH 110 執行 `/backup/scripts/verify-offsite-full-sync.sh --write-textfile --no-color`,檢查 `/backup/logs/offsite-full-sync-verify.log` 與 `/home/wooo/node_exporter_textfiles/offsite_full_sync_verify.prom`。"
|
||||
|
||||
- alert: BackupOffsiteRemoteSnapshotRetentionExceeded
|
||||
expr: |
|
||||
(awoooi_backup_offsite_remote_snapshot_count{host="110",provider="rclone"} > 1)
|
||||
and on(host, provider)
|
||||
(awoooi_backup_offsite_fresh{host="110",provider="rclone"} == 1)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: backup-retention
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "Google Drive repo {{ $labels.repo }} 保留超過 1 份 snapshot"
|
||||
description: "{{ $labels.repo }} 在 Google Drive/rclone 遠端目前有 {{ $value }} 份 snapshot;latest-only 策略要求遠端也只保留最新一份。"
|
||||
runbook: "確認 110 `/backup/scripts/sync-offsite-backups.sh --mode sync` 使用 `rclone sync`、`OFFSITE_SYNC_DELETE_OLD=1`、`RCLONE_DRIVE_USE_TRASH=false`,再於低峰重新執行 full sync 與 verifier。"
|
||||
|
||||
- alert: BackupCredentialEscrowEvidenceMissing
|
||||
expr: awoooi_backup_credential_escrow_fresh{host="110"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-backup
|
||||
component: credential-escrow
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "備份憑證金庫證據缺失或過期:{{ $labels.item }}"
|
||||
description: "{{ $labels.item }} 沒有 31 天內人工驗證證據;重建時可能找不到 restic/offsite/break-glass/DNS/OAuth 復原材料。"
|
||||
runbook: "在密碼管理器或離線加密金庫完成雙人覆核後,只建立不含 secret 的 `/backup/escrow-evidence/{{ $labels.item }}.last_verified` 時間戳證據。"
|
||||
|
||||
# =========================================================================
|
||||
# 基礎設施詳細告警 (awoooi_infrastructure_detailed) — 從主機補回 2026-04-12
|
||||
# =========================================================================
|
||||
@@ -1323,3 +1748,284 @@ groups:
|
||||
summary: "Prometheus ({{ $labels.instance }}) 停擺"
|
||||
description: "Prometheus 自己停擺 → 所有其他告警失效"
|
||||
runbook: "SSH 110 docker compose -f /home/wooo/monitoring/docker-compose.yml up -d prometheus"
|
||||
|
||||
# =========================================================================
|
||||
# Full-stack cold-start recovery gate
|
||||
# =========================================================================
|
||||
- name: cold_start_recovery_alerts
|
||||
rules:
|
||||
- alert: PrometheusRuleDriftGuardFailed
|
||||
expr: |
|
||||
absent(awoooi_prometheus_rule_drift_guard_last_run_timestamp{host="110"})
|
||||
or
|
||||
(time() - max by(host) (awoooi_prometheus_rule_drift_guard_last_run_timestamp{host="110"}) > 900)
|
||||
or
|
||||
(awoooi_prometheus_rule_drift_guard_missing_required_count{host="110"} > 0)
|
||||
or
|
||||
(awoooi_prometheus_rule_drift_guard_current_matches_canonical{host="110"} == 0)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: systemd-110
|
||||
component: prometheus-rule-drift-guard
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "Prometheus 規則漂移防護失效"
|
||||
description: "110 Prometheus rule drift guard 沒有新鮮成功指標、required rules 缺失,或 active alerts.yml 不等於 canonical rules。"
|
||||
runbook: "執行 `bash scripts/ops/deploy-alerts.sh` 重新部署 canonical rules 與 drift guard,等待 1-2 個 Prometheus evaluation cycle 後重跑 readiness audit。"
|
||||
|
||||
- alert: PrometheusRuleDriftAutoRepaired
|
||||
expr: awoooi_prometheus_rule_drift_guard_repaired{host="110"} > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: systemd-110
|
||||
component: prometheus-rule-drift-guard
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "Prometheus 規則漂移已被自動修復"
|
||||
description: "110 drift guard 最近一次偵測到 active Prometheus rules 漂移,已回復 canonical rules 並 reload Prometheus。"
|
||||
runbook: "檢查 `/home/wooo/logs/prometheus-rule-drift-guard.log` 與 `/home/wooo/monitoring/alerts.yml.guard.bak.*`,找出誰覆寫了 active rules。"
|
||||
|
||||
- alert: ColdStartMonitorMissing
|
||||
expr: absent(awoooi_cold_start_monitor_up{host="110",scope="110_120_121_188"})
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: systemd-110
|
||||
component: cold-start-monitor
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "Cold-start monitor textfile metric missing"
|
||||
description: "110 沒有輸出 awoooi_cold_start_monitor_up;重開機恢復 gate 目前不可觀測。"
|
||||
runbook: "執行 scripts/reboot-recovery/install-cold-start-monitor-110.sh,確認 /home/wooo/node_exporter_textfiles/cold_start_recovery.prom"
|
||||
|
||||
- alert: ColdStartMonitorStale
|
||||
expr: time() - awoooi_cold_start_last_run_timestamp{host="110",scope="110_120_121_188"} > 900
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: systemd-110
|
||||
component: cold-start-monitor
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "Cold-start monitor stale"
|
||||
description: "cold-start monitor 超過 15 分鐘沒有更新,距離上次執行 {{ $value | humanizeDuration }}。"
|
||||
runbook: "SSH 110 檢查 crontab、/tmp/awoooi-cold-start-monitor.cron.log、cold-start-last.log"
|
||||
|
||||
- alert: ColdStartRecoveryBlocked
|
||||
expr: awoooi_cold_start_blocked_gates{host="110",scope="110_120_121_188"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: full-stack
|
||||
component: cold-start-gate
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "Full-stack cold-start recovery BLOCKED"
|
||||
description: "cold-start gate 有 {{ $value }} 個 BLOCKED gate。AI 修復需保持 observe-only,先處理第一個 blocked gate。"
|
||||
runbook: "查看 /home/wooo/reboot-recovery/cold-start-last.log;依 docs/runbooks/FULL-STACK-COLD-START-SOP.md 的 P0→P2 順序修復"
|
||||
|
||||
- alert: K3sNodeFilesystemErrorGateBlocked
|
||||
expr: awoooi_cold_start_blocker_reason{host="110",scope="110_120_121_188",reason="k3s_node_filesystem_error",target="120"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: k3s
|
||||
component: node-filesystem
|
||||
host: "120"
|
||||
target_host: "120"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "120 K3s 節點 filesystem error 阻擋重開機放行"
|
||||
description: "cold-start log 偵測到 120 Node event 仍有 EXT4/I/O/deleted inode 類錯誤;即使 Pod Running、網站 200,也不可宣告下一次重開機安全。"
|
||||
runbook: "查看 110 `/home/wooo/reboot-recovery/cold-start-last.log`,執行 `scripts/reboot-recovery/120-fsck-maintenance-checklist.sh` 做維護前只讀檢查;維護窗內用 console/rescue 對 120 root LV 執行 fsck,禁止 online fsck。"
|
||||
|
||||
- alert: ColdStartHost120Unreachable
|
||||
expr: awoooi_cold_start_blocker_reason{host="110",scope="110_120_121_188",reason="host_unreachable",target="120"} > 0
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: host
|
||||
component: host-reachability
|
||||
host: "120"
|
||||
target_host: "120"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "120 主機不可達,Full-stack cold-start 已阻擋"
|
||||
description: "110 cold-start monitor 無法 ping/SSH 192.168.0.120;目前只能由 121/VIP 撐住 K3s,不能宣告所有主機重開機恢復完成。"
|
||||
runbook: "查看 120 console。若停在 initramfs/manual fsck,先對 root LV 做離線 fsck;若主機關機或網卡異常,先恢復電源/網路,再重跑 full-stack cold-start gate。禁止從自動修復直接重啟其他服務掩蓋主機離線。"
|
||||
|
||||
- alert: ColdStartRecoveryDegraded
|
||||
expr: awoooi_cold_start_warn_gates{host="110",scope="110_120_121_188"} > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: full-stack
|
||||
component: cold-start-gate
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "Full-stack cold-start recovery DEGRADED"
|
||||
description: "cold-start gate 有 {{ $value }} 個 WARN gate;核心可用但不應放行 runner/CD/AI auto-repair full execution。"
|
||||
runbook: "查看 /home/wooo/reboot-recovery/cold-start-last.log,修到 PASS/WARN/BLOCKED = green"
|
||||
|
||||
- alert: ColdStartLastGreenTooOld
|
||||
expr: time() - awoooi_cold_start_last_green_timestamp{host="110",scope="110_120_121_188"} > 3600
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: full-stack
|
||||
component: cold-start-gate
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "Full-stack cold-start gate has not been GREEN recently"
|
||||
description: "距離上次 GREEN 已超過 {{ $value | humanizeDuration }};需要確認 110/120/121/188 與排程/網站 gate。"
|
||||
runbook: "執行 SSH_BATCH_MODE=yes bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test"
|
||||
|
||||
# =========================================================================
|
||||
# Host storage health / dirty reboot evidence
|
||||
# =========================================================================
|
||||
- name: host_storage_health_alerts
|
||||
rules:
|
||||
- alert: Host110StorageHealthMonitorMissing
|
||||
expr: absent(awoooi_host_storage_monitor_up{host="110"})
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: systemd-110
|
||||
component: storage-health-monitor
|
||||
host: "110"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "110 storage health textfile metric missing"
|
||||
description: "110 沒有輸出 storage_health.prom;dirty reboot、root read-only 與 fsck 證據目前不可觀測。"
|
||||
runbook: "用 Ansible `110-devops.yml --tags textfile_exporters` 或手動部署 scripts/ops/storage-health-textfile-exporter.py,確認 /home/wooo/node_exporter_textfiles/storage_health.prom"
|
||||
|
||||
- alert: Host188StorageHealthMonitorMissing
|
||||
expr: absent(awoooi_host_storage_monitor_up{host="188"})
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: systemd-188
|
||||
component: storage-health-monitor
|
||||
host: "188"
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "188 storage health textfile metric missing"
|
||||
description: "188 沒有輸出 storage_health.prom;dirty reboot、root read-only 與 fsck 證據目前不可觀測。"
|
||||
runbook: "用 Ansible `188-ai-web.yml --tags textfile_exporters` 或手動部署 scripts/ops/storage-health-textfile-exporter.py,確認 /home/ollama/node_exporter_textfiles/storage_health.prom"
|
||||
|
||||
- alert: HostStorageHealthMonitorStale
|
||||
expr: time() - awoooi_host_storage_last_run_timestamp{host=~"110|188"} > 900
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-storage
|
||||
component: storage-health-monitor
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "主機 {{ $labels.host }} storage health textfile stale"
|
||||
description: "storage health exporter 超過 15 分鐘沒有更新;重開機後檔案系統風險不可觀測。"
|
||||
runbook: "SSH 主機檢查 cron、/tmp/awoooi-storage-health-textfile-exporter.cron.log 與 node-exporter textfile collector"
|
||||
|
||||
- alert: HostRootFilesystemReadOnly
|
||||
expr: awoooi_host_root_filesystem_readonly{host=~"110|188",mountpoint="/"} > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: host-storage
|
||||
component: root-filesystem
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "主機 {{ $labels.host }} root filesystem 已變成 read-only"
|
||||
description: "root filesystem 被掛載為唯讀,服務可能仍暫時存活但寫入會失敗;禁止自動修復,先保全證據並規劃維護窗。"
|
||||
runbook: "依 docs/runbooks/FULL-STACK-COLD-START-SOP.md §16:保全 journal/df/mount 證據,確認備份,再安排 console/offline fsck"
|
||||
|
||||
- alert: HostCurrentBootStorageErrorsDetected
|
||||
expr: awoooi_host_storage_error_count{host=~"110|188",source="journalctl-kernel",boot="current"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: host-storage
|
||||
component: kernel-storage
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "主機 {{ $labels.host }} current boot 有 storage/kernel 錯誤"
|
||||
description: "目前開機週期已出現 filesystem、I/O 或 fsck 類錯誤;不可只重啟容器掩蓋問題。"
|
||||
runbook: "先執行 read-only 診斷:journalctl -k -p warning..alert、mount、df、smartctl/raid 狀態;必要時進入維護窗處理"
|
||||
|
||||
- alert: HostPreviousBootStorageErrorsDetected
|
||||
expr: awoooi_host_storage_error_count{host=~"110|188",source="journalctl-kernel",boot="previous"} > 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-storage
|
||||
component: dirty-reboot-evidence
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "主機 {{ $labels.host }} previous boot 保留 storage/fsck 錯誤證據"
|
||||
description: "上一個開機週期留有 storage/fsck 錯誤,代表這次重開機事故需要完成 fsck、備份與容量後續檢查。"
|
||||
runbook: "把證據寫入 docs/LOGBOOK.md,確認 full-stack cold-start gate 與 P3 gate;下一次維護窗補 offline fsck/SMART/RAID 檢查"
|
||||
|
||||
- alert: HostFsckLogErrorsDetected
|
||||
expr: sum by(host) (awoooi_host_storage_error_count{host=~"110|188",boot="last-fsck-log"}) > 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: host-storage
|
||||
component: fsck-log
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "主機 {{ $labels.host }} fsck log 保留錯誤證據"
|
||||
description: "主機 fsck log 內仍有 inconsistency 或 I/O 類錯誤文字;這是事故後追蹤項,不應交給自動修復直接處理。"
|
||||
runbook: "確認 /run/initramfs/fsck.log 與 /var/log/fsck/*,將結果納入重開機事故報告與下次維護窗檢查項"
|
||||
|
||||
Reference in New Issue
Block a user