Files
awoooi/ops/monitoring/alerts-unified.yml
2026-05-29 12:41:34 +08:00

2032 lines
91 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# ops/monitoring/alerts-unified.yml
# AWOOOI 統一 Prometheus 告警規則
# 2026-04-05 Claude Code: 整合所有規則,加入統一 layer 標籤
# 2026-04-08 Claude Sonnet 4.6: 補 database_detail_alerts 群組 (6條詳細規則)
# 2026-04-12 Claude Sonnet 4.6: 補回 4 個僅存在主機的群組 (backup/flywheel/connectivity/infra-detailed)
# 部署目標: 192.168.0.110:/home/wooo/monitoring/alerts.yml
# 部署方式: scripts/ops/deploy-alerts.sh (CD 自動部署)
#
# 標籤規範:
# layer: k8s | docker-110 | docker-188 | systemd-188
# component: 服務名稱
# team: ops | backend | ai | platform
# host: "110" | "188" | "120" | "121"
# auto_repair: "true" | "false"
groups:
# =========================================================================
# Full-stack recovery scorecard recording rules
# =========================================================================
- name: full_stack_recovery_scorecard_rules
interval: 60s
rules:
- record: awoooi_recovery_core_ready
expr: |
sum without(result) (
awoooi_cold_start_last_result{host="110",scope="110_120_121_188",result="green"} == bool 1
)
* on(host,scope) (
awoooi_cold_start_warn_gates{host="110",scope="110_120_121_188"} == bool 0
)
* on(host,scope) (
awoooi_cold_start_blocked_gates{host="110",scope="110_120_121_188"} == bool 0
)
* on(host,scope) (
(time() - awoooi_cold_start_last_green_timestamp{host="110",scope="110_120_121_188"}) < bool 3600
)
- record: awoooi_recovery_dr_offsite_ready
expr: |
max by(host) (
awoooi_backup_offsite_configured{host="110"} == bool 1
)
* on(host) max by(host) (
awoooi_backup_offsite_fresh{host="110"} == bool 1
)
* on(host) min by(host) (
awoooi_backup_credential_escrow_fresh{host="110"} == bool 1
)
# =========================================================================
# 主機層告警 (host_alerts)
# =========================================================================
- name: host_alerts
rules:
- alert: HostDown
expr: up{job=~"node-exporter.*"} == 0
for: 1m
labels:
severity: critical
layer: systemd-188
team: ops
auto_repair: "false"
annotations:
summary: "主機 {{ $labels.host }} 不可達"
description: "Node Exporter 無回應超過 1 分鐘"
- alert: HostHighCpuLoad
# 2026-05-05 ogt + Codex: keep this as early warning only.
# Sustained overload/root-cause automation is handled by HostLoadAverageSustainedHigh.
expr: 100 - (avg by(host) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
for: 10m
labels:
severity: warning
layer: systemd-188
team: ops
auto_repair: "false"
# MCP Phase 2a (ADR-071, 2026-04-11 Claude Sonnet 4.6): SSH MCP 路由標籤
mcp_provider: "ssh_host"
host_type: "bare_metal"
alert_category: "host_resource"
annotations:
summary: "主機 {{ $labels.host }} CPU 高負載"
description: "CPU 使用率超過 90% 持續 10 分鐘;若 load5/core 未超過 1.5,先視為容量觀察與診斷,不直接修復。"
- alert: HostLoadAverageSustainedHigh
# 2026-05-05 ogt + Codex: 110/188 長時間過載基線。
# Why: CPU% 只看 busy time無法充分表達 ClickHouse merge、Kafka 回補、Chrome/Ollama 造成的 runnable queue。
expr: node_load5{host=~"110|188"} / on(host) count by(host) (count by(host,cpu) (node_cpu_seconds_total{mode="idle",host=~"110|188"})) > 1.5
for: 15m
labels:
severity: critical
layer: systemd-188
team: ops
auto_repair: "true"
mcp_provider: "ssh_host"
host_type: "bare_metal"
alert_category: "host_resource"
annotations:
summary: "主機 {{ $labels.host }} load5/core 長時間過高"
description: "load5 / CPU core > 1.5 持續 15 分鐘;這通常代表 runnable queue 已長期塞車,不是短暫尖峰。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'echo \"=== LOAD ===\"; uptime; echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -25; echo \"=== DOCKER STATS ===\"; docker stats --no-stream | head -40'"
runbook: "先判斷高 load 來源ClickHouse merge / Kafka backlog / Chrome scraper / Ollama runner / exporter只允許 read-only 診斷,自動修復需走服務專屬 playbook。"
- alert: HostOutOfMemory
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
severity: warning
layer: systemd-188
team: ops
auto_repair: "true"
mcp_provider: "ssh_host"
host_type: "bare_metal"
alert_category: "host_resource"
annotations:
summary: "主機 {{ $labels.host }} 記憶體不足"
description: "記憶體使用率超過 85%"
- alert: HostOutOfDiskSpace
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 85
for: 5m
labels:
severity: warning
layer: systemd-188
team: ops
auto_repair: "true"
mcp_provider: "ssh_host"
host_type: "bare_metal"
alert_category: "host_resource"
annotations:
summary: "主機 {{ $labels.host }} 磁碟空間不足"
description: "磁碟使用率超過 85%"
auto_repair_action: "ssh {{ $labels.instance }} 'echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -15; echo \"=== MEMORY ===\"; free -h; echo \"=== DISK ===\"; df -h; echo \"=== LOAD ===\"; uptime'"
# =========================================================================
# K8s 叢集告警 (kubernetes_alerts)
# =========================================================================
- name: kubernetes_alerts
rules:
- alert: K3sNodeNotReady
expr: kube_node_status_condition{condition="Ready", status="true"} == 0
for: 2m
labels:
severity: critical
layer: k8s
team: ops
auto_repair: "false"
annotations:
summary: "K3s 節點 {{ $labels.node }} 未就緒"
description: "節點超過 2 分鐘未達到 Ready 狀態"
- alert: KubePodCrashLooping
expr: rate(kube_pod_container_status_restarts_total{namespace="awoooi-prod"}[15m]) > 0
for: 5m
labels:
severity: warning
layer: k8s
team: ops
auto_repair: "true"
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} 持續重啟"
description: "Pod 在過去 15 分鐘內重啟次數異常"
- alert: KubePodNotReady
expr: kube_pod_status_ready{condition="true",namespace="awoooi-prod"} == 0
for: 5m
labels:
severity: warning
layer: k8s
team: ops
auto_repair: "true"
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} 未就緒"
description: "Running 中的 Pod 超過 5 分鐘未達到 Ready 狀態"
- alert: KubeDeploymentReplicasMismatch
expr: kube_deployment_spec_replicas{namespace="awoooi-prod"} != kube_deployment_status_replicas_available{namespace="awoooi-prod"}
for: 10m
labels:
severity: warning
layer: k8s
team: ops
auto_repair: "true"
annotations:
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} 副本數不匹配"
description: "期望副本數與可用副本數不一致超過 10 分鐘"
- alert: VeleroBackupFailed
expr: increase(velero_backup_failure_total[24h]) > 0
for: 5m
labels:
severity: warning
layer: k8s
team: ops
component: velero
auto_repair: "false"
annotations:
summary: "Velero 備份失敗"
description: "過去 24 小時有備份失敗"
- alert: VeleroBackupNotRun
expr: max by(host, namespace) (awoooi_velero_latest_completed_backup_fresh{host="110",namespace="velero"}) == 0
for: 10m
labels:
severity: critical
layer: k8s
team: ops
component: velero
auto_repair: "false"
annotations:
summary: "Velero 超過 24 小時未成功備份"
description: "backup health exporter 顯示 latest Completed Velero backup 超過 25 小時或不存在。"
# Sprint C-2 Host rsync 備份告警 (2026-04-11 Claude Sonnet 4.6)
# backup-from-110.sh 成功時寫入 /var/run/backup-110.last_success
# node-exporter textfile collector 讀取此檔案暴露指標
- alert: HostBackupFailed
# 2026-05-02 ogt + Claude Sonnet 4.6: 刪除 backup_110 collector label 判斷
# 根因node_textfile_scrape_error 目前已不帶 collector 欄位,此條件在實際環境持續 absent()=true導致告警永遠成立。
# 修法:僅以 backup_110_last_success_timestamp 是否缺失 / 是否超時為主判斷。
expr: absent(backup_110_last_success_timestamp) or (time() - backup_110_last_success_timestamp > 90000)
for: 10m
labels:
severity: warning
layer: docker-188
team: ops
component: backup
host: "188"
auto_repair: "true"
alert_category: host_resource
annotations:
summary: "188 Host 備份超過 25 小時未成功"
description: "backup-from-110.sh 可能失敗,/backup/110 資料可能過舊"
auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%cpu | head -15; echo \"=== BACKUP STATUS ===\"; ls -lah /home/ollama/backup/110 2>/dev/null || true; echo \"=== LAST SUCCESS ===\"; cat /home/ollama/backup/110/last_success 2>/dev/null || true; echo \"=== BACKUP LOG ===\"; tail -80 /home/ollama/backup/110/backup.log 2>/dev/null || true; echo \"=== DISK ===\"; df -h /home/ollama /backup / 2>/dev/null || df -h'"
# ADR-075: CoreDNS 解析失敗 (2026-04-12 ogt)
- alert: CoreDNSResolutionFailed
expr: rate(coredns_dns_responses_total{rcode="SERVFAIL"}[5m]) > 0.05
for: 5m
labels:
severity: critical
layer: k8s
team: ops
auto_repair: "true"
alert_category: kubernetes
notification_type: TYPE-3
annotations:
summary: "CoreDNS SERVFAIL 率過高 {{ $value | humanizePercentage }}"
description: "CoreDNS 在 5 分鐘內 SERVFAIL 回應率超過 5%K8s 服務間 DNS 解析可能失敗"
runbook: "kubectl -n kube-system get pods -l k8s-app=kube-dns && kubectl -n kube-system logs -l k8s-app=kube-dns"
# =========================================================================
# 資料庫告警 (database_alerts)
# =========================================================================
- name: database_alerts
rules:
- alert: PostgreSQLDown
expr: up{job="postgres-exporter"} == 0 or pg_up == 0
for: 1m
labels:
severity: critical
layer: systemd-188
component: postgres
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "PostgreSQL 資料庫離線"
description: "PostgreSQL Exporter 無法連接資料庫超過 1 分鐘"
- alert: RedisDown
expr: up{job="redis-exporter"} == 0 or redis_up == 0
for: 1m
labels:
severity: critical
layer: systemd-188
component: redis
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "Redis 快取服務離線"
description: "Redis Exporter 無法連接 Redis 超過 1 分鐘"
- alert: PostgreSQLHighConnections
expr: pg_stat_activity_count > 80
for: 5m
labels:
severity: warning
layer: systemd-188
component: postgres
team: ops
auto_repair: "false"
annotations:
summary: "PostgreSQL 連接數過高"
description: "當前連接數 {{ $value }} 超過 80"
- alert: RedisMemoryHigh
expr: redis_memory_max_bytes > 0 and redis_memory_used_bytes / redis_memory_max_bytes * 100 > 80
for: 5m
labels:
severity: warning
layer: systemd-188
component: redis
team: ops
auto_repair: "false"
annotations:
summary: "Redis 記憶體使用過高"
description: "Redis 記憶體使用率超過 80%"
# =========================================================================
# Sprint 5.2 Plan B: 資料庫詳細指標告警 (database_detail_alerts)
# 前置: postgres-exporter:9187 + redis-exporter:9121 on 192.168.0.188
# 2026-04-08 Claude Sonnet 4.6 Asia/Taipei
# =========================================================================
- name: database_detail_alerts
rules:
# ---- PostgreSQL 詳細指標 ----
- alert: PostgreSQLSlowQueries
expr: pg_stat_activity_max_tx_duration{datname="awoooi_prod"} > 60
for: 5m
labels:
severity: warning
layer: systemd-188
component: postgres
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "PostgreSQL 有慢查詢 (>60s)"
description: "awoooi_prod 資料庫最長事務超過 60 秒"
- alert: PostgreSQLDeadlocks
expr: increase(pg_stat_database_deadlocks{datname="awoooi_prod"}[5m]) > 0
for: 1m
labels:
severity: warning
layer: systemd-188
component: postgres
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "PostgreSQL 死鎖發生"
description: "過去 5 分鐘 awoooi_prod 資料庫有死鎖"
- alert: PostgreSQLTooManyConnections
expr: pg_stat_activity_count{datname="awoooi_prod"} > 50
for: 5m
labels:
severity: warning
layer: systemd-188
component: postgres
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "PostgreSQL 連接數過高 ({{ $value }})"
description: "awoooi_prod 連接數超過 50"
# ---- Redis 詳細指標 ----
- alert: RedisKeyEviction
expr: increase(redis_evicted_keys_total[5m]) > 0
for: 1m
labels:
severity: warning
layer: systemd-188
component: redis
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "Redis 發生 Key 驅逐"
description: "過去 5 分鐘有 Key 被驅逐,可能記憶體不足"
- alert: RedisConnectionsHigh
expr: redis_connected_clients > 100
for: 5m
labels:
severity: warning
layer: systemd-188
component: redis
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "Redis 連接數過高 ({{ $value }})"
description: "Redis 連接數超過 100"
- alert: RedisCommandLatencyHigh
expr: redis_commands_duration_seconds_total / redis_commands_processed_total > 0.01
for: 5m
labels:
severity: warning
layer: systemd-188
component: redis
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "Redis 命令平均延遲過高 (>10ms)"
description: "Redis 命令平均延遲超過 10ms"
# =========================================================================
# 服務可用性告警 (service_alerts) — 含 layer 標籤決定修復路徑
# =========================================================================
- name: service_alerts
rules:
# ---- 188 Docker 層 ----
- alert: OpenClawDown
# 2026-04-05 Claude Code: 修正舊命名 ClawBotDown → OpenClawDown
expr: up{job="clawbot"} == 0
for: 2m
labels:
severity: critical
layer: docker-188
component: openclaw
host: "188"
team: ops
auto_repair: "true"
annotations:
summary: "OpenClaw 服務離線"
description: "OpenClaw (192.168.0.188:8088) 已離線超過 2 分鐘"
- alert: SignOzDown
expr: probe_success{job="blackbox-http", instance=~".*3301.*"} == 0
for: 2m
labels:
severity: warning
layer: docker-188
component: signoz
host: "188"
team: ops
auto_repair: "true"
mcp_provider: "ssh_host"
target_host: "192.168.0.188"
alert_category: "devops_tool"
annotations:
summary: "SignOz 服務離線"
description: "SignOz (192.168.0.188:3301) 已離線超過 2 分鐘"
# ---- 110 Docker 層 ----
- alert: SentryDown
expr: probe_success{job="blackbox-tcp", instance="192.168.0.110:9000"} == 0
for: 2m
labels:
severity: warning
layer: docker-110
component: sentry
host: "110"
team: ops
auto_repair: "true"
mcp_provider: "ssh_host"
target_host: "192.168.0.110"
alert_category: "devops_tool"
annotations:
summary: "Sentry 服務離線"
description: "Sentry (192.168.0.110:9000) 已離線超過 2 分鐘"
- alert: HarborDown
expr: probe_success{job="blackbox-http", instance=~".*5000.*"} == 0
for: 2m
labels:
severity: critical
layer: docker-110
component: harbor
host: "110"
team: ops
auto_repair: "true"
mcp_provider: "ssh_host"
target_host: "192.168.0.110"
alert_category: "devops_tool"
annotations:
summary: "Harbor Registry 離線"
description: "Harbor (192.168.0.110:5000) 已離線超過 2 分鐘CD pipeline 將無法拉取映像"
- alert: GiteaDown
expr: probe_success{job="blackbox-http", instance="http://192.168.0.110:3001"} == 0
for: 2m
labels:
severity: critical
layer: docker-110
component: gitea
host: "110"
team: ops
auto_repair: "true"
mcp_provider: "ssh_host"
target_host: "192.168.0.110"
alert_category: "devops_tool"
annotations:
summary: "Gitea Git 服務離線"
description: "Gitea (192.168.0.110:3001) 已離線超過 2 分鐘CD pipeline 失效"
- alert: AlertmanagerDown
expr: probe_success{job="blackbox-tcp", instance="192.168.0.110:9093"} == 0
for: 2m
labels:
severity: critical
layer: docker-110
component: alertmanager
host: "110"
team: ops
auto_repair: "true"
annotations:
summary: "Alertmanager 離線"
description: "Alertmanager (192.168.0.110:9093) 已離線,所有告警將靜默"
# =========================================================================
# 告警鏈路監控 (alert_chain) — 防止 2026-03-26/04-05 事故重演
# =========================================================================
- name: alert_chain
rules:
- alert: AlertChainBroken_Alertmanager
expr: |
sum(rate(awoooi_webhook_requests_total{source="alertmanager",status!="success"}[5m]))
/ sum(rate(awoooi_webhook_requests_total{source="alertmanager"}[5m])) > 0.1
for: 10m
labels:
severity: critical
layer: k8s
team: platform
auto_repair: "false"
annotations:
summary: "Alertmanager Webhook 錯誤率 > 10%"
description: "告警鏈路可能斷裂,請執行 E2E 驗證"
- alert: AlertChainBroken_Sentry
expr: |
sum(rate(awoooi_webhook_requests_total{source="sentry",status!="success"}[5m]))
/ sum(rate(awoooi_webhook_requests_total{source="sentry"}[5m])) > 0.1
for: 10m
labels:
severity: warning
layer: k8s
team: platform
auto_repair: "false"
annotations:
summary: "Sentry Webhook 錯誤率 > 10%"
description: "Sentry 錯誤可能無法正確處理"
- alert: NoAlertsReceived2Hours
expr: time() - max by (source)(awoooi_alert_chain_last_success_timestamp{source="alertmanager"}) > 7200
for: 5m
labels:
severity: warning
layer: k8s
team: platform
auto_repair: "false"
annotations:
summary: "2 小時內未收到任何告警 ({{ $labels.source }})"
description: "可能是告警鏈路問題,請執行 Smoke Test"
- alert: AlertChainUnhealthy
expr: awoooi_alert_chain_healthy == 0
for: 5m
labels:
severity: critical
layer: k8s
team: platform
auto_repair: "false"
annotations:
summary: "告警鏈路不健康 ({{ $labels.source }})"
description: "告警鏈路標記為不健康,最近處理失敗"
# =========================================================================
# 自動修復監控 (auto_repair)
# =========================================================================
- name: auto_repair
rules:
- alert: AutoRepairLowSuccessRate
expr: awoooi_auto_repair_success_rate < 0.3
for: 30m
labels:
severity: warning
layer: k8s
team: backend
auto_repair: "false"
annotations:
summary: "自動修復成功率過低 ({{ $value | humanizePercentage }})"
description: "動作 {{ $labels.action }} 的成功率低於 30%,建議檢查 Playbook"
- alert: PermanentFixRequired
expr: sum(rate(awoooi_anomaly_escalation_total{level="PERMANENT_FIX"}[1h])) > 0
for: 1m
labels:
severity: critical
layer: k8s
team: backend
auto_repair: "false"
annotations:
summary: "需要永久修復的異常升級"
description: "有異常升級到 PERMANENT_FIX 級別,需要根本修復"
# =========================================================================
# Sprint 5.1: Docker 容器健康監控docker-health-monitor 感知層接入)
# 由 docker-health-monitor.sh 送 Alertmanager 格式 webhook
# 或 Prometheus 自訂 exporter 上報時使用。
# auto_repair: "true" 代表允許 AWOOOI Guardrail 決策(非直接修復)
# 實際修復動作由 Service Registry 分級決定ADR-062
# 2026-04-08 Claude Sonnet 4.6 Asia/Taipei
# =========================================================================
- name: docker_health_alerts
rules:
- alert: DockerContainerUnhealthy
expr: container_health_status{job="docker-health-monitor"} == 0
for: 2m
labels:
severity: warning
layer: docker
team: ops
auto_repair: "true"
mcp_provider: "ssh_host"
alert_category: "devops_tool"
annotations:
summary: "容器 {{ $labels.container }} 健康檢查失敗"
description: "主機 {{ $labels.host }} 容器 {{ $labels.container }} 健康狀態異常,持續 2 分鐘"
- alert: DockerContainerExited
expr: container_running_status{job="docker-health-monitor"} == 0
for: 1m
labels:
severity: critical
layer: docker
team: ops
auto_repair: "true"
mcp_provider: "ssh_host"
alert_category: "devops_tool"
annotations:
summary: "容器 {{ $labels.container }} 已停止"
description: "主機 {{ $labels.host }} 容器 {{ $labels.container }} 已 exited/dead持續 1 分鐘"
- alert: DockerContainerCpuSustainedHigh
# 2026-05-05 ogt + Codex: Docker Compose 長期過載基線。
# Baseline: 單容器 >2 core 10m 為 warning用於提早抓 cadvisor、ClickHouse、momo-scheduler、Ollama runner 類問題。
expr: docker_container_cpu_cores > 2
for: 10m
labels:
severity: warning
layer: docker
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "true"
annotations:
summary: "容器 {{ $labels.container_name }} CPU 持續超過 2 core"
description: "{{ $labels.container_name }} 5m CPU 使用 >2 core 且持續 10 分鐘,需檢查是否缺 cpus limit、merge backlog、爬蟲尖峰或模型推理。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker stats --no-stream {{ $labels.container_name }}; docker inspect {{ $labels.container_name }} | head -80'"
runbook: "自動階段只做診斷;若是已知服務,交給 service-specific playbookClickHouse 降 merge/查 backlog、scheduler 限 concurrency、litellm/blackbox 查 liveliness。"
- alert: DockerContainerCpuRunawayCritical
expr: docker_container_cpu_cores > 4
for: 15m
labels:
severity: critical
layer: docker
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "true"
annotations:
summary: "容器 {{ $labels.container_name }} CPU 持續超過 4 core"
description: "{{ $labels.container_name }} 已持續吃超過 4 core會拖垮 110/188 主機;需要 AI 判斷是否限流、暫停 worker 或套用既有 playbook。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'echo \"=== CONTAINER ===\"; docker stats --no-stream {{ $labels.container_name }}; echo \"=== TOP ===\"; docker top {{ $labels.container_name }} -eo pid,ppid,stat,pcpu,pmem,comm,args | head -30'"
runbook: "禁止通用 docker restart先抓根因只有 health down 或 crash loop 才可走重啟。"
- alert: DockerContainerMemoryLimitPressure
# 2026-05-05 ogt + Codex: 防止亂設 memory limit 把 Kafka/Gitea/Taskworker 類服務卡死。
expr: docker_container_memory_limit_bytes > 0 and docker_container_memory_usage_bytes / docker_container_memory_limit_bytes > 0.85
for: 10m
labels:
severity: warning
layer: docker
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "true"
annotations:
summary: "容器 {{ $labels.container_name }} 記憶體超過 limit 85%"
description: "{{ $labels.container_name }} memory / limit > 85% 持續 10 分鐘。若是 Kafka/ClickHouse/Gitea/Taskworker需先判斷 workload不可直接降 limit。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker stats --no-stream {{ $labels.container_name }}; docker inspect {{ $labels.container_name }} | head -80'"
runbook: "若服務已接近 limit優先調整 retention/concurrency/cache再評估提高 memory禁止用更低 memory limit 當止血。"
- alert: DockerContainerRestartSpike
# 2026-05-05 ogt + Codex: cAdvisor v0.47 無 restart metric吃 node-exporter textfile docker_container_restart_count。
expr: increase(docker_container_restart_count[15m]) > 5 or increase(docker_container_inspect_restart_count[15m]) > 5
for: 3m
labels:
severity: critical
layer: docker
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "true"
annotations:
summary: "容器 {{ $labels.container_name }} 15 分鐘重啟超過 5 次"
description: "Docker restart textfile exporter 顯示 {{ $labels.container_name }} 重啟暴增,避免再次出現 litellm 24,464 次靜默崩潰。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps -a --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}'"
runbook: "先抓 crash signature若是 config/DB/網路問題,修設定,不用無限 restart。"
- alert: DockerContainerMissingResourceLimit
# 2026-05-05 ogt + Codex: catch Compose services that silently run with unlimited CPU/memory.
expr: (docker_container_cpu_limit_cores{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|taskbroker|taskscheduler|redis|uptime-checker|process-spans).*"} == 0) or (docker_container_memory_limit_bytes{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|taskbroker|taskscheduler|redis|uptime-checker|process-spans).*"} == 0)
for: 30m
labels:
severity: warning
layer: docker
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "true"
annotations:
summary: "容器 {{ $labels.container_name }} 缺 CPU 或 memory limit"
description: "{{ $labels.container_name }} 在 Docker Host 上沒有完整 CPU/memory guardrail長時間尖峰可能拖垮 110/188。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker inspect {{ $labels.container_name }} --format \"NanoCpus={{.HostConfig.NanoCpus}} Memory={{.HostConfig.Memory}}\"; docker stats --no-stream {{ $labels.container_name }}'"
runbook: "先盤點 workload再在 compose/service-specific playbook 補 cpus/mem_limit。不可盲目降 ClickHouse/Kafkamomo app/scheduler 可用 2 core/2GiB 起步。"
- alert: DockerGiteaActionsJobStale
# 2026-05-05 ogt + Codex: Gitea/act job containers can outlive workflow timeout and keep 110 hot.
expr: time() - docker_container_started_seconds{container_name=~"GITEA-ACTIONS-.*"} > 1200
for: 5m
labels:
severity: warning
layer: docker-110
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "true"
annotations:
summary: "Gitea Actions job {{ $labels.container_name }} 執行超過 20 分鐘"
description: "{{ $labels.container_name }} 已超過 20 分鐘110 曾出現 code-review/CD job timeout 失效而持續佔用 runner。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}; bash /home/wooo/scripts/stop-stale-gitea-actions-jobs.sh --min-age-seconds 1200'"
runbook: "先執行 dry-run清理腳本會依 workflow/job 名稱套停止門檻。若 logs 空白且超過該 job timeout buffer再用 --apply 停止 stale job container。不要停止仍在輸出或正在部署的 job。"
- alert: SystemdRunnerRestartSpike
# 2026-05-05 ogt + Codex: 110 GitHub Actions runner watchdog loop was outside Docker/cAdvisor coverage.
expr: increase(systemd_unit_restarts_total{unit=~"actions\\.runner\\..*"}[15m]) > 2
for: 3m
labels:
severity: critical
layer: systemd-110
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "true"
annotations:
summary: "Systemd runner {{ $labels.unit }} 15 分鐘重啟超過 2 次"
description: "{{ $labels.unit }} 在 15 分鐘內重啟暴增110 曾發生 WatchdogSec=5min 造成 runner 每 5 分鐘自殺重啟。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'systemctl show {{ $labels.unit }} -p WatchdogUSec -p NRestarts -p DropInPaths -p CPUQuotaPerSecUSec -p MemoryMax; journalctl -u {{ $labels.unit }} --since \"20 minutes ago\" --no-pager | tail -120'"
runbook: "自動階段先診斷。若確認 WatchdogSec 對 GitHub runner 誤設,需人工或 sudo playbook 執行 /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply。"
- alert: SystemdRunnerWatchdogEnabled
expr: systemd_unit_watchdog_seconds{unit=~"actions\\.runner\\..*"} > 0
for: 10m
labels:
severity: warning
layer: systemd-110
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "true"
annotations:
summary: "Systemd runner {{ $labels.unit }} 啟用了 WatchdogSec"
description: "{{ $labels.unit }} WatchdogSec={{ $value }} 秒。GitHub Actions runner service 不應被 systemd watchdog 週期性殺掉。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'systemctl show {{ $labels.unit }} -p WatchdogUSec -p DropInPaths -p NRestarts'"
runbook: "確認 drop-in 來源;需要 sudo 時由人工套用 /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply。"
- alert: SystemdRunnerMissingResourceQuota
expr: systemd_unit_cpu_quota_cores{unit=~"actions\\.runner\\..*"} == 0 or systemd_unit_memory_max_bytes{unit=~"actions\\.runner\\..*"} == 0
for: 30m
labels:
severity: warning
layer: systemd-110
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "true"
annotations:
summary: "Systemd runner {{ $labels.unit }} 缺 CPU 或 memory quota"
description: "{{ $labels.unit }} 仍為 unlimitedCI runner 會與 Sentry/ClickHouse/Gitea 搶主機 CPU。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'systemctl show {{ $labels.unit }} -p CPUQuotaPerSecUSec -p MemoryMax -p ActiveState -p SubState'"
runbook: "建議 baseline每個 runner CPUQuota=200%、MemoryMax=2G由 /home/wooo/scripts/apply-runner-systemd-guardrails.sh 套用,若仍過載再限制並行度或分流。"
# =========================================================================
# MinIO / Kali 告警
# =========================================================================
- name: minio_kali_alerts
rules:
- alert: MinIODown
expr: probe_success{job="blackbox-http", instance=~".*9001.*"} == 0
for: 2m
labels:
severity: warning
layer: docker-188
component: minio
host: "188"
team: ops
auto_repair: "true"
annotations:
summary: "MinIO (Velero 備份) 離線"
description: "MinIO (192.168.0.188:9001) 已離線超過 2 分鐘Velero 備份可能失敗"
- alert: KaliScannerDown
expr: probe_success{job="blackbox-http", instance=~".*192.168.0.112.*"} == 0
for: 5m
labels:
severity: info
layer: docker-188
component: kali
host: "112"
team: ops
auto_repair: "false"
annotations:
summary: "Kali Scanner 離線"
description: "Kali (192.168.0.112:8080) 離線,安全掃描功能暫停"
# =========================================================================
# Plan C — 外部網站監控 (Sprint 5.2, 2026-04-09 Claude Sonnet 4.6 Asia/Taipei)
# blackbox-http 已涵蓋 4 個外部網站,此群組提供結構化告警
# auto_repair: "true" — 由 AWOOOI Guardrail 決策Service Registry 分級)
# =========================================================================
- name: external_website_alerts
rules:
- alert: MoWoooWorkDown
expr: probe_success{job="blackbox-http", instance="https://mo.wooo.work"} == 0
for: 3m
labels:
severity: critical
layer: external
component: momo-app
host: "188"
team: ops
auto_repair: "true"
annotations:
summary: "外部網站 mo.wooo.work 離線"
description: "mo.wooo.work 探測失敗超過 3 分鐘,容器 momo-app (188) 可能需要重啟"
- alert: TsenyangWebsiteDown
expr: probe_success{job="blackbox-http", instance="https://www.tsenyang.com"} == 0
for: 3m
labels:
severity: critical
layer: external
component: tsenyang-website
host: "188"
team: ops
auto_repair: "true"
annotations:
summary: "外部網站 tsenyang.com 離線"
description: "tsenyang.com 探測失敗超過 3 分鐘,容器 tsenyang-website (188) 可能需要重啟"
- alert: StockWoooWorkDown
expr: probe_success{job="blackbox-http", instance="http://stock.wooo.work"} == 0
for: 3m
labels:
severity: critical
layer: external
component: stock-platform
host: "110"
team: ops
auto_repair: "true"
annotations:
summary: "外部網站 stock.wooo.work 離線"
description: "stock.wooo.work 探測失敗超過 3 分鐘,容器 stock-platform (110) 可能需要重啟"
- alert: BitanWoooWorkDown
expr: probe_success{job="blackbox-http", instance="https://bitan.wooo.work"} == 0
for: 3m
labels:
severity: critical
layer: external
component: bitan-app
host: "188"
team: ops
auto_repair: "true"
annotations:
summary: "外部網站 bitan.wooo.work 離線"
description: "bitan.wooo.work 探測失敗超過 3 分鐘,容器 bitan-app (188) 可能需要重啟"
- alert: ExternalSiteSSLExpiringSoon
expr: probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time() < 14 * 24 * 3600
for: 1h
labels:
severity: warning
layer: external
component: ssl
team: ops
auto_repair: "false"
annotations:
summary: "SSL 憑證即將到期: {{ $labels.instance }}"
description: "{{ $labels.instance }} SSL 憑證將在 14 天內到期,請手動更新"
# =============================================================================
# ADR-075 新增規則群組 (2026-04-12 ogt)
# =============================================================================
- name: awoooi_secops_alerts
interval: 60s
rules:
- alert: UnauthorizedSSHLogin
expr: increase(node_failed_auth_attempts_total[5m]) > 10
for: 1m
labels:
severity: critical
layer: systemd-188
team: security
auto_repair: "false"
alert_category: secops
annotations:
summary: "異常 SSH 登入嘗試: {{ $labels.instance }}"
description: "5 分鐘內失敗登入 {{ $value }} 次,可能遭受暴力破解"
- name: awoooi_business_alerts
interval: 60s
rules:
- alert: AITokenCostSpike
expr: increase(awoooi_ai_token_cost_usd_total[1h]) > 10
for: 5m
labels:
severity: warning
layer: k8s
team: finops
auto_repair: "false"
alert_category: business
annotations:
summary: "AI Token 費用 1 小時內暴增 ${{ $value | humanize }}"
description: "AI API 調用費用異常,請檢查是否有迴圈或濫用"
- alert: GeminiAPIErrorRateHigh
expr: rate(awoooi_ai_request_errors_total{provider="gemini"}[5m]) / rate(awoooi_ai_requests_total{provider="gemini"}[5m]) > 0.2
for: 10m
labels:
severity: warning
layer: k8s
team: finops
auto_repair: "false"
alert_category: business
annotations:
summary: "Gemini API 錯誤率過高: {{ $value | humanizePercentage }}"
description: "Gemini API 5 分鐘錯誤率超過 20%AI 降級可能失效"
# ADR-075: 業務爬蟲健康 (2026-04-12 ogt)
- alert: MomoScraperSuccessLow
expr: |
rate(momo_scraper_requests_total{status="success"}[5m])
/ rate(momo_scraper_requests_total[5m]) < 0.9
for: 10m
labels:
severity: warning
layer: docker-110
auto_repair: "false"
alert_category: business
notification_type: TYPE-3
annotations:
summary: "Momo 抓取成功率跌至 {{ $value | humanizePercentage }}"
description: "Momo 爬蟲成功率低於 90%,業務資料可能缺失"
- name: awoooi_flywheel_meta_alerts
interval: 60s
rules:
- alert: FlywheelPlaybookZero
expr: awoooi_flywheel_playbook_count == 0
for: 1h
labels:
severity: critical
layer: k8s
team: aiops
auto_repair: "false"
alert_category: flywheel_health
notification_type: TYPE-8M
annotations:
summary: "飛輪 Playbook 數量為零AI 修復完全依賴 LLM"
description: "Redis 中無任何已批准 Playbook自動修復能力大幅降低"
runbook: "執行 scripts/cold_start_playbooks.py 冷啟動"
# 2026-05-03 ogt + Claude Opus 4.7(亞太)— anti-silencing 補配對告警
# NaN sentinel 不會被 < 0.1 誤觸;下方 FlywheelExecutionRateMissing 補「無資料」獨立告警
- alert: FlywheelExecutionSuccessLow
expr: awoooi_flywheel_execution_success_rate < 0.1
for: 2h
labels:
severity: warning
layer: k8s
team: aiops
auto_repair: "false"
alert_category: flywheel_health
notification_type: TYPE-8M
annotations:
summary: "飛輪執行成功率 {{ $value | humanizePercentage }} 低於 10%"
description: "連續 2 小時執行成功率不足 10%Playbook 可能已過時"
runbook: "檢查 decision_manager 日誌,確認 target 解析和 SSH MCP 狀態"
- alert: FlywheelExecutionRateMissing
expr: absent(awoooi_flywheel_execution_success_rate) or (awoooi_flywheel_execution_success_rate != awoooi_flywheel_execution_success_rate)
for: 30m
labels:
severity: warning
layer: k8s
team: aiops
auto_repair: "false"
alert_category: flywheel_health
notification_type: TYPE-8M
annotations:
summary: "飛輪執行率指標 30 分鐘無資料"
description: "execution_success_rate 連續 30 分鐘為 NaN 或不存在Redis playbook 統計斷流(資料管線壞 / Redis flush / FlywheelStatsService 異常)"
runbook: "1) 檢查 Redis playbook:* keys 2) 檢查 FlywheelStatsService 日誌 3) curl /metrics 直接拉看 NaN 來源"
- alert: FlywheelKMVectorizationLow
expr: awoooi_flywheel_km_unvectorized_count > 10
for: 30m
labels:
severity: warning
layer: k8s
team: aiops
auto_repair: "false"
alert_category: flywheel_health
notification_type: TYPE-8M
annotations:
summary: "{{ $value }} 筆 KM 未向量化RAG 查詢命中率下降"
description: "knowledge_entries 中 embedding IS NULL 超過 10 筆且持續 30 分鐘"
runbook: "執行 scripts/batch_vectorize_km.py 或檢查每日 CronJob 狀態"
- alert: FlywheelAlertnameNullHigh
expr: awoooi_flywheel_alertname_null_rate > 0.05
for: 30m
labels:
severity: warning
layer: k8s
team: aiops
auto_repair: "false"
alert_category: flywheel_health
notification_type: TYPE-8M
annotations:
summary: "飛輪 alertname NULL 率超過 5%"
description: "alertname NULL 率 {{ $value | humanizePercentage }},影響路由準確性。"
runbook: "執行 scripts/backfill_alertname.py 回填"
- alert: FlywheelIncidentsStuck
expr: awoooi_flywheel_incidents_stuck > 5
for: 10m
labels:
severity: warning
layer: k8s
team: aiops
auto_repair: "false"
alert_category: flywheel_health
notification_type: TYPE-8M
annotations:
summary: "{{ $value }} 筆 Incident 卡在 INVESTIGATING 超過 24h"
description: "飛輪推理匹配節點可能堵塞,需人工清理或重新觸發診斷"
# =========================================================================
# 備份還原告警 (awoooi_backup_restore) — 從主機補回 2026-04-12
# =========================================================================
- name: awoooi_backup_restore
interval: 1m
rules:
- alert: BackupRestoreTestFailed
expr: max by(host, namespace, cronjob) (awoooi_velero_restore_test_failed_jobs{host="110",namespace="velero",cronjob="backup-restore-test"}) > 0
for: 5m
labels:
severity: critical
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "備份還原 dry-run 測試失敗"
description: "velero namespace 中保留了失敗的 backup-restore-test Job備份可能無法還原。立即人工驗證備份狀態。"
runbook: "先找最新 Completed Velero backup再執行 restore dry-run禁止在 production namespace 做真還原"
- alert: BackupRestoreTestMissing
expr: absent(awoooi_velero_restore_test_cron_present{host="110",namespace="velero",cronjob="backup-restore-test"})
for: 30m
labels:
severity: warning
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "備份還原 dry-run 監控指標缺失"
description: "Prometheus 沒有收到 awoooi_velero_restore_test_cron_present110 backup health exporter 或 120 kubectl 查詢可能失效。"
runbook: "檢查 110 backup_health.prom、SSH 110→120、以及 velero namespace 的 backup-restore-test CronJob"
- alert: BackupRestoreTestCronMissing
expr: max by(host, namespace, cronjob) (awoooi_velero_restore_test_cron_present{host="110",namespace="velero",cronjob="backup-restore-test"}) == 0
for: 15m
labels:
severity: critical
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "備份還原 dry-run CronJob 缺失"
description: "velero namespace 找不到 backup-restore-test CronJob備份可還原性沒有定期驗證。"
runbook: "kubectl apply k8s/awoooi-prod/17-configmap-backup-restore-scripts.yaml 與 16-cronjob-backup-restore-test.yaml"
- alert: BackupRestoreTestStale
expr: max by(host, namespace, cronjob) (awoooi_velero_restore_test_last_success_fresh{host="110",namespace="velero",cronjob="backup-restore-test"}) == 0
for: 10m
labels:
severity: warning
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "備份還原測試超過 8 天未執行"
description: "backup-restore-test CronJob 沒有 8 天內成功紀錄;週排程 CronJob 可能失效。"
runbook: "檢查 velero namespace 中的 backup-restore-test CronJob 狀態"
# =========================================================================
# Host / service / config backup health
# =========================================================================
- name: full_stack_backup_health_alerts
interval: 1m
rules:
- alert: BackupHealthMonitorMissing110
expr: absent(awoooi_backup_health_monitor_up{host="110"})
for: 20m
labels:
severity: warning
layer: host-backup
component: backup-health-monitor
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 備份健康指標缺失"
description: "110 沒有輸出 backup_health.prom無法確認資料庫、設定檔與服務備份是否新鮮。"
runbook: "用 Ansible `110-devops.yml --tags textfile_exporters` 或手動部署 scripts/ops/backup-health-textfile-exporter.py"
- alert: BackupHealthMonitorMissing188
expr: absent(awoooi_backup_health_monitor_up{host="188"})
for: 20m
labels:
severity: warning
layer: host-backup
component: backup-health-monitor
host: "188"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "188 備份健康指標缺失"
description: "188 沒有輸出 backup_health.prom無法確認 110 rsync 與 momo PostgreSQL 備份是否新鮮。"
runbook: "用 Ansible `188-ai-web.yml --tags textfile_exporters` 或手動部署 scripts/ops/backup-health-textfile-exporter.py"
- alert: BackupHealthMonitorStale
expr: time() - awoooi_backup_health_last_run_timestamp{host=~"110|188"} > 1800
for: 10m
labels:
severity: warning
layer: host-backup
component: backup-health-monitor
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "{{ $labels.host }} 備份健康 exporter 超過 30 分鐘未更新"
description: "backup health textfile exporter stale備份狀態不可觀測。"
runbook: "SSH 主機檢查 cron、/tmp/awoooi-backup-health-textfile-exporter.cron.log 與 node-exporter textfile collector"
- alert: BackupExpectedJobMissing
expr: awoooi_backup_job_configured{host=~"110|188"} == 0
for: 15m
labels:
severity: critical
layer: host-backup
component: backup-cron
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "{{ $labels.host }} 備份排程缺失:{{ $labels.exported_job }}"
description: "預期備份 cron/config 不存在;下一次重開機後資料可能沒有可用還原點。"
runbook: "依 docs/runbooks/FULL-STACK-COLD-START-SOP.md 的備份章節補回 cron先 dry-run 再執行"
- alert: BackupScheduleDuplicateActiveEntries
expr: awoooi_backup_cron_active_duplicate_count{host="110"} > 0
for: 15m
labels:
severity: warning
layer: host-backup
component: backup-cron
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 備份 crontab 有重複 active entries"
description: "110 crontab 目前有 {{ $value }} 個 exact duplicate active entry可能造成 offsite sync、verifier 或 status job 重複執行。"
runbook: "SSH 110 執行 `crontab -l | awk 'NF && $0 !~ /^#/ {count[$0]++} END {for (line in count) if (count[line] > 1) print count[line], line}'`,只移除重複 active entry不要刪除未理解的備份排程。"
- alert: BackupScheduleSingletonMismatch
expr: awoooi_backup_cron_singular_entry_ok{host="110"} == 0
for: 15m
labels:
severity: warning
layer: host-backup
component: backup-cron
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 備份排程單一入口異常:{{ $labels.entry }}"
description: "{{ $labels.entry }} 應該剛好只有一個 active cron entry目前 count={{ $value }},可能造成排程缺失或重複執行。"
runbook: "用 Ansible `110-devops.yml --tags backup_jobs` 收斂排程,並用 `scripts/reboot-recovery/reboot-recovery-readiness-audit.sh --live --no-color` 驗證。"
- alert: BackupScriptMissing
expr: awoooi_backup_script_present{host=~"110|188"} == 0
for: 15m
labels:
severity: critical
layer: host-backup
component: backup-script
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "{{ $labels.host }} 備份腳本缺失:{{ $labels.script }}"
description: "備份排程可能存在,但實際腳本不存在或路徑漂移。"
runbook: "從 repo 部署對應 scripts/backup 或 scripts/ops 腳本,確認權限 0755"
- alert: BackupJobStale
expr: awoooi_backup_job_fresh{host=~"110|188"} == 0
for: 15m
labels:
severity: critical
layer: host-backup
component: backup-freshness
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "{{ $labels.host }} 備份過舊:{{ $labels.exported_job }}"
description: "{{ $labels.exported_job }} 最新成功證據超過 {{ $labels.max_age_hours }} 小時或不存在;來源 {{ $labels.source }},目標 {{ $labels.target }}。"
runbook: "先檢查備份 log 與磁碟空間,再手動執行對應備份;禁止直接刪除舊備份或 production 資料"
- alert: BackupAggregateRunFailed
expr: awoooi_backup_last_run_failed_count{host="110",exported_job="backup_all"} > 0
for: 10m
labels:
severity: warning
layer: host-backup
component: backup-all
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 全服務備份最近一次有 {{ $value }} 個失敗項目"
description: "backup-all.sh 最近一次 aggregate run 仍有失敗;即使個別 DB 備份已手動補跑,也要重跑 aggregate backup 清除紅燈。"
runbook: "SSH 110 檢查 /backup/logs/cron.log 與 /backup/logs/backup.log修正後執行 /backup/scripts/backup-all.sh"
- alert: BackupConfigCapturePartial
expr: awoooi_backup_config_capture_ok{host="110",critical="true"} == 0
for: 10m
labels:
severity: warning
layer: host-backup
component: backup-config-capture
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 設定檔備份缺少關鍵目標:{{ $labels.target }}"
description: "configs restic snapshot 雖可能存在,但最新設定檔備份未成功捕捉 {{ $labels.target }}source={{ $labels.source }}。這會影響下一次冷啟動或災難還原的設定還原完整性。"
runbook: "先修復對應主機或 K8s API 可達性,再執行 /backup/scripts/backup-configs.sh確認 awoooi_backup_config_capture_ok 回到 1最後補跑 Google Drive/rclone offsite sync。"
- alert: BackupConfigCaptureStatusStale
expr: absent(awoooi_backup_config_capture_status_timestamp{host="110"}) or (time() - awoooi_backup_config_capture_status_timestamp{host="110"} > 172800)
for: 30m
labels:
severity: warning
layer: host-backup
component: backup-config-capture
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 設定檔備份覆蓋率狀態缺失或過舊"
description: "backup-configs.sh 沒有新鮮的 capture status無法判斷 110/120/121/188/K8s 設定檔是否真的被最新 snapshot 捕捉。"
runbook: "部署新版 /backup/scripts/backup-configs.sh 與 /home/wooo/scripts/backup-health-textfile-exporter.py執行 /backup/scripts/backup-configs.sh 後刷新 textfile exporter。"
- alert: BackupIntegrityCheckMissingOrFailed
expr: absent(awoooi_backup_integrity_fresh{host="110",scope="restic_check"}) or awoooi_backup_integrity_fresh{host="110",scope="restic_check"} == 0
for: 30m
labels:
severity: critical
layer: host-backup
component: backup-integrity
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "110 備份倉庫完整性檢查缺失或失敗"
description: "每週 restic check 沒有成功證據,或有 repo 檢查失敗;目前不能假設備份可讀。"
runbook: "SSH 110 執行 `/backup/scripts/check-backup-integrity.sh --mode check`,先看 /backup/logs/backup-integrity.log禁止刪 repo 或 prune 直到確認原因"
- alert: BackupRestoreDrillMissingOrFailed
expr: absent(awoooi_backup_integrity_fresh{host="110",scope="restore_drill"}) or awoooi_backup_integrity_fresh{host="110",scope="restore_drill"} == 0
for: 30m
labels:
severity: warning
layer: host-backup
component: backup-restore-drill
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 備份抽樣還原演練缺失或失敗"
description: "每月 restore drill 沒有成功證據,備份雖可能新鮮,但尚未驗證可讀取還原。"
runbook: "SSH 110 執行 `/backup/scripts/check-backup-integrity.sh --mode restore-drill`;只允許還原到隔離暫存目錄,不得覆蓋 production"
- alert: BackupOffsiteCopyNotConfigured
expr: sum by(host) (awoooi_backup_offsite_configured{host="110"}) == 0
for: 1m
labels:
severity: warning
layer: host-backup
component: backup-offsite
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 尚未配置離機備份 provider"
description: "backup health exporter 未偵測到 Google Drive/rclone 或其他 offsite provider 配置;本地 restic 全綠仍不等於異地可恢復。"
runbook: "在 110 以 `/backup/scripts/configure-offsite-rclone.sh --interactive` 建立 Google Drive remote產生 `/backup/offsite/*last_success` 證據;不得把 provider token 寫入 repo、Telegram 或 Prometheus label。"
- alert: BackupOffsiteCopyStale
expr: |
(
(sum by(host) (awoooi_backup_offsite_configured{host="110"}) > 0)
and
(sum by(host) (awoooi_backup_offsite_fresh{host="110"}) == 0)
)
and
(
(sum by(host) (awoooi_backup_offsite_full_sync_enabled{host="110"}) == 0)
or
((time() - max by(host) (awoooi_backup_offsite_full_sync_enabled_timestamp{host="110"})) > 30 * 3600)
)
for: 2h
labels:
severity: warning
layer: host-backup
component: backup-offsite
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 離機備份超過 48 小時未成功"
description: "已偵測到 offsite provider 配置,但沒有新鮮成功標記;本地備份可能無法抵抗整台 110 遺失。"
runbook: "SSH 110 檢查 Google Drive/rclone 同步 log 與 `/backup/offsite/*last_success`full sync 需在 enable marker 與低負載門檻成立後由 `/backup/scripts/sync-offsite-backups.sh --mode sync` 鏡像本地 latest-only repo。"
- alert: BackupRetentionPolicyNotLatestOnly
expr: |
absent(awoooi_backup_retention_latest_only{host="110"})
or
awoooi_backup_retention_latest_only{host="110"} != 1
or
absent(awoooi_backup_retention_offsite_delete_old_enabled{host="110",provider="rclone"})
or
awoooi_backup_retention_offsite_delete_old_enabled{host="110",provider="rclone"} != 1
for: 15m
labels:
severity: warning
layer: host-backup
component: backup-retention
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 備份保留策略不是 latest-only"
description: "operator 要求所有備份只保留最新一份;本地 restic 必須 keep-last=1Google Drive/rclone 必須在成功 mirror 後刪除遠端舊檔。"
runbook: "檢查 `/backup/scripts/common.sh` 的 BACKUP_RETENTION_MODE=latest、KEEP_LAST=1 與 OFFSITE_SYNC_DELETE_OLD=1刷新 backup-health textfile必要時在備份成功後執行 `/backup/scripts/enforce-latest-only-retention.sh`。"
- alert: BackupSnapshotRetentionExceeded
expr: awoooi_backup_job_snapshot_count{host="110",type="restic"} > 1
for: 30m
labels:
severity: warning
layer: host-backup
component: backup-retention
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 備份 repo {{ $labels.exported_job }} 保留超過 1 份 snapshot"
description: "{{ $labels.exported_job }} 目前有 {{ $value }} 份 restic snapshotlatest-only 策略要求每個 repo 全域只保留最新 1 份。"
runbook: "SSH 110 執行 `/backup/scripts/enforce-latest-only-retention.sh`;若仍未收斂,確認 `common.sh` 使用 `restic forget --group-by \"\" --keep-last 1 --prune`,避免 restic 依 path/tag 分組保留多份。"
- alert: BackupOffsiteFullVerifyFailed
expr: |
awoooi_backup_offsite_fresh{host="110",provider="rclone"} == 1
unless on(host, provider)
(awoooi_backup_offsite_remote_verify_ok{host="110",provider="rclone"} == 1)
for: 30m
labels:
severity: warning
layer: host-backup
component: backup-offsite
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 Google Drive full sync 完成但遠端驗證未通過"
description: "full offsite marker 已 fresh但 verify-offsite-full-sync.sh 沒有證明 13 個 Google Drive repo 都可列出且符合 latest-only。"
runbook: "SSH 110 執行 `/backup/scripts/verify-offsite-full-sync.sh --write-textfile --no-color`,檢查 `/backup/logs/offsite-full-sync-verify.log` 與 `/home/wooo/node_exporter_textfiles/offsite_full_sync_verify.prom`。"
- alert: BackupOffsiteRemoteSnapshotRetentionExceeded
expr: |
(awoooi_backup_offsite_remote_snapshot_count{host="110",provider="rclone"} > 1)
and on(host, provider)
(awoooi_backup_offsite_fresh{host="110",provider="rclone"} == 1)
for: 30m
labels:
severity: warning
layer: host-backup
component: backup-retention
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "Google Drive repo {{ $labels.repo }} 保留超過 1 份 snapshot"
description: "{{ $labels.repo }} 在 Google Drive/rclone 遠端目前有 {{ $value }} 份 snapshotlatest-only 策略要求遠端也只保留最新一份。"
runbook: "確認 110 `/backup/scripts/sync-offsite-backups.sh --mode sync` 使用 `rclone sync`、`OFFSITE_SYNC_DELETE_OLD=1`、`RCLONE_DRIVE_USE_TRASH=false`,再於低峰重新執行 full sync 與 verifier。"
- alert: BackupCredentialEscrowEvidenceMissing
expr: awoooi_backup_credential_escrow_fresh{host="110"} == 0
for: 1m
labels:
severity: warning
layer: host-backup
component: credential-escrow
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "備份憑證金庫證據缺失或過期:{{ $labels.item }}"
description: "{{ $labels.item }} 沒有 31 天內人工驗證證據;重建時可能找不到 restic/offsite/break-glass/DNS/OAuth 復原材料。"
runbook: "在密碼管理器或離線加密金庫完成雙人覆核後,只建立不含 secret 的 `/backup/escrow-evidence/{{ $labels.item }}.last_verified` 時間戳證據。"
# =========================================================================
# 基礎設施詳細告警 (awoooi_infrastructure_detailed) — 從主機補回 2026-04-12
# =========================================================================
- name: awoooi_infrastructure_detailed
interval: 60s
rules:
- alert: DockerContainerUnhealthyDetailed
expr: |
count by (name, instance) (
container_tasks_state{state="running", instance=~"192.168.0.188.*"}
) == 0
or
container_last_seen{instance=~"192.168.0.188.*", name!=""} < (time() - 120)
for: 5m
labels:
severity: warning
layer: docker-188
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "188 主機容器 {{ $labels.name }} 異常"
description: "容器 {{ $labels.name }} 在 {{ $labels.instance }} 已超過 2 分鐘無活動或不在 running 狀態。"
runbook: "SSH 到 192.168.0.188docker inspect {{ $labels.name }} 確認健康狀態"
- alert: RedisStreamBacklogHigh
expr: awoooi_redis_stream_len > 500
for: 10m
labels:
severity: warning
layer: docker-188
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "Redis Stream {{ $labels.stream }} 積壓 {{ $value }} 筆"
description: "Redis Stream 積壓超過 500 筆,飛輪消費者可能阻塞。"
runbook: "檢查 consumer group lagXINFO GROUPS <stream-key>"
# 2026-04-19 Hermes E3 決策: PostgreSQLDiskGrowthRate deprecated
# 真因: 500MB/h 增長是 PG WAL 正常行為 (commits/checkpoints),不該告警
# 過去 30d 觸發 7 次,全部 AI 判 NO_ACTION 或誤判 kubectl rollout restart 失敗
# 統帥決策 (2026-04-19 18:xx Taipei): 選項 C 刪除舊規則 + 改用絕對磁碟使用率
# -----------------------------------------------------------------
- alert: HostDiskUsageHigh
expr: |
(
node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
- node_filesystem_avail_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
)
/ node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
* 100 > 80
for: 10m
labels:
severity: warning
layer: systemd-188
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
supersedes: PostgreSQLDiskGrowthRate
annotations:
summary: "主機 {{ $labels.instance }} 根目錄磁碟使用率 {{ $value | humanize }}% (>80%)"
description: "磁碟使用率持續 10 分鐘超過 80%, 需清理或擴容. 常見原因: PG WAL, 日誌, container images, 舊 build cache."
runbook: "SSH 該主機: df -h / && du -sh /var/lib/postgresql/*/pg_wal /var/log /var/lib/docker"
- alert: HostDiskUsageCritical
expr: |
(
node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
- node_filesystem_avail_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
)
/ node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
* 100 > 90
for: 5m
labels:
severity: critical
layer: systemd-188
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "true"
supersedes: PostgreSQLDiskGrowthRate
annotations:
summary: "🔴 主機 {{ $labels.instance }} 根目錄磁碟使用率 {{ $value | humanize }}% (>90%, critical)"
description: "磁碟即將滿, 需立即清理. 超過 95% 可能導致服務中斷."
runbook: "立即 SSH 該主機: df -h / && du -sh /* 2>/dev/null | sort -h | tail -10"
# =========================================================================
# 主機連通性告警 (awoooi_host_connectivity) — 從主機補回 2026-04-12
# =========================================================================
- name: awoooi_host_connectivity
interval: 60s
rules:
- alert: HostNetworkPartition
expr: probe_success{job="host-connectivity"} == 0
for: 5m
labels:
severity: critical
layer: systemd-188
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "true"
annotations:
summary: "主機 {{ $labels.instance }} 無法連通"
description: "TCP probe 到 {{ $labels.instance }} 失敗超過 5 分鐘,可能發生網路分區。"
runbook: "SSH 檢查路由和防火牆規則"
# =========================================================================
# 監控工具自監控 (infra_self_monitoring) — ADR-090 Phase 7
# 2026-04-19 Claude Opus 4.7 / 鐵律:監控工具必須被監控
# 設計:不寫死 CPU% 或 MB 數,改用 (配額佔比) + (throttle 訊號) 動態判斷
# 配額由 docker-compose 宣告,告警條件 = 使用量 / 配額 > 0.8
# 比寫死 80% 更智能 — 配額改告警閾值自動跟著變
# =========================================================================
- name: infra_self_monitoring
interval: 1m
rules:
# --- cadvisor 自監控 ---
- alert: CadvisorDown
expr: up{job=~".*cadvisor.*"} == 0
for: 5m
labels:
severity: critical
layer: docker-110-188
component: cadvisor
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "true"
annotations:
summary: "cAdvisor ({{ $labels.instance }}) 停擺"
description: "主機 {{ $labels.instance }} 的 cadvisor 已停擺 5 分鐘,容器監控中斷。"
runbook: "SSH 主機 docker compose up -d cadvisor檢查 OOMKill 訊號"
- alert: CadvisorMemoryPressure
expr: container_memory_usage_bytes{name="cadvisor"} / container_spec_memory_limit_bytes{name="cadvisor"} > 0.8
for: 10m
labels:
severity: warning
component: cadvisor
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "cAdvisor 記憶體使用率 > 80% limit"
description: "cadvisor 記憶體用量 / mem_limit = {{ $value | humanizePercentage }},接近 OOMKill。"
runbook: "若頻繁觸發 → 檢查 cardinality 是否持續成長,考慮調整 --disable_metrics"
- alert: CadvisorCPUThrottled
expr: rate(container_cpu_cfs_throttled_seconds_total{name="cadvisor"}[5m]) > 0.5
for: 15m
labels:
severity: warning
component: cadvisor
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "cAdvisor CPU 被 throttle配額不足"
description: "cadvisor 每秒被 throttle {{ $value }} 秒,表示實際需求超過 cpus 配額。"
runbook: "調高 docker-compose cpus 設定,或檢查 scrape interval / cardinality"
# --- node-exporter 自監控 ---
- alert: NodeExporterDown
expr: up{job=~"node-exporter.*|node_exporter.*"} == 0
for: 5m
labels:
severity: critical
component: node-exporter
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "true"
annotations:
summary: "node-exporter ({{ $labels.instance }}) 停擺"
description: "主機 {{ $labels.instance }} node-exporter 已停擺 5 分鐘,主機 metrics 中斷。"
runbook: "SSH 主機檢查 docker ps node-exporter重啟 docker compose up -d node-exporter"
- alert: NodeExporterCPUThrottled
expr: rate(container_cpu_cfs_throttled_seconds_total{name="node-exporter"}[5m]) > 0.5
for: 15m
labels:
severity: warning
component: node-exporter
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "node-exporter CPU 被 throttle配額不足"
description: "node-exporter 每秒被 throttle {{ $value }} 秒。可能 collector 未適度 disable。"
runbook: "檢查 node-exporter --collector.* flags 是否該關掉閒置硬體 probe"
# --- Sentry self-hosted 自監控110---
# 2026-04-25 ogt + Claude Opus 4.7: 修正假告警根因
# 舊規則用 container_memory_usage_bytes含 page cache導致 ClickHouse
# 執行大查詢時 OS 把 SSTable 緩存進 page cache比例衝到 88.5% 觸發誤報
# 2026-04-23 23:13 鐵證usage_bytes=88.5% / working_set=7.8%)。
# 改用 container_memory_working_set_bytes — 這才是 K8s/Docker OOM killer
# 實際追蹤的「真實工作集」(RSS + active page cache),不含 inactive page cache。
# 參考: https://github.com/google/cadvisor/blob/master/info/v1/container.go
- alert: SentryClickHouseMemoryPressure
expr: container_memory_working_set_bytes{name=~".*sentry.*clickhouse.*"} / container_spec_memory_limit_bytes{name=~".*sentry.*clickhouse.*"} > 0.85
for: 10m
labels:
severity: warning
component: sentry-clickhouse
team: platform
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "Sentry ClickHouse 工作集記憶體 > 85% limit"
description: "sentry clickhouse working_set / mem_limit = {{ $value | humanizePercentage }} (排除 page cache)。"
runbook: "檢查 Sentry 查詢壓力;確認非 page cache 假象;必要時調整 /opt/sentry/docker-compose.override.yml clickhouse mem_limit"
- alert: SentryClickHouseCPUThrottled
expr: rate(container_cpu_cfs_throttled_seconds_total{name=~".*sentry.*clickhouse.*"}[5m]) > 1.0
for: 15m
labels:
severity: warning
component: sentry-clickhouse
team: platform
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "Sentry ClickHouse CPU 持續被 throttle"
description: "每秒 throttle {{ $value }} 秒,配額 cpus=4.0 可能不足。"
runbook: "檢查 Sentry retention / query pattern必要時調高 override.yml cpus"
# --- Gitea 自監控 ---
- alert: GiteaMemoryPressure
# 2026-04-25 ogt + Claude Sonnet 4.6 — 同 ClickHouse 假警報根因:
# container_memory_usage_bytes 含 page cacheOS inactiveOOM killer 不管)→ 虛高假警報
# 改用 container_memory_working_set_bytesRSS + active cache真實壓力cadvisor 適用 Docker + K8s
expr: container_memory_working_set_bytes{name="gitea"} / container_spec_memory_limit_bytes{name="gitea"} > 0.85
for: 10m
labels:
severity: warning
component: gitea
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "Gitea 記憶體工作集 > 85% limit"
description: "gitea working_set / mem_limit = {{ $value | humanizePercentage }}(真實記憶體壓力,非 page cache 干擾)。"
runbook: "檢查 CI/CD 任務堆積;必要時調高 docker-compose mem_limit"
- alert: GiteaCPUThrottled
expr: rate(container_cpu_cfs_throttled_seconds_total{name=~"gitea|gitea-runner"}[5m]) > 1.0
for: 15m
labels:
severity: warning
component: gitea
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "Gitea / Runner CPU 持續被 throttle"
description: "{{ $labels.name }} 每秒 throttle {{ $value }} 秒CD peak 可能卡關。"
runbook: "檢查 job 並行度;考慮縮減並行或調高 cpus"
# --- 監控自監控元層Prometheus 本身)---
- alert: PrometheusDown
expr: up{job="prometheus"} == 0
for: 2m
labels:
severity: critical
component: prometheus
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "Prometheus ({{ $labels.instance }}) 停擺"
description: "Prometheus 自己停擺 → 所有其他告警失效"
runbook: "SSH 110 docker compose -f /home/wooo/monitoring/docker-compose.yml up -d prometheus"
# =========================================================================
# Full-stack cold-start recovery gate
# =========================================================================
- name: cold_start_recovery_alerts
rules:
- alert: PrometheusRuleDriftGuardFailed
expr: |
absent(awoooi_prometheus_rule_drift_guard_last_run_timestamp{host="110"})
or
(time() - max by(host) (awoooi_prometheus_rule_drift_guard_last_run_timestamp{host="110"}) > 900)
or
(awoooi_prometheus_rule_drift_guard_missing_required_count{host="110"} > 0)
or
(awoooi_prometheus_rule_drift_guard_current_matches_canonical{host="110"} == 0)
for: 10m
labels:
severity: critical
layer: systemd-110
component: prometheus-rule-drift-guard
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "Prometheus 規則漂移防護失效"
description: "110 Prometheus rule drift guard 沒有新鮮成功指標、required rules 缺失,或 active alerts.yml 不等於 canonical rules。"
runbook: "執行 `bash scripts/ops/deploy-alerts.sh` 重新部署 canonical rules 與 drift guard等待 1-2 個 Prometheus evaluation cycle 後重跑 readiness audit。"
- alert: PrometheusRuleDriftAutoRepaired
expr: awoooi_prometheus_rule_drift_guard_repaired{host="110"} > 0
for: 1m
labels:
severity: warning
layer: systemd-110
component: prometheus-rule-drift-guard
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "Prometheus 規則漂移已被自動修復"
description: "110 drift guard 最近一次偵測到 active Prometheus rules 漂移,已回復 canonical rules 並 reload Prometheus。"
runbook: "檢查 `/home/wooo/logs/prometheus-rule-drift-guard.log` 與 `/home/wooo/monitoring/alerts.yml.guard.bak.*`,找出誰覆寫了 active rules。"
- alert: ColdStartMonitorMissing
expr: absent(awoooi_cold_start_monitor_up{host="110",scope="110_120_121_188"})
for: 15m
labels:
severity: warning
layer: systemd-110
component: cold-start-monitor
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "Cold-start monitor textfile metric missing"
description: "110 沒有輸出 awoooi_cold_start_monitor_up重開機恢復 gate 目前不可觀測。"
runbook: "執行 scripts/reboot-recovery/install-cold-start-monitor-110.sh確認 /home/wooo/node_exporter_textfiles/cold_start_recovery.prom"
- alert: ColdStartMonitorStale
expr: time() - awoooi_cold_start_last_run_timestamp{host="110",scope="110_120_121_188"} > 900
for: 10m
labels:
severity: warning
layer: systemd-110
component: cold-start-monitor
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "Cold-start monitor stale"
description: "cold-start monitor 超過 15 分鐘沒有更新,距離上次執行 {{ $value | humanizeDuration }}。"
runbook: "SSH 110 檢查 crontab、/tmp/awoooi-cold-start-monitor.cron.log、cold-start-last.log"
- alert: ColdStartRecoveryBlocked
expr: awoooi_cold_start_blocked_gates{host="110",scope="110_120_121_188"} > 0
for: 5m
labels:
severity: critical
layer: full-stack
component: cold-start-gate
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "Full-stack cold-start recovery BLOCKED"
description: "cold-start gate 有 {{ $value }} 個 BLOCKED gate。AI 修復需保持 observe-only先處理第一個 blocked gate。"
runbook: "查看 /home/wooo/reboot-recovery/cold-start-last.log依 docs/runbooks/FULL-STACK-COLD-START-SOP.md 的 P0→P2 順序修復"
- alert: K3sNodeFilesystemErrorGateBlocked
expr: awoooi_cold_start_blocker_reason{host="110",scope="110_120_121_188",reason="k3s_node_filesystem_error",target="120"} > 0
for: 5m
labels:
severity: critical
layer: k3s
component: node-filesystem
host: "120"
target_host: "120"
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "120 K3s 節點 filesystem error 阻擋重開機放行"
description: "cold-start log 偵測到 120 Node event 仍有 EXT4/I/O/deleted inode 類錯誤;即使 Pod Running、網站 200也不可宣告下一次重開機安全。"
runbook: "查看 110 `/home/wooo/reboot-recovery/cold-start-last.log`,執行 `scripts/reboot-recovery/120-fsck-maintenance-checklist.sh` 做維護前只讀檢查;維護窗內用 console/rescue 對 120 root LV 執行 fsck禁止 online fsck。"
- alert: ColdStartHost120Unreachable
expr: awoooi_cold_start_blocker_reason{host="110",scope="110_120_121_188",reason="host_unreachable",target="120"} > 0
for: 3m
labels:
severity: critical
layer: host
component: host-reachability
host: "120"
target_host: "120"
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "120 主機不可達Full-stack cold-start 已阻擋"
description: "110 cold-start monitor 無法 ping/SSH 192.168.0.120;目前只能由 121/VIP 撐住 K3s不能宣告所有主機重開機恢復完成。"
runbook: "查看 120 console。若停在 initramfs/manual fsck先對 root LV 做離線 fsck若主機關機或網卡異常先恢復電源/網路,再重跑 full-stack cold-start gate。禁止從自動修復直接重啟其他服務掩蓋主機離線。"
- alert: ColdStartRecoveryDegraded
expr: awoooi_cold_start_warn_gates{host="110",scope="110_120_121_188"} > 0
for: 15m
labels:
severity: warning
layer: full-stack
component: cold-start-gate
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "Full-stack cold-start recovery DEGRADED"
description: "cold-start gate 有 {{ $value }} 個 WARN gate核心可用但不應放行 runner/CD/AI auto-repair full execution。"
runbook: "查看 /home/wooo/reboot-recovery/cold-start-last.log修到 PASS/WARN/BLOCKED = green"
- alert: ColdStartLastGreenTooOld
expr: time() - awoooi_cold_start_last_green_timestamp{host="110",scope="110_120_121_188"} > 3600
for: 15m
labels:
severity: warning
layer: full-stack
component: cold-start-gate
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "Full-stack cold-start gate has not been GREEN recently"
description: "距離上次 GREEN 已超過 {{ $value | humanizeDuration }};需要確認 110/120/121/188 與排程/網站 gate。"
runbook: "執行 SSH_BATCH_MODE=yes bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test"
# =========================================================================
# Host storage health / dirty reboot evidence
# =========================================================================
- name: host_storage_health_alerts
rules:
- alert: Host110StorageHealthMonitorMissing
expr: absent(awoooi_host_storage_monitor_up{host="110"})
for: 15m
labels:
severity: warning
layer: systemd-110
component: storage-health-monitor
host: "110"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "110 storage health textfile metric missing"
description: "110 沒有輸出 storage_health.promdirty reboot、root read-only 與 fsck 證據目前不可觀測。"
runbook: "用 Ansible `110-devops.yml --tags textfile_exporters` 或手動部署 scripts/ops/storage-health-textfile-exporter.py確認 /home/wooo/node_exporter_textfiles/storage_health.prom"
- alert: Host188StorageHealthMonitorMissing
expr: absent(awoooi_host_storage_monitor_up{host="188"})
for: 15m
labels:
severity: warning
layer: systemd-188
component: storage-health-monitor
host: "188"
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "188 storage health textfile metric missing"
description: "188 沒有輸出 storage_health.promdirty reboot、root read-only 與 fsck 證據目前不可觀測。"
runbook: "用 Ansible `188-ai-web.yml --tags textfile_exporters` 或手動部署 scripts/ops/storage-health-textfile-exporter.py確認 /home/ollama/node_exporter_textfiles/storage_health.prom"
- alert: HostStorageHealthMonitorStale
expr: time() - awoooi_host_storage_last_run_timestamp{host=~"110|188"} > 900
for: 10m
labels:
severity: warning
layer: host-storage
component: storage-health-monitor
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "主機 {{ $labels.host }} storage health textfile stale"
description: "storage health exporter 超過 15 分鐘沒有更新;重開機後檔案系統風險不可觀測。"
runbook: "SSH 主機檢查 cron、/tmp/awoooi-storage-health-textfile-exporter.cron.log 與 node-exporter textfile collector"
- alert: HostRootFilesystemReadOnly
expr: awoooi_host_root_filesystem_readonly{host=~"110|188",mountpoint="/"} > 0
for: 1m
labels:
severity: critical
layer: host-storage
component: root-filesystem
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "主機 {{ $labels.host }} root filesystem 已變成 read-only"
description: "root filesystem 被掛載為唯讀,服務可能仍暫時存活但寫入會失敗;禁止自動修復,先保全證據並規劃維護窗。"
runbook: "依 docs/runbooks/FULL-STACK-COLD-START-SOP.md §16保全 journal/df/mount 證據,確認備份,再安排 console/offline fsck"
- alert: HostCurrentBootStorageErrorsDetected
expr: awoooi_host_storage_error_count{host=~"110|188",source="journalctl-kernel",boot="current"} > 0
for: 5m
labels:
severity: critical
layer: host-storage
component: kernel-storage
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "主機 {{ $labels.host }} current boot 有 storage/kernel 錯誤"
description: "目前開機週期已出現 filesystem、I/O 或 fsck 類錯誤;不可只重啟容器掩蓋問題。"
runbook: "先執行 read-only 診斷journalctl -k -p warning..alert、mount、df、smartctl/raid 狀態;必要時進入維護窗處理"
- alert: HostPreviousBootStorageErrorsDetected
expr: awoooi_host_storage_error_count{host=~"110|188",source="journalctl-kernel",boot="previous"} > 0
for: 30m
labels:
severity: warning
layer: host-storage
component: dirty-reboot-evidence
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "主機 {{ $labels.host }} previous boot 保留 storage/fsck 錯誤證據"
description: "上一個開機週期留有 storage/fsck 錯誤,代表這次重開機事故需要完成 fsck、備份與容量後續檢查。"
runbook: "把證據寫入 docs/LOGBOOK.md確認 full-stack cold-start gate 與 P3 gate下一次維護窗補 offline fsck/SMART/RAID 檢查"
- alert: HostFsckLogErrorsDetected
expr: sum by(host) (awoooi_host_storage_error_count{host=~"110|188",boot="last-fsck-log"}) > 0
for: 30m
labels:
severity: warning
layer: host-storage
component: fsck-log
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "主機 {{ $labels.host }} fsck log 保留錯誤證據"
description: "主機 fsck log 內仍有 inconsistency 或 I/O 類錯誤文字;這是事故後追蹤項,不應交給自動修復直接處理。"
runbook: "確認 /run/initramfs/fsck.log 與 /var/log/fsck/*,將結果納入重開機事故報告與下次維護窗檢查項"