2032 lines
91 KiB
YAML
2032 lines
91 KiB
YAML
# ops/monitoring/alerts-unified.yml
|
||
# AWOOOI 統一 Prometheus 告警規則
|
||
# 2026-04-05 Claude Code: 整合所有規則,加入統一 layer 標籤
|
||
# 2026-04-08 Claude Sonnet 4.6: 補 database_detail_alerts 群組 (6條詳細規則)
|
||
# 2026-04-12 Claude Sonnet 4.6: 補回 4 個僅存在主機的群組 (backup/flywheel/connectivity/infra-detailed)
|
||
# 部署目標: 192.168.0.110:/home/wooo/monitoring/alerts.yml
|
||
# 部署方式: scripts/ops/deploy-alerts.sh (CD 自動部署)
|
||
#
|
||
# 標籤規範:
|
||
# layer: k8s | docker-110 | docker-188 | systemd-188
|
||
# component: 服務名稱
|
||
# team: ops | backend | ai | platform
|
||
# host: "110" | "188" | "120" | "121"
|
||
# auto_repair: "true" | "false"
|
||
|
||
groups:
|
||
|
||
# =========================================================================
|
||
# Full-stack recovery scorecard recording rules
|
||
# =========================================================================
|
||
- name: full_stack_recovery_scorecard_rules
|
||
interval: 60s
|
||
rules:
|
||
- record: awoooi_recovery_core_ready
|
||
expr: |
|
||
sum without(result) (
|
||
awoooi_cold_start_last_result{host="110",scope="110_120_121_188",result="green"} == bool 1
|
||
)
|
||
* on(host,scope) (
|
||
awoooi_cold_start_warn_gates{host="110",scope="110_120_121_188"} == bool 0
|
||
)
|
||
* on(host,scope) (
|
||
awoooi_cold_start_blocked_gates{host="110",scope="110_120_121_188"} == bool 0
|
||
)
|
||
* on(host,scope) (
|
||
(time() - awoooi_cold_start_last_green_timestamp{host="110",scope="110_120_121_188"}) < bool 3600
|
||
)
|
||
|
||
- record: awoooi_recovery_dr_offsite_ready
|
||
expr: |
|
||
max by(host) (
|
||
awoooi_backup_offsite_configured{host="110"} == bool 1
|
||
)
|
||
* on(host) max by(host) (
|
||
awoooi_backup_offsite_fresh{host="110"} == bool 1
|
||
)
|
||
* on(host) min by(host) (
|
||
awoooi_backup_credential_escrow_fresh{host="110"} == bool 1
|
||
)
|
||
|
||
# =========================================================================
|
||
# 主機層告警 (host_alerts)
|
||
# =========================================================================
|
||
- name: host_alerts
|
||
rules:
|
||
- alert: HostDown
|
||
expr: up{job=~"node-exporter.*"} == 0
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
layer: systemd-188
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "主機 {{ $labels.host }} 不可達"
|
||
description: "Node Exporter 無回應超過 1 分鐘"
|
||
|
||
- alert: HostHighCpuLoad
|
||
# 2026-05-05 ogt + Codex: keep this as early warning only.
|
||
# Sustained overload/root-cause automation is handled by HostLoadAverageSustainedHigh.
|
||
expr: 100 - (avg by(host) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
team: ops
|
||
auto_repair: "false"
|
||
# MCP Phase 2a (ADR-071, 2026-04-11 Claude Sonnet 4.6): SSH MCP 路由標籤
|
||
mcp_provider: "ssh_host"
|
||
host_type: "bare_metal"
|
||
alert_category: "host_resource"
|
||
annotations:
|
||
summary: "主機 {{ $labels.host }} CPU 高負載"
|
||
description: "CPU 使用率超過 90% 持續 10 分鐘;若 load5/core 未超過 1.5,先視為容量觀察與診斷,不直接修復。"
|
||
|
||
- alert: HostLoadAverageSustainedHigh
|
||
# 2026-05-05 ogt + Codex: 110/188 長時間過載基線。
|
||
# Why: CPU% 只看 busy time,無法充分表達 ClickHouse merge、Kafka 回補、Chrome/Ollama 造成的 runnable queue。
|
||
expr: node_load5{host=~"110|188"} / on(host) count by(host) (count by(host,cpu) (node_cpu_seconds_total{mode="idle",host=~"110|188"})) > 1.5
|
||
for: 15m
|
||
labels:
|
||
severity: critical
|
||
layer: systemd-188
|
||
team: ops
|
||
auto_repair: "true"
|
||
mcp_provider: "ssh_host"
|
||
host_type: "bare_metal"
|
||
alert_category: "host_resource"
|
||
annotations:
|
||
summary: "主機 {{ $labels.host }} load5/core 長時間過高"
|
||
description: "load5 / CPU core > 1.5 持續 15 分鐘;這通常代表 runnable queue 已長期塞車,不是短暫尖峰。"
|
||
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'echo \"=== LOAD ===\"; uptime; echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -25; echo \"=== DOCKER STATS ===\"; docker stats --no-stream | head -40'"
|
||
runbook: "先判斷高 load 來源:ClickHouse merge / Kafka backlog / Chrome scraper / Ollama runner / exporter;只允許 read-only 診斷,自動修復需走服務專屬 playbook。"
|
||
|
||
- alert: HostOutOfMemory
|
||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
team: ops
|
||
auto_repair: "true"
|
||
mcp_provider: "ssh_host"
|
||
host_type: "bare_metal"
|
||
alert_category: "host_resource"
|
||
annotations:
|
||
summary: "主機 {{ $labels.host }} 記憶體不足"
|
||
description: "記憶體使用率超過 85%"
|
||
|
||
- alert: HostOutOfDiskSpace
|
||
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 85
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
team: ops
|
||
auto_repair: "true"
|
||
mcp_provider: "ssh_host"
|
||
host_type: "bare_metal"
|
||
alert_category: "host_resource"
|
||
annotations:
|
||
summary: "主機 {{ $labels.host }} 磁碟空間不足"
|
||
description: "磁碟使用率超過 85%"
|
||
auto_repair_action: "ssh {{ $labels.instance }} 'echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -15; echo \"=== MEMORY ===\"; free -h; echo \"=== DISK ===\"; df -h; echo \"=== LOAD ===\"; uptime'"
|
||
|
||
# =========================================================================
|
||
# K8s 叢集告警 (kubernetes_alerts)
|
||
# =========================================================================
|
||
- name: kubernetes_alerts
|
||
rules:
|
||
- alert: K3sNodeNotReady
|
||
expr: kube_node_status_condition{condition="Ready", status="true"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
layer: k8s
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "K3s 節點 {{ $labels.node }} 未就緒"
|
||
description: "節點超過 2 分鐘未達到 Ready 狀態"
|
||
|
||
- alert: KubePodCrashLooping
|
||
expr: rate(kube_pod_container_status_restarts_total{namespace="awoooi-prod"}[15m]) > 0
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} 持續重啟"
|
||
description: "Pod 在過去 15 分鐘內重啟次數異常"
|
||
|
||
- alert: KubePodNotReady
|
||
expr: kube_pod_status_ready{condition="true",namespace="awoooi-prod"} == 0
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} 未就緒"
|
||
description: "Running 中的 Pod 超過 5 分鐘未達到 Ready 狀態"
|
||
|
||
- alert: KubeDeploymentReplicasMismatch
|
||
expr: kube_deployment_spec_replicas{namespace="awoooi-prod"} != kube_deployment_status_replicas_available{namespace="awoooi-prod"}
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} 副本數不匹配"
|
||
description: "期望副本數與可用副本數不一致超過 10 分鐘"
|
||
|
||
- alert: VeleroBackupFailed
|
||
expr: increase(velero_backup_failure_total[24h]) > 0
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: ops
|
||
component: velero
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Velero 備份失敗"
|
||
description: "過去 24 小時有備份失敗"
|
||
|
||
- alert: VeleroBackupNotRun
|
||
expr: max by(host, namespace) (awoooi_velero_latest_completed_backup_fresh{host="110",namespace="velero"}) == 0
|
||
for: 10m
|
||
labels:
|
||
severity: critical
|
||
layer: k8s
|
||
team: ops
|
||
component: velero
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Velero 超過 24 小時未成功備份"
|
||
description: "backup health exporter 顯示 latest Completed Velero backup 超過 25 小時或不存在。"
|
||
|
||
# Sprint C-2 Host rsync 備份告警 (2026-04-11 Claude Sonnet 4.6)
|
||
# backup-from-110.sh 成功時寫入 /var/run/backup-110.last_success
|
||
# node-exporter textfile collector 讀取此檔案暴露指標
|
||
- alert: HostBackupFailed
|
||
# 2026-05-02 ogt + Claude Sonnet 4.6: 刪除 backup_110 collector label 判斷
|
||
# 根因:node_textfile_scrape_error 目前已不帶 collector 欄位,此條件在實際環境持續 absent()=true,導致告警永遠成立。
|
||
# 修法:僅以 backup_110_last_success_timestamp 是否缺失 / 是否超時為主判斷。
|
||
expr: absent(backup_110_last_success_timestamp) or (time() - backup_110_last_success_timestamp > 90000)
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
layer: docker-188
|
||
team: ops
|
||
component: backup
|
||
host: "188"
|
||
auto_repair: "true"
|
||
alert_category: host_resource
|
||
annotations:
|
||
summary: "188 Host 備份超過 25 小時未成功"
|
||
description: "backup-from-110.sh 可能失敗,/backup/110 資料可能過舊"
|
||
auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%cpu | head -15; echo \"=== BACKUP STATUS ===\"; ls -lah /home/ollama/backup/110 2>/dev/null || true; echo \"=== LAST SUCCESS ===\"; cat /home/ollama/backup/110/last_success 2>/dev/null || true; echo \"=== BACKUP LOG ===\"; tail -80 /home/ollama/backup/110/backup.log 2>/dev/null || true; echo \"=== DISK ===\"; df -h /home/ollama /backup / 2>/dev/null || df -h'"
|
||
|
||
# ADR-075: CoreDNS 解析失敗 (2026-04-12 ogt)
|
||
- alert: CoreDNSResolutionFailed
|
||
expr: rate(coredns_dns_responses_total{rcode="SERVFAIL"}[5m]) > 0.05
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
layer: k8s
|
||
team: ops
|
||
auto_repair: "true"
|
||
alert_category: kubernetes
|
||
notification_type: TYPE-3
|
||
annotations:
|
||
summary: "CoreDNS SERVFAIL 率過高 {{ $value | humanizePercentage }}"
|
||
description: "CoreDNS 在 5 分鐘內 SERVFAIL 回應率超過 5%,K8s 服務間 DNS 解析可能失敗"
|
||
runbook: "kubectl -n kube-system get pods -l k8s-app=kube-dns && kubectl -n kube-system logs -l k8s-app=kube-dns"
|
||
|
||
# =========================================================================
|
||
# 資料庫告警 (database_alerts)
|
||
# =========================================================================
|
||
- name: database_alerts
|
||
rules:
|
||
- alert: PostgreSQLDown
|
||
expr: up{job="postgres-exporter"} == 0 or pg_up == 0
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
layer: systemd-188
|
||
component: postgres
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "PostgreSQL 資料庫離線"
|
||
description: "PostgreSQL Exporter 無法連接資料庫超過 1 分鐘"
|
||
|
||
- alert: RedisDown
|
||
expr: up{job="redis-exporter"} == 0 or redis_up == 0
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
layer: systemd-188
|
||
component: redis
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Redis 快取服務離線"
|
||
description: "Redis Exporter 無法連接 Redis 超過 1 分鐘"
|
||
|
||
- alert: PostgreSQLHighConnections
|
||
expr: pg_stat_activity_count > 80
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
component: postgres
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "PostgreSQL 連接數過高"
|
||
description: "當前連接數 {{ $value }} 超過 80"
|
||
|
||
- alert: RedisMemoryHigh
|
||
expr: redis_memory_max_bytes > 0 and redis_memory_used_bytes / redis_memory_max_bytes * 100 > 80
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
component: redis
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Redis 記憶體使用過高"
|
||
description: "Redis 記憶體使用率超過 80%"
|
||
|
||
# =========================================================================
|
||
# Sprint 5.2 Plan B: 資料庫詳細指標告警 (database_detail_alerts)
|
||
# 前置: postgres-exporter:9187 + redis-exporter:9121 on 192.168.0.188
|
||
# 2026-04-08 Claude Sonnet 4.6 Asia/Taipei
|
||
# =========================================================================
|
||
- name: database_detail_alerts
|
||
rules:
|
||
# ---- PostgreSQL 詳細指標 ----
|
||
- alert: PostgreSQLSlowQueries
|
||
expr: pg_stat_activity_max_tx_duration{datname="awoooi_prod"} > 60
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
component: postgres
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "PostgreSQL 有慢查詢 (>60s)"
|
||
description: "awoooi_prod 資料庫最長事務超過 60 秒"
|
||
|
||
- alert: PostgreSQLDeadlocks
|
||
expr: increase(pg_stat_database_deadlocks{datname="awoooi_prod"}[5m]) > 0
|
||
for: 1m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
component: postgres
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "PostgreSQL 死鎖發生"
|
||
description: "過去 5 分鐘 awoooi_prod 資料庫有死鎖"
|
||
|
||
- alert: PostgreSQLTooManyConnections
|
||
expr: pg_stat_activity_count{datname="awoooi_prod"} > 50
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
component: postgres
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "PostgreSQL 連接數過高 ({{ $value }})"
|
||
description: "awoooi_prod 連接數超過 50"
|
||
|
||
# ---- Redis 詳細指標 ----
|
||
- alert: RedisKeyEviction
|
||
expr: increase(redis_evicted_keys_total[5m]) > 0
|
||
for: 1m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
component: redis
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Redis 發生 Key 驅逐"
|
||
description: "過去 5 分鐘有 Key 被驅逐,可能記憶體不足"
|
||
|
||
- alert: RedisConnectionsHigh
|
||
expr: redis_connected_clients > 100
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
component: redis
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Redis 連接數過高 ({{ $value }})"
|
||
description: "Redis 連接數超過 100"
|
||
|
||
- alert: RedisCommandLatencyHigh
|
||
expr: redis_commands_duration_seconds_total / redis_commands_processed_total > 0.01
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
component: redis
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Redis 命令平均延遲過高 (>10ms)"
|
||
description: "Redis 命令平均延遲超過 10ms"
|
||
|
||
# =========================================================================
|
||
# 服務可用性告警 (service_alerts) — 含 layer 標籤決定修復路徑
|
||
# =========================================================================
|
||
- name: service_alerts
|
||
rules:
|
||
# ---- 188 Docker 層 ----
|
||
- alert: OpenClawDown
|
||
# 2026-04-05 Claude Code: 修正舊命名 ClawBotDown → OpenClawDown
|
||
expr: up{job="clawbot"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
layer: docker-188
|
||
component: openclaw
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "OpenClaw 服務離線"
|
||
description: "OpenClaw (192.168.0.188:8088) 已離線超過 2 分鐘"
|
||
|
||
- alert: SignOzDown
|
||
expr: probe_success{job="blackbox-http", instance=~".*3301.*"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: warning
|
||
layer: docker-188
|
||
component: signoz
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "true"
|
||
mcp_provider: "ssh_host"
|
||
target_host: "192.168.0.188"
|
||
alert_category: "devops_tool"
|
||
annotations:
|
||
summary: "SignOz 服務離線"
|
||
description: "SignOz (192.168.0.188:3301) 已離線超過 2 分鐘"
|
||
|
||
# ---- 110 Docker 層 ----
|
||
- alert: SentryDown
|
||
expr: probe_success{job="blackbox-tcp", instance="192.168.0.110:9000"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: warning
|
||
layer: docker-110
|
||
component: sentry
|
||
host: "110"
|
||
team: ops
|
||
auto_repair: "true"
|
||
mcp_provider: "ssh_host"
|
||
target_host: "192.168.0.110"
|
||
alert_category: "devops_tool"
|
||
annotations:
|
||
summary: "Sentry 服務離線"
|
||
description: "Sentry (192.168.0.110:9000) 已離線超過 2 分鐘"
|
||
|
||
- alert: HarborDown
|
||
expr: probe_success{job="blackbox-http", instance=~".*5000.*"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
layer: docker-110
|
||
component: harbor
|
||
host: "110"
|
||
team: ops
|
||
auto_repair: "true"
|
||
mcp_provider: "ssh_host"
|
||
target_host: "192.168.0.110"
|
||
alert_category: "devops_tool"
|
||
annotations:
|
||
summary: "Harbor Registry 離線"
|
||
description: "Harbor (192.168.0.110:5000) 已離線超過 2 分鐘,CD pipeline 將無法拉取映像"
|
||
|
||
- alert: GiteaDown
|
||
expr: probe_success{job="blackbox-http", instance="http://192.168.0.110:3001"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
layer: docker-110
|
||
component: gitea
|
||
host: "110"
|
||
team: ops
|
||
auto_repair: "true"
|
||
mcp_provider: "ssh_host"
|
||
target_host: "192.168.0.110"
|
||
alert_category: "devops_tool"
|
||
annotations:
|
||
summary: "Gitea Git 服務離線"
|
||
description: "Gitea (192.168.0.110:3001) 已離線超過 2 分鐘,CD pipeline 失效"
|
||
|
||
- alert: AlertmanagerDown
|
||
expr: probe_success{job="blackbox-tcp", instance="192.168.0.110:9093"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
layer: docker-110
|
||
component: alertmanager
|
||
host: "110"
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "Alertmanager 離線"
|
||
description: "Alertmanager (192.168.0.110:9093) 已離線,所有告警將靜默"
|
||
|
||
# =========================================================================
|
||
# 告警鏈路監控 (alert_chain) — 防止 2026-03-26/04-05 事故重演
|
||
# =========================================================================
|
||
- name: alert_chain
|
||
rules:
|
||
- alert: AlertChainBroken_Alertmanager
|
||
expr: |
|
||
sum(rate(awoooi_webhook_requests_total{source="alertmanager",status!="success"}[5m]))
|
||
/ sum(rate(awoooi_webhook_requests_total{source="alertmanager"}[5m])) > 0.1
|
||
for: 10m
|
||
labels:
|
||
severity: critical
|
||
layer: k8s
|
||
team: platform
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Alertmanager Webhook 錯誤率 > 10%"
|
||
description: "告警鏈路可能斷裂,請執行 E2E 驗證"
|
||
|
||
- alert: AlertChainBroken_Sentry
|
||
expr: |
|
||
sum(rate(awoooi_webhook_requests_total{source="sentry",status!="success"}[5m]))
|
||
/ sum(rate(awoooi_webhook_requests_total{source="sentry"}[5m])) > 0.1
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: platform
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Sentry Webhook 錯誤率 > 10%"
|
||
description: "Sentry 錯誤可能無法正確處理"
|
||
|
||
- alert: NoAlertsReceived2Hours
|
||
expr: time() - max by (source)(awoooi_alert_chain_last_success_timestamp{source="alertmanager"}) > 7200
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: platform
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "2 小時內未收到任何告警 ({{ $labels.source }})"
|
||
description: "可能是告警鏈路問題,請執行 Smoke Test"
|
||
|
||
- alert: AlertChainUnhealthy
|
||
expr: awoooi_alert_chain_healthy == 0
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
layer: k8s
|
||
team: platform
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "告警鏈路不健康 ({{ $labels.source }})"
|
||
description: "告警鏈路標記為不健康,最近處理失敗"
|
||
|
||
# =========================================================================
|
||
# 自動修復監控 (auto_repair)
|
||
# =========================================================================
|
||
- name: auto_repair
|
||
rules:
|
||
- alert: AutoRepairLowSuccessRate
|
||
expr: awoooi_auto_repair_success_rate < 0.3
|
||
for: 30m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: backend
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "自動修復成功率過低 ({{ $value | humanizePercentage }})"
|
||
description: "動作 {{ $labels.action }} 的成功率低於 30%,建議檢查 Playbook"
|
||
|
||
- alert: PermanentFixRequired
|
||
expr: sum(rate(awoooi_anomaly_escalation_total{level="PERMANENT_FIX"}[1h])) > 0
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
layer: k8s
|
||
team: backend
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "需要永久修復的異常升級"
|
||
description: "有異常升級到 PERMANENT_FIX 級別,需要根本修復"
|
||
|
||
# =========================================================================
|
||
# Sprint 5.1: Docker 容器健康監控(docker-health-monitor 感知層接入)
|
||
# 由 docker-health-monitor.sh 送 Alertmanager 格式 webhook,
|
||
# 或 Prometheus 自訂 exporter 上報時使用。
|
||
# auto_repair: "true" 代表允許 AWOOOI Guardrail 決策(非直接修復)
|
||
# 實際修復動作由 Service Registry 分級決定(ADR-062)
|
||
# 2026-04-08 Claude Sonnet 4.6 Asia/Taipei
|
||
# =========================================================================
|
||
- name: docker_health_alerts
|
||
rules:
|
||
- alert: DockerContainerUnhealthy
|
||
expr: container_health_status{job="docker-health-monitor"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: warning
|
||
layer: docker
|
||
team: ops
|
||
auto_repair: "true"
|
||
mcp_provider: "ssh_host"
|
||
alert_category: "devops_tool"
|
||
annotations:
|
||
summary: "容器 {{ $labels.container }} 健康檢查失敗"
|
||
description: "主機 {{ $labels.host }} 容器 {{ $labels.container }} 健康狀態異常,持續 2 分鐘"
|
||
|
||
- alert: DockerContainerExited
|
||
expr: container_running_status{job="docker-health-monitor"} == 0
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
layer: docker
|
||
team: ops
|
||
auto_repair: "true"
|
||
mcp_provider: "ssh_host"
|
||
alert_category: "devops_tool"
|
||
annotations:
|
||
summary: "容器 {{ $labels.container }} 已停止"
|
||
description: "主機 {{ $labels.host }} 容器 {{ $labels.container }} 已 exited/dead,持續 1 分鐘"
|
||
|
||
- alert: DockerContainerCpuSustainedHigh
|
||
# 2026-05-05 ogt + Codex: Docker Compose 長期過載基線。
|
||
# Baseline: 單容器 >2 core 10m 為 warning;用於提早抓 cadvisor、ClickHouse、momo-scheduler、Ollama runner 類問題。
|
||
expr: docker_container_cpu_cores > 2
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
layer: docker
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "容器 {{ $labels.container_name }} CPU 持續超過 2 core"
|
||
description: "{{ $labels.container_name }} 5m CPU 使用 >2 core 且持續 10 分鐘,需檢查是否缺 cpus limit、merge backlog、爬蟲尖峰或模型推理。"
|
||
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker stats --no-stream {{ $labels.container_name }}; docker inspect {{ $labels.container_name }} | head -80'"
|
||
runbook: "自動階段只做診斷;若是已知服務,交給 service-specific playbook:ClickHouse 降 merge/查 backlog、scheduler 限 concurrency、litellm/blackbox 查 liveliness。"
|
||
|
||
- alert: DockerContainerCpuRunawayCritical
|
||
expr: docker_container_cpu_cores > 4
|
||
for: 15m
|
||
labels:
|
||
severity: critical
|
||
layer: docker
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "容器 {{ $labels.container_name }} CPU 持續超過 4 core"
|
||
description: "{{ $labels.container_name }} 已持續吃超過 4 core,會拖垮 110/188 主機;需要 AI 判斷是否限流、暫停 worker 或套用既有 playbook。"
|
||
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'echo \"=== CONTAINER ===\"; docker stats --no-stream {{ $labels.container_name }}; echo \"=== TOP ===\"; docker top {{ $labels.container_name }} -eo pid,ppid,stat,pcpu,pmem,comm,args | head -30'"
|
||
runbook: "禁止通用 docker restart;先抓根因,只有 health down 或 crash loop 才可走重啟。"
|
||
|
||
- alert: DockerContainerMemoryLimitPressure
|
||
# 2026-05-05 ogt + Codex: 防止亂設 memory limit 把 Kafka/Gitea/Taskworker 類服務卡死。
|
||
expr: docker_container_memory_limit_bytes > 0 and docker_container_memory_usage_bytes / docker_container_memory_limit_bytes > 0.85
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
layer: docker
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "容器 {{ $labels.container_name }} 記憶體超過 limit 85%"
|
||
description: "{{ $labels.container_name }} memory / limit > 85% 持續 10 分鐘。若是 Kafka/ClickHouse/Gitea/Taskworker,需先判斷 workload,不可直接降 limit。"
|
||
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker stats --no-stream {{ $labels.container_name }}; docker inspect {{ $labels.container_name }} | head -80'"
|
||
runbook: "若服務已接近 limit:優先調整 retention/concurrency/cache,再評估提高 memory;禁止用更低 memory limit 當止血。"
|
||
|
||
- alert: DockerContainerRestartSpike
|
||
# 2026-05-05 ogt + Codex: cAdvisor v0.47 無 restart metric,吃 node-exporter textfile docker_container_restart_count。
|
||
expr: increase(docker_container_restart_count[15m]) > 5 or increase(docker_container_inspect_restart_count[15m]) > 5
|
||
for: 3m
|
||
labels:
|
||
severity: critical
|
||
layer: docker
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "容器 {{ $labels.container_name }} 15 分鐘重啟超過 5 次"
|
||
description: "Docker restart textfile exporter 顯示 {{ $labels.container_name }} 重啟暴增,避免再次出現 litellm 24,464 次靜默崩潰。"
|
||
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps -a --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}'"
|
||
runbook: "先抓 crash signature;若是 config/DB/網路問題,修設定,不用無限 restart。"
|
||
|
||
- alert: DockerContainerMissingResourceLimit
|
||
# 2026-05-05 ogt + Codex: catch Compose services that silently run with unlimited CPU/memory.
|
||
expr: (docker_container_cpu_limit_cores{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|taskbroker|taskscheduler|redis|uptime-checker|process-spans).*"} == 0) or (docker_container_memory_limit_bytes{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|taskbroker|taskscheduler|redis|uptime-checker|process-spans).*"} == 0)
|
||
for: 30m
|
||
labels:
|
||
severity: warning
|
||
layer: docker
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "容器 {{ $labels.container_name }} 缺 CPU 或 memory limit"
|
||
description: "{{ $labels.container_name }} 在 Docker Host 上沒有完整 CPU/memory guardrail,長時間尖峰可能拖垮 110/188。"
|
||
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker inspect {{ $labels.container_name }} --format \"NanoCpus={{.HostConfig.NanoCpus}} Memory={{.HostConfig.Memory}}\"; docker stats --no-stream {{ $labels.container_name }}'"
|
||
runbook: "先盤點 workload,再在 compose/service-specific playbook 補 cpus/mem_limit。不可盲目降 ClickHouse/Kafka;momo app/scheduler 可用 2 core/2GiB 起步。"
|
||
|
||
- alert: DockerGiteaActionsJobStale
|
||
# 2026-05-05 ogt + Codex: Gitea/act job containers can outlive workflow timeout and keep 110 hot.
|
||
expr: time() - docker_container_started_seconds{container_name=~"GITEA-ACTIONS-.*"} > 1200
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: docker-110
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "Gitea Actions job {{ $labels.container_name }} 執行超過 20 分鐘"
|
||
description: "{{ $labels.container_name }} 已超過 20 分鐘,110 曾出現 code-review/CD job timeout 失效而持續佔用 runner。"
|
||
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}; bash /home/wooo/scripts/stop-stale-gitea-actions-jobs.sh --min-age-seconds 1200'"
|
||
runbook: "先執行 dry-run;清理腳本會依 workflow/job 名稱套停止門檻。若 logs 空白且超過該 job timeout buffer,再用 --apply 停止 stale job container。不要停止仍在輸出或正在部署的 job。"
|
||
|
||
- alert: SystemdRunnerRestartSpike
|
||
# 2026-05-05 ogt + Codex: 110 GitHub Actions runner watchdog loop was outside Docker/cAdvisor coverage.
|
||
expr: increase(systemd_unit_restarts_total{unit=~"actions\\.runner\\..*"}[15m]) > 2
|
||
for: 3m
|
||
labels:
|
||
severity: critical
|
||
layer: systemd-110
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "Systemd runner {{ $labels.unit }} 15 分鐘重啟超過 2 次"
|
||
description: "{{ $labels.unit }} 在 15 分鐘內重啟暴增;110 曾發生 WatchdogSec=5min 造成 runner 每 5 分鐘自殺重啟。"
|
||
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'systemctl show {{ $labels.unit }} -p WatchdogUSec -p NRestarts -p DropInPaths -p CPUQuotaPerSecUSec -p MemoryMax; journalctl -u {{ $labels.unit }} --since \"20 minutes ago\" --no-pager | tail -120'"
|
||
runbook: "自動階段先診斷。若確認 WatchdogSec 對 GitHub runner 誤設,需人工或 sudo playbook 執行 /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply。"
|
||
|
||
- alert: SystemdRunnerWatchdogEnabled
|
||
expr: systemd_unit_watchdog_seconds{unit=~"actions\\.runner\\..*"} > 0
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-110
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "Systemd runner {{ $labels.unit }} 啟用了 WatchdogSec"
|
||
description: "{{ $labels.unit }} WatchdogSec={{ $value }} 秒。GitHub Actions runner service 不應被 systemd watchdog 週期性殺掉。"
|
||
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'systemctl show {{ $labels.unit }} -p WatchdogUSec -p DropInPaths -p NRestarts'"
|
||
runbook: "確認 drop-in 來源;需要 sudo 時由人工套用 /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply。"
|
||
|
||
- alert: SystemdRunnerMissingResourceQuota
|
||
expr: systemd_unit_cpu_quota_cores{unit=~"actions\\.runner\\..*"} == 0 or systemd_unit_memory_max_bytes{unit=~"actions\\.runner\\..*"} == 0
|
||
for: 30m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-110
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "Systemd runner {{ $labels.unit }} 缺 CPU 或 memory quota"
|
||
description: "{{ $labels.unit }} 仍為 unlimited;CI runner 會與 Sentry/ClickHouse/Gitea 搶主機 CPU。"
|
||
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'systemctl show {{ $labels.unit }} -p CPUQuotaPerSecUSec -p MemoryMax -p ActiveState -p SubState'"
|
||
runbook: "建議 baseline:每個 runner CPUQuota=200%、MemoryMax=2G;由 /home/wooo/scripts/apply-runner-systemd-guardrails.sh 套用,若仍過載再限制並行度或分流。"
|
||
|
||
# =========================================================================
|
||
# MinIO / Kali 告警
|
||
# =========================================================================
|
||
- name: minio_kali_alerts
|
||
rules:
|
||
- alert: MinIODown
|
||
expr: probe_success{job="blackbox-http", instance=~".*9001.*"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: warning
|
||
layer: docker-188
|
||
component: minio
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "MinIO (Velero 備份) 離線"
|
||
description: "MinIO (192.168.0.188:9001) 已離線超過 2 分鐘,Velero 備份可能失敗"
|
||
|
||
- alert: KaliScannerDown
|
||
expr: probe_success{job="blackbox-http", instance=~".*192.168.0.112.*"} == 0
|
||
for: 5m
|
||
labels:
|
||
severity: info
|
||
layer: docker-188
|
||
component: kali
|
||
host: "112"
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Kali Scanner 離線"
|
||
description: "Kali (192.168.0.112:8080) 離線,安全掃描功能暫停"
|
||
|
||
# =========================================================================
|
||
# Plan C — 外部網站監控 (Sprint 5.2, 2026-04-09 Claude Sonnet 4.6 Asia/Taipei)
|
||
# blackbox-http 已涵蓋 4 個外部網站,此群組提供結構化告警
|
||
# auto_repair: "true" — 由 AWOOOI Guardrail 決策(Service Registry 分級)
|
||
# =========================================================================
|
||
- name: external_website_alerts
|
||
rules:
|
||
- alert: MoWoooWorkDown
|
||
expr: probe_success{job="blackbox-http", instance="https://mo.wooo.work"} == 0
|
||
for: 3m
|
||
labels:
|
||
severity: critical
|
||
layer: external
|
||
component: momo-app
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "外部網站 mo.wooo.work 離線"
|
||
description: "mo.wooo.work 探測失敗超過 3 分鐘,容器 momo-app (188) 可能需要重啟"
|
||
|
||
- alert: TsenyangWebsiteDown
|
||
expr: probe_success{job="blackbox-http", instance="https://www.tsenyang.com"} == 0
|
||
for: 3m
|
||
labels:
|
||
severity: critical
|
||
layer: external
|
||
component: tsenyang-website
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "外部網站 tsenyang.com 離線"
|
||
description: "tsenyang.com 探測失敗超過 3 分鐘,容器 tsenyang-website (188) 可能需要重啟"
|
||
|
||
- alert: StockWoooWorkDown
|
||
expr: probe_success{job="blackbox-http", instance="http://stock.wooo.work"} == 0
|
||
for: 3m
|
||
labels:
|
||
severity: critical
|
||
layer: external
|
||
component: stock-platform
|
||
host: "110"
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "外部網站 stock.wooo.work 離線"
|
||
description: "stock.wooo.work 探測失敗超過 3 分鐘,容器 stock-platform (110) 可能需要重啟"
|
||
|
||
- alert: BitanWoooWorkDown
|
||
expr: probe_success{job="blackbox-http", instance="https://bitan.wooo.work"} == 0
|
||
for: 3m
|
||
labels:
|
||
severity: critical
|
||
layer: external
|
||
component: bitan-app
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "外部網站 bitan.wooo.work 離線"
|
||
description: "bitan.wooo.work 探測失敗超過 3 分鐘,容器 bitan-app (188) 可能需要重啟"
|
||
|
||
- alert: ExternalSiteSSLExpiringSoon
|
||
expr: probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time() < 14 * 24 * 3600
|
||
for: 1h
|
||
labels:
|
||
severity: warning
|
||
layer: external
|
||
component: ssl
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "SSL 憑證即將到期: {{ $labels.instance }}"
|
||
description: "{{ $labels.instance }} SSL 憑證將在 14 天內到期,請手動更新"
|
||
|
||
# =============================================================================
|
||
# ADR-075 新增規則群組 (2026-04-12 ogt)
|
||
# =============================================================================
|
||
|
||
- name: awoooi_secops_alerts
|
||
interval: 60s
|
||
rules:
|
||
- alert: UnauthorizedSSHLogin
|
||
expr: increase(node_failed_auth_attempts_total[5m]) > 10
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
layer: systemd-188
|
||
team: security
|
||
auto_repair: "false"
|
||
alert_category: secops
|
||
annotations:
|
||
summary: "異常 SSH 登入嘗試: {{ $labels.instance }}"
|
||
description: "5 分鐘內失敗登入 {{ $value }} 次,可能遭受暴力破解"
|
||
|
||
- name: awoooi_business_alerts
|
||
interval: 60s
|
||
rules:
|
||
- alert: AITokenCostSpike
|
||
expr: increase(awoooi_ai_token_cost_usd_total[1h]) > 10
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: finops
|
||
auto_repair: "false"
|
||
alert_category: business
|
||
annotations:
|
||
summary: "AI Token 費用 1 小時內暴增 ${{ $value | humanize }}"
|
||
description: "AI API 調用費用異常,請檢查是否有迴圈或濫用"
|
||
- alert: GeminiAPIErrorRateHigh
|
||
expr: rate(awoooi_ai_request_errors_total{provider="gemini"}[5m]) / rate(awoooi_ai_requests_total{provider="gemini"}[5m]) > 0.2
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: finops
|
||
auto_repair: "false"
|
||
alert_category: business
|
||
annotations:
|
||
summary: "Gemini API 錯誤率過高: {{ $value | humanizePercentage }}"
|
||
description: "Gemini API 5 分鐘錯誤率超過 20%,AI 降級可能失效"
|
||
# ADR-075: 業務爬蟲健康 (2026-04-12 ogt)
|
||
- alert: MomoScraperSuccessLow
|
||
expr: |
|
||
rate(momo_scraper_requests_total{status="success"}[5m])
|
||
/ rate(momo_scraper_requests_total[5m]) < 0.9
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
layer: docker-110
|
||
auto_repair: "false"
|
||
alert_category: business
|
||
notification_type: TYPE-3
|
||
annotations:
|
||
summary: "Momo 抓取成功率跌至 {{ $value | humanizePercentage }}"
|
||
description: "Momo 爬蟲成功率低於 90%,業務資料可能缺失"
|
||
|
||
- name: awoooi_flywheel_meta_alerts
|
||
interval: 60s
|
||
rules:
|
||
- alert: FlywheelPlaybookZero
|
||
expr: awoooi_flywheel_playbook_count == 0
|
||
for: 1h
|
||
labels:
|
||
severity: critical
|
||
layer: k8s
|
||
team: aiops
|
||
auto_repair: "false"
|
||
alert_category: flywheel_health
|
||
notification_type: TYPE-8M
|
||
annotations:
|
||
summary: "飛輪 Playbook 數量為零,AI 修復完全依賴 LLM"
|
||
description: "Redis 中無任何已批准 Playbook,自動修復能力大幅降低"
|
||
runbook: "執行 scripts/cold_start_playbooks.py 冷啟動"
|
||
# 2026-05-03 ogt + Claude Opus 4.7(亞太)— anti-silencing 補配對告警
|
||
# NaN sentinel 不會被 < 0.1 誤觸;下方 FlywheelExecutionRateMissing 補「無資料」獨立告警
|
||
- alert: FlywheelExecutionSuccessLow
|
||
expr: awoooi_flywheel_execution_success_rate < 0.1
|
||
for: 2h
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: aiops
|
||
auto_repair: "false"
|
||
alert_category: flywheel_health
|
||
notification_type: TYPE-8M
|
||
annotations:
|
||
summary: "飛輪執行成功率 {{ $value | humanizePercentage }} 低於 10%"
|
||
description: "連續 2 小時執行成功率不足 10%,Playbook 可能已過時"
|
||
runbook: "檢查 decision_manager 日誌,確認 target 解析和 SSH MCP 狀態"
|
||
- alert: FlywheelExecutionRateMissing
|
||
expr: absent(awoooi_flywheel_execution_success_rate) or (awoooi_flywheel_execution_success_rate != awoooi_flywheel_execution_success_rate)
|
||
for: 30m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: aiops
|
||
auto_repair: "false"
|
||
alert_category: flywheel_health
|
||
notification_type: TYPE-8M
|
||
annotations:
|
||
summary: "飛輪執行率指標 30 分鐘無資料"
|
||
description: "execution_success_rate 連續 30 分鐘為 NaN 或不存在,Redis playbook 統計斷流(資料管線壞 / Redis flush / FlywheelStatsService 異常)"
|
||
runbook: "1) 檢查 Redis playbook:* keys 2) 檢查 FlywheelStatsService 日誌 3) curl /metrics 直接拉看 NaN 來源"
|
||
- alert: FlywheelKMVectorizationLow
|
||
expr: awoooi_flywheel_km_unvectorized_count > 10
|
||
for: 30m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: aiops
|
||
auto_repair: "false"
|
||
alert_category: flywheel_health
|
||
notification_type: TYPE-8M
|
||
annotations:
|
||
summary: "{{ $value }} 筆 KM 未向量化,RAG 查詢命中率下降"
|
||
description: "knowledge_entries 中 embedding IS NULL 超過 10 筆且持續 30 分鐘"
|
||
runbook: "執行 scripts/batch_vectorize_km.py 或檢查每日 CronJob 狀態"
|
||
- alert: FlywheelAlertnameNullHigh
|
||
expr: awoooi_flywheel_alertname_null_rate > 0.05
|
||
for: 30m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: aiops
|
||
auto_repair: "false"
|
||
alert_category: flywheel_health
|
||
notification_type: TYPE-8M
|
||
annotations:
|
||
summary: "飛輪 alertname NULL 率超過 5%"
|
||
description: "alertname NULL 率 {{ $value | humanizePercentage }},影響路由準確性。"
|
||
runbook: "執行 scripts/backfill_alertname.py 回填"
|
||
- alert: FlywheelIncidentsStuck
|
||
expr: awoooi_flywheel_incidents_stuck > 5
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: aiops
|
||
auto_repair: "false"
|
||
alert_category: flywheel_health
|
||
notification_type: TYPE-8M
|
||
annotations:
|
||
summary: "{{ $value }} 筆 Incident 卡在 INVESTIGATING 超過 24h"
|
||
description: "飛輪推理匹配節點可能堵塞,需人工清理或重新觸發診斷"
|
||
|
||
# =========================================================================
|
||
# 備份還原告警 (awoooi_backup_restore) — 從主機補回 2026-04-12
|
||
# =========================================================================
|
||
- name: awoooi_backup_restore
|
||
interval: 1m
|
||
rules:
|
||
- alert: BackupRestoreTestFailed
|
||
expr: max by(host, namespace, cronjob) (awoooi_velero_restore_test_failed_jobs{host="110",namespace="velero",cronjob="backup-restore-test"}) > 0
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "備份還原 dry-run 測試失敗"
|
||
description: "velero namespace 中保留了失敗的 backup-restore-test Job,備份可能無法還原。立即人工驗證備份狀態。"
|
||
runbook: "先找最新 Completed Velero backup,再執行 restore dry-run;禁止在 production namespace 做真還原"
|
||
|
||
- alert: BackupRestoreTestMissing
|
||
expr: absent(awoooi_velero_restore_test_cron_present{host="110",namespace="velero",cronjob="backup-restore-test"})
|
||
for: 30m
|
||
labels:
|
||
severity: warning
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "備份還原 dry-run 監控指標缺失"
|
||
description: "Prometheus 沒有收到 awoooi_velero_restore_test_cron_present;110 backup health exporter 或 120 kubectl 查詢可能失效。"
|
||
runbook: "檢查 110 backup_health.prom、SSH 110→120、以及 velero namespace 的 backup-restore-test CronJob"
|
||
|
||
- alert: BackupRestoreTestCronMissing
|
||
expr: max by(host, namespace, cronjob) (awoooi_velero_restore_test_cron_present{host="110",namespace="velero",cronjob="backup-restore-test"}) == 0
|
||
for: 15m
|
||
labels:
|
||
severity: critical
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "備份還原 dry-run CronJob 缺失"
|
||
description: "velero namespace 找不到 backup-restore-test CronJob;備份可還原性沒有定期驗證。"
|
||
runbook: "kubectl apply k8s/awoooi-prod/17-configmap-backup-restore-scripts.yaml 與 16-cronjob-backup-restore-test.yaml"
|
||
|
||
- alert: BackupRestoreTestStale
|
||
expr: max by(host, namespace, cronjob) (awoooi_velero_restore_test_last_success_fresh{host="110",namespace="velero",cronjob="backup-restore-test"}) == 0
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "備份還原測試超過 8 天未執行"
|
||
description: "backup-restore-test CronJob 沒有 8 天內成功紀錄;週排程 CronJob 可能失效。"
|
||
runbook: "檢查 velero namespace 中的 backup-restore-test CronJob 狀態"
|
||
|
||
# =========================================================================
|
||
# Host / service / config backup health
|
||
# =========================================================================
|
||
- name: full_stack_backup_health_alerts
|
||
interval: 1m
|
||
rules:
|
||
- alert: BackupHealthMonitorMissing110
|
||
expr: absent(awoooi_backup_health_monitor_up{host="110"})
|
||
for: 20m
|
||
labels:
|
||
severity: warning
|
||
layer: host-backup
|
||
component: backup-health-monitor
|
||
host: "110"
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "110 備份健康指標缺失"
|
||
description: "110 沒有輸出 backup_health.prom,無法確認資料庫、設定檔與服務備份是否新鮮。"
|
||
runbook: "用 Ansible `110-devops.yml --tags textfile_exporters` 或手動部署 scripts/ops/backup-health-textfile-exporter.py"
|
||
|
||
- alert: BackupHealthMonitorMissing188
|
||
expr: absent(awoooi_backup_health_monitor_up{host="188"})
|
||
for: 20m
|
||
labels:
|
||
severity: warning
|
||
layer: host-backup
|
||
component: backup-health-monitor
|
||
host: "188"
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "188 備份健康指標缺失"
|
||
description: "188 沒有輸出 backup_health.prom,無法確認 110 rsync 與 momo PostgreSQL 備份是否新鮮。"
|
||
runbook: "用 Ansible `188-ai-web.yml --tags textfile_exporters` 或手動部署 scripts/ops/backup-health-textfile-exporter.py"
|
||
|
||
- alert: BackupHealthMonitorStale
|
||
expr: time() - awoooi_backup_health_last_run_timestamp{host=~"110|188"} > 1800
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
layer: host-backup
|
||
component: backup-health-monitor
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "{{ $labels.host }} 備份健康 exporter 超過 30 分鐘未更新"
|
||
description: "backup health textfile exporter stale,備份狀態不可觀測。"
|
||
runbook: "SSH 主機檢查 cron、/tmp/awoooi-backup-health-textfile-exporter.cron.log 與 node-exporter textfile collector"
|
||
|
||
- alert: BackupExpectedJobMissing
|
||
expr: awoooi_backup_job_configured{host=~"110|188"} == 0
|
||
for: 15m
|
||
labels:
|
||
severity: critical
|
||
layer: host-backup
|
||
component: backup-cron
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "{{ $labels.host }} 備份排程缺失:{{ $labels.exported_job }}"
|
||
description: "預期備份 cron/config 不存在;下一次重開機後資料可能沒有可用還原點。"
|
||
runbook: "依 docs/runbooks/FULL-STACK-COLD-START-SOP.md 的備份章節補回 cron,先 dry-run 再執行"
|
||
|
||
- alert: BackupScheduleDuplicateActiveEntries
|
||
expr: awoooi_backup_cron_active_duplicate_count{host="110"} > 0
|
||
for: 15m
|
||
labels:
|
||
severity: warning
|
||
layer: host-backup
|
||
component: backup-cron
|
||
host: "110"
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "110 備份 crontab 有重複 active entries"
|
||
description: "110 crontab 目前有 {{ $value }} 個 exact duplicate active entry;可能造成 offsite sync、verifier 或 status job 重複執行。"
|
||
runbook: "SSH 110 執行 `crontab -l | awk 'NF && $0 !~ /^#/ {count[$0]++} END {for (line in count) if (count[line] > 1) print count[line], line}'`,只移除重複 active entry,不要刪除未理解的備份排程。"
|
||
|
||
- alert: BackupScheduleSingletonMismatch
|
||
expr: awoooi_backup_cron_singular_entry_ok{host="110"} == 0
|
||
for: 15m
|
||
labels:
|
||
severity: warning
|
||
layer: host-backup
|
||
component: backup-cron
|
||
host: "110"
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "110 備份排程單一入口異常:{{ $labels.entry }}"
|
||
description: "{{ $labels.entry }} 應該剛好只有一個 active cron entry;目前 count={{ $value }},可能造成排程缺失或重複執行。"
|
||
runbook: "用 Ansible `110-devops.yml --tags backup_jobs` 收斂排程,並用 `scripts/reboot-recovery/reboot-recovery-readiness-audit.sh --live --no-color` 驗證。"
|
||
|
||
- alert: BackupScriptMissing
|
||
expr: awoooi_backup_script_present{host=~"110|188"} == 0
|
||
for: 15m
|
||
labels:
|
||
severity: critical
|
||
layer: host-backup
|
||
component: backup-script
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "{{ $labels.host }} 備份腳本缺失:{{ $labels.script }}"
|
||
description: "備份排程可能存在,但實際腳本不存在或路徑漂移。"
|
||
runbook: "從 repo 部署對應 scripts/backup 或 scripts/ops 腳本,確認權限 0755"
|
||
|
||
- alert: BackupJobStale
|
||
expr: awoooi_backup_job_fresh{host=~"110|188"} == 0
|
||
for: 15m
|
||
labels:
|
||
severity: critical
|
||
layer: host-backup
|
||
component: backup-freshness
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "{{ $labels.host }} 備份過舊:{{ $labels.exported_job }}"
|
||
description: "{{ $labels.exported_job }} 最新成功證據超過 {{ $labels.max_age_hours }} 小時或不存在;來源 {{ $labels.source }},目標 {{ $labels.target }}。"
|
||
runbook: "先檢查備份 log 與磁碟空間,再手動執行對應備份;禁止直接刪除舊備份或 production 資料"
|
||
|
||
- alert: BackupAggregateRunFailed
|
||
expr: awoooi_backup_last_run_failed_count{host="110",exported_job="backup_all"} > 0
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
layer: host-backup
|
||
component: backup-all
|
||
host: "110"
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "110 全服務備份最近一次有 {{ $value }} 個失敗項目"
|
||
description: "backup-all.sh 最近一次 aggregate run 仍有失敗;即使個別 DB 備份已手動補跑,也要重跑 aggregate backup 清除紅燈。"
|
||
runbook: "SSH 110 檢查 /backup/logs/cron.log 與 /backup/logs/backup.log,修正後執行 /backup/scripts/backup-all.sh"
|
||
|
||
- alert: BackupConfigCapturePartial
|
||
expr: awoooi_backup_config_capture_ok{host="110",critical="true"} == 0
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
layer: host-backup
|
||
component: backup-config-capture
|
||
host: "110"
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "110 設定檔備份缺少關鍵目標:{{ $labels.target }}"
|
||
description: "configs restic snapshot 雖可能存在,但最新設定檔備份未成功捕捉 {{ $labels.target }};source={{ $labels.source }}。這會影響下一次冷啟動或災難還原的設定還原完整性。"
|
||
runbook: "先修復對應主機或 K8s API 可達性,再執行 /backup/scripts/backup-configs.sh,確認 awoooi_backup_config_capture_ok 回到 1,最後補跑 Google Drive/rclone offsite sync。"
|
||
|
||
- alert: BackupConfigCaptureStatusStale
|
||
expr: absent(awoooi_backup_config_capture_status_timestamp{host="110"}) or (time() - awoooi_backup_config_capture_status_timestamp{host="110"} > 172800)
|
||
for: 30m
|
||
labels:
|
||
severity: warning
|
||
layer: host-backup
|
||
component: backup-config-capture
|
||
host: "110"
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "110 設定檔備份覆蓋率狀態缺失或過舊"
|
||
description: "backup-configs.sh 沒有新鮮的 capture status;無法判斷 110/120/121/188/K8s 設定檔是否真的被最新 snapshot 捕捉。"
|
||
runbook: "部署新版 /backup/scripts/backup-configs.sh 與 /home/wooo/scripts/backup-health-textfile-exporter.py,執行 /backup/scripts/backup-configs.sh 後刷新 textfile exporter。"
|
||
|
||
- alert: BackupIntegrityCheckMissingOrFailed
|
||
expr: absent(awoooi_backup_integrity_fresh{host="110",scope="restic_check"}) or awoooi_backup_integrity_fresh{host="110",scope="restic_check"} == 0
|
||
for: 30m
|
||
labels:
|
||
severity: critical
|
||
layer: host-backup
|
||
component: backup-integrity
|
||
host: "110"
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "110 備份倉庫完整性檢查缺失或失敗"
|
||
description: "每週 restic check 沒有成功證據,或有 repo 檢查失敗;目前不能假設備份可讀。"
|
||
runbook: "SSH 110 執行 `/backup/scripts/check-backup-integrity.sh --mode check`,先看 /backup/logs/backup-integrity.log;禁止刪 repo 或 prune 直到確認原因"
|
||
|
||
- alert: BackupRestoreDrillMissingOrFailed
|
||
expr: absent(awoooi_backup_integrity_fresh{host="110",scope="restore_drill"}) or awoooi_backup_integrity_fresh{host="110",scope="restore_drill"} == 0
|
||
for: 30m
|
||
labels:
|
||
severity: warning
|
||
layer: host-backup
|
||
component: backup-restore-drill
|
||
host: "110"
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "110 備份抽樣還原演練缺失或失敗"
|
||
description: "每月 restore drill 沒有成功證據,備份雖可能新鮮,但尚未驗證可讀取還原。"
|
||
runbook: "SSH 110 執行 `/backup/scripts/check-backup-integrity.sh --mode restore-drill`;只允許還原到隔離暫存目錄,不得覆蓋 production"
|
||
|
||
- alert: BackupOffsiteCopyNotConfigured
|
||
expr: sum by(host) (awoooi_backup_offsite_configured{host="110"}) == 0
|
||
for: 1m
|
||
labels:
|
||
severity: warning
|
||
layer: host-backup
|
||
component: backup-offsite
|
||
host: "110"
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "110 尚未配置離機備份 provider"
|
||
description: "backup health exporter 未偵測到 Google Drive/rclone 或其他 offsite provider 配置;本地 restic 全綠仍不等於異地可恢復。"
|
||
runbook: "在 110 以 `/backup/scripts/configure-offsite-rclone.sh --interactive` 建立 Google Drive remote,產生 `/backup/offsite/*last_success` 證據;不得把 provider token 寫入 repo、Telegram 或 Prometheus label。"
|
||
|
||
- alert: BackupOffsiteCopyStale
|
||
expr: |
|
||
(
|
||
(sum by(host) (awoooi_backup_offsite_configured{host="110"}) > 0)
|
||
and
|
||
(sum by(host) (awoooi_backup_offsite_fresh{host="110"}) == 0)
|
||
)
|
||
and
|
||
(
|
||
(sum by(host) (awoooi_backup_offsite_full_sync_enabled{host="110"}) == 0)
|
||
or
|
||
((time() - max by(host) (awoooi_backup_offsite_full_sync_enabled_timestamp{host="110"})) > 30 * 3600)
|
||
)
|
||
for: 2h
|
||
labels:
|
||
severity: warning
|
||
layer: host-backup
|
||
component: backup-offsite
|
||
host: "110"
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "110 離機備份超過 48 小時未成功"
|
||
description: "已偵測到 offsite provider 配置,但沒有新鮮成功標記;本地備份可能無法抵抗整台 110 遺失。"
|
||
runbook: "SSH 110 檢查 Google Drive/rclone 同步 log 與 `/backup/offsite/*last_success`;full sync 需在 enable marker 與低負載門檻成立後由 `/backup/scripts/sync-offsite-backups.sh --mode sync` 鏡像本地 latest-only repo。"
|
||
|
||
- alert: BackupRetentionPolicyNotLatestOnly
|
||
expr: |
|
||
absent(awoooi_backup_retention_latest_only{host="110"})
|
||
or
|
||
awoooi_backup_retention_latest_only{host="110"} != 1
|
||
or
|
||
absent(awoooi_backup_retention_offsite_delete_old_enabled{host="110",provider="rclone"})
|
||
or
|
||
awoooi_backup_retention_offsite_delete_old_enabled{host="110",provider="rclone"} != 1
|
||
for: 15m
|
||
labels:
|
||
severity: warning
|
||
layer: host-backup
|
||
component: backup-retention
|
||
host: "110"
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "110 備份保留策略不是 latest-only"
|
||
description: "operator 要求所有備份只保留最新一份;本地 restic 必須 keep-last=1,Google Drive/rclone 必須在成功 mirror 後刪除遠端舊檔。"
|
||
runbook: "檢查 `/backup/scripts/common.sh` 的 BACKUP_RETENTION_MODE=latest、KEEP_LAST=1 與 OFFSITE_SYNC_DELETE_OLD=1,刷新 backup-health textfile;必要時在備份成功後執行 `/backup/scripts/enforce-latest-only-retention.sh`。"
|
||
|
||
- alert: BackupSnapshotRetentionExceeded
|
||
expr: awoooi_backup_job_snapshot_count{host="110",type="restic"} > 1
|
||
for: 30m
|
||
labels:
|
||
severity: warning
|
||
layer: host-backup
|
||
component: backup-retention
|
||
host: "110"
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "110 備份 repo {{ $labels.exported_job }} 保留超過 1 份 snapshot"
|
||
description: "{{ $labels.exported_job }} 目前有 {{ $value }} 份 restic snapshot;latest-only 策略要求每個 repo 全域只保留最新 1 份。"
|
||
runbook: "SSH 110 執行 `/backup/scripts/enforce-latest-only-retention.sh`;若仍未收斂,確認 `common.sh` 使用 `restic forget --group-by \"\" --keep-last 1 --prune`,避免 restic 依 path/tag 分組保留多份。"
|
||
|
||
- alert: BackupOffsiteFullVerifyFailed
|
||
expr: |
|
||
awoooi_backup_offsite_fresh{host="110",provider="rclone"} == 1
|
||
unless on(host, provider)
|
||
(awoooi_backup_offsite_remote_verify_ok{host="110",provider="rclone"} == 1)
|
||
for: 30m
|
||
labels:
|
||
severity: warning
|
||
layer: host-backup
|
||
component: backup-offsite
|
||
host: "110"
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "110 Google Drive full sync 完成但遠端驗證未通過"
|
||
description: "full offsite marker 已 fresh,但 verify-offsite-full-sync.sh 沒有證明 13 個 Google Drive repo 都可列出且符合 latest-only。"
|
||
runbook: "SSH 110 執行 `/backup/scripts/verify-offsite-full-sync.sh --write-textfile --no-color`,檢查 `/backup/logs/offsite-full-sync-verify.log` 與 `/home/wooo/node_exporter_textfiles/offsite_full_sync_verify.prom`。"
|
||
|
||
- alert: BackupOffsiteRemoteSnapshotRetentionExceeded
|
||
expr: |
|
||
(awoooi_backup_offsite_remote_snapshot_count{host="110",provider="rclone"} > 1)
|
||
and on(host, provider)
|
||
(awoooi_backup_offsite_fresh{host="110",provider="rclone"} == 1)
|
||
for: 30m
|
||
labels:
|
||
severity: warning
|
||
layer: host-backup
|
||
component: backup-retention
|
||
host: "110"
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Google Drive repo {{ $labels.repo }} 保留超過 1 份 snapshot"
|
||
description: "{{ $labels.repo }} 在 Google Drive/rclone 遠端目前有 {{ $value }} 份 snapshot;latest-only 策略要求遠端也只保留最新一份。"
|
||
runbook: "確認 110 `/backup/scripts/sync-offsite-backups.sh --mode sync` 使用 `rclone sync`、`OFFSITE_SYNC_DELETE_OLD=1`、`RCLONE_DRIVE_USE_TRASH=false`,再於低峰重新執行 full sync 與 verifier。"
|
||
|
||
- alert: BackupCredentialEscrowEvidenceMissing
|
||
expr: awoooi_backup_credential_escrow_fresh{host="110"} == 0
|
||
for: 1m
|
||
labels:
|
||
severity: warning
|
||
layer: host-backup
|
||
component: credential-escrow
|
||
host: "110"
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "備份憑證金庫證據缺失或過期:{{ $labels.item }}"
|
||
description: "{{ $labels.item }} 沒有 31 天內人工驗證證據;重建時可能找不到 restic/offsite/break-glass/DNS/OAuth 復原材料。"
|
||
runbook: "在密碼管理器或離線加密金庫完成雙人覆核後,只建立不含 secret 的 `/backup/escrow-evidence/{{ $labels.item }}.last_verified` 時間戳證據。"
|
||
|
||
# =========================================================================
|
||
# 基礎設施詳細告警 (awoooi_infrastructure_detailed) — 從主機補回 2026-04-12
|
||
# =========================================================================
|
||
- name: awoooi_infrastructure_detailed
|
||
interval: 60s
|
||
rules:
|
||
- alert: DockerContainerUnhealthyDetailed
|
||
expr: |
|
||
count by (name, instance) (
|
||
container_tasks_state{state="running", instance=~"192.168.0.188.*"}
|
||
) == 0
|
||
or
|
||
container_last_seen{instance=~"192.168.0.188.*", name!=""} < (time() - 120)
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: docker-188
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "188 主機容器 {{ $labels.name }} 異常"
|
||
description: "容器 {{ $labels.name }} 在 {{ $labels.instance }} 已超過 2 分鐘無活動或不在 running 狀態。"
|
||
runbook: "SSH 到 192.168.0.188:docker inspect {{ $labels.name }} 確認健康狀態"
|
||
|
||
- alert: RedisStreamBacklogHigh
|
||
expr: awoooi_redis_stream_len > 500
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
layer: docker-188
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Redis Stream {{ $labels.stream }} 積壓 {{ $value }} 筆"
|
||
description: "Redis Stream 積壓超過 500 筆,飛輪消費者可能阻塞。"
|
||
runbook: "檢查 consumer group lag:XINFO GROUPS <stream-key>"
|
||
|
||
# 2026-04-19 Hermes E3 決策: PostgreSQLDiskGrowthRate deprecated
|
||
# 真因: 500MB/h 增長是 PG WAL 正常行為 (commits/checkpoints),不該告警
|
||
# 過去 30d 觸發 7 次,全部 AI 判 NO_ACTION 或誤判 kubectl rollout restart 失敗
|
||
# 統帥決策 (2026-04-19 18:xx Taipei): 選項 C 刪除舊規則 + 改用絕對磁碟使用率
|
||
# -----------------------------------------------------------------
|
||
- alert: HostDiskUsageHigh
|
||
expr: |
|
||
(
|
||
node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
|
||
- node_filesystem_avail_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
|
||
)
|
||
/ node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
|
||
* 100 > 80
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
auto_repair: "false"
|
||
supersedes: PostgreSQLDiskGrowthRate
|
||
annotations:
|
||
summary: "主機 {{ $labels.instance }} 根目錄磁碟使用率 {{ $value | humanize }}% (>80%)"
|
||
description: "磁碟使用率持續 10 分鐘超過 80%, 需清理或擴容. 常見原因: PG WAL, 日誌, container images, 舊 build cache."
|
||
runbook: "SSH 該主機: df -h / && du -sh /var/lib/postgresql/*/pg_wal /var/log /var/lib/docker"
|
||
|
||
- alert: HostDiskUsageCritical
|
||
expr: |
|
||
(
|
||
node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
|
||
- node_filesystem_avail_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
|
||
)
|
||
/ node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
|
||
* 100 > 90
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
layer: systemd-188
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
auto_repair: "true"
|
||
supersedes: PostgreSQLDiskGrowthRate
|
||
annotations:
|
||
summary: "🔴 主機 {{ $labels.instance }} 根目錄磁碟使用率 {{ $value | humanize }}% (>90%, critical)"
|
||
description: "磁碟即將滿, 需立即清理. 超過 95% 可能導致服務中斷."
|
||
runbook: "立即 SSH 該主機: df -h / && du -sh /* 2>/dev/null | sort -h | tail -10"
|
||
|
||
# =========================================================================
|
||
# 主機連通性告警 (awoooi_host_connectivity) — 從主機補回 2026-04-12
|
||
# =========================================================================
|
||
- name: awoooi_host_connectivity
|
||
interval: 60s
|
||
rules:
|
||
- alert: HostNetworkPartition
|
||
expr: probe_success{job="host-connectivity"} == 0
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
layer: systemd-188
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "主機 {{ $labels.instance }} 無法連通"
|
||
description: "TCP probe 到 {{ $labels.instance }} 失敗超過 5 分鐘,可能發生網路分區。"
|
||
runbook: "SSH 檢查路由和防火牆規則"
|
||
|
||
# =========================================================================
|
||
# 監控工具自監控 (infra_self_monitoring) — ADR-090 Phase 7
|
||
# 2026-04-19 Claude Opus 4.7 / 鐵律:監控工具必須被監控
|
||
# 設計:不寫死 CPU% 或 MB 數,改用 (配額佔比) + (throttle 訊號) 動態判斷
|
||
# 配額由 docker-compose 宣告,告警條件 = 使用量 / 配額 > 0.8
|
||
# 比寫死 80% 更智能 — 配額改告警閾值自動跟著變
|
||
# =========================================================================
|
||
- name: infra_self_monitoring
|
||
interval: 1m
|
||
rules:
|
||
|
||
# --- cadvisor 自監控 ---
|
||
- alert: CadvisorDown
|
||
expr: up{job=~".*cadvisor.*"} == 0
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
layer: docker-110-188
|
||
component: cadvisor
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "cAdvisor ({{ $labels.instance }}) 停擺"
|
||
description: "主機 {{ $labels.instance }} 的 cadvisor 已停擺 5 分鐘,容器監控中斷。"
|
||
runbook: "SSH 主機 docker compose up -d cadvisor;檢查 OOMKill 訊號"
|
||
|
||
- alert: CadvisorMemoryPressure
|
||
expr: container_memory_usage_bytes{name="cadvisor"} / container_spec_memory_limit_bytes{name="cadvisor"} > 0.8
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
component: cadvisor
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "cAdvisor 記憶體使用率 > 80% limit"
|
||
description: "cadvisor 記憶體用量 / mem_limit = {{ $value | humanizePercentage }},接近 OOMKill。"
|
||
runbook: "若頻繁觸發 → 檢查 cardinality 是否持續成長,考慮調整 --disable_metrics"
|
||
|
||
- alert: CadvisorCPUThrottled
|
||
expr: rate(container_cpu_cfs_throttled_seconds_total{name="cadvisor"}[5m]) > 0.5
|
||
for: 15m
|
||
labels:
|
||
severity: warning
|
||
component: cadvisor
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "cAdvisor CPU 被 throttle(配額不足)"
|
||
description: "cadvisor 每秒被 throttle {{ $value }} 秒,表示實際需求超過 cpus 配額。"
|
||
runbook: "調高 docker-compose cpus 設定,或檢查 scrape interval / cardinality"
|
||
|
||
# --- node-exporter 自監控 ---
|
||
- alert: NodeExporterDown
|
||
expr: up{job=~"node-exporter.*|node_exporter.*"} == 0
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
component: node-exporter
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "node-exporter ({{ $labels.instance }}) 停擺"
|
||
description: "主機 {{ $labels.instance }} node-exporter 已停擺 5 分鐘,主機 metrics 中斷。"
|
||
runbook: "SSH 主機檢查 docker ps node-exporter;重啟 docker compose up -d node-exporter"
|
||
|
||
- alert: NodeExporterCPUThrottled
|
||
expr: rate(container_cpu_cfs_throttled_seconds_total{name="node-exporter"}[5m]) > 0.5
|
||
for: 15m
|
||
labels:
|
||
severity: warning
|
||
component: node-exporter
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "node-exporter CPU 被 throttle(配額不足)"
|
||
description: "node-exporter 每秒被 throttle {{ $value }} 秒。可能 collector 未適度 disable。"
|
||
runbook: "檢查 node-exporter --collector.* flags 是否該關掉閒置硬體 probe"
|
||
|
||
# --- Sentry self-hosted 自監控(110)---
|
||
# 2026-04-25 ogt + Claude Opus 4.7: 修正假告警根因
|
||
# 舊規則用 container_memory_usage_bytes(含 page cache),導致 ClickHouse
|
||
# 執行大查詢時 OS 把 SSTable 緩存進 page cache,比例衝到 88.5% 觸發誤報
|
||
# (2026-04-23 23:13 鐵證:usage_bytes=88.5% / working_set=7.8%)。
|
||
# 改用 container_memory_working_set_bytes — 這才是 K8s/Docker OOM killer
|
||
# 實際追蹤的「真實工作集」(RSS + active page cache),不含 inactive page cache。
|
||
# 參考: https://github.com/google/cadvisor/blob/master/info/v1/container.go
|
||
- alert: SentryClickHouseMemoryPressure
|
||
expr: container_memory_working_set_bytes{name=~".*sentry.*clickhouse.*"} / container_spec_memory_limit_bytes{name=~".*sentry.*clickhouse.*"} > 0.85
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
component: sentry-clickhouse
|
||
team: platform
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Sentry ClickHouse 工作集記憶體 > 85% limit"
|
||
description: "sentry clickhouse working_set / mem_limit = {{ $value | humanizePercentage }} (排除 page cache)。"
|
||
runbook: "檢查 Sentry 查詢壓力;確認非 page cache 假象;必要時調整 /opt/sentry/docker-compose.override.yml clickhouse mem_limit"
|
||
|
||
- alert: SentryClickHouseCPUThrottled
|
||
expr: rate(container_cpu_cfs_throttled_seconds_total{name=~".*sentry.*clickhouse.*"}[5m]) > 1.0
|
||
for: 15m
|
||
labels:
|
||
severity: warning
|
||
component: sentry-clickhouse
|
||
team: platform
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Sentry ClickHouse CPU 持續被 throttle"
|
||
description: "每秒 throttle {{ $value }} 秒,配額 cpus=4.0 可能不足。"
|
||
runbook: "檢查 Sentry retention / query pattern;必要時調高 override.yml cpus"
|
||
|
||
# --- Gitea 自監控 ---
|
||
- alert: GiteaMemoryPressure
|
||
# 2026-04-25 ogt + Claude Sonnet 4.6 — 同 ClickHouse 假警報根因:
|
||
# container_memory_usage_bytes 含 page cache(OS inactive,OOM killer 不管)→ 虛高假警報
|
||
# 改用 container_memory_working_set_bytes(RSS + active cache,真實壓力,cadvisor 適用 Docker + K8s)
|
||
expr: container_memory_working_set_bytes{name="gitea"} / container_spec_memory_limit_bytes{name="gitea"} > 0.85
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
component: gitea
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Gitea 記憶體工作集 > 85% limit"
|
||
description: "gitea working_set / mem_limit = {{ $value | humanizePercentage }}(真實記憶體壓力,非 page cache 干擾)。"
|
||
runbook: "檢查 CI/CD 任務堆積;必要時調高 docker-compose mem_limit"
|
||
|
||
- alert: GiteaCPUThrottled
|
||
expr: rate(container_cpu_cfs_throttled_seconds_total{name=~"gitea|gitea-runner"}[5m]) > 1.0
|
||
for: 15m
|
||
labels:
|
||
severity: warning
|
||
component: gitea
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Gitea / Runner CPU 持續被 throttle"
|
||
description: "{{ $labels.name }} 每秒 throttle {{ $value }} 秒,CD peak 可能卡關。"
|
||
runbook: "檢查 job 並行度;考慮縮減並行或調高 cpus"
|
||
|
||
# --- 監控自監控元層(Prometheus 本身)---
|
||
- alert: PrometheusDown
|
||
expr: up{job="prometheus"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
component: prometheus
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Prometheus ({{ $labels.instance }}) 停擺"
|
||
description: "Prometheus 自己停擺 → 所有其他告警失效"
|
||
runbook: "SSH 110 docker compose -f /home/wooo/monitoring/docker-compose.yml up -d prometheus"
|
||
|
||
# =========================================================================
|
||
# Full-stack cold-start recovery gate
|
||
# =========================================================================
|
||
- name: cold_start_recovery_alerts
|
||
rules:
|
||
- alert: PrometheusRuleDriftGuardFailed
|
||
expr: |
|
||
absent(awoooi_prometheus_rule_drift_guard_last_run_timestamp{host="110"})
|
||
or
|
||
(time() - max by(host) (awoooi_prometheus_rule_drift_guard_last_run_timestamp{host="110"}) > 900)
|
||
or
|
||
(awoooi_prometheus_rule_drift_guard_missing_required_count{host="110"} > 0)
|
||
or
|
||
(awoooi_prometheus_rule_drift_guard_current_matches_canonical{host="110"} == 0)
|
||
for: 10m
|
||
labels:
|
||
severity: critical
|
||
layer: systemd-110
|
||
component: prometheus-rule-drift-guard
|
||
host: "110"
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Prometheus 規則漂移防護失效"
|
||
description: "110 Prometheus rule drift guard 沒有新鮮成功指標、required rules 缺失,或 active alerts.yml 不等於 canonical rules。"
|
||
runbook: "執行 `bash scripts/ops/deploy-alerts.sh` 重新部署 canonical rules 與 drift guard,等待 1-2 個 Prometheus evaluation cycle 後重跑 readiness audit。"
|
||
|
||
- alert: PrometheusRuleDriftAutoRepaired
|
||
expr: awoooi_prometheus_rule_drift_guard_repaired{host="110"} > 0
|
||
for: 1m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-110
|
||
component: prometheus-rule-drift-guard
|
||
host: "110"
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Prometheus 規則漂移已被自動修復"
|
||
description: "110 drift guard 最近一次偵測到 active Prometheus rules 漂移,已回復 canonical rules 並 reload Prometheus。"
|
||
runbook: "檢查 `/home/wooo/logs/prometheus-rule-drift-guard.log` 與 `/home/wooo/monitoring/alerts.yml.guard.bak.*`,找出誰覆寫了 active rules。"
|
||
|
||
- alert: ColdStartMonitorMissing
|
||
expr: absent(awoooi_cold_start_monitor_up{host="110",scope="110_120_121_188"})
|
||
for: 15m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-110
|
||
component: cold-start-monitor
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Cold-start monitor textfile metric missing"
|
||
description: "110 沒有輸出 awoooi_cold_start_monitor_up;重開機恢復 gate 目前不可觀測。"
|
||
runbook: "執行 scripts/reboot-recovery/install-cold-start-monitor-110.sh,確認 /home/wooo/node_exporter_textfiles/cold_start_recovery.prom"
|
||
|
||
- alert: ColdStartMonitorStale
|
||
expr: time() - awoooi_cold_start_last_run_timestamp{host="110",scope="110_120_121_188"} > 900
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-110
|
||
component: cold-start-monitor
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Cold-start monitor stale"
|
||
description: "cold-start monitor 超過 15 分鐘沒有更新,距離上次執行 {{ $value | humanizeDuration }}。"
|
||
runbook: "SSH 110 檢查 crontab、/tmp/awoooi-cold-start-monitor.cron.log、cold-start-last.log"
|
||
|
||
- alert: ColdStartRecoveryBlocked
|
||
expr: awoooi_cold_start_blocked_gates{host="110",scope="110_120_121_188"} > 0
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
layer: full-stack
|
||
component: cold-start-gate
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Full-stack cold-start recovery BLOCKED"
|
||
description: "cold-start gate 有 {{ $value }} 個 BLOCKED gate。AI 修復需保持 observe-only,先處理第一個 blocked gate。"
|
||
runbook: "查看 /home/wooo/reboot-recovery/cold-start-last.log;依 docs/runbooks/FULL-STACK-COLD-START-SOP.md 的 P0→P2 順序修復"
|
||
|
||
- alert: K3sNodeFilesystemErrorGateBlocked
|
||
expr: awoooi_cold_start_blocker_reason{host="110",scope="110_120_121_188",reason="k3s_node_filesystem_error",target="120"} > 0
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
layer: k3s
|
||
component: node-filesystem
|
||
host: "120"
|
||
target_host: "120"
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "120 K3s 節點 filesystem error 阻擋重開機放行"
|
||
description: "cold-start log 偵測到 120 Node event 仍有 EXT4/I/O/deleted inode 類錯誤;即使 Pod Running、網站 200,也不可宣告下一次重開機安全。"
|
||
runbook: "查看 110 `/home/wooo/reboot-recovery/cold-start-last.log`,執行 `scripts/reboot-recovery/120-fsck-maintenance-checklist.sh` 做維護前只讀檢查;維護窗內用 console/rescue 對 120 root LV 執行 fsck,禁止 online fsck。"
|
||
|
||
- alert: ColdStartHost120Unreachable
|
||
expr: awoooi_cold_start_blocker_reason{host="110",scope="110_120_121_188",reason="host_unreachable",target="120"} > 0
|
||
for: 3m
|
||
labels:
|
||
severity: critical
|
||
layer: host
|
||
component: host-reachability
|
||
host: "120"
|
||
target_host: "120"
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "120 主機不可達,Full-stack cold-start 已阻擋"
|
||
description: "110 cold-start monitor 無法 ping/SSH 192.168.0.120;目前只能由 121/VIP 撐住 K3s,不能宣告所有主機重開機恢復完成。"
|
||
runbook: "查看 120 console。若停在 initramfs/manual fsck,先對 root LV 做離線 fsck;若主機關機或網卡異常,先恢復電源/網路,再重跑 full-stack cold-start gate。禁止從自動修復直接重啟其他服務掩蓋主機離線。"
|
||
|
||
- alert: ColdStartRecoveryDegraded
|
||
expr: awoooi_cold_start_warn_gates{host="110",scope="110_120_121_188"} > 0
|
||
for: 15m
|
||
labels:
|
||
severity: warning
|
||
layer: full-stack
|
||
component: cold-start-gate
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Full-stack cold-start recovery DEGRADED"
|
||
description: "cold-start gate 有 {{ $value }} 個 WARN gate;核心可用但不應放行 runner/CD/AI auto-repair full execution。"
|
||
runbook: "查看 /home/wooo/reboot-recovery/cold-start-last.log,修到 PASS/WARN/BLOCKED = green"
|
||
|
||
- alert: ColdStartLastGreenTooOld
|
||
expr: time() - awoooi_cold_start_last_green_timestamp{host="110",scope="110_120_121_188"} > 3600
|
||
for: 15m
|
||
labels:
|
||
severity: warning
|
||
layer: full-stack
|
||
component: cold-start-gate
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Full-stack cold-start gate has not been GREEN recently"
|
||
description: "距離上次 GREEN 已超過 {{ $value | humanizeDuration }};需要確認 110/120/121/188 與排程/網站 gate。"
|
||
runbook: "執行 SSH_BATCH_MODE=yes bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test"
|
||
|
||
# =========================================================================
|
||
# Host storage health / dirty reboot evidence
|
||
# =========================================================================
|
||
- name: host_storage_health_alerts
|
||
rules:
|
||
- alert: Host110StorageHealthMonitorMissing
|
||
expr: absent(awoooi_host_storage_monitor_up{host="110"})
|
||
for: 15m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-110
|
||
component: storage-health-monitor
|
||
host: "110"
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "110 storage health textfile metric missing"
|
||
description: "110 沒有輸出 storage_health.prom;dirty reboot、root read-only 與 fsck 證據目前不可觀測。"
|
||
runbook: "用 Ansible `110-devops.yml --tags textfile_exporters` 或手動部署 scripts/ops/storage-health-textfile-exporter.py,確認 /home/wooo/node_exporter_textfiles/storage_health.prom"
|
||
|
||
- alert: Host188StorageHealthMonitorMissing
|
||
expr: absent(awoooi_host_storage_monitor_up{host="188"})
|
||
for: 15m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
component: storage-health-monitor
|
||
host: "188"
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "188 storage health textfile metric missing"
|
||
description: "188 沒有輸出 storage_health.prom;dirty reboot、root read-only 與 fsck 證據目前不可觀測。"
|
||
runbook: "用 Ansible `188-ai-web.yml --tags textfile_exporters` 或手動部署 scripts/ops/storage-health-textfile-exporter.py,確認 /home/ollama/node_exporter_textfiles/storage_health.prom"
|
||
|
||
- alert: HostStorageHealthMonitorStale
|
||
expr: time() - awoooi_host_storage_last_run_timestamp{host=~"110|188"} > 900
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
layer: host-storage
|
||
component: storage-health-monitor
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "主機 {{ $labels.host }} storage health textfile stale"
|
||
description: "storage health exporter 超過 15 分鐘沒有更新;重開機後檔案系統風險不可觀測。"
|
||
runbook: "SSH 主機檢查 cron、/tmp/awoooi-storage-health-textfile-exporter.cron.log 與 node-exporter textfile collector"
|
||
|
||
- alert: HostRootFilesystemReadOnly
|
||
expr: awoooi_host_root_filesystem_readonly{host=~"110|188",mountpoint="/"} > 0
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
layer: host-storage
|
||
component: root-filesystem
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "主機 {{ $labels.host }} root filesystem 已變成 read-only"
|
||
description: "root filesystem 被掛載為唯讀,服務可能仍暫時存活但寫入會失敗;禁止自動修復,先保全證據並規劃維護窗。"
|
||
runbook: "依 docs/runbooks/FULL-STACK-COLD-START-SOP.md §16:保全 journal/df/mount 證據,確認備份,再安排 console/offline fsck"
|
||
|
||
- alert: HostCurrentBootStorageErrorsDetected
|
||
expr: awoooi_host_storage_error_count{host=~"110|188",source="journalctl-kernel",boot="current"} > 0
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
layer: host-storage
|
||
component: kernel-storage
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "主機 {{ $labels.host }} current boot 有 storage/kernel 錯誤"
|
||
description: "目前開機週期已出現 filesystem、I/O 或 fsck 類錯誤;不可只重啟容器掩蓋問題。"
|
||
runbook: "先執行 read-only 診斷:journalctl -k -p warning..alert、mount、df、smartctl/raid 狀態;必要時進入維護窗處理"
|
||
|
||
- alert: HostPreviousBootStorageErrorsDetected
|
||
expr: awoooi_host_storage_error_count{host=~"110|188",source="journalctl-kernel",boot="previous"} > 0
|
||
for: 30m
|
||
labels:
|
||
severity: warning
|
||
layer: host-storage
|
||
component: dirty-reboot-evidence
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "主機 {{ $labels.host }} previous boot 保留 storage/fsck 錯誤證據"
|
||
description: "上一個開機週期留有 storage/fsck 錯誤,代表這次重開機事故需要完成 fsck、備份與容量後續檢查。"
|
||
runbook: "把證據寫入 docs/LOGBOOK.md,確認 full-stack cold-start gate 與 P3 gate;下一次維護窗補 offline fsck/SMART/RAID 檢查"
|
||
|
||
- alert: HostFsckLogErrorsDetected
|
||
expr: sum by(host) (awoooi_host_storage_error_count{host=~"110|188",boot="last-fsck-log"}) > 0
|
||
for: 30m
|
||
labels:
|
||
severity: warning
|
||
layer: host-storage
|
||
component: fsck-log
|
||
team: ops
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-1
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "主機 {{ $labels.host }} fsck log 保留錯誤證據"
|
||
description: "主機 fsck log 內仍有 inconsistency 或 I/O 類錯誤文字;這是事故後追蹤項,不應交給自動修復直接處理。"
|
||
runbook: "確認 /run/initramfs/fsck.log 與 /var/log/fsck/*,將結果納入重開機事故報告與下次維護窗檢查項"
|