Files
awoooi/ops/monitoring/alerts.yml
Your Name 2ec7f6f440
Some checks failed
Code Review / ai-code-review (push) Successful in 14s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 31s
CD Pipeline / tests (push) Successful in 1m59s
CD Pipeline / build-and-deploy (push) Successful in 7m36s
CD Pipeline / post-deploy-checks (push) Failing after 43s
Ansible / Reboot Recovery Contract / validate (push) Has been cancelled
fix(ops): harden heartbeat and momo alert noise
2026-06-24 19:38:33 +08:00

1357 lines
60 KiB
YAML
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# ops/monitoring/alerts-unified.yml
# AWOOOI 統一 Prometheus 告警規則
# 2026-04-05 Claude Code: 整合所有規則,加入統一 layer 標籤
# 2026-04-08 Claude Sonnet 4.6: 補 database_detail_alerts 群組 (6條詳細規則)
# 2026-04-12 Claude Sonnet 4.6: 補回 4 個僅存在主機的群組 (backup/flywheel/connectivity/infra-detailed)
# 部署目標: 192.168.0.110:/home/wooo/monitoring/alerts.yml
# 部署方式: scripts/ops/deploy-alerts.sh (CD 自動部署)
#
# 標籤規範:
# layer: k8s | docker-110 | docker-188 | systemd-188
# component: 服務名稱
# team: ops | backend | ai | platform
# host: "110" | "188" | "120" | "121"
# auto_repair: "true" | "false"
groups:
# =========================================================================
# 主機層告警 (host_alerts)
# =========================================================================
- name: host_alerts
rules:
- alert: HostDown
expr: up{job=~"node-exporter.*"} == 0
for: 1m
labels:
severity: critical
layer: systemd-188
team: ops
auto_repair: "false"
annotations:
summary: "主機 {{ $labels.host }} 不可達"
description: "Node Exporter 無回應超過 1 分鐘"
- alert: HostHighCpuLoad
# 2026-05-05 ogt + Codex: keep this as early warning only.
# Sustained overload/root-cause automation is handled by HostLoadAverageSustainedHigh.
expr: 100 - (avg by(host) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
for: 10m
labels:
severity: warning
layer: systemd-188
team: ops
auto_repair: "true"
# MCP Phase 2a (ADR-071, 2026-04-11 Claude Sonnet 4.6): SSH MCP 路由標籤
mcp_provider: "ssh_host"
host_type: "bare_metal"
alert_category: "host_resource"
annotations:
summary: "主機 {{ $labels.host }} CPU 高負載"
description: "CPU 使用率超過 90% 持續 10 分鐘;若 load5/core 未超過 1.5,先視為容量觀察與診斷,不直接修復。"
# 2026-05-02 ogt + Claude Sonnet 4.6: 引導 LLM 走 SSH 診斷而非 kubectl
auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%cpu | head -20' (host CPU 診斷;禁 kubectl restart awoooi-* — 主因常為第三方服務 Sentry/ClickHouse/Snuba)"
runbook: "host CPU 高負載排查:先 SSH ps aux 看 top 進程若為第三方服務Sentry/ClickHouse 等)寫 ADR 升級資源或調 limit禁止 kubectl restart 跨 domain"
- alert: HostLoadAverageSustainedHigh
# 2026-05-05 ogt + Codex: 110/188 長時間過載基線。
# Why: CPU% 只看 busy time無法充分表達 ClickHouse merge、Kafka 回補、Chrome/Ollama 造成的 runnable queue。
expr: node_load5{host=~"110|188"} / on(host) count by(host) (count by(host,cpu) (node_cpu_seconds_total{mode="idle",host=~"110|188"})) > 1.5
for: 15m
labels:
severity: critical
layer: systemd-188
team: ops
auto_repair: "true"
mcp_provider: "ssh_host"
host_type: "bare_metal"
alert_category: "host_resource"
annotations:
summary: "主機 {{ $labels.host }} load5/core 長時間過高"
description: "load5 / CPU core > 1.5 持續 15 分鐘;這通常代表 runnable queue 已長期塞車,不是短暫尖峰。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'echo \"=== LOAD ===\"; uptime; echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -25; echo \"=== DOCKER STATS ===\"; docker stats --no-stream | head -40'"
runbook: "先判斷高 load 來源ClickHouse merge / Kafka backlog / Chrome scraper / Ollama runner / exporter只允許 read-only 診斷,自動修復需走服務專屬 playbook。"
- alert: HostOutOfMemory
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
severity: warning
layer: systemd-188
team: ops
auto_repair: "true"
mcp_provider: "ssh_host"
host_type: "bare_metal"
alert_category: "host_resource"
annotations:
summary: "主機 {{ $labels.host }} 記憶體不足"
description: "記憶體使用率超過 85%"
# 2026-05-02 ogt + Claude Sonnet 4.6: 引導 LLM 走 SSH 診斷
auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%mem | head -20' (host 記憶體診斷;禁 kubectl restart — 主因常為第三方服務)"
runbook: "host 記憶體不足排查SSH 看 top 進程;若為第三方服務需擴容或調 limit"
- alert: HostOutOfDiskSpace
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 85
for: 5m
labels:
severity: warning
layer: systemd-188
team: ops
auto_repair: "true"
mcp_provider: "ssh_host"
host_type: "bare_metal"
alert_category: "host_resource"
annotations:
summary: "主機 {{ $labels.host }} 磁碟空間不足"
description: "磁碟使用率超過 85%"
auto_repair_action: "ssh {{ $labels.instance }} 'echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -15; echo \"=== MEMORY ===\"; free -h; echo \"=== DISK ===\"; df -h; echo \"=== LOAD ===\"; uptime'"
# =========================================================================
# K8s 叢集告警 (kubernetes_alerts)
# =========================================================================
- name: kubernetes_alerts
rules:
- alert: K3sNodeNotReady
expr: kube_node_status_condition{condition="Ready", status="true"} == 0
for: 2m
labels:
severity: critical
layer: k8s
team: ops
auto_repair: "false"
annotations:
summary: "K3s 節點 {{ $labels.node }} 未就緒"
description: "節點超過 2 分鐘未達到 Ready 狀態"
- alert: KubePodCrashLooping
expr: rate(kube_pod_container_status_restarts_total{namespace="awoooi-prod"}[15m]) > 0
for: 5m
labels:
severity: warning
layer: k8s
team: ops
auto_repair: "true"
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} 持續重啟"
description: "Pod 在過去 15 分鐘內重啟次數異常"
- alert: KubePodNotReady
expr: kube_pod_status_ready{condition="true",namespace="awoooi-prod"} == 0
for: 5m
labels:
severity: warning
layer: k8s
team: ops
auto_repair: "true"
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} 未就緒"
description: "Running 中的 Pod 超過 5 分鐘未達到 Ready 狀態"
- alert: KubeDeploymentReplicasMismatch
expr: kube_deployment_spec_replicas{namespace="awoooi-prod"} != kube_deployment_status_replicas_available{namespace="awoooi-prod"}
for: 10m
labels:
severity: warning
layer: k8s
team: ops
auto_repair: "true"
annotations:
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} 副本數不匹配"
description: "期望副本數與可用副本數不一致超過 10 分鐘"
- alert: VeleroBackupFailed
expr: increase(velero_backup_failure_total[24h]) > 0
for: 5m
labels:
severity: warning
layer: k8s
team: ops
component: velero
auto_repair: "false"
annotations:
summary: "Velero 備份失敗"
description: "過去 24 小時有備份失敗"
- alert: VeleroBackupNotRun
expr: time() - velero_backup_last_successful_timestamp > 86400
for: 10m
labels:
severity: critical
layer: k8s
team: ops
component: velero
auto_repair: "false"
annotations:
summary: "Velero 超過 24 小時未成功備份"
description: "最後一次成功備份超過 24 小時"
# Sprint C-2 Host rsync 備份告警 (2026-04-11 Claude Sonnet 4.6)
# backup-from-110.sh 成功時寫入 /var/run/backup-110.last_success
# node-exporter textfile collector 讀取此檔案暴露指標
- alert: HostBackupFailed
# 2026-05-02 ogt + Claude Sonnet 4.6: 刪除 backup_110 collector 標籤條件
# 根因node_textfile_scrape_error 已移除 collector 欄位,原條件一直判斷為 absent造成錯誤告警。
# 修法:以 backup_110_last_success_timestamp 是否缺失/超時判斷。
expr: absent(backup_110_last_success_timestamp) or (time() - backup_110_last_success_timestamp > 90000)
for: 10m
labels:
severity: warning
layer: docker-188
team: ops
component: backup
host: "188"
auto_repair: "true"
alert_category: host_resource
annotations:
summary: "188 Host 備份超過 25 小時未成功"
description: "backup-from-110.sh 可能失敗,/backup/110 資料可能過舊"
auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%cpu | head -15; echo \"=== BACKUP STATUS ===\"; ls -lah /home/ollama/backup/110 2>/dev/null || true; echo \"=== LAST SUCCESS ===\"; cat /home/ollama/backup/110/last_success 2>/dev/null || true; echo \"=== BACKUP LOG ===\"; tail -80 /home/ollama/backup/110/backup.log 2>/dev/null || true; echo \"=== DISK ===\"; df -h /home/ollama /backup / 2>/dev/null || df -h'"
# ADR-075: CoreDNS 解析失敗 (2026-04-12 ogt)
- alert: CoreDNSResolutionFailed
expr: rate(coredns_dns_responses_total{rcode="SERVFAIL"}[5m]) > 0.05
for: 5m
labels:
severity: critical
layer: k8s
team: ops
auto_repair: "true"
alert_category: kubernetes
notification_type: TYPE-3
annotations:
summary: "CoreDNS SERVFAIL 率過高 {{ $value | humanizePercentage }}"
description: "CoreDNS 在 5 分鐘內 SERVFAIL 回應率超過 5%K8s 服務間 DNS 解析可能失敗"
runbook: "kubectl -n kube-system get pods -l k8s-app=kube-dns && kubectl -n kube-system logs -l k8s-app=kube-dns"
# =========================================================================
# 資料庫告警 (database_alerts)
# =========================================================================
- name: database_alerts
rules:
- alert: PostgreSQLDown
expr: up{job="postgres-exporter"} == 0 or pg_up == 0
for: 1m
labels:
severity: critical
layer: systemd-188
component: postgres
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "PostgreSQL 資料庫離線"
description: "PostgreSQL Exporter 無法連接資料庫超過 1 分鐘"
- alert: RedisDown
expr: up{job="redis-exporter"} == 0 or redis_up == 0
for: 1m
labels:
severity: critical
layer: systemd-188
component: redis
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "Redis 快取服務離線"
description: "Redis Exporter 無法連接 Redis 超過 1 分鐘"
- alert: PostgreSQLHighConnections
expr: pg_stat_activity_count > 80
for: 5m
labels:
severity: warning
layer: systemd-188
component: postgres
team: ops
auto_repair: "false"
annotations:
summary: "PostgreSQL 連接數過高"
description: "當前連接數 {{ $value }} 超過 80"
- alert: RedisMemoryHigh
expr: redis_memory_max_bytes > 0 and redis_memory_used_bytes / redis_memory_max_bytes * 100 > 80
for: 5m
labels:
severity: warning
layer: systemd-188
component: redis
team: ops
auto_repair: "false"
annotations:
summary: "Redis 記憶體使用過高"
description: "Redis 記憶體使用率超過 80%"
# =========================================================================
# Sprint 5.2 Plan B: 資料庫詳細指標告警 (database_detail_alerts)
# 前置: postgres-exporter:9187 + redis-exporter:9121 on 192.168.0.188
# 2026-04-08 Claude Sonnet 4.6 Asia/Taipei
# =========================================================================
- name: database_detail_alerts
rules:
# ---- PostgreSQL 詳細指標 ----
- alert: PostgreSQLSlowQueries
expr: pg_stat_activity_max_tx_duration{datname="awoooi_prod"} > 60
for: 5m
labels:
severity: warning
layer: systemd-188
component: postgres
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "PostgreSQL 有慢查詢 (>60s)"
description: "awoooi_prod 資料庫最長事務超過 60 秒"
- alert: PostgreSQLDeadlocks
expr: increase(pg_stat_database_deadlocks{datname="awoooi_prod"}[5m]) > 0
for: 1m
labels:
severity: warning
layer: systemd-188
component: postgres
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "PostgreSQL 死鎖發生"
description: "過去 5 分鐘 awoooi_prod 資料庫有死鎖"
- alert: PostgreSQLTooManyConnections
expr: pg_stat_activity_count{datname="awoooi_prod"} > 50
for: 5m
labels:
severity: warning
layer: systemd-188
component: postgres
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "PostgreSQL 連接數過高 ({{ $value }})"
description: "awoooi_prod 連接數超過 50"
# ---- Redis 詳細指標 ----
- alert: RedisKeyEviction
expr: increase(redis_evicted_keys_total[5m]) > 0
for: 1m
labels:
severity: warning
layer: systemd-188
component: redis
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "Redis 發生 Key 驅逐"
description: "過去 5 分鐘有 Key 被驅逐,可能記憶體不足"
- alert: RedisConnectionsHigh
expr: redis_connected_clients > 100
for: 5m
labels:
severity: warning
layer: systemd-188
component: redis
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "Redis 連接數過高 ({{ $value }})"
description: "Redis 連接數超過 100"
- alert: RedisCommandLatencyHigh
expr: redis_commands_duration_seconds_total / redis_commands_processed_total > 0.01
for: 5m
labels:
severity: warning
layer: systemd-188
component: redis
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "Redis 命令平均延遲過高 (>10ms)"
description: "Redis 命令平均延遲超過 10ms"
# =========================================================================
# 服務可用性告警 (service_alerts) — 含 layer 標籤決定修復路徑
# =========================================================================
- name: service_alerts
rules:
# ---- 188 Docker 層 ----
- alert: OpenClawDown
# 2026-04-05 Claude Code: 修正舊命名 ClawBotDown → OpenClawDown
expr: up{job="clawbot"} == 0
for: 2m
labels:
severity: critical
layer: docker-188
component: openclaw
host: "188"
team: ops
auto_repair: "true"
annotations:
summary: "OpenClaw 服務離線"
description: "OpenClaw (192.168.0.188:8088) 已離線超過 2 分鐘"
- alert: SignOzDown
expr: probe_success{job="blackbox-http", instance=~".*3301.*"} == 0
for: 2m
labels:
severity: warning
layer: docker-188
component: signoz
host: "188"
team: ops
auto_repair: "true"
mcp_provider: "ssh_host"
target_host: "192.168.0.188"
alert_category: "devops_tool"
annotations:
summary: "SignOz 服務離線"
description: "SignOz (192.168.0.188:3301) 已離線超過 2 分鐘"
# ---- 110 Docker 層 ----
- alert: SentryDown
expr: probe_success{job="blackbox-tcp", instance="192.168.0.110:9000"} == 0
for: 2m
labels:
severity: warning
layer: docker-110
component: sentry
host: "110"
team: ops
auto_repair: "true"
mcp_provider: "ssh_host"
target_host: "192.168.0.110"
alert_category: "devops_tool"
annotations:
summary: "Sentry 服務離線"
description: "Sentry (192.168.0.110:9000) 已離線超過 2 分鐘"
- alert: HarborDown
expr: probe_success{job="blackbox-http", instance=~".*5000.*"} == 0
for: 2m
labels:
severity: critical
layer: docker-110
component: harbor
host: "110"
team: ops
auto_repair: "true"
mcp_provider: "ssh_host"
target_host: "192.168.0.110"
alert_category: "devops_tool"
annotations:
summary: "Harbor Registry 離線"
description: "Harbor (192.168.0.110:5000) 已離線超過 2 分鐘CD pipeline 將無法拉取映像"
- alert: GiteaDown
expr: probe_success{job="blackbox-http", instance="http://192.168.0.110:3001"} == 0
for: 2m
labels:
severity: critical
layer: docker-110
component: gitea
host: "110"
team: ops
auto_repair: "true"
mcp_provider: "ssh_host"
target_host: "192.168.0.110"
alert_category: "devops_tool"
annotations:
summary: "Gitea Git 服務離線"
description: "Gitea (192.168.0.110:3001) 已離線超過 2 分鐘CD pipeline 失效"
- alert: AlertmanagerDown
expr: probe_success{job="blackbox-tcp", instance="192.168.0.110:9093"} == 0
for: 2m
labels:
severity: critical
layer: docker-110
component: alertmanager
host: "110"
team: ops
auto_repair: "true"
annotations:
summary: "Alertmanager 離線"
description: "Alertmanager (192.168.0.110:9093) 已離線,所有告警將靜默"
# =========================================================================
# 告警鏈路監控 (alert_chain) — 防止 2026-03-26/04-05 事故重演
# =========================================================================
- name: alert_chain
rules:
- alert: AlertChainBroken_Alertmanager
expr: |
sum(rate(awoooi_webhook_requests_total{source="alertmanager",status!="success"}[5m]))
/ sum(rate(awoooi_webhook_requests_total{source="alertmanager"}[5m])) > 0.1
for: 10m
labels:
severity: critical
layer: k8s
team: platform
auto_repair: "false"
annotations:
summary: "Alertmanager Webhook 錯誤率 > 10%"
description: "告警鏈路可能斷裂,請執行 E2E 驗證"
- alert: AlertChainBroken_Sentry
expr: |
sum(rate(awoooi_webhook_requests_total{source="sentry",status!="success"}[5m]))
/ sum(rate(awoooi_webhook_requests_total{source="sentry"}[5m])) > 0.1
for: 10m
labels:
severity: warning
layer: k8s
team: platform
auto_repair: "false"
annotations:
summary: "Sentry Webhook 錯誤率 > 10%"
description: "Sentry 錯誤可能無法正確處理"
- alert: NoAlertsReceived2Hours
expr: time() - max by (source)(awoooi_alert_chain_last_success_timestamp{source="alertmanager"}) > 7200
for: 5m
labels:
severity: warning
layer: k8s
team: platform
auto_repair: "false"
annotations:
summary: "Alertmanager 主鏈路 2 小時內未收到告警"
description: "Alertmanager 是固定主鏈路Sentry/SignOz 沉默不代表鏈路故障,錯誤率另有 AlertChainBroken_* 規則監控。請執行 Smoke Test"
- alert: SourceProviderIngestionStale
expr: |
time() - max by (source) (
awoooi_alert_chain_last_success_timestamp{source=~"sentry|signoz"}
) > 86400
for: 15m
labels:
severity: warning
layer: k8s
component: source-ingestion
team: platform
auto_repair: "false"
alert_category: "alertchain_provider_freshness"
annotations:
summary: "{{ $labels.source }} source ingestion 超過 24 小時未更新"
description: "{{ $labels.source }} webhook endpoint 可能仍健康,但 AwoooP source dossier 已超過 24 小時沒有新事件。這是 provider ingestion / upstream smoke / correlation freshness 缺口,不是 Alertmanager 主鏈路故障。"
runbook: "先查 /api/v1/webhooks/{{ $labels.source }}/health再查 /api/v1/platform/events/dossier/coverage?provider={{ $labels.source }};若 endpoint OK 但 latest stale檢查上游 Sentry/SignOz notification channel 或排程 smoke。"
- alert: AlertChainUnhealthy
expr: awoooi_alert_chain_healthy == 0
for: 5m
labels:
severity: critical
layer: k8s
team: platform
auto_repair: "false"
annotations:
summary: "告警鏈路不健康 ({{ $labels.source }})"
description: "告警鏈路標記為不健康,最近處理失敗"
# =========================================================================
# 自動修復監控 (auto_repair)
# =========================================================================
- name: auto_repair
rules:
- alert: AutoRepairLowSuccessRate
expr: awoooi_auto_repair_success_rate < 0.3
for: 30m
labels:
severity: warning
layer: k8s
team: backend
auto_repair: "false"
annotations:
summary: "自動修復成功率過低 ({{ $value | humanizePercentage }})"
description: "動作 {{ $labels.action }} 的成功率低於 30%,建議檢查 Playbook"
- alert: PermanentFixRequired
expr: sum(rate(awoooi_anomaly_escalation_total{level="PERMANENT_FIX"}[1h])) > 0
for: 1m
labels:
severity: critical
layer: k8s
team: backend
auto_repair: "false"
annotations:
summary: "需要永久修復的異常升級"
description: "有異常升級到 PERMANENT_FIX 級別,需要根本修復"
# =========================================================================
# Sprint 5.1: Docker 容器健康監控docker-health-monitor 感知層接入)
# 由 docker-health-monitor.sh 送 Alertmanager 格式 webhook
# 或 Prometheus 自訂 exporter 上報時使用。
# auto_repair: "true" 代表允許 AWOOOI Guardrail 決策(非直接修復)
# 實際修復動作由 Service Registry 分級決定ADR-062
# 2026-04-08 Claude Sonnet 4.6 Asia/Taipei
# =========================================================================
- name: docker_health_alerts
rules:
- alert: DockerContainerUnhealthy
expr: container_health_status{job="docker-health-monitor"} == 0
for: 2m
labels:
severity: warning
layer: docker
team: ops
auto_repair: "true"
mcp_provider: "ssh_host"
alert_category: "devops_tool"
annotations:
summary: "容器 {{ $labels.container }} 健康檢查失敗"
description: "主機 {{ $labels.host }} 容器 {{ $labels.container }} 健康狀態異常,持續 2 分鐘"
- alert: DockerContainerExited
expr: container_running_status{job="docker-health-monitor"} == 0
for: 1m
labels:
severity: critical
layer: docker
team: ops
auto_repair: "true"
mcp_provider: "ssh_host"
alert_category: "devops_tool"
annotations:
summary: "容器 {{ $labels.container }} 已停止"
description: "主機 {{ $labels.host }} 容器 {{ $labels.container }} 已 exited/dead持續 1 分鐘"
- alert: DockerContainerCpuSustainedHigh
# 2026-05-05 ogt + Codex: Docker Compose 長期過載基線。
# Baseline: 單容器 >2 core 10m 為 warning用於提早抓 cadvisor、ClickHouse、momo-scheduler、Ollama runner 類問題。
expr: docker_container_cpu_cores > 2
for: 10m
labels:
severity: warning
layer: docker
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "true"
annotations:
summary: "容器 {{ $labels.container_name }} CPU 持續超過 2 core"
description: "{{ $labels.container_name }} 5m CPU 使用 >2 core 且持續 10 分鐘,需檢查是否缺 cpus limit、merge backlog、爬蟲尖峰或模型推理。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker stats --no-stream {{ $labels.container_name }}; docker inspect {{ $labels.container_name }} | head -80'"
runbook: "自動階段只做診斷;若是已知服務,交給 service-specific playbookClickHouse 降 merge/查 backlog、scheduler 限 concurrency、litellm/blackbox 查 liveliness。"
- alert: DockerContainerCpuRunawayCritical
expr: docker_container_cpu_cores > 4
for: 15m
labels:
severity: critical
layer: docker
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "true"
annotations:
summary: "容器 {{ $labels.container_name }} CPU 持續超過 4 core"
description: "{{ $labels.container_name }} 已持續吃超過 4 core會拖垮 110/188 主機;需要 AI 判斷是否限流、暫停 worker 或套用既有 playbook。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'echo \"=== CONTAINER ===\"; docker stats --no-stream {{ $labels.container_name }}; echo \"=== TOP ===\"; docker top {{ $labels.container_name }} -eo pid,ppid,stat,pcpu,pmem,comm,args | head -30'"
runbook: "禁止通用 docker restart先抓根因只有 health down 或 crash loop 才可走重啟。"
- alert: DockerContainerMemoryLimitPressure
# 2026-05-05 ogt + Codex: 防止亂設 memory limit 把 Kafka/Gitea/Taskworker 類服務卡死。
expr: docker_container_memory_limit_bytes > 0 and docker_container_memory_usage_bytes / docker_container_memory_limit_bytes > 0.85
for: 10m
labels:
severity: warning
layer: docker
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "true"
annotations:
summary: "容器 {{ $labels.container_name }} 記憶體超過 limit 85%"
description: "{{ $labels.container_name }} memory / limit > 85% 持續 10 分鐘。若是 Kafka/ClickHouse/Gitea/Taskworker需先判斷 workload不可直接降 limit。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker stats --no-stream {{ $labels.container_name }}; docker inspect {{ $labels.container_name }} | head -80'"
runbook: "若服務已接近 limit優先調整 retention/concurrency/cache再評估提高 memory禁止用更低 memory limit 當止血。"
- alert: DockerContainerRestartSpike
# 2026-05-05 ogt + Codex: cAdvisor v0.47 無 restart metric吃 node-exporter textfile docker_container_restart_count。
expr: increase(docker_container_restart_count[15m]) > 5 or increase(docker_container_inspect_restart_count[15m]) > 5
for: 3m
labels:
severity: critical
layer: docker
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "true"
annotations:
summary: "容器 {{ $labels.container_name }} 15 分鐘重啟超過 5 次"
description: "Docker restart textfile exporter 顯示 {{ $labels.container_name }} 重啟暴增,避免再次出現 litellm 24,464 次靜默崩潰。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps -a --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}'"
runbook: "先抓 crash signature若是 config/DB/網路問題,修設定,不用無限 restart。"
- alert: DockerContainerMissingResourceLimit
# 2026-05-05 ogt + Codex: catch Compose services that silently run with unlimited CPU/memory.
expr: (docker_container_cpu_limit_cores{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|taskbroker|taskscheduler|redis|uptime-checker|process-spans).*"} == 0) or (docker_container_memory_limit_bytes{container_name=~"momo-.*|litellm|gitea|sentry-self-hosted-(clickhouse|kafka|taskworker|taskbroker|taskscheduler|redis|uptime-checker|process-spans).*"} == 0)
for: 30m
labels:
severity: warning
layer: docker
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "true"
annotations:
summary: "容器 {{ $labels.container_name }} 缺 CPU 或 memory limit"
description: "{{ $labels.container_name }} 在 Docker Host 上沒有完整 CPU/memory guardrail長時間尖峰可能拖垮 110/188。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker inspect {{ $labels.container_name }} --format \"NanoCpus={{.HostConfig.NanoCpus}} Memory={{.HostConfig.Memory}}\"; docker stats --no-stream {{ $labels.container_name }}'"
runbook: "先盤點 workload再在 compose/service-specific playbook 補 cpus/mem_limit。不可盲目降 ClickHouse/Kafkamomo app/scheduler 可用 2 core/2GiB 起步。"
- alert: DockerGiteaActionsJobStale
# 2026-05-05 ogt + Codex: Gitea/act job containers can outlive workflow timeout and keep 110 hot.
expr: time() - docker_container_started_seconds{container_name=~"GITEA-ACTIONS-.*"} > 1200
for: 5m
labels:
severity: warning
layer: docker-110
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "true"
annotations:
summary: "Gitea Actions job {{ $labels.container_name }} 執行超過 20 分鐘"
description: "{{ $labels.container_name }} 已超過 20 分鐘110 曾出現 code-review/CD job timeout 失效而持續佔用 runner。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}; bash /home/wooo/scripts/stop-stale-gitea-actions-jobs.sh --min-age-seconds 1200'"
runbook: "先執行 dry-run清理腳本會依 workflow/job 名稱套停止門檻。若 logs 空白且超過該 job timeout buffer再用 --apply 停止 stale job container。不要停止仍在輸出或正在部署的 job。"
- alert: SystemdRunnerRestartSpike
# 2026-05-05 ogt + Codex: 110 GitHub Actions runner watchdog loop was outside Docker/cAdvisor coverage.
expr: increase(systemd_unit_restarts_total{unit=~"actions\\.runner\\..*"}[15m]) > 2
for: 3m
labels:
severity: critical
layer: systemd-110
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "true"
annotations:
summary: "Systemd runner {{ $labels.unit }} 15 分鐘重啟超過 2 次"
description: "{{ $labels.unit }} 在 15 分鐘內重啟暴增110 曾發生 WatchdogSec=5min 造成 runner 每 5 分鐘自殺重啟。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'systemctl show {{ $labels.unit }} -p WatchdogUSec -p NRestarts -p DropInPaths -p CPUQuotaPerSecUSec -p MemoryMax; journalctl -u {{ $labels.unit }} --since \"20 minutes ago\" --no-pager | tail -120'"
runbook: "自動階段先診斷。若確認 WatchdogSec 對 GitHub runner 誤設,需人工或 sudo playbook 執行 /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply。"
- alert: SystemdRunnerWatchdogEnabled
expr: systemd_unit_watchdog_seconds{unit=~"actions\\.runner\\..*"} > 0
for: 10m
labels:
severity: warning
layer: systemd-110
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "true"
annotations:
summary: "Systemd runner {{ $labels.unit }} 啟用了 WatchdogSec"
description: "{{ $labels.unit }} WatchdogSec={{ $value }} 秒。GitHub Actions runner service 不應被 systemd watchdog 週期性殺掉。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'systemctl show {{ $labels.unit }} -p WatchdogUSec -p DropInPaths -p NRestarts'"
runbook: "確認 drop-in 來源;需要 sudo 時由人工套用 /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply。"
- alert: SystemdRunnerMissingResourceQuota
expr: systemd_unit_cpu_quota_cores{unit=~"actions\\.runner\\..*"} == 0 or systemd_unit_memory_max_bytes{unit=~"actions\\.runner\\..*"} == 0
for: 30m
labels:
severity: warning
layer: systemd-110
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "true"
annotations:
summary: "Systemd runner {{ $labels.unit }} 缺 CPU 或 memory quota"
description: "{{ $labels.unit }} 仍為 unlimitedCI runner 會與 Sentry/ClickHouse/Gitea 搶主機 CPU。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'systemctl show {{ $labels.unit }} -p CPUQuotaPerSecUSec -p MemoryMax -p ActiveState -p SubState'"
runbook: "建議 baseline每個 runner CPUQuota=200%、MemoryMax=2G由 /home/wooo/scripts/apply-runner-systemd-guardrails.sh 套用,若仍過載再限制並行度或分流。"
# =========================================================================
# MinIO / Kali 告警
# =========================================================================
- name: minio_kali_alerts
rules:
- alert: MinIODown
expr: probe_success{job="blackbox-http", instance=~".*9001.*"} == 0
for: 2m
labels:
severity: warning
layer: docker-188
component: minio
host: "188"
team: ops
auto_repair: "true"
annotations:
summary: "MinIO (Velero 備份) 離線"
description: "MinIO (192.168.0.188:9001) 已離線超過 2 分鐘Velero 備份可能失敗"
- alert: KaliScannerDown
expr: probe_success{job="blackbox-http", instance=~".*192.168.0.112.*"} == 0
for: 5m
labels:
severity: info
layer: docker-188
component: kali
host: "112"
team: ops
auto_repair: "false"
annotations:
summary: "Kali Scanner 離線"
description: "Kali (192.168.0.112:8080) 離線,安全掃描功能暫停"
# =========================================================================
# Plan C — 外部網站監控 (Sprint 5.2, 2026-04-09 Claude Sonnet 4.6 Asia/Taipei)
# blackbox-http 已涵蓋 4 個外部網站,此群組提供結構化告警
# auto_repair: "true" — 由 AWOOOI Guardrail 決策Service Registry 分級)
# =========================================================================
- name: external_website_alerts
rules:
- alert: MoWoooWorkDown
expr: probe_success{job="blackbox-http", instance="https://mo.wooo.work"} == 0
for: 3m
labels:
severity: critical
layer: external
component: momo-pro-system
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "外部網站 mo.wooo.work 離線"
description: "mo.wooo.work public route 探測失敗超過 3 分鐘;先檢查 https://mo.wooo.work/health、188 local http://127.0.0.1:5003/health 與容器 momo-pro-system不可只因 502 盲目重啟。"
runbook: "先比對 public health、188 local app health、momo-pro-system / momo-db / momo-scheduler 狀態與最新 import/data freshness若 public 502 但 local healthy優先查 188 Nginx / upstream / TLS不直接重啟容器。"
- alert: TsenyangWebsiteDown
expr: probe_success{job="blackbox-http", instance="https://www.tsenyang.com"} == 0
for: 3m
labels:
severity: critical
layer: external
component: tsenyang-website
host: "188"
team: ops
auto_repair: "true"
annotations:
summary: "外部網站 tsenyang.com 離線"
description: "tsenyang.com 探測失敗超過 3 分鐘,容器 tsenyang-website (188) 可能需要重啟"
- alert: StockWoooWorkDown
expr: probe_success{job="blackbox-http", instance="http://stock.wooo.work"} == 0
for: 3m
labels:
severity: critical
layer: external
component: stock-platform
host: "110"
team: ops
auto_repair: "true"
annotations:
summary: "外部網站 stock.wooo.work 離線"
description: "stock.wooo.work 探測失敗超過 3 分鐘,容器 stock-platform (110) 可能需要重啟"
- alert: BitanWoooWorkDown
expr: probe_success{job="blackbox-http", instance="https://bitan.wooo.work"} == 0
for: 3m
labels:
severity: critical
layer: external
component: bitan-app
host: "188"
team: ops
auto_repair: "true"
annotations:
summary: "外部網站 bitan.wooo.work 離線"
description: "bitan.wooo.work 探測失敗超過 3 分鐘,容器 bitan-app (188) 可能需要重啟"
- alert: ExternalSiteSSLExpiringSoon
expr: probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time() < 14 * 24 * 3600
for: 1h
labels:
severity: warning
layer: external
component: ssl
team: ops
auto_repair: "false"
annotations:
summary: "SSL 憑證即將到期: {{ $labels.instance }}"
description: "{{ $labels.instance }} SSL 憑證將在 14 天內到期,請手動更新"
# =============================================================================
# ADR-075 新增規則群組 (2026-04-12 ogt)
# =============================================================================
- name: awoooi_secops_alerts
interval: 60s
rules:
- alert: UnauthorizedSSHLogin
expr: increase(node_failed_auth_attempts_total[5m]) > 10
for: 1m
labels:
severity: critical
layer: systemd-188
team: security
auto_repair: "false"
alert_category: secops
annotations:
summary: "異常 SSH 登入嘗試: {{ $labels.instance }}"
description: "5 分鐘內失敗登入 {{ $value }} 次,可能遭受暴力破解"
- name: awoooi_business_alerts
interval: 60s
rules:
- alert: AITokenCostSpike
expr: increase(awoooi_ai_token_cost_usd_total[1h]) > 10
for: 5m
labels:
severity: warning
layer: k8s
team: finops
auto_repair: "false"
alert_category: business
annotations:
summary: "AI Token 費用 1 小時內暴增 ${{ $value | humanize }}"
description: "AI API 調用費用異常,請檢查是否有迴圈或濫用"
- alert: GeminiAPIErrorRateHigh
expr: rate(awoooi_ai_request_errors_total{provider="gemini"}[5m]) / rate(awoooi_ai_requests_total{provider="gemini"}[5m]) > 0.2
for: 10m
labels:
severity: warning
layer: k8s
team: finops
auto_repair: "false"
alert_category: business
annotations:
summary: "Gemini API 錯誤率過高: {{ $value | humanizePercentage }}"
description: "Gemini API 5 分鐘錯誤率超過 20%AI 降級可能失效"
# ADR-075: 業務爬蟲健康 (2026-04-12 ogt)
- alert: MomoScraperSuccessLow
expr: |
rate(momo_scraper_requests_total{status="success"}[5m])
/ rate(momo_scraper_requests_total[5m]) < 0.9
for: 10m
labels:
severity: warning
layer: docker-110
auto_repair: "false"
alert_category: business
notification_type: TYPE-3
annotations:
summary: "Momo 抓取成功率跌至 {{ $value | humanizePercentage }}"
description: "Momo 爬蟲成功率低於 90%,業務資料可能缺失"
- name: awoooi_flywheel_meta_alerts
interval: 60s
rules:
- alert: FlywheelPlaybookZero
expr: awoooi_flywheel_playbook_count == 0
for: 1h
labels:
severity: critical
layer: k8s
team: aiops
auto_repair: "false"
alert_category: flywheel_health
notification_type: TYPE-8M
annotations:
summary: "飛輪 Playbook 數量為零AI 修復完全依賴 LLM"
description: "Redis 中無任何已批准 Playbook自動修復能力大幅降低"
runbook: "執行 scripts/cold_start_playbooks.py 冷啟動"
# 2026-05-03 ogt + Claude Opus 4.7(亞太)— anti-silencing 補配對告警
# NaN sentinel 不會被 < 0.1 誤觸;下方 FlywheelExecutionRateMissing 補「無資料」獨立告警
- alert: FlywheelExecutionSuccessLow
expr: awoooi_flywheel_execution_success_rate < 0.1
for: 2h
labels:
severity: warning
layer: k8s
team: aiops
auto_repair: "false"
alert_category: flywheel_health
notification_type: TYPE-8M
annotations:
summary: "飛輪執行成功率 {{ $value | humanizePercentage }} 低於 10%"
description: "連續 2 小時執行成功率不足 10%Playbook 可能已過時"
runbook: "檢查 decision_manager 日誌,確認 target 解析和 SSH MCP 狀態"
- alert: FlywheelExecutionRateMissing
expr: absent(awoooi_flywheel_execution_success_rate) or (awoooi_flywheel_execution_success_rate != awoooi_flywheel_execution_success_rate)
for: 30m
labels:
severity: warning
layer: k8s
team: aiops
auto_repair: "false"
alert_category: flywheel_health
notification_type: TYPE-8M
annotations:
summary: "飛輪執行率指標 30 分鐘無資料"
description: "execution_success_rate 連續 30 分鐘為 NaN 或不存在Redis playbook 統計斷流(資料管線壞 / Redis flush / FlywheelStatsService 異常)"
runbook: "1) 檢查 Redis playbook:* keys 2) 檢查 FlywheelStatsService 日誌 3) curl /metrics 直接拉看 NaN 來源"
- alert: FlywheelKMVectorizationLow
expr: awoooi_flywheel_km_unvectorized_count > 10
for: 30m
labels:
severity: warning
layer: k8s
team: aiops
auto_repair: "false"
alert_category: flywheel_health
notification_type: TYPE-8M
annotations:
summary: "{{ $value }} 筆 KM 未向量化RAG 查詢命中率下降"
description: "knowledge_entries 中 embedding IS NULL 超過 10 筆且持續 30 分鐘"
runbook: "執行 scripts/batch_vectorize_km.py 或檢查每日 CronJob 狀態"
- alert: FlywheelAlertnameNullHigh
expr: awoooi_flywheel_alertname_null_rate > 0.05
for: 30m
labels:
severity: warning
layer: k8s
team: aiops
auto_repair: "false"
alert_category: flywheel_health
notification_type: TYPE-8M
annotations:
summary: "飛輪 alertname NULL 率超過 5%"
description: "alertname NULL 率 {{ $value | humanizePercentage }},影響路由準確性。"
runbook: "執行 scripts/backfill_alertname.py 回填"
- alert: FlywheelIncidentsStuck
expr: awoooi_flywheel_incidents_stuck > 5
for: 10m
labels:
severity: warning
layer: k8s
team: aiops
auto_repair: "false"
alert_category: flywheel_health
notification_type: TYPE-8M
annotations:
summary: "{{ $value }} 筆 Incident 卡在 INVESTIGATING 超過 24h"
description: "飛輪推理匹配節點可能堵塞,需人工清理或重新觸發診斷"
# =========================================================================
# 備份還原告警 (awoooi_backup_restore) — 從主機補回 2026-04-12
# =========================================================================
- name: awoooi_backup_restore
interval: 1h
rules:
- alert: BackupRestoreTestFailed
expr: awoooi_backup_restore_test_success == 0
for: 5m
labels:
severity: critical
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "備份還原 dry-run 測試失敗"
description: "Velero restore dry-run 失敗,備份可能無法還原。立即人工驗證備份狀態。"
runbook: "執行 velero backup describe awoooi-daily 及 velero restore create --from-backup awoooi-daily --dry-run"
- alert: BackupRestoreTestStale
expr: (time() - awoooi_backup_restore_test_timestamp_seconds) > 691200
for: 10m
labels:
severity: warning
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "備份還原測試超過 8 天未執行"
description: "上次備份測試距今 {{ $value | humanizeDuration }},週排程 CronJob 可能失效。"
runbook: "檢查 velero namespace 中的 backup-restore-test CronJob 狀態"
# =========================================================================
# 基礎設施詳細告警 (awoooi_infrastructure_detailed) — 從主機補回 2026-04-12
# =========================================================================
- name: awoooi_infrastructure_detailed
interval: 60s
rules:
- alert: DockerContainerUnhealthyDetailed
expr: |
count by (name, instance) (
container_tasks_state{state="running", instance=~"192.168.0.188.*"}
) == 0
or
container_last_seen{instance=~"192.168.0.188.*", name!=""} < (time() - 120)
for: 5m
labels:
severity: warning
layer: docker-188
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "188 主機容器 {{ $labels.name }} 異常"
description: "容器 {{ $labels.name }} 在 {{ $labels.instance }} 已超過 2 分鐘無活動或不在 running 狀態。"
runbook: "SSH 到 192.168.0.188docker inspect {{ $labels.name }} 確認健康狀態"
- alert: RedisStreamBacklogHigh
expr: awoooi_redis_stream_len > 500
for: 10m
labels:
severity: warning
layer: docker-188
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "Redis Stream {{ $labels.stream }} 積壓 {{ $value }} 筆"
description: "Redis Stream 積壓超過 500 筆,飛輪消費者可能阻塞。"
runbook: "檢查 consumer group lagXINFO GROUPS <stream-key>"
# 2026-04-19 Hermes E3 決策: PostgreSQLDiskGrowthRate deprecated
# 真因: 500MB/h 增長是 PG WAL 正常行為 (commits/checkpoints),不該告警
# 過去 30d 觸發 7 次,全部 AI 判 NO_ACTION 或誤判 kubectl rollout restart 失敗
# 統帥決策 (2026-04-19 18:xx Taipei): 選項 C 刪除舊規則 + 改用絕對磁碟使用率
# -----------------------------------------------------------------
- alert: HostDiskUsageHigh
expr: |
(
node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
- node_filesystem_avail_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
)
/ node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
* 100 > 80
for: 10m
labels:
severity: warning
layer: systemd-188
alert_category: host_resource
notification_type: TYPE-3
# 2026-05-02 ogt + Claude Sonnet 4.6: ADR-068 飛輪 — disk full SOP
# auto_repair: false → true路由到 ssh_host MCP Group B `ssh_docker_prune`
# 工具內含 ≥75% 磁碟守衛,低於閾值 no-op避免誤刪
auto_repair: "true"
mcp_provider: "ssh_host"
host_type: "bare_metal"
supersedes: PostgreSQLDiskGrowthRate
annotations:
summary: "主機 {{ $labels.instance }} 根目錄磁碟使用率 {{ $value | humanize }}% (>80%)"
description: "磁碟使用率持續 10 分鐘超過 80%, 需清理或擴容. 常見原因: PG WAL, 日誌, container images, 舊 build cache."
auto_repair_action: "ssh {{ $labels.instance }} docker prune (image+volume+builder; gated by 75% disk usage)"
runbook: "SSH 該主機: df -h / && du -sh /var/lib/postgresql/*/pg_wal /var/log /var/lib/docker"
- alert: HostDiskUsageCritical
expr: |
(
node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
- node_filesystem_avail_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
)
/ node_filesystem_size_bytes{fstype!~"tmpfs|overlay", mountpoint="/"}
* 100 > 90
for: 5m
labels:
severity: critical
layer: systemd-188
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "true"
supersedes: PostgreSQLDiskGrowthRate
annotations:
summary: "🔴 主機 {{ $labels.instance }} 根目錄磁碟使用率 {{ $value | humanize }}% (>90%, critical)"
description: "磁碟即將滿, 需立即清理. 超過 95% 可能導致服務中斷."
runbook: "立即 SSH 該主機: df -h / && du -sh /* 2>/dev/null | sort -h | tail -10"
# =========================================================================
# 主機連通性告警 (awoooi_host_connectivity) — 從主機補回 2026-04-12
# =========================================================================
- name: awoooi_host_connectivity
interval: 60s
rules:
- alert: HostNetworkPartition
expr: probe_success{job="host-connectivity"} == 0
for: 5m
labels:
severity: critical
layer: systemd-188
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "true"
annotations:
summary: "主機 {{ $labels.instance }} 無法連通"
description: "TCP probe 到 {{ $labels.instance }} 失敗超過 5 分鐘,可能發生網路分區。"
runbook: "SSH 檢查路由和防火牆規則"
# =========================================================================
# 監控工具自監控 (infra_self_monitoring) — ADR-090 Phase 7
# 2026-04-19 Claude Opus 4.7 / 鐵律:監控工具必須被監控
# 設計:不寫死 CPU% 或 MB 數,改用 (配額佔比) + (throttle 訊號) 動態判斷
# 配額由 docker-compose 宣告,告警條件 = 使用量 / 配額 > 0.8
# 比寫死 80% 更智能 — 配額改告警閾值自動跟著變
# =========================================================================
- name: infra_self_monitoring
interval: 1m
rules:
# --- cadvisor 自監控 ---
- alert: CadvisorDown
expr: up{job=~".*cadvisor.*"} == 0
for: 5m
labels:
severity: critical
layer: docker-110-188
component: cadvisor
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "true"
annotations:
summary: "cAdvisor ({{ $labels.instance }}) 停擺"
description: "主機 {{ $labels.instance }} 的 cadvisor 已停擺 5 分鐘,容器監控中斷。"
runbook: "SSH 主機 docker compose up -d cadvisor檢查 OOMKill 訊號"
- alert: CadvisorMemoryPressure
expr: container_memory_usage_bytes{name="cadvisor"} / container_spec_memory_limit_bytes{name="cadvisor"} > 0.8
for: 10m
labels:
severity: warning
component: cadvisor
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "cAdvisor 記憶體使用率 > 80% limit"
description: "cadvisor 記憶體用量 / mem_limit = {{ $value | humanizePercentage }},接近 OOMKill。"
runbook: "若頻繁觸發 → 檢查 cardinality 是否持續成長,考慮調整 --disable_metrics"
- alert: CadvisorCPUThrottled
expr: rate(container_cpu_cfs_throttled_seconds_total{name="cadvisor"}[5m]) > 0.5
for: 15m
labels:
severity: warning
component: cadvisor
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "cAdvisor CPU 被 throttle配額不足"
description: "cadvisor 每秒被 throttle {{ $value }} 秒,表示實際需求超過 cpus 配額。"
runbook: "調高 docker-compose cpus 設定,或檢查 scrape interval / cardinality"
# --- node-exporter 自監控 ---
- alert: NodeExporterDown
expr: up{job=~"node-exporter.*|node_exporter.*"} == 0
for: 5m
labels:
severity: critical
component: node-exporter
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "true"
annotations:
summary: "node-exporter ({{ $labels.instance }}) 停擺"
description: "主機 {{ $labels.instance }} node-exporter 已停擺 5 分鐘,主機 metrics 中斷。"
runbook: "SSH 主機檢查 docker ps node-exporter重啟 docker compose up -d node-exporter"
- alert: NodeExporterCPUThrottled
expr: rate(container_cpu_cfs_throttled_seconds_total{name="node-exporter"}[5m]) > 0.5
for: 15m
labels:
severity: warning
component: node-exporter
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "node-exporter CPU 被 throttle配額不足"
description: "node-exporter 每秒被 throttle {{ $value }} 秒。可能 collector 未適度 disable。"
runbook: "檢查 node-exporter --collector.* flags 是否該關掉閒置硬體 probe"
# --- Sentry self-hosted 自監控110---
# 2026-04-25 ogt + Claude Opus 4.7: 修正假告警根因
# 舊規則用 container_memory_usage_bytes含 page cache導致 ClickHouse
# 執行大查詢時 OS 把 SSTable 緩存進 page cache比例衝到 88.5% 觸發誤報
# 2026-04-23 23:13 鐵證usage_bytes=88.5% / working_set=7.8%)。
# 改用 container_memory_working_set_bytes — 這才是 K8s/Docker OOM killer
# 實際追蹤的「真實工作集」(RSS + active page cache),不含 inactive page cache。
# 參考: https://github.com/google/cadvisor/blob/master/info/v1/container.go
- alert: SentryClickHouseMemoryPressure
expr: container_memory_working_set_bytes{name=~".*sentry.*clickhouse.*"} / container_spec_memory_limit_bytes{name=~".*sentry.*clickhouse.*"} > 0.85
for: 10m
labels:
severity: warning
component: sentry-clickhouse
team: platform
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "Sentry ClickHouse 工作集記憶體 > 85% limit"
description: "sentry clickhouse working_set / mem_limit = {{ $value | humanizePercentage }} (排除 page cache)。"
runbook: "檢查 Sentry 查詢壓力;確認非 page cache 假象;必要時調整 /opt/sentry/docker-compose.override.yml clickhouse mem_limit"
- alert: SentryClickHouseCPUThrottled
expr: rate(container_cpu_cfs_throttled_seconds_total{name=~".*sentry.*clickhouse.*"}[5m]) > 1.0
for: 15m
labels:
severity: warning
component: sentry-clickhouse
team: platform
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "Sentry ClickHouse CPU 持續被 throttle"
description: "每秒 throttle {{ $value }} 秒,配額 cpus=4.0 可能不足。"
runbook: "檢查 Sentry retention / query pattern必要時調高 override.yml cpus"
# --- Gitea 自監控 ---
- alert: GiteaMemoryPressure
# 2026-04-25 ogt + Claude Sonnet 4.6 — 同 ClickHouse 假警報根因:
# container_memory_usage_bytes 含 page cacheOS inactiveOOM killer 不管)→ 虛高假警報
# 改用 container_memory_working_set_bytesRSS + active cache真實壓力cadvisor 適用 Docker + K8s
expr: container_memory_working_set_bytes{name="gitea"} / container_spec_memory_limit_bytes{name="gitea"} > 0.85
for: 10m
labels:
severity: warning
component: gitea
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "Gitea 記憶體工作集 > 85% limit"
description: "gitea working_set / mem_limit = {{ $value | humanizePercentage }}(真實記憶體壓力,非 page cache 干擾)。"
runbook: "檢查 CI/CD 任務堆積;必要時調高 docker-compose mem_limit"
- alert: GiteaCPUThrottled
expr: rate(container_cpu_cfs_throttled_seconds_total{name=~"gitea|gitea-runner"}[5m]) > 1.0
for: 15m
labels:
severity: warning
component: gitea
team: ops
alert_category: infrastructure
notification_type: TYPE-1
auto_repair: "false"
annotations:
summary: "Gitea / Runner CPU 持續被 throttle"
description: "{{ $labels.name }} 每秒 throttle {{ $value }} 秒CD peak 可能卡關。"
runbook: "檢查 job 並行度;考慮縮減並行或調高 cpus"
# --- 監控自監控元層Prometheus 本身)---
- alert: PrometheusDown
expr: up{job="prometheus"} == 0
for: 2m
labels:
severity: critical
component: prometheus
team: ops
alert_category: infrastructure
notification_type: TYPE-3
auto_repair: "false"
annotations:
summary: "Prometheus ({{ $labels.instance }}) 停擺"
description: "Prometheus 自己停擺 → 所有其他告警失效"
runbook: "SSH 110 docker compose -f /home/wooo/monitoring/docker-compose.yml up -d prometheus"