All checks were successful
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 39s
飛輪一直 GUARDRAIL_BLOCKED 的根本原因: Prometheus rule 標籤 auto_repair=false 強制 HITL Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
621 lines
22 KiB
YAML
621 lines
22 KiB
YAML
# ops/monitoring/alerts-unified.yml
|
||
# AWOOOI 統一 Prometheus 告警規則
|
||
# 2026-04-05 Claude Code: 整合所有規則,加入統一 layer 標籤
|
||
# 2026-04-08 Claude Sonnet 4.6: 補 database_detail_alerts 群組 (6條詳細規則)
|
||
# 部署目標: 192.168.0.110:/home/wooo/monitoring/alerts.yml
|
||
# 部署方式: scripts/ops/deploy-alerts.sh (CD 自動部署)
|
||
#
|
||
# 標籤規範:
|
||
# layer: k8s | docker-110 | docker-188 | systemd-188
|
||
# component: 服務名稱
|
||
# team: ops | backend | ai | platform
|
||
# host: "110" | "188" | "120" | "121"
|
||
# auto_repair: "true" | "false"
|
||
|
||
groups:
|
||
|
||
# =========================================================================
|
||
# 主機層告警 (host_alerts)
|
||
# =========================================================================
|
||
- name: host_alerts
|
||
rules:
|
||
- alert: HostDown
|
||
expr: up{job=~"node-exporter.*"} == 0
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
layer: systemd-188
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "主機 {{ $labels.host }} 不可達"
|
||
description: "Node Exporter 無回應超過 1 分鐘"
|
||
|
||
- alert: HostHighCpuLoad
|
||
expr: 100 - (avg by(host) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "主機 {{ $labels.host }} CPU 高負載"
|
||
description: "CPU 使用率超過 80%"
|
||
|
||
- alert: HostOutOfMemory
|
||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "主機 {{ $labels.host }} 記憶體不足"
|
||
description: "記憶體使用率超過 85%"
|
||
|
||
- alert: HostOutOfDiskSpace
|
||
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 85
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "主機 {{ $labels.host }} 磁碟空間不足"
|
||
description: "磁碟使用率超過 85%"
|
||
|
||
# =========================================================================
|
||
# K8s 叢集告警 (kubernetes_alerts)
|
||
# =========================================================================
|
||
- name: kubernetes_alerts
|
||
rules:
|
||
- alert: K3sNodeNotReady
|
||
expr: kube_node_status_condition{condition="Ready", status="true"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
layer: k8s
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "K3s 節點 {{ $labels.node }} 未就緒"
|
||
description: "節點超過 2 分鐘未達到 Ready 狀態"
|
||
|
||
- alert: KubePodCrashLooping
|
||
expr: rate(kube_pod_container_status_restarts_total{namespace="awoooi-prod"}[15m]) > 0
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} 持續重啟"
|
||
description: "Pod 在過去 15 分鐘內重啟次數異常"
|
||
|
||
- alert: KubePodNotReady
|
||
expr: kube_pod_status_ready{condition="true",namespace="awoooi-prod"} == 0
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} 未就緒"
|
||
description: "Running 中的 Pod 超過 5 分鐘未達到 Ready 狀態"
|
||
|
||
- alert: KubeDeploymentReplicasMismatch
|
||
expr: kube_deployment_spec_replicas{namespace="awoooi-prod"} != kube_deployment_status_replicas_available{namespace="awoooi-prod"}
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} 副本數不匹配"
|
||
description: "期望副本數與可用副本數不一致超過 10 分鐘"
|
||
|
||
- alert: VeleroBackupFailed
|
||
expr: increase(velero_backup_failure_total[24h]) > 0
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: ops
|
||
component: velero
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Velero 備份失敗"
|
||
description: "過去 24 小時有備份失敗"
|
||
|
||
- alert: VeleroBackupNotRun
|
||
expr: time() - velero_backup_last_successful_timestamp > 86400
|
||
for: 10m
|
||
labels:
|
||
severity: critical
|
||
layer: k8s
|
||
team: ops
|
||
component: velero
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Velero 超過 24 小時未成功備份"
|
||
description: "最後一次成功備份超過 24 小時"
|
||
|
||
# =========================================================================
|
||
# 資料庫告警 (database_alerts)
|
||
# =========================================================================
|
||
- name: database_alerts
|
||
rules:
|
||
- alert: PostgreSQLDown
|
||
expr: up{job="postgres-exporter"} == 0 or pg_up == 0
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
layer: systemd-188
|
||
component: postgres
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "PostgreSQL 資料庫離線"
|
||
description: "PostgreSQL Exporter 無法連接資料庫超過 1 分鐘"
|
||
|
||
- alert: RedisDown
|
||
expr: up{job="redis-exporter"} == 0 or redis_up == 0
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
layer: systemd-188
|
||
component: redis
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Redis 快取服務離線"
|
||
description: "Redis Exporter 無法連接 Redis 超過 1 分鐘"
|
||
|
||
- alert: PostgreSQLHighConnections
|
||
expr: pg_stat_activity_count > 80
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
component: postgres
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "PostgreSQL 連接數過高"
|
||
description: "當前連接數 {{ $value }} 超過 80"
|
||
|
||
- alert: RedisMemoryHigh
|
||
expr: redis_memory_max_bytes > 0 and redis_memory_used_bytes / redis_memory_max_bytes * 100 > 80
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
component: redis
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Redis 記憶體使用過高"
|
||
description: "Redis 記憶體使用率超過 80%"
|
||
|
||
# =========================================================================
|
||
# Sprint 5.2 Plan B: 資料庫詳細指標告警 (database_detail_alerts)
|
||
# 前置: postgres-exporter:9187 + redis-exporter:9121 on 192.168.0.188
|
||
# 2026-04-08 Claude Sonnet 4.6 Asia/Taipei
|
||
# =========================================================================
|
||
- name: database_detail_alerts
|
||
rules:
|
||
# ---- PostgreSQL 詳細指標 ----
|
||
- alert: PostgreSQLSlowQueries
|
||
expr: pg_stat_activity_max_tx_duration{datname="awoooi_prod"} > 60
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
component: postgres
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "PostgreSQL 有慢查詢 (>60s)"
|
||
description: "awoooi_prod 資料庫最長事務超過 60 秒"
|
||
|
||
- alert: PostgreSQLDeadlocks
|
||
expr: increase(pg_stat_database_deadlocks{datname="awoooi_prod"}[5m]) > 0
|
||
for: 1m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
component: postgres
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "PostgreSQL 死鎖發生"
|
||
description: "過去 5 分鐘 awoooi_prod 資料庫有死鎖"
|
||
|
||
- alert: PostgreSQLTooManyConnections
|
||
expr: pg_stat_activity_count{datname="awoooi_prod"} > 50
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
component: postgres
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "PostgreSQL 連接數過高 ({{ $value }})"
|
||
description: "awoooi_prod 連接數超過 50"
|
||
|
||
# ---- Redis 詳細指標 ----
|
||
- alert: RedisKeyEviction
|
||
expr: increase(redis_evicted_keys_total[5m]) > 0
|
||
for: 1m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
component: redis
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Redis 發生 Key 驅逐"
|
||
description: "過去 5 分鐘有 Key 被驅逐,可能記憶體不足"
|
||
|
||
- alert: RedisConnectionsHigh
|
||
expr: redis_connected_clients > 100
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
component: redis
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Redis 連接數過高 ({{ $value }})"
|
||
description: "Redis 連接數超過 100"
|
||
|
||
- alert: RedisCommandLatencyHigh
|
||
expr: redis_commands_duration_seconds_total / redis_commands_processed_total > 0.01
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
component: redis
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Redis 命令平均延遲過高 (>10ms)"
|
||
description: "Redis 命令平均延遲超過 10ms"
|
||
|
||
# =========================================================================
|
||
# 服務可用性告警 (service_alerts) — 含 layer 標籤決定修復路徑
|
||
# =========================================================================
|
||
- name: service_alerts
|
||
rules:
|
||
# ---- 188 Docker 層 ----
|
||
- alert: OpenClawDown
|
||
# 2026-04-05 Claude Code: 修正舊命名 ClawBotDown → OpenClawDown
|
||
expr: up{job="clawbot"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
layer: docker-188
|
||
component: openclaw
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "OpenClaw 服務離線"
|
||
description: "OpenClaw (192.168.0.188:8088) 已離線超過 2 分鐘"
|
||
|
||
- alert: SignOzDown
|
||
expr: probe_success{job="blackbox-http", instance=~".*3301.*"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: warning
|
||
layer: docker-188
|
||
component: signoz
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "SignOz 服務離線"
|
||
description: "SignOz (192.168.0.188:3301) 已離線超過 2 分鐘"
|
||
|
||
# ---- 110 Docker 層 ----
|
||
- alert: SentryDown
|
||
expr: probe_success{job="blackbox-tcp", instance="192.168.0.110:9000"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: warning
|
||
layer: docker-110
|
||
component: sentry
|
||
host: "110"
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "Sentry 服務離線"
|
||
description: "Sentry (192.168.0.110:9000) 已離線超過 2 分鐘"
|
||
|
||
- alert: HarborDown
|
||
expr: probe_success{job="blackbox-http", instance=~".*5000.*"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
layer: docker-110
|
||
component: harbor
|
||
host: "110"
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "Harbor Registry 離線"
|
||
description: "Harbor (192.168.0.110:5000) 已離線超過 2 分鐘,CD pipeline 將無法拉取映像"
|
||
|
||
- alert: GiteaDown
|
||
expr: probe_success{job="blackbox-http", instance="http://192.168.0.110:3001"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
layer: docker-110
|
||
component: gitea
|
||
host: "110"
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "Gitea Git 服務離線"
|
||
description: "Gitea (192.168.0.110:3001) 已離線超過 2 分鐘,CD pipeline 失效"
|
||
|
||
- alert: AlertmanagerDown
|
||
expr: probe_success{job="blackbox-tcp", instance="192.168.0.110:9093"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
layer: docker-110
|
||
component: alertmanager
|
||
host: "110"
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "Alertmanager 離線"
|
||
description: "Alertmanager (192.168.0.110:9093) 已離線,所有告警將靜默"
|
||
|
||
# =========================================================================
|
||
# 告警鏈路監控 (alert_chain) — 防止 2026-03-26/04-05 事故重演
|
||
# =========================================================================
|
||
- name: alert_chain
|
||
rules:
|
||
- alert: AlertChainBroken_Alertmanager
|
||
expr: |
|
||
sum(rate(awoooi_webhook_requests_total{source="alertmanager",status!="success"}[5m]))
|
||
/ sum(rate(awoooi_webhook_requests_total{source="alertmanager"}[5m])) > 0.1
|
||
for: 10m
|
||
labels:
|
||
severity: critical
|
||
layer: k8s
|
||
team: platform
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Alertmanager Webhook 錯誤率 > 10%"
|
||
description: "告警鏈路可能斷裂,請執行 E2E 驗證"
|
||
|
||
- alert: AlertChainBroken_Sentry
|
||
expr: |
|
||
sum(rate(awoooi_webhook_requests_total{source="sentry",status!="success"}[5m]))
|
||
/ sum(rate(awoooi_webhook_requests_total{source="sentry"}[5m])) > 0.1
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: platform
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Sentry Webhook 錯誤率 > 10%"
|
||
description: "Sentry 錯誤可能無法正確處理"
|
||
|
||
- alert: NoAlertsReceived2Hours
|
||
expr: time() - max by (source)(awoooi_alert_chain_last_success_timestamp) > 7200
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: platform
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "2 小時內未收到任何告警 ({{ $labels.source }})"
|
||
description: "可能是告警鏈路問題,請執行 Smoke Test"
|
||
|
||
- alert: AlertChainUnhealthy
|
||
expr: awoooi_alert_chain_healthy == 0
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
layer: k8s
|
||
team: platform
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "告警鏈路不健康 ({{ $labels.source }})"
|
||
description: "告警鏈路標記為不健康,最近處理失敗"
|
||
|
||
# =========================================================================
|
||
# 自動修復監控 (auto_repair)
|
||
# =========================================================================
|
||
- name: auto_repair
|
||
rules:
|
||
- alert: AutoRepairLowSuccessRate
|
||
expr: awoooi_auto_repair_success_rate < 0.3
|
||
for: 30m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: backend
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "自動修復成功率過低 ({{ $value | humanizePercentage }})"
|
||
description: "動作 {{ $labels.action }} 的成功率低於 30%,建議檢查 Playbook"
|
||
|
||
- alert: PermanentFixRequired
|
||
expr: sum(rate(awoooi_anomaly_escalation_total{level="PERMANENT_FIX"}[1h])) > 0
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
layer: k8s
|
||
team: backend
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "需要永久修復的異常升級"
|
||
description: "有異常升級到 PERMANENT_FIX 級別,需要根本修復"
|
||
|
||
# =========================================================================
|
||
# Sprint 5.1: Docker 容器健康監控(docker-health-monitor 感知層接入)
|
||
# 由 docker-health-monitor.sh 送 Alertmanager 格式 webhook,
|
||
# 或 Prometheus 自訂 exporter 上報時使用。
|
||
# auto_repair: "true" 代表允許 AWOOOI Guardrail 決策(非直接修復)
|
||
# 實際修復動作由 Service Registry 分級決定(ADR-062)
|
||
# 2026-04-08 Claude Sonnet 4.6 Asia/Taipei
|
||
# =========================================================================
|
||
- name: docker_health_alerts
|
||
rules:
|
||
- alert: DockerContainerUnhealthy
|
||
expr: container_health_status{job="docker-health-monitor"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: warning
|
||
layer: docker
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "容器 {{ $labels.container }} 健康檢查失敗"
|
||
description: "主機 {{ $labels.host }} 容器 {{ $labels.container }} 健康狀態異常,持續 2 分鐘"
|
||
|
||
- alert: DockerContainerExited
|
||
expr: container_running_status{job="docker-health-monitor"} == 0
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
layer: docker
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "容器 {{ $labels.container }} 已停止"
|
||
description: "主機 {{ $labels.host }} 容器 {{ $labels.container }} 已 exited/dead,持續 1 分鐘"
|
||
|
||
# =========================================================================
|
||
# MinIO / Kali 告警
|
||
# =========================================================================
|
||
- name: minio_kali_alerts
|
||
rules:
|
||
- alert: MinIODown
|
||
expr: probe_success{job="blackbox-http", instance=~".*9001.*"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: warning
|
||
layer: docker-188
|
||
component: minio
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "MinIO (Velero 備份) 離線"
|
||
description: "MinIO (192.168.0.188:9001) 已離線超過 2 分鐘,Velero 備份可能失敗"
|
||
|
||
- alert: KaliScannerDown
|
||
expr: probe_success{job="blackbox-http", instance=~".*192.168.0.112.*"} == 0
|
||
for: 5m
|
||
labels:
|
||
severity: info
|
||
layer: docker-188
|
||
component: kali
|
||
host: "112"
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Kali Scanner 離線"
|
||
description: "Kali (192.168.0.112:8080) 離線,安全掃描功能暫停"
|
||
|
||
# =========================================================================
|
||
# Plan C — 外部網站監控 (Sprint 5.2, 2026-04-09 Claude Sonnet 4.6 Asia/Taipei)
|
||
# blackbox-http 已涵蓋 4 個外部網站,此群組提供結構化告警
|
||
# auto_repair: "true" — 由 AWOOOI Guardrail 決策(Service Registry 分級)
|
||
# =========================================================================
|
||
- name: external_website_alerts
|
||
rules:
|
||
- alert: MoWoooWorkDown
|
||
expr: probe_success{job="blackbox-http", instance="https://mo.wooo.work"} == 0
|
||
for: 3m
|
||
labels:
|
||
severity: critical
|
||
layer: external
|
||
component: momo-app
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "外部網站 mo.wooo.work 離線"
|
||
description: "mo.wooo.work 探測失敗超過 3 分鐘,容器 momo-app (188) 可能需要重啟"
|
||
|
||
- alert: TsenyangWebsiteDown
|
||
expr: probe_success{job="blackbox-http", instance="https://www.tsenyang.com"} == 0
|
||
for: 3m
|
||
labels:
|
||
severity: critical
|
||
layer: external
|
||
component: tsenyang-website
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "外部網站 tsenyang.com 離線"
|
||
description: "tsenyang.com 探測失敗超過 3 分鐘,容器 tsenyang-website (188) 可能需要重啟"
|
||
|
||
- alert: StockWoooWorkDown
|
||
expr: probe_success{job="blackbox-http", instance="http://stock.wooo.work"} == 0
|
||
for: 3m
|
||
labels:
|
||
severity: critical
|
||
layer: external
|
||
component: stock-platform
|
||
host: "110"
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "外部網站 stock.wooo.work 離線"
|
||
description: "stock.wooo.work 探測失敗超過 3 分鐘,容器 stock-platform (110) 可能需要重啟"
|
||
|
||
- alert: BitanWoooWorkDown
|
||
expr: probe_success{job="blackbox-http", instance="https://bitan.wooo.work"} == 0
|
||
for: 3m
|
||
labels:
|
||
severity: critical
|
||
layer: external
|
||
component: bitan-app
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "外部網站 bitan.wooo.work 離線"
|
||
description: "bitan.wooo.work 探測失敗超過 3 分鐘,容器 bitan-app (188) 可能需要重啟"
|
||
|
||
- alert: ExternalSiteSSLExpiringSoon
|
||
expr: probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time() < 14 * 24 * 3600
|
||
for: 1h
|
||
labels:
|
||
severity: warning
|
||
layer: external
|
||
component: ssl
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "SSL 憑證即將到期: {{ $labels.instance }}"
|
||
description: "{{ $labels.instance }} SSL 憑證將在 14 天內到期,請手動更新"
|