From 99be215e8318b7cafd4b4d7617277c1ae6af53e1 Mon Sep 17 00:00:00 2001 From: OG T Date: Thu, 2 Apr 2026 14:02:50 +0800 Subject: [PATCH] =?UTF-8?q?fix(monitoring):=20R1=20Review=20=E4=BF=AE?= =?UTF-8?q?=E6=AD=A3=20=E2=80=94=20Blackbox=20DNS/PSA=20label/=E5=91=8A?= =?UTF-8?q?=E8=AD=A6=E9=96=BE=E5=80=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Critical: Blackbox Exporter replacement 從 K8s DNS 改為主機 IP (192.168.0.188:9115) Important: Descheduler namespace 顯式宣告 PSA restricted labels Suggestion: failedJobsHistoryLimit 3→1, 新增 MinioDiskUsageCritical 5% 告警 R1 Review by: 首席架構師 (Phase O-1) Co-Authored-By: Claude Opus 4.6 (1M context) --- k8s/descheduler/descheduler.yaml | 6 +++++- k8s/monitoring/minio-kali-alerts.yaml | 11 +++++++++++ k8s/monitoring/prometheus-config-phase-o.yaml | 4 +++- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/k8s/descheduler/descheduler.yaml b/k8s/descheduler/descheduler.yaml index e2962c67..9278bc62 100644 --- a/k8s/descheduler/descheduler.yaml +++ b/k8s/descheduler/descheduler.yaml @@ -12,6 +12,10 @@ metadata: name: descheduler labels: app.kubernetes.io/name: descheduler + # R1 Review: 顯式宣告 PSA restricted + pod-security.kubernetes.io/enforce: restricted + pod-security.kubernetes.io/audit: restricted + pod-security.kubernetes.io/warn: restricted --- apiVersion: v1 kind: ServiceAccount @@ -111,7 +115,7 @@ spec: schedule: "0 */2 * * *" # Every 2 hours concurrencyPolicy: Forbid successfulJobsHistoryLimit: 3 - failedJobsHistoryLimit: 3 + failedJobsHistoryLimit: 1 jobTemplate: spec: template: diff --git a/k8s/monitoring/minio-kali-alerts.yaml b/k8s/monitoring/minio-kali-alerts.yaml index 73df2a11..c32ab5a4 100644 --- a/k8s/monitoring/minio-kali-alerts.yaml +++ b/k8s/monitoring/minio-kali-alerts.yaml @@ -31,6 +31,17 @@ groups: summary: "MinIO 磁碟剩餘空間不足 20%" description: "MinIO 可用空間: {{ $value | humanizePercentage }}" + # R1 Review: 加 5% critical 磁碟告警作為最後防線 + - alert: MinioDiskUsageCritical + expr: minio_cluster_capacity_usable_free_bytes / minio_cluster_capacity_usable_total_bytes < 0.05 + for: 5m + labels: + severity: critical + service: minio + annotations: + summary: "MinIO 磁碟剩餘空間不足 5% (緊急)" + description: "MinIO 可用空間: {{ $value | humanizePercentage }},備份即將失敗" + # MinIO 離線磁碟 - alert: MinioOfflineDisk expr: minio_cluster_disk_offline_total > 0 diff --git a/k8s/monitoring/prometheus-config-phase-o.yaml b/k8s/monitoring/prometheus-config-phase-o.yaml index 029aea4f..096dfbba 100644 --- a/k8s/monitoring/prometheus-config-phase-o.yaml +++ b/k8s/monitoring/prometheus-config-phase-o.yaml @@ -57,8 +57,10 @@ target_label: __param_target - source_labels: [__param_target] target_label: instance + # R1 Review Critical Fix: Prometheus 在主機上,無法解析 K8s DNS + # Blackbox Exporter 部署在 .188 主機上 - target_label: __address__ - replacement: blackbox-exporter:9115 + replacement: 192.168.0.188:9115 # ============================================================================= # 驗證指令