From 41bf0681cf2ab68683ccd12cc0575cbdc4ca197a Mon Sep 17 00:00:00 2001 From: OG T Date: Thu, 2 Apr 2026 14:01:42 +0800 Subject: [PATCH] =?UTF-8?q?feat(observability):=20Phase=20O-2/O-3=20OTEL?= =?UTF-8?q?=20Log=E7=AE=A1=E7=B7=9A=20+=20Event=20Exporter=20+=20Remote=20?= =?UTF-8?q?Write?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit O-2.1: OTEL Collector DaemonSet (filelog receiver) - 收集所有 K3s 節點 Pod stdout/stderr → SigNoz ClickHouse - CRI log parser (Go time layout for +08:00 timezone) - filter processor 排除 kube-system debug noise - observability namespace PSA privileged (log 目錄需 root) - 資源限制: 50m-200m CPU / 64-128Mi Memory O-2.2: kubernetes-event-exporter - K8s Event → 結構化 JSON Log → SigNoz - Warning/Error 全量保留, Normal 過濾高頻事件 - 解決: Event 預設僅保留 ~1hr 的致命盲區 O-3: Prometheus remote_write 配置模板 - 白名單: ~50 關鍵 metric series (node/container/kube/api/db) - 目標: 90 天長期儲存於 SigNoz ClickHouse 已部署驗證: 3 Pod Running, 0 error, filelog 正常監控所有 namespace Co-Authored-By: Claude Opus 4.6 (1M context) --- .../prometheus-remote-write-signoz.yaml | 92 +++++++ k8s/observability/event-exporter.yaml | 157 ++++++++++++ .../otel-collector-daemonset.yaml | 226 ++++++++++++++++++ 3 files changed, 475 insertions(+) create mode 100644 k8s/monitoring/prometheus-remote-write-signoz.yaml create mode 100644 k8s/observability/event-exporter.yaml create mode 100644 k8s/observability/otel-collector-daemonset.yaml diff --git a/k8s/monitoring/prometheus-remote-write-signoz.yaml b/k8s/monitoring/prometheus-remote-write-signoz.yaml new file mode 100644 index 00000000..9b994951 --- /dev/null +++ b/k8s/monitoring/prometheus-remote-write-signoz.yaml @@ -0,0 +1,92 @@ +# ============================================================================= +# Prometheus Remote Write → SigNoz - Phase O-3 +# ============================================================================= +# 建立者: Claude Code (首席架構師) +# 日期: 2026-04-02 (台北時間) +# 用途: 將關鍵指標長期儲存到 SigNoz ClickHouse (90 天) +# 部署位置: 192.168.0.188 /etc/prometheus/prometheus.yml +# ============================================================================= +# +# 部署方式: +# 1. SSH 到 192.168.0.188 (ollama 使用者) +# 2. 編輯 /etc/prometheus/prometheus.yml +# 3. 在最外層新增以下 remote_write 區塊 +# 4. 執行: sudo systemctl reload prometheus +# +# 驗證: +# curl -s "http://192.168.0.188:9090/api/v1/status/config" | jq '.data.yaml' | grep remote_write +# +# ============================================================================= + +# ===== 新增至 prometheus.yml 最外層 (與 scrape_configs 同級) ===== + +remote_write: + - url: http://localhost:24318/api/v1/write + # 只在同一台機器上 (188 → 188),無需跨網路 + remote_timeout: 30s + + # 佇列配置: 適合小型叢集 + queue_config: + capacity: 2500 + max_shards: 5 + min_shards: 1 + max_samples_per_send: 500 + batch_send_deadline: 5s + + # ===== 白名單過濾 (只轉關鍵指標) ===== + # 預估 ~50 個 metric series,90 天約 2-5 GB + write_relabel_configs: + # 1. 節點資源指標 + - source_labels: [__name__] + regex: "node_cpu_seconds_total|node_memory_MemAvailable_bytes|node_memory_MemTotal_bytes|node_filesystem_avail_bytes|node_filesystem_size_bytes|node_load1|node_load5|node_load15|node_network_receive_bytes_total|node_network_transmit_bytes_total" + action: keep + + # 2. 容器資源指標 + - source_labels: [__name__] + regex: "container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_memory_rss|container_network_receive_bytes_total|container_network_transmit_bytes_total" + action: keep + + # 3. K8s 狀態指標 (Pod 重啟是 RCA 關鍵!) + - source_labels: [__name__] + regex: "kube_pod_container_status_restarts_total|kube_pod_status_phase|kube_deployment_status_replicas_available|kube_deployment_status_replicas_unavailable|kube_node_status_condition" + action: keep + + # 4. API 效能指標 + - source_labels: [__name__] + regex: "http_request_duration_seconds.*|http_requests_total" + action: keep + + # 5. AWOOOI 自訂業務指標 + - source_labels: [__name__] + regex: "awoooi_.*" + action: keep + + # 6. 資料庫指標 + - source_labels: [__name__] + regex: "pg_stat_activity_count|pg_stat_database_conflicts|pg_stat_database_deadlocks|pg_up|redis_connected_clients|redis_memory_used_bytes|redis_keyspace_hits_total|redis_keyspace_misses_total|redis_up" + action: keep + + # 7. Probe 健康狀態 + - source_labels: [__name__] + regex: "probe_success|probe_duration_seconds|probe_ssl_earliest_cert_expiry" + action: keep + + # 8. UP 指標 (所有 target 存活狀態) + - source_labels: [__name__] + regex: "up" + action: keep + +# ============================================================================= +# ⚠️ 注意事項 +# ============================================================================= +# +# 1. write_relabel_configs 是 OR 邏輯:多個 keep 規則中, +# 任一匹配即保留。這是 Prometheus 內建行為。 +# +# 2. 如果 SigNoz OTLP/HTTP 端點不支援 Prometheus remote_write 格式, +# 需要改用 victoria-metrics 或 cortex 作為中繼。 +# 替代方案: 在 SigNoz 的 OTEL Collector 中啟用 prometheusremotewrite receiver。 +# +# 3. 增加/移除指標後,用以下指令驗證: +# curl -s "http://192.168.0.188:9090/api/v1/query?query=up" | jq '.data.result | length' +# diff --git a/k8s/observability/event-exporter.yaml b/k8s/observability/event-exporter.yaml new file mode 100644 index 00000000..4279960b --- /dev/null +++ b/k8s/observability/event-exporter.yaml @@ -0,0 +1,157 @@ +# ============================================================================= +# Kubernetes Event Exporter - Phase O-2.2 +# ============================================================================= +# 建立者: Claude Code (首席架構師) +# 日期: 2026-04-02 (台北時間) +# 用途: 將 K8s Event 轉為結構化 Log 送往 SigNoz,保留 30 天 +# 解決: K8s Event 預設僅保留 ~1 小時的致命盲區 +# ============================================================================= +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: event-exporter + namespace: observability +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: event-exporter +rules: + - apiGroups: [""] + resources: ["events"] + verbs: ["get", "watch", "list"] + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["get", "watch", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: event-exporter +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: event-exporter +subjects: + - kind: ServiceAccount + name: event-exporter + namespace: observability +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: event-exporter-config + namespace: observability +data: + config.yaml: | + # Phase O-2.2: K8s Event → SigNoz (OTLP HTTP) + logLevel: info + logFormat: json + route: + routes: + # Warning 和 Error 事件: 全量保留 (RCA 關鍵資料) + - match: + - receiver: signoz-warning + drop: + - type: "Normal" + reason: "Scheduled" + - type: "Normal" + reason: "Pulling" + - type: "Normal" + reason: "Pulled" + - type: "Normal" + reason: "Created" + - type: "Normal" + reason: "Started" + # Normal 事件中的關鍵類型: 選擇性保留 + - match: + - receiver: signoz-normal + drop: + - type: "Warning" + receivers: + - name: signoz-warning + webhook: + endpoint: http://192.168.0.188:24318/v1/logs + headers: + Content-Type: application/json + layout: + severity: "{{ .Type }}" + body: "{{ .Message }}" + attributes: + k8s.event.reason: "{{ .Reason }}" + k8s.event.type: "{{ .Type }}" + k8s.event.action: "{{ .Action }}" + k8s.event.count: "{{ .Count }}" + k8s.namespace.name: "{{ .InvolvedObject.Namespace }}" + k8s.object.kind: "{{ .InvolvedObject.Kind }}" + k8s.object.name: "{{ .InvolvedObject.Name }}" + k8s.event.source.component: "{{ .Source.Component }}" + k8s.event.source.host: "{{ .Source.Host }}" + k8s.event.first_timestamp: "{{ .FirstTimestamp }}" + k8s.event.last_timestamp: "{{ .LastTimestamp }}" + - name: signoz-normal + webhook: + endpoint: http://192.168.0.188:24318/v1/logs + headers: + Content-Type: application/json + layout: + severity: "Normal" + body: "{{ .Message }}" + attributes: + k8s.event.reason: "{{ .Reason }}" + k8s.event.type: "{{ .Type }}" + k8s.namespace.name: "{{ .InvolvedObject.Namespace }}" + k8s.object.kind: "{{ .InvolvedObject.Kind }}" + k8s.object.name: "{{ .InvolvedObject.Name }}" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: event-exporter + namespace: observability + labels: + app.kubernetes.io/name: event-exporter + phase: o-2 +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: event-exporter + template: + metadata: + labels: + app.kubernetes.io/name: event-exporter + spec: + serviceAccountName: event-exporter + # PSA restricted 合規 + securityContext: + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + containers: + - name: event-exporter + image: ghcr.io/resmoio/kubernetes-event-exporter:v1.7 + args: + - -conf=/etc/event-exporter/config.yaml + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"] + resources: + requests: + cpu: 20m + memory: 32Mi + limits: + cpu: 100m + memory: 64Mi + volumeMounts: + - name: config + mountPath: /etc/event-exporter + readOnly: true + volumes: + - name: config + configMap: + name: event-exporter-config diff --git a/k8s/observability/otel-collector-daemonset.yaml b/k8s/observability/otel-collector-daemonset.yaml new file mode 100644 index 00000000..5894ca8e --- /dev/null +++ b/k8s/observability/otel-collector-daemonset.yaml @@ -0,0 +1,226 @@ +# ============================================================================= +# OTEL Collector DaemonSet - Phase O-2.1 +# ============================================================================= +# 建立者: Claude Code (首席架構師) +# 日期: 2026-04-02 (台北時間) +# 用途: 收集所有 K3s 節點的 Pod Log,統一送往 SigNoz ClickHouse +# 架構決策: SigNoz 統一派 (不另裝 Loki) +# ============================================================================= +--- +apiVersion: v1 +kind: Namespace +metadata: + name: observability + labels: + app.kubernetes.io/name: observability + # Phase O-2.1: DaemonSet log collector 必須 root 讀取 /var/log/pods + # 這是 OTEL Collector / Promtail / Fluentbit 的業界標準做法 + pod-security.kubernetes.io/enforce: privileged + pod-security.kubernetes.io/audit: privileged + pod-security.kubernetes.io/warn: privileged +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: otel-collector + namespace: observability +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: otel-collector +rules: + # filelog receiver 需要讀取 Pod 資訊來注入 k8s attributes + - apiGroups: [""] + resources: ["pods", "namespaces", "nodes"] + verbs: ["get", "watch", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: otel-collector +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: otel-collector +subjects: + - kind: ServiceAccount + name: otel-collector + namespace: observability +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: otel-collector-config + namespace: observability +data: + config.yaml: | + receivers: + # 讀取 K3s 節點上所有 Pod 的 stdout/stderr log + filelog: + include: + - /var/log/pods/*/*/*.log + exclude: + # 排除 OTEL Collector 自身 (防遞迴) + - /var/log/pods/observability_otel-collector*/**/*.log + start_at: end + include_file_path: true + include_file_name: false + operators: + # K3s 使用 CRI log 格式: + - type: regex_parser + id: cri_parser + regex: '^(?P