feat(observability): Phase O-2/O-3 OTEL Log管線 + Event Exporter + Remote Write
O-2.1: OTEL Collector DaemonSet (filelog receiver) - 收集所有 K3s 節點 Pod stdout/stderr → SigNoz ClickHouse - CRI log parser (Go time layout for +08:00 timezone) - filter processor 排除 kube-system debug noise - observability namespace PSA privileged (log 目錄需 root) - 資源限制: 50m-200m CPU / 64-128Mi Memory O-2.2: kubernetes-event-exporter - K8s Event → 結構化 JSON Log → SigNoz - Warning/Error 全量保留, Normal 過濾高頻事件 - 解決: Event 預設僅保留 ~1hr 的致命盲區 O-3: Prometheus remote_write 配置模板 - 白名單: ~50 關鍵 metric series (node/container/kube/api/db) - 目標: 90 天長期儲存於 SigNoz ClickHouse 已部署驗證: 3 Pod Running, 0 error, filelog 正常監控所有 namespace Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
92
k8s/monitoring/prometheus-remote-write-signoz.yaml
Normal file
92
k8s/monitoring/prometheus-remote-write-signoz.yaml
Normal file
@@ -0,0 +1,92 @@
|
||||
# =============================================================================
|
||||
# Prometheus Remote Write → SigNoz - Phase O-3
|
||||
# =============================================================================
|
||||
# 建立者: Claude Code (首席架構師)
|
||||
# 日期: 2026-04-02 (台北時間)
|
||||
# 用途: 將關鍵指標長期儲存到 SigNoz ClickHouse (90 天)
|
||||
# 部署位置: 192.168.0.188 /etc/prometheus/prometheus.yml
|
||||
# =============================================================================
|
||||
#
|
||||
# 部署方式:
|
||||
# 1. SSH 到 192.168.0.188 (ollama 使用者)
|
||||
# 2. 編輯 /etc/prometheus/prometheus.yml
|
||||
# 3. 在最外層新增以下 remote_write 區塊
|
||||
# 4. 執行: sudo systemctl reload prometheus
|
||||
#
|
||||
# 驗證:
|
||||
# curl -s "http://192.168.0.188:9090/api/v1/status/config" | jq '.data.yaml' | grep remote_write
|
||||
#
|
||||
# =============================================================================
|
||||
|
||||
# ===== 新增至 prometheus.yml 最外層 (與 scrape_configs 同級) =====
|
||||
|
||||
remote_write:
|
||||
- url: http://localhost:24318/api/v1/write
|
||||
# 只在同一台機器上 (188 → 188),無需跨網路
|
||||
remote_timeout: 30s
|
||||
|
||||
# 佇列配置: 適合小型叢集
|
||||
queue_config:
|
||||
capacity: 2500
|
||||
max_shards: 5
|
||||
min_shards: 1
|
||||
max_samples_per_send: 500
|
||||
batch_send_deadline: 5s
|
||||
|
||||
# ===== 白名單過濾 (只轉關鍵指標) =====
|
||||
# 預估 ~50 個 metric series,90 天約 2-5 GB
|
||||
write_relabel_configs:
|
||||
# 1. 節點資源指標
|
||||
- source_labels: [__name__]
|
||||
regex: "node_cpu_seconds_total|node_memory_MemAvailable_bytes|node_memory_MemTotal_bytes|node_filesystem_avail_bytes|node_filesystem_size_bytes|node_load1|node_load5|node_load15|node_network_receive_bytes_total|node_network_transmit_bytes_total"
|
||||
action: keep
|
||||
|
||||
# 2. 容器資源指標
|
||||
- source_labels: [__name__]
|
||||
regex: "container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_memory_rss|container_network_receive_bytes_total|container_network_transmit_bytes_total"
|
||||
action: keep
|
||||
|
||||
# 3. K8s 狀態指標 (Pod 重啟是 RCA 關鍵!)
|
||||
- source_labels: [__name__]
|
||||
regex: "kube_pod_container_status_restarts_total|kube_pod_status_phase|kube_deployment_status_replicas_available|kube_deployment_status_replicas_unavailable|kube_node_status_condition"
|
||||
action: keep
|
||||
|
||||
# 4. API 效能指標
|
||||
- source_labels: [__name__]
|
||||
regex: "http_request_duration_seconds.*|http_requests_total"
|
||||
action: keep
|
||||
|
||||
# 5. AWOOOI 自訂業務指標
|
||||
- source_labels: [__name__]
|
||||
regex: "awoooi_.*"
|
||||
action: keep
|
||||
|
||||
# 6. 資料庫指標
|
||||
- source_labels: [__name__]
|
||||
regex: "pg_stat_activity_count|pg_stat_database_conflicts|pg_stat_database_deadlocks|pg_up|redis_connected_clients|redis_memory_used_bytes|redis_keyspace_hits_total|redis_keyspace_misses_total|redis_up"
|
||||
action: keep
|
||||
|
||||
# 7. Probe 健康狀態
|
||||
- source_labels: [__name__]
|
||||
regex: "probe_success|probe_duration_seconds|probe_ssl_earliest_cert_expiry"
|
||||
action: keep
|
||||
|
||||
# 8. UP 指標 (所有 target 存活狀態)
|
||||
- source_labels: [__name__]
|
||||
regex: "up"
|
||||
action: keep
|
||||
|
||||
# =============================================================================
|
||||
# ⚠️ 注意事項
|
||||
# =============================================================================
|
||||
#
|
||||
# 1. write_relabel_configs 是 OR 邏輯:多個 keep 規則中,
|
||||
# 任一匹配即保留。這是 Prometheus 內建行為。
|
||||
#
|
||||
# 2. 如果 SigNoz OTLP/HTTP 端點不支援 Prometheus remote_write 格式,
|
||||
# 需要改用 victoria-metrics 或 cortex 作為中繼。
|
||||
# 替代方案: 在 SigNoz 的 OTEL Collector 中啟用 prometheusremotewrite receiver。
|
||||
#
|
||||
# 3. 增加/移除指標後,用以下指令驗證:
|
||||
# curl -s "http://192.168.0.188:9090/api/v1/query?query=up" | jq '.data.result | length'
|
||||
#
|
||||
157
k8s/observability/event-exporter.yaml
Normal file
157
k8s/observability/event-exporter.yaml
Normal file
@@ -0,0 +1,157 @@
|
||||
# =============================================================================
|
||||
# Kubernetes Event Exporter - Phase O-2.2
|
||||
# =============================================================================
|
||||
# 建立者: Claude Code (首席架構師)
|
||||
# 日期: 2026-04-02 (台北時間)
|
||||
# 用途: 將 K8s Event 轉為結構化 Log 送往 SigNoz,保留 30 天
|
||||
# 解決: K8s Event 預設僅保留 ~1 小時的致命盲區
|
||||
# =============================================================================
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: event-exporter
|
||||
namespace: observability
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: event-exporter
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["events"]
|
||||
verbs: ["get", "watch", "list"]
|
||||
- apiGroups: [""]
|
||||
resources: ["namespaces"]
|
||||
verbs: ["get", "watch", "list"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: event-exporter
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: event-exporter
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: event-exporter
|
||||
namespace: observability
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: event-exporter-config
|
||||
namespace: observability
|
||||
data:
|
||||
config.yaml: |
|
||||
# Phase O-2.2: K8s Event → SigNoz (OTLP HTTP)
|
||||
logLevel: info
|
||||
logFormat: json
|
||||
route:
|
||||
routes:
|
||||
# Warning 和 Error 事件: 全量保留 (RCA 關鍵資料)
|
||||
- match:
|
||||
- receiver: signoz-warning
|
||||
drop:
|
||||
- type: "Normal"
|
||||
reason: "Scheduled"
|
||||
- type: "Normal"
|
||||
reason: "Pulling"
|
||||
- type: "Normal"
|
||||
reason: "Pulled"
|
||||
- type: "Normal"
|
||||
reason: "Created"
|
||||
- type: "Normal"
|
||||
reason: "Started"
|
||||
# Normal 事件中的關鍵類型: 選擇性保留
|
||||
- match:
|
||||
- receiver: signoz-normal
|
||||
drop:
|
||||
- type: "Warning"
|
||||
receivers:
|
||||
- name: signoz-warning
|
||||
webhook:
|
||||
endpoint: http://192.168.0.188:24318/v1/logs
|
||||
headers:
|
||||
Content-Type: application/json
|
||||
layout:
|
||||
severity: "{{ .Type }}"
|
||||
body: "{{ .Message }}"
|
||||
attributes:
|
||||
k8s.event.reason: "{{ .Reason }}"
|
||||
k8s.event.type: "{{ .Type }}"
|
||||
k8s.event.action: "{{ .Action }}"
|
||||
k8s.event.count: "{{ .Count }}"
|
||||
k8s.namespace.name: "{{ .InvolvedObject.Namespace }}"
|
||||
k8s.object.kind: "{{ .InvolvedObject.Kind }}"
|
||||
k8s.object.name: "{{ .InvolvedObject.Name }}"
|
||||
k8s.event.source.component: "{{ .Source.Component }}"
|
||||
k8s.event.source.host: "{{ .Source.Host }}"
|
||||
k8s.event.first_timestamp: "{{ .FirstTimestamp }}"
|
||||
k8s.event.last_timestamp: "{{ .LastTimestamp }}"
|
||||
- name: signoz-normal
|
||||
webhook:
|
||||
endpoint: http://192.168.0.188:24318/v1/logs
|
||||
headers:
|
||||
Content-Type: application/json
|
||||
layout:
|
||||
severity: "Normal"
|
||||
body: "{{ .Message }}"
|
||||
attributes:
|
||||
k8s.event.reason: "{{ .Reason }}"
|
||||
k8s.event.type: "{{ .Type }}"
|
||||
k8s.namespace.name: "{{ .InvolvedObject.Namespace }}"
|
||||
k8s.object.kind: "{{ .InvolvedObject.Kind }}"
|
||||
k8s.object.name: "{{ .InvolvedObject.Name }}"
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: event-exporter
|
||||
namespace: observability
|
||||
labels:
|
||||
app.kubernetes.io/name: event-exporter
|
||||
phase: o-2
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: event-exporter
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: event-exporter
|
||||
spec:
|
||||
serviceAccountName: event-exporter
|
||||
# PSA restricted 合規
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65534
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
containers:
|
||||
- name: event-exporter
|
||||
image: ghcr.io/resmoio/kubernetes-event-exporter:v1.7
|
||||
args:
|
||||
- -conf=/etc/event-exporter/config.yaml
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
resources:
|
||||
requests:
|
||||
cpu: 20m
|
||||
memory: 32Mi
|
||||
limits:
|
||||
cpu: 100m
|
||||
memory: 64Mi
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/event-exporter
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: event-exporter-config
|
||||
226
k8s/observability/otel-collector-daemonset.yaml
Normal file
226
k8s/observability/otel-collector-daemonset.yaml
Normal file
@@ -0,0 +1,226 @@
|
||||
# =============================================================================
|
||||
# OTEL Collector DaemonSet - Phase O-2.1
|
||||
# =============================================================================
|
||||
# 建立者: Claude Code (首席架構師)
|
||||
# 日期: 2026-04-02 (台北時間)
|
||||
# 用途: 收集所有 K3s 節點的 Pod Log,統一送往 SigNoz ClickHouse
|
||||
# 架構決策: SigNoz 統一派 (不另裝 Loki)
|
||||
# =============================================================================
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: observability
|
||||
labels:
|
||||
app.kubernetes.io/name: observability
|
||||
# Phase O-2.1: DaemonSet log collector 必須 root 讀取 /var/log/pods
|
||||
# 這是 OTEL Collector / Promtail / Fluentbit 的業界標準做法
|
||||
pod-security.kubernetes.io/enforce: privileged
|
||||
pod-security.kubernetes.io/audit: privileged
|
||||
pod-security.kubernetes.io/warn: privileged
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: otel-collector
|
||||
namespace: observability
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: otel-collector
|
||||
rules:
|
||||
# filelog receiver 需要讀取 Pod 資訊來注入 k8s attributes
|
||||
- apiGroups: [""]
|
||||
resources: ["pods", "namespaces", "nodes"]
|
||||
verbs: ["get", "watch", "list"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: otel-collector
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: otel-collector
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: otel-collector
|
||||
namespace: observability
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: otel-collector-config
|
||||
namespace: observability
|
||||
data:
|
||||
config.yaml: |
|
||||
receivers:
|
||||
# 讀取 K3s 節點上所有 Pod 的 stdout/stderr log
|
||||
filelog:
|
||||
include:
|
||||
- /var/log/pods/*/*/*.log
|
||||
exclude:
|
||||
# 排除 OTEL Collector 自身 (防遞迴)
|
||||
- /var/log/pods/observability_otel-collector*/**/*.log
|
||||
start_at: end
|
||||
include_file_path: true
|
||||
include_file_name: false
|
||||
operators:
|
||||
# K3s 使用 CRI log 格式: <timestamp> <stream> <flags> <log>
|
||||
- type: regex_parser
|
||||
id: cri_parser
|
||||
regex: '^(?P<time>[^ ]+) (?P<stream>stdout|stderr) (?P<flags>[^ ]*) (?P<log>.*)$'
|
||||
timestamp:
|
||||
parse_from: attributes.time
|
||||
# K3s CRI log: 2026-04-02T13:58:56.472854397+08:00
|
||||
# 使用 Go native layout (非 strptime)
|
||||
layout_type: gotime
|
||||
layout: '2006-01-02T15:04:05.999999999-07:00'
|
||||
# 從檔案路徑提取 namespace/pod/container 資訊
|
||||
- type: regex_parser
|
||||
id: extract_metadata
|
||||
parse_from: attributes["log.file.path"]
|
||||
regex: '^/var/log/pods/(?P<namespace>[^_]+)_(?P<pod>[^_]+)_[^/]+/(?P<container>[^/]+)/'
|
||||
# 移動到 resource attributes
|
||||
- type: move
|
||||
from: attributes.namespace
|
||||
to: resource["k8s.namespace.name"]
|
||||
- type: move
|
||||
from: attributes.pod
|
||||
to: resource["k8s.pod.name"]
|
||||
- type: move
|
||||
from: attributes.container
|
||||
to: resource["k8s.container.name"]
|
||||
# log body 使用解析出的 log 內容
|
||||
- type: move
|
||||
from: attributes.log
|
||||
to: body
|
||||
|
||||
processors:
|
||||
# 批次發送: 降低網路開銷
|
||||
batch:
|
||||
send_batch_size: 200
|
||||
send_batch_max_size: 500
|
||||
timeout: 5s
|
||||
|
||||
# 過濾: 捨棄低價值 log (降低 ClickHouse 儲存壓力)
|
||||
filter/drop_noise:
|
||||
logs:
|
||||
exclude:
|
||||
match_type: regexp
|
||||
# 排除 kube-system 中的高頻 debug log
|
||||
resource_attributes:
|
||||
- key: k8s.namespace.name
|
||||
value: "kube-system"
|
||||
bodies:
|
||||
- ".*level=debug.*"
|
||||
- ".*Listing and watching.*"
|
||||
- ".*reflector\\.go.*"
|
||||
|
||||
# 記憶體限制: 防止 OOM
|
||||
memory_limiter:
|
||||
check_interval: 5s
|
||||
limit_mib: 100
|
||||
spike_limit_mib: 30
|
||||
|
||||
# 注入節點資訊
|
||||
resource:
|
||||
attributes:
|
||||
- key: k8s.node.name
|
||||
value: "${env:K8S_NODE_NAME}"
|
||||
action: upsert
|
||||
- key: deployment.environment
|
||||
value: "prod"
|
||||
action: upsert
|
||||
|
||||
exporters:
|
||||
# 發送到 SigNoz OTEL Collector (HTTP)
|
||||
otlphttp:
|
||||
endpoint: http://192.168.0.188:24318
|
||||
retry_on_failure:
|
||||
enabled: true
|
||||
initial_interval: 5s
|
||||
max_interval: 30s
|
||||
sending_queue:
|
||||
enabled: true
|
||||
num_consumers: 2
|
||||
queue_size: 100
|
||||
|
||||
service:
|
||||
pipelines:
|
||||
logs:
|
||||
receivers: [filelog]
|
||||
processors: [memory_limiter, filter/drop_noise, resource, batch]
|
||||
exporters: [otlphttp]
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: otel-collector
|
||||
namespace: observability
|
||||
labels:
|
||||
app.kubernetes.io/name: otel-collector
|
||||
app.kubernetes.io/component: log-collector
|
||||
phase: o-2
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: otel-collector
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: otel-collector
|
||||
app.kubernetes.io/component: log-collector
|
||||
spec:
|
||||
serviceAccountName: otel-collector
|
||||
# observability namespace 設為 privileged PSA
|
||||
# 原因: /var/log/pods 是 root:root 0750,filelog receiver 必須 root 讀取
|
||||
securityContext:
|
||||
runAsUser: 0
|
||||
runAsGroup: 0
|
||||
containers:
|
||||
- name: otel-collector
|
||||
image: otel/opentelemetry-collector-contrib:0.96.0
|
||||
args:
|
||||
- --config=/etc/otel/config.yaml
|
||||
env:
|
||||
- name: K8S_NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
securityContext:
|
||||
readOnlyRootFilesystem: true
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 64Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 128Mi
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/otel
|
||||
readOnly: true
|
||||
- name: varlogpods
|
||||
mountPath: /var/log/pods
|
||||
readOnly: true
|
||||
# OTEL Collector 需要 tmp 來寫 checkpoint
|
||||
- name: tmp
|
||||
mountPath: /tmp
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: otel-collector-config
|
||||
- name: varlogpods
|
||||
hostPath:
|
||||
path: /var/log/pods
|
||||
type: Directory
|
||||
- name: tmp
|
||||
emptyDir:
|
||||
sizeLimit: 50Mi
|
||||
tolerations:
|
||||
# 確保在所有節點上都運行 (包括 control-plane)
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
Reference in New Issue
Block a user