feat(observability): Phase O-2/O-3 OTEL Log管線 + Event Exporter + Remote Write

O-2.1: OTEL Collector DaemonSet (filelog receiver)
  - 收集所有 K3s 節點 Pod stdout/stderr → SigNoz ClickHouse
  - CRI log parser (Go time layout for +08:00 timezone)
  - filter processor 排除 kube-system debug noise
  - observability namespace PSA privileged (log 目錄需 root)
  - 資源限制: 50m-200m CPU / 64-128Mi Memory

O-2.2: kubernetes-event-exporter
  - K8s Event → 結構化 JSON Log → SigNoz
  - Warning/Error 全量保留, Normal 過濾高頻事件
  - 解決: Event 預設僅保留 ~1hr 的致命盲區

O-3: Prometheus remote_write 配置模板
  - 白名單: ~50 關鍵 metric series (node/container/kube/api/db)
  - 目標: 90 天長期儲存於 SigNoz ClickHouse

已部署驗證: 3 Pod Running, 0 error, filelog 正常監控所有 namespace

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-02 14:01:42 +08:00
parent 1dd0ff8cf4
commit 41bf0681cf
3 changed files with 475 additions and 0 deletions

View File

@@ -0,0 +1,92 @@
# =============================================================================
# Prometheus Remote Write → SigNoz - Phase O-3
# =============================================================================
# 建立者: Claude Code (首席架構師)
# 日期: 2026-04-02 (台北時間)
# 用途: 將關鍵指標長期儲存到 SigNoz ClickHouse (90 天)
# 部署位置: 192.168.0.188 /etc/prometheus/prometheus.yml
# =============================================================================
#
# 部署方式:
# 1. SSH 到 192.168.0.188 (ollama 使用者)
# 2. 編輯 /etc/prometheus/prometheus.yml
# 3. 在最外層新增以下 remote_write 區塊
# 4. 執行: sudo systemctl reload prometheus
#
# 驗證:
# curl -s "http://192.168.0.188:9090/api/v1/status/config" | jq '.data.yaml' | grep remote_write
#
# =============================================================================
# ===== 新增至 prometheus.yml 最外層 (與 scrape_configs 同級) =====
remote_write:
- url: http://localhost:24318/api/v1/write
# 只在同一台機器上 (188 → 188),無需跨網路
remote_timeout: 30s
# 佇列配置: 適合小型叢集
queue_config:
capacity: 2500
max_shards: 5
min_shards: 1
max_samples_per_send: 500
batch_send_deadline: 5s
# ===== 白名單過濾 (只轉關鍵指標) =====
# 預估 ~50 個 metric series90 天約 2-5 GB
write_relabel_configs:
# 1. 節點資源指標
- source_labels: [__name__]
regex: "node_cpu_seconds_total|node_memory_MemAvailable_bytes|node_memory_MemTotal_bytes|node_filesystem_avail_bytes|node_filesystem_size_bytes|node_load1|node_load5|node_load15|node_network_receive_bytes_total|node_network_transmit_bytes_total"
action: keep
# 2. 容器資源指標
- source_labels: [__name__]
regex: "container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_memory_rss|container_network_receive_bytes_total|container_network_transmit_bytes_total"
action: keep
# 3. K8s 狀態指標 (Pod 重啟是 RCA 關鍵!)
- source_labels: [__name__]
regex: "kube_pod_container_status_restarts_total|kube_pod_status_phase|kube_deployment_status_replicas_available|kube_deployment_status_replicas_unavailable|kube_node_status_condition"
action: keep
# 4. API 效能指標
- source_labels: [__name__]
regex: "http_request_duration_seconds.*|http_requests_total"
action: keep
# 5. AWOOOI 自訂業務指標
- source_labels: [__name__]
regex: "awoooi_.*"
action: keep
# 6. 資料庫指標
- source_labels: [__name__]
regex: "pg_stat_activity_count|pg_stat_database_conflicts|pg_stat_database_deadlocks|pg_up|redis_connected_clients|redis_memory_used_bytes|redis_keyspace_hits_total|redis_keyspace_misses_total|redis_up"
action: keep
# 7. Probe 健康狀態
- source_labels: [__name__]
regex: "probe_success|probe_duration_seconds|probe_ssl_earliest_cert_expiry"
action: keep
# 8. UP 指標 (所有 target 存活狀態)
- source_labels: [__name__]
regex: "up"
action: keep
# =============================================================================
# ⚠️ 注意事項
# =============================================================================
#
# 1. write_relabel_configs 是 OR 邏輯:多個 keep 規則中,
# 任一匹配即保留。這是 Prometheus 內建行為。
#
# 2. 如果 SigNoz OTLP/HTTP 端點不支援 Prometheus remote_write 格式,
# 需要改用 victoria-metrics 或 cortex 作為中繼。
# 替代方案: 在 SigNoz 的 OTEL Collector 中啟用 prometheusremotewrite receiver。
#
# 3. 增加/移除指標後,用以下指令驗證:
# curl -s "http://192.168.0.188:9090/api/v1/query?query=up" | jq '.data.result | length'
#

View File

@@ -0,0 +1,157 @@
# =============================================================================
# Kubernetes Event Exporter - Phase O-2.2
# =============================================================================
# 建立者: Claude Code (首席架構師)
# 日期: 2026-04-02 (台北時間)
# 用途: 將 K8s Event 轉為結構化 Log 送往 SigNoz保留 30 天
# 解決: K8s Event 預設僅保留 ~1 小時的致命盲區
# =============================================================================
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: event-exporter
namespace: observability
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: event-exporter
rules:
- apiGroups: [""]
resources: ["events"]
verbs: ["get", "watch", "list"]
- apiGroups: [""]
resources: ["namespaces"]
verbs: ["get", "watch", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: event-exporter
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: event-exporter
subjects:
- kind: ServiceAccount
name: event-exporter
namespace: observability
---
apiVersion: v1
kind: ConfigMap
metadata:
name: event-exporter-config
namespace: observability
data:
config.yaml: |
# Phase O-2.2: K8s Event → SigNoz (OTLP HTTP)
logLevel: info
logFormat: json
route:
routes:
# Warning 和 Error 事件: 全量保留 (RCA 關鍵資料)
- match:
- receiver: signoz-warning
drop:
- type: "Normal"
reason: "Scheduled"
- type: "Normal"
reason: "Pulling"
- type: "Normal"
reason: "Pulled"
- type: "Normal"
reason: "Created"
- type: "Normal"
reason: "Started"
# Normal 事件中的關鍵類型: 選擇性保留
- match:
- receiver: signoz-normal
drop:
- type: "Warning"
receivers:
- name: signoz-warning
webhook:
endpoint: http://192.168.0.188:24318/v1/logs
headers:
Content-Type: application/json
layout:
severity: "{{ .Type }}"
body: "{{ .Message }}"
attributes:
k8s.event.reason: "{{ .Reason }}"
k8s.event.type: "{{ .Type }}"
k8s.event.action: "{{ .Action }}"
k8s.event.count: "{{ .Count }}"
k8s.namespace.name: "{{ .InvolvedObject.Namespace }}"
k8s.object.kind: "{{ .InvolvedObject.Kind }}"
k8s.object.name: "{{ .InvolvedObject.Name }}"
k8s.event.source.component: "{{ .Source.Component }}"
k8s.event.source.host: "{{ .Source.Host }}"
k8s.event.first_timestamp: "{{ .FirstTimestamp }}"
k8s.event.last_timestamp: "{{ .LastTimestamp }}"
- name: signoz-normal
webhook:
endpoint: http://192.168.0.188:24318/v1/logs
headers:
Content-Type: application/json
layout:
severity: "Normal"
body: "{{ .Message }}"
attributes:
k8s.event.reason: "{{ .Reason }}"
k8s.event.type: "{{ .Type }}"
k8s.namespace.name: "{{ .InvolvedObject.Namespace }}"
k8s.object.kind: "{{ .InvolvedObject.Kind }}"
k8s.object.name: "{{ .InvolvedObject.Name }}"
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: event-exporter
namespace: observability
labels:
app.kubernetes.io/name: event-exporter
phase: o-2
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: event-exporter
template:
metadata:
labels:
app.kubernetes.io/name: event-exporter
spec:
serviceAccountName: event-exporter
# PSA restricted 合規
securityContext:
runAsNonRoot: true
runAsUser: 65534
seccompProfile:
type: RuntimeDefault
containers:
- name: event-exporter
image: ghcr.io/resmoio/kubernetes-event-exporter:v1.7
args:
- -conf=/etc/event-exporter/config.yaml
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop: ["ALL"]
resources:
requests:
cpu: 20m
memory: 32Mi
limits:
cpu: 100m
memory: 64Mi
volumeMounts:
- name: config
mountPath: /etc/event-exporter
readOnly: true
volumes:
- name: config
configMap:
name: event-exporter-config

View File

@@ -0,0 +1,226 @@
# =============================================================================
# OTEL Collector DaemonSet - Phase O-2.1
# =============================================================================
# 建立者: Claude Code (首席架構師)
# 日期: 2026-04-02 (台北時間)
# 用途: 收集所有 K3s 節點的 Pod Log統一送往 SigNoz ClickHouse
# 架構決策: SigNoz 統一派 (不另裝 Loki)
# =============================================================================
---
apiVersion: v1
kind: Namespace
metadata:
name: observability
labels:
app.kubernetes.io/name: observability
# Phase O-2.1: DaemonSet log collector 必須 root 讀取 /var/log/pods
# 這是 OTEL Collector / Promtail / Fluentbit 的業界標準做法
pod-security.kubernetes.io/enforce: privileged
pod-security.kubernetes.io/audit: privileged
pod-security.kubernetes.io/warn: privileged
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: otel-collector
namespace: observability
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: otel-collector
rules:
# filelog receiver 需要讀取 Pod 資訊來注入 k8s attributes
- apiGroups: [""]
resources: ["pods", "namespaces", "nodes"]
verbs: ["get", "watch", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: otel-collector
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: otel-collector
subjects:
- kind: ServiceAccount
name: otel-collector
namespace: observability
---
apiVersion: v1
kind: ConfigMap
metadata:
name: otel-collector-config
namespace: observability
data:
config.yaml: |
receivers:
# 讀取 K3s 節點上所有 Pod 的 stdout/stderr log
filelog:
include:
- /var/log/pods/*/*/*.log
exclude:
# 排除 OTEL Collector 自身 (防遞迴)
- /var/log/pods/observability_otel-collector*/**/*.log
start_at: end
include_file_path: true
include_file_name: false
operators:
# K3s 使用 CRI log 格式: <timestamp> <stream> <flags> <log>
- type: regex_parser
id: cri_parser
regex: '^(?P<time>[^ ]+) (?P<stream>stdout|stderr) (?P<flags>[^ ]*) (?P<log>.*)$'
timestamp:
parse_from: attributes.time
# K3s CRI log: 2026-04-02T13:58:56.472854397+08:00
# 使用 Go native layout (非 strptime)
layout_type: gotime
layout: '2006-01-02T15:04:05.999999999-07:00'
# 從檔案路徑提取 namespace/pod/container 資訊
- type: regex_parser
id: extract_metadata
parse_from: attributes["log.file.path"]
regex: '^/var/log/pods/(?P<namespace>[^_]+)_(?P<pod>[^_]+)_[^/]+/(?P<container>[^/]+)/'
# 移動到 resource attributes
- type: move
from: attributes.namespace
to: resource["k8s.namespace.name"]
- type: move
from: attributes.pod
to: resource["k8s.pod.name"]
- type: move
from: attributes.container
to: resource["k8s.container.name"]
# log body 使用解析出的 log 內容
- type: move
from: attributes.log
to: body
processors:
# 批次發送: 降低網路開銷
batch:
send_batch_size: 200
send_batch_max_size: 500
timeout: 5s
# 過濾: 捨棄低價值 log (降低 ClickHouse 儲存壓力)
filter/drop_noise:
logs:
exclude:
match_type: regexp
# 排除 kube-system 中的高頻 debug log
resource_attributes:
- key: k8s.namespace.name
value: "kube-system"
bodies:
- ".*level=debug.*"
- ".*Listing and watching.*"
- ".*reflector\\.go.*"
# 記憶體限制: 防止 OOM
memory_limiter:
check_interval: 5s
limit_mib: 100
spike_limit_mib: 30
# 注入節點資訊
resource:
attributes:
- key: k8s.node.name
value: "${env:K8S_NODE_NAME}"
action: upsert
- key: deployment.environment
value: "prod"
action: upsert
exporters:
# 發送到 SigNoz OTEL Collector (HTTP)
otlphttp:
endpoint: http://192.168.0.188:24318
retry_on_failure:
enabled: true
initial_interval: 5s
max_interval: 30s
sending_queue:
enabled: true
num_consumers: 2
queue_size: 100
service:
pipelines:
logs:
receivers: [filelog]
processors: [memory_limiter, filter/drop_noise, resource, batch]
exporters: [otlphttp]
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: otel-collector
namespace: observability
labels:
app.kubernetes.io/name: otel-collector
app.kubernetes.io/component: log-collector
phase: o-2
spec:
selector:
matchLabels:
app.kubernetes.io/name: otel-collector
template:
metadata:
labels:
app.kubernetes.io/name: otel-collector
app.kubernetes.io/component: log-collector
spec:
serviceAccountName: otel-collector
# observability namespace 設為 privileged PSA
# 原因: /var/log/pods 是 root:root 0750filelog receiver 必須 root 讀取
securityContext:
runAsUser: 0
runAsGroup: 0
containers:
- name: otel-collector
image: otel/opentelemetry-collector-contrib:0.96.0
args:
- --config=/etc/otel/config.yaml
env:
- name: K8S_NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
securityContext:
readOnlyRootFilesystem: true
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
cpu: 200m
memory: 128Mi
volumeMounts:
- name: config
mountPath: /etc/otel
readOnly: true
- name: varlogpods
mountPath: /var/log/pods
readOnly: true
# OTEL Collector 需要 tmp 來寫 checkpoint
- name: tmp
mountPath: /tmp
volumes:
- name: config
configMap:
name: otel-collector-config
- name: varlogpods
hostPath:
path: /var/log/pods
type: Directory
- name: tmp
emptyDir:
sizeLimit: 50Mi
tolerations:
# 確保在所有節點上都運行 (包括 control-plane)
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule