debugger 全 codebase 追根溯源後揪出 5 處 PROMETHEUS_URL drift 殘存 (根因:docs/reference/SERVICE-ENDPOINTS.md 早期把 Prometheus 標在 188 是整個 codebase drift 的源頭)。 本次修最急的 2 處: ## 🔴🔴 kured.yaml:132(守門員失效風險) - 188 → 110 - kured 跑 reboot 前會查 Prometheus alerts,連錯主機 = 跳過保護直接 reboot 主機 - 對齊 ConfigMap + config.py PROMETHEUS_URL ## 🟡 monitoring.py:67(單一事實源) - 寫死 110:9090 改用 settings.PROMETHEUS_URL - 主機巧合正確但繞過 ConfigMap 注入機制 - 未來 Prometheus 再遷移避免再次 drift ## 暫不修 - k3s_monitor_service.py:38 用 121:30090 是 K3s NodePort 內網端點 與外部 PROMETHEUS_URL 概念不同,需新增 PROMETHEUS_INTERNAL_URL setting - 其他 docstring + 文件 drift(SERVICE-ENDPOINTS.md 等)留待後續 ## 驗證 1552 unit tests 全綠(無回歸) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
150 lines
3.8 KiB
YAML
150 lines
3.8 KiB
YAML
# =============================================================================
|
|
# Kured (KUbernetes REboot Daemon)
|
|
# =============================================================================
|
|
# K4.1 2026-03-28: Automatic node reboot for OS updates
|
|
# Deployed by: Claude Code (首席架構師)
|
|
# Maintenance window: 02:00-04:00 (UTC+8 台北時間)
|
|
# =============================================================================
|
|
---
|
|
apiVersion: v1
|
|
kind: Namespace
|
|
metadata:
|
|
name: kured
|
|
labels:
|
|
app.kubernetes.io/name: kured
|
|
---
|
|
apiVersion: v1
|
|
kind: ServiceAccount
|
|
metadata:
|
|
name: kured
|
|
namespace: kured
|
|
---
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: ClusterRole
|
|
metadata:
|
|
name: kured
|
|
rules:
|
|
- apiGroups: [""]
|
|
resources: ["nodes"]
|
|
verbs: ["get", "patch"]
|
|
- apiGroups: [""]
|
|
resources: ["pods"]
|
|
verbs: ["list", "delete", "get"]
|
|
- apiGroups: ["apps"]
|
|
resources: ["daemonsets"]
|
|
verbs: ["get"]
|
|
- apiGroups: [""]
|
|
resources: ["pods/eviction"]
|
|
verbs: ["create"]
|
|
---
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: ClusterRoleBinding
|
|
metadata:
|
|
name: kured
|
|
roleRef:
|
|
apiGroup: rbac.authorization.k8s.io
|
|
kind: ClusterRole
|
|
name: kured
|
|
subjects:
|
|
- kind: ServiceAccount
|
|
name: kured
|
|
namespace: kured
|
|
---
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: Role
|
|
metadata:
|
|
name: kured
|
|
namespace: kured
|
|
rules:
|
|
- apiGroups: ["apps"]
|
|
resources: ["daemonsets"]
|
|
verbs: ["update"]
|
|
---
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: RoleBinding
|
|
metadata:
|
|
name: kured
|
|
namespace: kured
|
|
roleRef:
|
|
apiGroup: rbac.authorization.k8s.io
|
|
kind: Role
|
|
name: kured
|
|
subjects:
|
|
- kind: ServiceAccount
|
|
name: kured
|
|
namespace: kured
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: DaemonSet
|
|
metadata:
|
|
name: kured
|
|
namespace: kured
|
|
labels:
|
|
app.kubernetes.io/name: kured
|
|
app.kubernetes.io/version: "1.15.1"
|
|
spec:
|
|
selector:
|
|
matchLabels:
|
|
app.kubernetes.io/name: kured
|
|
updateStrategy:
|
|
type: RollingUpdate
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app.kubernetes.io/name: kured
|
|
spec:
|
|
serviceAccountName: kured
|
|
tolerations:
|
|
- key: node-role.kubernetes.io/control-plane
|
|
operator: Exists
|
|
effect: NoSchedule
|
|
- key: node-role.kubernetes.io/master
|
|
operator: Exists
|
|
effect: NoSchedule
|
|
hostPID: true
|
|
restartPolicy: Always
|
|
containers:
|
|
- name: kured
|
|
image: ghcr.io/kubereboot/kured:1.15.1
|
|
imagePullPolicy: IfNotPresent
|
|
securityContext:
|
|
privileged: true
|
|
env:
|
|
- name: KURED_NODE_ID
|
|
valueFrom:
|
|
fieldRef:
|
|
fieldPath: spec.nodeName
|
|
command:
|
|
- /usr/bin/kured
|
|
- --reboot-sentinel=/var/run/reboot-required
|
|
# DaemonSet 位置 (修復 CrashLoopBackOff)
|
|
- --ds-namespace=kured
|
|
- --ds-name=kured
|
|
# 維護窗口: 02:00-04:00 台北時間 (UTC+8)
|
|
- --start-time=02:00
|
|
- --end-time=04:00
|
|
- --time-zone=Asia/Taipei
|
|
# 重啟間隔: 1小時
|
|
- --period=1h
|
|
# PodDisruptionBudget 檢查
|
|
- --blocking-pod-selector=app.kubernetes.io/name=awoooi
|
|
# Prometheus metrics
|
|
# 2026-04-29 ogt + Claude Opus 4.7: drift fix — 188 是 Ollama Hub
|
|
# Prometheus 實際在 110。kured 連錯主機 = 跳過 alert 守門員直接 reboot
|
|
# 對齊 ConfigMap 04-configmap.yaml + config.py PROMETHEUS_URL
|
|
- --prometheus-url=http://192.168.0.110:9090
|
|
resources:
|
|
limits:
|
|
cpu: 100m
|
|
memory: 64Mi
|
|
requests:
|
|
cpu: 10m
|
|
memory: 32Mi
|
|
volumeMounts:
|
|
- name: hostrun
|
|
mountPath: /var/run
|
|
volumes:
|
|
- name: hostrun
|
|
hostPath:
|
|
path: /var/run
|