I2: nemotron.analyze() 補上 system role (NIM 標準 message format)
- 舊: messages=[{role:user, ...}]
- 新: messages=[{role:system, ...}, {role:user, ...}]
- 效果: K8s operator 角色定義,改善 tool calling 品質
I4: drift_detector._is_allowlisted/_is_critical 用正則取代 strip
- 舊: replace('[*]','') 後 startswith/in → 無法匹配 containers[0]
- 新: [*] → \[\d+\] 正則,正確匹配所有索引
- 修復: containers[*].image 現在能匹配 containers[0].image
341 lines
11 KiB
Python
341 lines
11 KiB
Python
"""
|
||
Drift Detector - Phase 25 P2 Config Drift Detection
|
||
=====================================================
|
||
職責:比對 Git YAML vs K8s 實際狀態,輸出結構化 DriftItem 列表
|
||
不判斷嚴重性,不解釋意圖,只做事實比對
|
||
|
||
版本: v1.0
|
||
建立: 2026-04-04 (台北時區)
|
||
建立者: ogt (首席架構師設計) + Claude Code (實作)
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import subprocess
|
||
import uuid
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
import structlog
|
||
import yaml
|
||
|
||
from src.models.drift import DriftItem, DriftLevel, DriftReport
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
# 白名單欄位(靜默記錄,不告警)
|
||
_DEFAULT_ALLOWLIST_FIELDS = frozenset([
|
||
"spec.replicas",
|
||
"spec.template.spec.containers[*].resources.requests",
|
||
"spec.template.spec.containers[*].resources.limits",
|
||
"metadata.annotations",
|
||
"metadata.labels.pod-template-hash",
|
||
"metadata.resourceVersion",
|
||
"metadata.generation",
|
||
"metadata.uid",
|
||
"status",
|
||
])
|
||
|
||
# 關鍵欄位(必須立即告警)
|
||
_DEFAULT_CRITICAL_FIELDS = frozenset([
|
||
"spec.template.spec.containers[*].image",
|
||
"spec.template.spec.containers[*].env",
|
||
"spec.template.spec.containers[*].ports",
|
||
"spec.template.spec.volumes",
|
||
"spec.template.spec.serviceAccountName",
|
||
])
|
||
|
||
|
||
class GitStateReader:
|
||
"""從 Git HEAD 讀取 K8s YAML 狀態"""
|
||
|
||
def __init__(self, k8s_dir: str = "k8s"):
|
||
self._k8s_dir = Path(k8s_dir)
|
||
|
||
async def read(self, namespace: str) -> dict[str, Any]:
|
||
"""
|
||
讀取 Git HEAD 中指定 namespace 的所有 K8s YAML
|
||
|
||
Returns:
|
||
{resource_key: parsed_yaml_dict}
|
||
resource_key 格式: "{kind}/{name}"
|
||
"""
|
||
try:
|
||
result = await asyncio.get_event_loop().run_in_executor(
|
||
None, self._read_sync, namespace
|
||
)
|
||
return result
|
||
except Exception as e:
|
||
logger.warning("git_state_read_failed", namespace=namespace, error=str(e))
|
||
return {}
|
||
|
||
def _read_sync(self, namespace: str) -> dict[str, Any]:
|
||
resources: dict[str, Any] = {}
|
||
|
||
if not self._k8s_dir.exists():
|
||
logger.warning("k8s_dir_not_found", path=str(self._k8s_dir))
|
||
return resources
|
||
|
||
for yaml_file in self._k8s_dir.rglob("*.yaml"):
|
||
try:
|
||
with open(yaml_file) as f:
|
||
docs = list(yaml.safe_load_all(f))
|
||
for doc in docs:
|
||
if not doc or not isinstance(doc, dict):
|
||
continue
|
||
metadata = doc.get("metadata", {})
|
||
ns = metadata.get("namespace", "")
|
||
if ns and ns != namespace:
|
||
continue
|
||
kind = doc.get("kind", "")
|
||
name = metadata.get("name", "")
|
||
if kind and name:
|
||
key = f"{kind}/{name}"
|
||
resources[key] = doc
|
||
except Exception as e:
|
||
logger.debug("yaml_parse_failed", file=str(yaml_file), error=str(e))
|
||
|
||
return resources
|
||
|
||
|
||
class K8sStateReader:
|
||
"""從 kubectl 讀取 K8s 實際狀態"""
|
||
|
||
async def read(self, namespace: str) -> dict[str, Any]:
|
||
"""
|
||
透過 kubectl 取得指定 namespace 的實際狀態
|
||
|
||
Returns:
|
||
{resource_key: actual_resource_dict}
|
||
"""
|
||
try:
|
||
result = await asyncio.get_event_loop().run_in_executor(
|
||
None, self._read_sync, namespace
|
||
)
|
||
return result
|
||
except Exception as e:
|
||
logger.warning("k8s_state_read_failed", namespace=namespace, error=str(e))
|
||
return {}
|
||
|
||
def _read_sync(self, namespace: str) -> dict[str, Any]:
|
||
resources: dict[str, Any] = {}
|
||
resource_types = ["deployment", "service", "configmap", "ingress"]
|
||
|
||
for rtype in resource_types:
|
||
try:
|
||
proc = subprocess.run(
|
||
["kubectl", "get", rtype, "-n", namespace, "-o", "yaml"],
|
||
capture_output=True,
|
||
text=True,
|
||
timeout=30,
|
||
)
|
||
if proc.returncode != 0:
|
||
logger.debug("kubectl_failed", type=rtype, stderr=proc.stderr[:200])
|
||
continue
|
||
|
||
data = yaml.safe_load(proc.stdout)
|
||
if not data or data.get("kind") != "List":
|
||
continue
|
||
|
||
for item in data.get("items", []):
|
||
kind = item.get("kind", rtype.capitalize())
|
||
name = item.get("metadata", {}).get("name", "")
|
||
if name:
|
||
key = f"{kind}/{name}"
|
||
resources[key] = item
|
||
|
||
except subprocess.TimeoutExpired:
|
||
logger.warning("kubectl_timeout", type=rtype, namespace=namespace)
|
||
except Exception as e:
|
||
logger.warning("kubectl_error", type=rtype, error=str(e))
|
||
|
||
return resources
|
||
|
||
|
||
class DriftDetector:
|
||
"""
|
||
比對 Git vs K8s 實際狀態,輸出 DriftItem 列表
|
||
|
||
職責邊界:只做事實比對,不判斷嚴重性,不解釋意圖
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
k8s_dir: str = "k8s",
|
||
allowlist_fields: frozenset | None = None,
|
||
critical_fields: frozenset | None = None,
|
||
):
|
||
self._git_reader = GitStateReader(k8s_dir)
|
||
self._k8s_reader = K8sStateReader()
|
||
self._allowlist = allowlist_fields or _DEFAULT_ALLOWLIST_FIELDS
|
||
self._critical_fields = critical_fields or _DEFAULT_CRITICAL_FIELDS
|
||
|
||
async def scan(self, namespace: str, triggered_by: str = "cron") -> DriftReport:
|
||
"""
|
||
掃描指定 namespace 的漂移
|
||
|
||
Args:
|
||
namespace: K8s namespace
|
||
triggered_by: 觸發來源(cron / webhook / api)
|
||
|
||
Returns:
|
||
DriftReport(含 DriftItem 列表,尚未分析 intent)
|
||
"""
|
||
report_id = str(uuid.uuid4())[:8]
|
||
|
||
logger.info("drift_scan_start", namespace=namespace, report_id=report_id)
|
||
|
||
git_state, k8s_state = await asyncio.gather(
|
||
self._git_reader.read(namespace),
|
||
self._k8s_reader.read(namespace),
|
||
)
|
||
|
||
items: list[DriftItem] = []
|
||
|
||
# 比對 Git 中有的資源
|
||
for resource_key, git_resource in git_state.items():
|
||
actual_resource = k8s_state.get(resource_key)
|
||
if actual_resource is None:
|
||
# 資源在 Git 中存在但 K8s 中不存在(可能尚未部署)
|
||
logger.debug("resource_missing_in_k8s", resource=resource_key)
|
||
continue
|
||
|
||
kind, name = resource_key.split("/", 1)
|
||
diffs = self._diff_resources(git_resource, actual_resource, kind, name, namespace)
|
||
items.extend(diffs)
|
||
|
||
high_count = sum(1 for i in items if i.drift_level == DriftLevel.HIGH)
|
||
medium_count = sum(1 for i in items if i.drift_level == DriftLevel.MEDIUM)
|
||
info_count = sum(1 for i in items if i.drift_level == DriftLevel.INFO)
|
||
|
||
logger.info(
|
||
"drift_scan_done",
|
||
namespace=namespace,
|
||
report_id=report_id,
|
||
high=high_count,
|
||
medium=medium_count,
|
||
info=info_count,
|
||
)
|
||
|
||
return DriftReport(
|
||
report_id=report_id,
|
||
namespace=namespace,
|
||
items=items,
|
||
high_count=high_count,
|
||
medium_count=medium_count,
|
||
info_count=info_count,
|
||
triggered_by=triggered_by,
|
||
)
|
||
|
||
def _diff_resources(
|
||
self,
|
||
git_res: dict,
|
||
actual_res: dict,
|
||
kind: str,
|
||
name: str,
|
||
namespace: str,
|
||
) -> list[DriftItem]:
|
||
"""逐欄位比對兩個資源,回傳差異列表"""
|
||
items: list[DriftItem] = []
|
||
|
||
# 只比對 spec 層(metadata 的動態欄位太多)
|
||
git_spec = git_res.get("spec", {})
|
||
actual_spec = actual_res.get("spec", {})
|
||
|
||
diffs = self._flatten_diff("spec", git_spec, actual_spec)
|
||
for field_path, (git_val, actual_val) in diffs.items():
|
||
is_allowlisted = self._is_allowlisted(field_path)
|
||
if is_allowlisted:
|
||
level = DriftLevel.INFO
|
||
elif self._is_critical(field_path):
|
||
level = DriftLevel.HIGH
|
||
else:
|
||
level = DriftLevel.MEDIUM
|
||
|
||
items.append(DriftItem(
|
||
resource_kind=kind,
|
||
resource_name=name,
|
||
namespace=namespace,
|
||
field_path=field_path,
|
||
git_value=git_val,
|
||
actual_value=actual_val,
|
||
drift_level=level,
|
||
is_allowlisted=is_allowlisted,
|
||
))
|
||
|
||
return items
|
||
|
||
def _flatten_diff(
|
||
self,
|
||
prefix: str,
|
||
git_dict: Any,
|
||
actual_dict: Any,
|
||
) -> dict[str, tuple[Any, Any]]:
|
||
"""遞迴展開並比對兩個 dict,回傳 {field_path: (git_val, actual_val)}"""
|
||
diffs: dict[str, tuple[Any, Any]] = {}
|
||
|
||
if not isinstance(git_dict, dict) or not isinstance(actual_dict, dict):
|
||
if git_dict != actual_dict:
|
||
diffs[prefix] = (git_dict, actual_dict)
|
||
return diffs
|
||
|
||
all_keys = set(git_dict.keys()) | set(actual_dict.keys())
|
||
for key in all_keys:
|
||
path = f"{prefix}.{key}"
|
||
git_val = git_dict.get(key)
|
||
actual_val = actual_dict.get(key)
|
||
|
||
if git_val == actual_val:
|
||
continue
|
||
|
||
if isinstance(git_val, dict) and isinstance(actual_val, dict):
|
||
diffs.update(self._flatten_diff(path, git_val, actual_val))
|
||
else:
|
||
diffs[path] = (git_val, actual_val)
|
||
|
||
return diffs
|
||
|
||
@staticmethod
|
||
def _pattern_matches(pattern: str, field_path: str) -> bool:
|
||
"""
|
||
匹配 field_path 是否符合 pattern。
|
||
|
||
支援兩種萬用字元:
|
||
- [*] → 任意索引 (e.g. containers[*] 匹配 containers[0], containers[1])
|
||
- * → 任意字串段
|
||
|
||
2026-04-05 Claude Code: I4 修正 — 舊邏輯直接 strip [*] 導致
|
||
containers[*].image 無法匹配 containers[0].image (首席架構師 Review I4)
|
||
"""
|
||
import re as _re
|
||
|
||
# 將 pattern 轉為正則:[*] → \[\d+\],* → [^.]+
|
||
regex = _re.escape(pattern)
|
||
regex = regex.replace(r"\[\*\]", r"\[\d+\]")
|
||
regex = regex.replace(r"\*", r"[^.]+")
|
||
# 允許 pattern 是前綴(field_path 可能更深,. 或 [ 或字串結尾均可)
|
||
return bool(_re.match(f"^{regex}(\\.|\\[|$)", field_path))
|
||
|
||
def _is_allowlisted(self, field_path: str) -> bool:
|
||
"""判斷欄位是否在白名單(靜默記錄不告警)"""
|
||
return any(self._pattern_matches(p, field_path) for p in self._allowlist)
|
||
|
||
def _is_critical(self, field_path: str) -> bool:
|
||
"""判斷欄位是否為關鍵欄位(HIGH 等級)"""
|
||
return any(self._pattern_matches(p, field_path) for p in self._critical_fields)
|
||
|
||
|
||
# =============================================================================
|
||
# Singleton
|
||
# =============================================================================
|
||
|
||
_detector: DriftDetector | None = None
|
||
|
||
|
||
def get_drift_detector() -> DriftDetector:
|
||
global _detector
|
||
if _detector is None:
|
||
_detector = DriftDetector()
|
||
return _detector
|