feat(heartbeat): full K8s pod lifecycle state machine + regression tests
P0 #3 (徹底長期修系列) — 把 daily report 的 pod 健康判斷從「ready=False 一律告警」 升級到完整 K8s pod lifecycle state machine: | Phase | 行為 | |-------|------| | Succeeded / Completed | 跳過(CronJob/Job 跑完正常) | | Failed | 必告警 | | Unknown | 必告警 | | Pending <5min | 跳過(剛 schedule 合理) | | Pending >=5min | 告警「image pull / scheduling 卡住」| | Running ready=True | 健康,跳過 | | Running ready=False <2min | 跳過(剛起來 probe 還沒過)| | Running ready=False >=2min | 告警「readiness probe fail / 啟動異常」| | restarts >=3 | 必告警(無論 phase)| 實作: - PodInfo 加 start_time: Optional[str](從 .status.startTime) - _get_pod_status kubectl custom-columns 加 STARTTIME - _build_warnings 完整 state machine + 閾值常數 regression test (test_heartbeat_pod_state_machine.py 13 個) 覆蓋每個 phase + 邊界條件,含 2026-05-02 統帥截圖鐵證重現(3 個 drift-scanner Succeeded pod 不該觸發「需關注 3 項」假警報)。 Tests: 13 passed (新增 test_heartbeat_pod_state_machine.py) 接續 a38d9112(單純 Succeeded skip),這次徹底處理 Pending/Failed/Unknown + 時間閾值 + 沒 start_time 的保守告警。 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -72,6 +72,9 @@ class PodInfo:
|
||||
ready: bool
|
||||
status: str
|
||||
restarts: int = 0
|
||||
# 2026-05-03 Claude Opus 4.7 + 統帥 ogt:P0 #3 K8s pod state machine 完整化
|
||||
# 加 start_time 才能判斷 Pending/NotReady 是「剛起來合理」還是「卡住該告警」
|
||||
start_time: Optional[str] = None # ISO 8601 from .status.startTime
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -517,7 +520,9 @@ class HeartbeatReportService:
|
||||
return s
|
||||
|
||||
async def _get_pod_status(self) -> list[PodInfo]:
|
||||
"""查 awoooi-prod namespace 的所有 Pod 狀態"""
|
||||
"""查 awoooi-prod namespace 的所有 Pod 狀態
|
||||
2026-05-03 Claude Opus 4.7 + 統帥 ogt:P0 #3 加抓 STARTTIME 才能做 K8s state machine 判斷
|
||||
"""
|
||||
pods: list[PodInfo] = []
|
||||
try:
|
||||
import subprocess
|
||||
@@ -525,7 +530,8 @@ class HeartbeatReportService:
|
||||
["kubectl", "-n", "awoooi-prod", "get", "pods",
|
||||
"--no-headers", "-o",
|
||||
"custom-columns=NAME:.metadata.name,READY:.status.containerStatuses[0].ready,"
|
||||
"STATUS:.status.phase,RESTARTS:.status.containerStatuses[0].restartCount"],
|
||||
"STATUS:.status.phase,RESTARTS:.status.containerStatuses[0].restartCount,"
|
||||
"STARTTIME:.status.startTime"],
|
||||
capture_output=True, text=True, timeout=8,
|
||||
)
|
||||
for line in r.stdout.strip().splitlines():
|
||||
@@ -535,7 +541,11 @@ class HeartbeatReportService:
|
||||
ready = parts[1].lower() == "true"
|
||||
status = parts[2]
|
||||
restarts = int(parts[3]) if len(parts) >= 4 and parts[3].isdigit() else 0
|
||||
pods.append(PodInfo(name=name, ready=ready, status=status, restarts=restarts))
|
||||
start_time = parts[4] if len(parts) >= 5 and parts[4] != "<none>" else None
|
||||
pods.append(PodInfo(
|
||||
name=name, ready=ready, status=status,
|
||||
restarts=restarts, start_time=start_time,
|
||||
))
|
||||
except Exception as e:
|
||||
logger.debug("heartbeat_pod_status_failed", error=str(e))
|
||||
return pods
|
||||
@@ -700,18 +710,58 @@ class HeartbeatReportService:
|
||||
if report.alert_pipeline.pending_approval > 10:
|
||||
warnings.append(f"PENDING 積壓 {report.alert_pipeline.pending_approval} 筆,需人工處理")
|
||||
|
||||
# Pod 異常
|
||||
# 2026-05-02 Claude Opus 4.7 + 統帥 ogt:CronJob/Job 跑完的 Pod (Succeeded/Completed)
|
||||
# ready=False 是設計(容器已退出),不是異常。原本邏輯每天推「Pod drift-scanner-* 未就緒
|
||||
# (Succeeded)」3 條 false positive,讓統帥誤以為告警重複。
|
||||
# Pod 異常 — 2026-05-03 Claude Opus 4.7 + 統帥 ogt:P0 #3 完整 K8s pod state machine
|
||||
# K8s pod phases (https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/):
|
||||
# Pending — 已建立但容器還沒起(短暫 OK,>5min 異常 = image pull / scheduling 卡)
|
||||
# Running — 至少 1 容器跑中(ready=False 短暫 OK,>2min 異常 = probe fail)
|
||||
# Succeeded — 全容器成功結束(CronJob/Job 正常,不算未就緒)
|
||||
# Failed — 全容器結束,至少 1 fail(必告警)
|
||||
# Unknown — 狀態無法取得(必告警)
|
||||
from datetime import datetime, timezone
|
||||
_now = datetime.now(timezone.utc)
|
||||
_PENDING_THRESHOLD_MIN = 5
|
||||
_NOT_READY_THRESHOLD_MIN = 2
|
||||
|
||||
def _age_minutes(start_time: Optional[str]) -> Optional[float]:
|
||||
"""ISO 8601 startTime → 距今分鐘。None 或解析失敗返 None。"""
|
||||
if not start_time:
|
||||
return None
|
||||
try:
|
||||
# K8s startTime 格式:2026-05-03T12:34:56Z
|
||||
dt = datetime.fromisoformat(start_time.replace("Z", "+00:00"))
|
||||
return (_now - dt).total_seconds() / 60.0
|
||||
except (ValueError, TypeError):
|
||||
return None
|
||||
|
||||
for pod in report.pods:
|
||||
if pod.status in ("Succeeded", "Completed"):
|
||||
continue # CronJob/Job 跑完是成功,不算未就緒
|
||||
if not pod.ready:
|
||||
warnings.append(f"Pod {pod.name} 未就緒({pod.status})")
|
||||
phase = pod.status
|
||||
age_min = _age_minutes(pod.start_time)
|
||||
|
||||
# restart 次數高無論 phase 都告警(CrashLoop 中或跑完都值得追)
|
||||
# 放最前面,避免後面 continue 跳過
|
||||
if pod.restarts >= 3:
|
||||
warnings.append(f"Pod {pod.name} 重啟 {pod.restarts} 次")
|
||||
|
||||
if phase in ("Succeeded", "Completed"):
|
||||
# CronJob/Job 成功跑完,ready=False 是設計,phase 部分不算未就緒
|
||||
continue
|
||||
elif phase == "Failed":
|
||||
# 真正失敗 — 一定告警
|
||||
warnings.append(f"Pod {pod.name} Failed")
|
||||
elif phase == "Unknown":
|
||||
warnings.append(f"Pod {pod.name} 狀態 Unknown")
|
||||
elif phase == "Pending":
|
||||
# 短暫 Pending OK;持續 >5min 表示 image pull / scheduling 卡住
|
||||
if age_min is None or age_min >= _PENDING_THRESHOLD_MIN:
|
||||
age_str = f"{age_min:.0f}m" if age_min else "未知"
|
||||
warnings.append(f"Pod {pod.name} 持續 Pending {age_str}(image pull / scheduling 卡住)")
|
||||
elif phase == "Running" and not pod.ready:
|
||||
# Running 但 not ready:短暫 OK(剛起);>2min 表示 probe fail / 啟動慢
|
||||
if age_min is None or age_min >= _NOT_READY_THRESHOLD_MIN:
|
||||
age_str = f"{age_min:.0f}m" if age_min else "未知"
|
||||
warnings.append(f"Pod {pod.name} NotReady {age_str}(readiness probe fail / 啟動異常)")
|
||||
# Running + ready=True 是健康狀態,跳過
|
||||
|
||||
return warnings
|
||||
|
||||
|
||||
|
||||
172
apps/api/tests/test_heartbeat_pod_state_machine.py
Normal file
172
apps/api/tests/test_heartbeat_pod_state_machine.py
Normal file
@@ -0,0 +1,172 @@
|
||||
"""
|
||||
P0 #3 K8s pod state machine 測試
|
||||
2026-05-03 Claude Opus 4.7 + 統帥 ogt
|
||||
|
||||
驗證 _build_warnings() 對 5 種 K8s pod phase 的判斷邏輯:
|
||||
Pending — 短暫 OK,>5min 警告
|
||||
Running — ready=True OK;ready=False 短暫 OK,>2min 警告
|
||||
Succeeded/Completed — CronJob 跑完,跳過(不算未就緒)
|
||||
Failed — 必告警
|
||||
Unknown — 必告警
|
||||
restarts >= 3 — 必告警(無關 phase)
|
||||
|
||||
加 test 同時保護未來重構不誤砍 K8s lifecycle 處理。
|
||||
"""
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
import pytest
|
||||
|
||||
from src.services.heartbeat_report_service import (
|
||||
HeartbeatReport,
|
||||
HeartbeatReportService,
|
||||
PodInfo,
|
||||
)
|
||||
|
||||
|
||||
def _make_report(pods: list[PodInfo]) -> HeartbeatReport:
|
||||
"""構造最小 HeartbeatReport,只填 pods 欄位"""
|
||||
return HeartbeatReport(
|
||||
timestamp=datetime.now(timezone.utc),
|
||||
pods=pods,
|
||||
)
|
||||
|
||||
|
||||
def _start_time(minutes_ago: int) -> str:
|
||||
"""構造 N 分鐘前的 ISO 8601 startTime(K8s 格式)"""
|
||||
dt = datetime.now(timezone.utc) - timedelta(minutes=minutes_ago)
|
||||
return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
|
||||
|
||||
class TestPodStateMachine:
|
||||
"""K8s pod phase 完整 state machine 覆蓋"""
|
||||
|
||||
def setup_method(self):
|
||||
self.svc = HeartbeatReportService()
|
||||
|
||||
# --- Succeeded / Completed: CronJob 跑完,不算未就緒 ---
|
||||
|
||||
def test_succeeded_pod_no_warning(self):
|
||||
pod = PodInfo(name="drift-scanner-abc", ready=False, status="Succeeded")
|
||||
warnings = self.svc._build_warnings(_make_report([pod]))
|
||||
assert not any("drift-scanner" in w for w in warnings), \
|
||||
f"Succeeded pod 不該被當未就緒,實際 warnings: {warnings}"
|
||||
|
||||
def test_completed_pod_no_warning(self):
|
||||
pod = PodInfo(name="job-xyz", ready=False, status="Completed")
|
||||
warnings = self.svc._build_warnings(_make_report([pod]))
|
||||
assert not any("job-xyz" in w for w in warnings)
|
||||
|
||||
# --- Failed: 必告警 ---
|
||||
|
||||
def test_failed_pod_warns(self):
|
||||
pod = PodInfo(name="api-bad", ready=False, status="Failed")
|
||||
warnings = self.svc._build_warnings(_make_report([pod]))
|
||||
assert any("api-bad" in w and "Failed" in w for w in warnings), \
|
||||
f"Failed pod 必告警,實際 warnings: {warnings}"
|
||||
|
||||
# --- Unknown: 必告警 ---
|
||||
|
||||
def test_unknown_pod_warns(self):
|
||||
pod = PodInfo(name="api-unknown", ready=False, status="Unknown")
|
||||
warnings = self.svc._build_warnings(_make_report([pod]))
|
||||
assert any("api-unknown" in w and "Unknown" in w for w in warnings)
|
||||
|
||||
# --- Pending: 短暫 OK,>5min 警告 ---
|
||||
|
||||
def test_pending_short_no_warning(self):
|
||||
# 剛建立 1 分鐘的 Pending pod 不該告警
|
||||
pod = PodInfo(
|
||||
name="api-starting", ready=False, status="Pending",
|
||||
start_time=_start_time(minutes_ago=1),
|
||||
)
|
||||
warnings = self.svc._build_warnings(_make_report([pod]))
|
||||
assert not any("api-starting" in w for w in warnings), \
|
||||
f"Pending <5min 不該告警,實際: {warnings}"
|
||||
|
||||
def test_pending_long_warns(self):
|
||||
# Pending 10 分鐘 → image pull / scheduling 卡住
|
||||
pod = PodInfo(
|
||||
name="api-stuck", ready=False, status="Pending",
|
||||
start_time=_start_time(minutes_ago=10),
|
||||
)
|
||||
warnings = self.svc._build_warnings(_make_report([pod]))
|
||||
assert any("api-stuck" in w and "Pending" in w for w in warnings), \
|
||||
f"Pending >5min 必告警,實際: {warnings}"
|
||||
|
||||
def test_pending_no_starttime_warns(self):
|
||||
# 拿不到 start_time 時保守告警
|
||||
pod = PodInfo(
|
||||
name="api-no-time", ready=False, status="Pending",
|
||||
start_time=None,
|
||||
)
|
||||
warnings = self.svc._build_warnings(_make_report([pod]))
|
||||
assert any("api-no-time" in w for w in warnings)
|
||||
|
||||
# --- Running ready=True: 健康,跳過 ---
|
||||
|
||||
def test_running_ready_no_warning(self):
|
||||
pod = PodInfo(
|
||||
name="api-healthy", ready=True, status="Running",
|
||||
start_time=_start_time(minutes_ago=60),
|
||||
)
|
||||
warnings = self.svc._build_warnings(_make_report([pod]))
|
||||
assert not any("api-healthy" in w for w in warnings), \
|
||||
f"Running+Ready 不該告警,實際: {warnings}"
|
||||
|
||||
# --- Running ready=False: 短暫 OK,>2min 警告 ---
|
||||
|
||||
def test_running_not_ready_short_no_warning(self):
|
||||
# Running 但 not ready 1 分鐘(剛起來,probe 還沒過)
|
||||
pod = PodInfo(
|
||||
name="api-warming", ready=False, status="Running",
|
||||
start_time=_start_time(minutes_ago=1),
|
||||
)
|
||||
warnings = self.svc._build_warnings(_make_report([pod]))
|
||||
assert not any("api-warming" in w for w in warnings), \
|
||||
f"Running NotReady <2min 不該告警,實際: {warnings}"
|
||||
|
||||
def test_running_not_ready_long_warns(self):
|
||||
# Running 但 not ready 5 分鐘 → readiness probe fail
|
||||
pod = PodInfo(
|
||||
name="api-stale", ready=False, status="Running",
|
||||
start_time=_start_time(minutes_ago=5),
|
||||
)
|
||||
warnings = self.svc._build_warnings(_make_report([pod]))
|
||||
assert any("api-stale" in w and "NotReady" in w for w in warnings), \
|
||||
f"Running NotReady >2min 必告警,實際: {warnings}"
|
||||
|
||||
# --- restarts >= 3: 必告警,無關 phase ---
|
||||
|
||||
def test_high_restarts_warns_even_if_running(self):
|
||||
pod = PodInfo(
|
||||
name="api-flaky", ready=True, status="Running",
|
||||
start_time=_start_time(minutes_ago=30), restarts=5,
|
||||
)
|
||||
warnings = self.svc._build_warnings(_make_report([pod]))
|
||||
assert any("api-flaky" in w and "重啟" in w for w in warnings)
|
||||
|
||||
def test_high_restarts_warns_for_succeeded_pod(self):
|
||||
# 即使 Succeeded 也應該因為 restart 過多而告警(CrashLoop 跑完)
|
||||
pod = PodInfo(
|
||||
name="job-crashy", ready=False, status="Succeeded", restarts=10,
|
||||
)
|
||||
warnings = self.svc._build_warnings(_make_report([pod]))
|
||||
assert any("job-crashy" in w and "重啟" in w for w in warnings)
|
||||
|
||||
|
||||
class TestRegression:
|
||||
"""Regression:保護 daily report 不再出現 24h 連續同樣 false positive"""
|
||||
|
||||
def setup_method(self):
|
||||
self.svc = HeartbeatReportService()
|
||||
|
||||
def test_3_drift_scanner_succeeded_pods_zero_warning(self):
|
||||
"""2026-05-02 統帥截圖鐵證:3 個 drift-scanner Succeeded pod 每天造成
|
||||
「需關注 3 項」假警報。修復後同樣 fixture 應 0 warning。"""
|
||||
pods = [
|
||||
PodInfo(name=f"drift-scanner-{i}-pvbst", ready=False, status="Succeeded")
|
||||
for i in range(3)
|
||||
]
|
||||
warnings = self.svc._build_warnings(_make_report(pods))
|
||||
assert not any("drift-scanner" in w for w in warnings), \
|
||||
f"Codex stash + 我的修法後 Succeeded CronJob pod 不該告警,實際: {warnings}"
|
||||
Reference in New Issue
Block a user