P0 #3 (徹底長期修系列) — 把 daily report 的 pod 健康判斷從「ready=False 一律告警」 升級到完整 K8s pod lifecycle state machine: | Phase | 行為 | |-------|------| | Succeeded / Completed | 跳過(CronJob/Job 跑完正常) | | Failed | 必告警 | | Unknown | 必告警 | | Pending <5min | 跳過(剛 schedule 合理) | | Pending >=5min | 告警「image pull / scheduling 卡住」| | Running ready=True | 健康,跳過 | | Running ready=False <2min | 跳過(剛起來 probe 還沒過)| | Running ready=False >=2min | 告警「readiness probe fail / 啟動異常」| | restarts >=3 | 必告警(無論 phase)| 實作: - PodInfo 加 start_time: Optional[str](從 .status.startTime) - _get_pod_status kubectl custom-columns 加 STARTTIME - _build_warnings 完整 state machine + 閾值常數 regression test (test_heartbeat_pod_state_machine.py 13 個) 覆蓋每個 phase + 邊界條件,含 2026-05-02 統帥截圖鐵證重現(3 個 drift-scanner Succeeded pod 不該觸發「需關注 3 項」假警報)。 Tests: 13 passed (新增 test_heartbeat_pod_state_machine.py) 接續 a38d9112(單純 Succeeded skip),這次徹底處理 Pending/Failed/Unknown + 時間閾值 + 沒 start_time 的保守告警。 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
173 lines
6.7 KiB
Python
173 lines
6.7 KiB
Python
"""
|
||
P0 #3 K8s pod state machine 測試
|
||
2026-05-03 Claude Opus 4.7 + 統帥 ogt
|
||
|
||
驗證 _build_warnings() 對 5 種 K8s pod phase 的判斷邏輯:
|
||
Pending — 短暫 OK,>5min 警告
|
||
Running — ready=True OK;ready=False 短暫 OK,>2min 警告
|
||
Succeeded/Completed — CronJob 跑完,跳過(不算未就緒)
|
||
Failed — 必告警
|
||
Unknown — 必告警
|
||
restarts >= 3 — 必告警(無關 phase)
|
||
|
||
加 test 同時保護未來重構不誤砍 K8s lifecycle 處理。
|
||
"""
|
||
from datetime import datetime, timedelta, timezone
|
||
|
||
import pytest
|
||
|
||
from src.services.heartbeat_report_service import (
|
||
HeartbeatReport,
|
||
HeartbeatReportService,
|
||
PodInfo,
|
||
)
|
||
|
||
|
||
def _make_report(pods: list[PodInfo]) -> HeartbeatReport:
|
||
"""構造最小 HeartbeatReport,只填 pods 欄位"""
|
||
return HeartbeatReport(
|
||
timestamp=datetime.now(timezone.utc),
|
||
pods=pods,
|
||
)
|
||
|
||
|
||
def _start_time(minutes_ago: int) -> str:
|
||
"""構造 N 分鐘前的 ISO 8601 startTime(K8s 格式)"""
|
||
dt = datetime.now(timezone.utc) - timedelta(minutes=minutes_ago)
|
||
return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
|
||
|
||
|
||
class TestPodStateMachine:
|
||
"""K8s pod phase 完整 state machine 覆蓋"""
|
||
|
||
def setup_method(self):
|
||
self.svc = HeartbeatReportService()
|
||
|
||
# --- Succeeded / Completed: CronJob 跑完,不算未就緒 ---
|
||
|
||
def test_succeeded_pod_no_warning(self):
|
||
pod = PodInfo(name="drift-scanner-abc", ready=False, status="Succeeded")
|
||
warnings = self.svc._build_warnings(_make_report([pod]))
|
||
assert not any("drift-scanner" in w for w in warnings), \
|
||
f"Succeeded pod 不該被當未就緒,實際 warnings: {warnings}"
|
||
|
||
def test_completed_pod_no_warning(self):
|
||
pod = PodInfo(name="job-xyz", ready=False, status="Completed")
|
||
warnings = self.svc._build_warnings(_make_report([pod]))
|
||
assert not any("job-xyz" in w for w in warnings)
|
||
|
||
# --- Failed: 必告警 ---
|
||
|
||
def test_failed_pod_warns(self):
|
||
pod = PodInfo(name="api-bad", ready=False, status="Failed")
|
||
warnings = self.svc._build_warnings(_make_report([pod]))
|
||
assert any("api-bad" in w and "Failed" in w for w in warnings), \
|
||
f"Failed pod 必告警,實際 warnings: {warnings}"
|
||
|
||
# --- Unknown: 必告警 ---
|
||
|
||
def test_unknown_pod_warns(self):
|
||
pod = PodInfo(name="api-unknown", ready=False, status="Unknown")
|
||
warnings = self.svc._build_warnings(_make_report([pod]))
|
||
assert any("api-unknown" in w and "Unknown" in w for w in warnings)
|
||
|
||
# --- Pending: 短暫 OK,>5min 警告 ---
|
||
|
||
def test_pending_short_no_warning(self):
|
||
# 剛建立 1 分鐘的 Pending pod 不該告警
|
||
pod = PodInfo(
|
||
name="api-starting", ready=False, status="Pending",
|
||
start_time=_start_time(minutes_ago=1),
|
||
)
|
||
warnings = self.svc._build_warnings(_make_report([pod]))
|
||
assert not any("api-starting" in w for w in warnings), \
|
||
f"Pending <5min 不該告警,實際: {warnings}"
|
||
|
||
def test_pending_long_warns(self):
|
||
# Pending 10 分鐘 → image pull / scheduling 卡住
|
||
pod = PodInfo(
|
||
name="api-stuck", ready=False, status="Pending",
|
||
start_time=_start_time(minutes_ago=10),
|
||
)
|
||
warnings = self.svc._build_warnings(_make_report([pod]))
|
||
assert any("api-stuck" in w and "Pending" in w for w in warnings), \
|
||
f"Pending >5min 必告警,實際: {warnings}"
|
||
|
||
def test_pending_no_starttime_warns(self):
|
||
# 拿不到 start_time 時保守告警
|
||
pod = PodInfo(
|
||
name="api-no-time", ready=False, status="Pending",
|
||
start_time=None,
|
||
)
|
||
warnings = self.svc._build_warnings(_make_report([pod]))
|
||
assert any("api-no-time" in w for w in warnings)
|
||
|
||
# --- Running ready=True: 健康,跳過 ---
|
||
|
||
def test_running_ready_no_warning(self):
|
||
pod = PodInfo(
|
||
name="api-healthy", ready=True, status="Running",
|
||
start_time=_start_time(minutes_ago=60),
|
||
)
|
||
warnings = self.svc._build_warnings(_make_report([pod]))
|
||
assert not any("api-healthy" in w for w in warnings), \
|
||
f"Running+Ready 不該告警,實際: {warnings}"
|
||
|
||
# --- Running ready=False: 短暫 OK,>2min 警告 ---
|
||
|
||
def test_running_not_ready_short_no_warning(self):
|
||
# Running 但 not ready 1 分鐘(剛起來,probe 還沒過)
|
||
pod = PodInfo(
|
||
name="api-warming", ready=False, status="Running",
|
||
start_time=_start_time(minutes_ago=1),
|
||
)
|
||
warnings = self.svc._build_warnings(_make_report([pod]))
|
||
assert not any("api-warming" in w for w in warnings), \
|
||
f"Running NotReady <2min 不該告警,實際: {warnings}"
|
||
|
||
def test_running_not_ready_long_warns(self):
|
||
# Running 但 not ready 5 分鐘 → readiness probe fail
|
||
pod = PodInfo(
|
||
name="api-stale", ready=False, status="Running",
|
||
start_time=_start_time(minutes_ago=5),
|
||
)
|
||
warnings = self.svc._build_warnings(_make_report([pod]))
|
||
assert any("api-stale" in w and "NotReady" in w for w in warnings), \
|
||
f"Running NotReady >2min 必告警,實際: {warnings}"
|
||
|
||
# --- restarts >= 3: 必告警,無關 phase ---
|
||
|
||
def test_high_restarts_warns_even_if_running(self):
|
||
pod = PodInfo(
|
||
name="api-flaky", ready=True, status="Running",
|
||
start_time=_start_time(minutes_ago=30), restarts=5,
|
||
)
|
||
warnings = self.svc._build_warnings(_make_report([pod]))
|
||
assert any("api-flaky" in w and "重啟" in w for w in warnings)
|
||
|
||
def test_high_restarts_warns_for_succeeded_pod(self):
|
||
# 即使 Succeeded 也應該因為 restart 過多而告警(CrashLoop 跑完)
|
||
pod = PodInfo(
|
||
name="job-crashy", ready=False, status="Succeeded", restarts=10,
|
||
)
|
||
warnings = self.svc._build_warnings(_make_report([pod]))
|
||
assert any("job-crashy" in w and "重啟" in w for w in warnings)
|
||
|
||
|
||
class TestRegression:
|
||
"""Regression:保護 daily report 不再出現 24h 連續同樣 false positive"""
|
||
|
||
def setup_method(self):
|
||
self.svc = HeartbeatReportService()
|
||
|
||
def test_3_drift_scanner_succeeded_pods_zero_warning(self):
|
||
"""2026-05-02 統帥截圖鐵證:3 個 drift-scanner Succeeded pod 每天造成
|
||
「需關注 3 項」假警報。修復後同樣 fixture 應 0 warning。"""
|
||
pods = [
|
||
PodInfo(name=f"drift-scanner-{i}-pvbst", ready=False, status="Succeeded")
|
||
for i in range(3)
|
||
]
|
||
warnings = self.svc._build_warnings(_make_report(pods))
|
||
assert not any("drift-scanner" in w for w in warnings), \
|
||
f"Codex stash + 我的修法後 Succeeded CronJob pod 不該告警,實際: {warnings}"
|