Files
awoooi/apps/api/tests/test_heartbeat_pod_state_machine.py
Your Name 2ce722bda9
Some checks failed
Code Review / ai-code-review (push) Successful in 51s
CD Pipeline / tests (push) Successful in 2m59s
CD Pipeline / build-and-deploy (push) Has started running
CD Pipeline / post-deploy-checks (push) Has been cancelled
feat(heartbeat): full K8s pod lifecycle state machine + regression tests
P0 #3 (徹底長期修系列) — 把 daily report 的 pod 健康判斷從「ready=False 一律告警」
升級到完整 K8s pod lifecycle state machine:

| Phase | 行為 |
|-------|------|
| Succeeded / Completed | 跳過(CronJob/Job 跑完正常) |
| Failed | 必告警 |
| Unknown | 必告警 |
| Pending <5min | 跳過(剛 schedule 合理) |
| Pending >=5min | 告警「image pull / scheduling 卡住」|
| Running ready=True | 健康,跳過 |
| Running ready=False <2min | 跳過(剛起來 probe 還沒過)|
| Running ready=False >=2min | 告警「readiness probe fail / 啟動異常」|
| restarts >=3 | 必告警(無論 phase)|

實作:
- PodInfo 加 start_time: Optional[str](從 .status.startTime)
- _get_pod_status kubectl custom-columns 加 STARTTIME
- _build_warnings 完整 state machine + 閾值常數

regression test (test_heartbeat_pod_state_machine.py 13 個) 覆蓋每個 phase
+ 邊界條件,含 2026-05-02 統帥截圖鐵證重現(3 個 drift-scanner Succeeded
pod 不該觸發「需關注 3 項」假警報)。

Tests: 13 passed (新增 test_heartbeat_pod_state_machine.py)

接續 a38d9112(單純 Succeeded skip),這次徹底處理 Pending/Failed/Unknown
+ 時間閾值 + 沒 start_time 的保守告警。

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 01:44:58 +08:00

173 lines
6.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
P0 #3 K8s pod state machine 測試
2026-05-03 Claude Opus 4.7 + 統帥 ogt
驗證 _build_warnings() 對 5 種 K8s pod phase 的判斷邏輯:
Pending — 短暫 OK>5min 警告
Running — ready=True OKready=False 短暫 OK>2min 警告
Succeeded/Completed — CronJob 跑完,跳過(不算未就緒)
Failed — 必告警
Unknown — 必告警
restarts >= 3 — 必告警(無關 phase
加 test 同時保護未來重構不誤砍 K8s lifecycle 處理。
"""
from datetime import datetime, timedelta, timezone
import pytest
from src.services.heartbeat_report_service import (
HeartbeatReport,
HeartbeatReportService,
PodInfo,
)
def _make_report(pods: list[PodInfo]) -> HeartbeatReport:
"""構造最小 HeartbeatReport只填 pods 欄位"""
return HeartbeatReport(
timestamp=datetime.now(timezone.utc),
pods=pods,
)
def _start_time(minutes_ago: int) -> str:
"""構造 N 分鐘前的 ISO 8601 startTimeK8s 格式)"""
dt = datetime.now(timezone.utc) - timedelta(minutes=minutes_ago)
return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
class TestPodStateMachine:
"""K8s pod phase 完整 state machine 覆蓋"""
def setup_method(self):
self.svc = HeartbeatReportService()
# --- Succeeded / Completed: CronJob 跑完,不算未就緒 ---
def test_succeeded_pod_no_warning(self):
pod = PodInfo(name="drift-scanner-abc", ready=False, status="Succeeded")
warnings = self.svc._build_warnings(_make_report([pod]))
assert not any("drift-scanner" in w for w in warnings), \
f"Succeeded pod 不該被當未就緒,實際 warnings: {warnings}"
def test_completed_pod_no_warning(self):
pod = PodInfo(name="job-xyz", ready=False, status="Completed")
warnings = self.svc._build_warnings(_make_report([pod]))
assert not any("job-xyz" in w for w in warnings)
# --- Failed: 必告警 ---
def test_failed_pod_warns(self):
pod = PodInfo(name="api-bad", ready=False, status="Failed")
warnings = self.svc._build_warnings(_make_report([pod]))
assert any("api-bad" in w and "Failed" in w for w in warnings), \
f"Failed pod 必告警,實際 warnings: {warnings}"
# --- Unknown: 必告警 ---
def test_unknown_pod_warns(self):
pod = PodInfo(name="api-unknown", ready=False, status="Unknown")
warnings = self.svc._build_warnings(_make_report([pod]))
assert any("api-unknown" in w and "Unknown" in w for w in warnings)
# --- Pending: 短暫 OK>5min 警告 ---
def test_pending_short_no_warning(self):
# 剛建立 1 分鐘的 Pending pod 不該告警
pod = PodInfo(
name="api-starting", ready=False, status="Pending",
start_time=_start_time(minutes_ago=1),
)
warnings = self.svc._build_warnings(_make_report([pod]))
assert not any("api-starting" in w for w in warnings), \
f"Pending <5min 不該告警,實際: {warnings}"
def test_pending_long_warns(self):
# Pending 10 分鐘 → image pull / scheduling 卡住
pod = PodInfo(
name="api-stuck", ready=False, status="Pending",
start_time=_start_time(minutes_ago=10),
)
warnings = self.svc._build_warnings(_make_report([pod]))
assert any("api-stuck" in w and "Pending" in w for w in warnings), \
f"Pending >5min 必告警,實際: {warnings}"
def test_pending_no_starttime_warns(self):
# 拿不到 start_time 時保守告警
pod = PodInfo(
name="api-no-time", ready=False, status="Pending",
start_time=None,
)
warnings = self.svc._build_warnings(_make_report([pod]))
assert any("api-no-time" in w for w in warnings)
# --- Running ready=True: 健康,跳過 ---
def test_running_ready_no_warning(self):
pod = PodInfo(
name="api-healthy", ready=True, status="Running",
start_time=_start_time(minutes_ago=60),
)
warnings = self.svc._build_warnings(_make_report([pod]))
assert not any("api-healthy" in w for w in warnings), \
f"Running+Ready 不該告警,實際: {warnings}"
# --- Running ready=False: 短暫 OK>2min 警告 ---
def test_running_not_ready_short_no_warning(self):
# Running 但 not ready 1 分鐘剛起來probe 還沒過)
pod = PodInfo(
name="api-warming", ready=False, status="Running",
start_time=_start_time(minutes_ago=1),
)
warnings = self.svc._build_warnings(_make_report([pod]))
assert not any("api-warming" in w for w in warnings), \
f"Running NotReady <2min 不該告警,實際: {warnings}"
def test_running_not_ready_long_warns(self):
# Running 但 not ready 5 分鐘 → readiness probe fail
pod = PodInfo(
name="api-stale", ready=False, status="Running",
start_time=_start_time(minutes_ago=5),
)
warnings = self.svc._build_warnings(_make_report([pod]))
assert any("api-stale" in w and "NotReady" in w for w in warnings), \
f"Running NotReady >2min 必告警,實際: {warnings}"
# --- restarts >= 3: 必告警,無關 phase ---
def test_high_restarts_warns_even_if_running(self):
pod = PodInfo(
name="api-flaky", ready=True, status="Running",
start_time=_start_time(minutes_ago=30), restarts=5,
)
warnings = self.svc._build_warnings(_make_report([pod]))
assert any("api-flaky" in w and "重啟" in w for w in warnings)
def test_high_restarts_warns_for_succeeded_pod(self):
# 即使 Succeeded 也應該因為 restart 過多而告警CrashLoop 跑完)
pod = PodInfo(
name="job-crashy", ready=False, status="Succeeded", restarts=10,
)
warnings = self.svc._build_warnings(_make_report([pod]))
assert any("job-crashy" in w and "重啟" in w for w in warnings)
class TestRegression:
"""Regression保護 daily report 不再出現 24h 連續同樣 false positive"""
def setup_method(self):
self.svc = HeartbeatReportService()
def test_3_drift_scanner_succeeded_pods_zero_warning(self):
"""2026-05-02 統帥截圖鐵證3 個 drift-scanner Succeeded pod 每天造成
「需關注 3 項」假警報。修復後同樣 fixture 應 0 warning。"""
pods = [
PodInfo(name=f"drift-scanner-{i}-pvbst", ready=False, status="Succeeded")
for i in range(3)
]
warnings = self.svc._build_warnings(_make_report(pods))
assert not any("drift-scanner" in w for w in warnings), \
f"Codex stash + 我的修法後 Succeeded CronJob pod 不該告警,實際: {warnings}"