feat(heartbeat): full K8s pod lifecycle state machine + regression tests
Some checks failed
Code Review / ai-code-review (push) Successful in 51s
CD Pipeline / tests (push) Successful in 2m59s
CD Pipeline / build-and-deploy (push) Has started running
CD Pipeline / post-deploy-checks (push) Has been cancelled

P0 #3 (徹底長期修系列) — 把 daily report 的 pod 健康判斷從「ready=False 一律告警」
升級到完整 K8s pod lifecycle state machine:

| Phase | 行為 |
|-------|------|
| Succeeded / Completed | 跳過(CronJob/Job 跑完正常) |
| Failed | 必告警 |
| Unknown | 必告警 |
| Pending <5min | 跳過(剛 schedule 合理) |
| Pending >=5min | 告警「image pull / scheduling 卡住」|
| Running ready=True | 健康,跳過 |
| Running ready=False <2min | 跳過(剛起來 probe 還沒過)|
| Running ready=False >=2min | 告警「readiness probe fail / 啟動異常」|
| restarts >=3 | 必告警(無論 phase)|

實作:
- PodInfo 加 start_time: Optional[str](從 .status.startTime)
- _get_pod_status kubectl custom-columns 加 STARTTIME
- _build_warnings 完整 state machine + 閾值常數

regression test (test_heartbeat_pod_state_machine.py 13 個) 覆蓋每個 phase
+ 邊界條件,含 2026-05-02 統帥截圖鐵證重現(3 個 drift-scanner Succeeded
pod 不該觸發「需關注 3 項」假警報)。

Tests: 13 passed (新增 test_heartbeat_pod_state_machine.py)

接續 a38d9112(單純 Succeeded skip),這次徹底處理 Pending/Failed/Unknown
+ 時間閾值 + 沒 start_time 的保守告警。

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Your Name
2026-05-03 01:44:58 +08:00
parent f1362fcc8d
commit 2ce722bda9
2 changed files with 233 additions and 11 deletions

View File

@@ -72,6 +72,9 @@ class PodInfo:
ready: bool
status: str
restarts: int = 0
# 2026-05-03 Claude Opus 4.7 + 統帥 ogtP0 #3 K8s pod state machine 完整化
# 加 start_time 才能判斷 Pending/NotReady 是「剛起來合理」還是「卡住該告警」
start_time: Optional[str] = None # ISO 8601 from .status.startTime
@dataclass
@@ -517,7 +520,9 @@ class HeartbeatReportService:
return s
async def _get_pod_status(self) -> list[PodInfo]:
"""查 awoooi-prod namespace 的所有 Pod 狀態"""
"""查 awoooi-prod namespace 的所有 Pod 狀態
2026-05-03 Claude Opus 4.7 + 統帥 ogtP0 #3 加抓 STARTTIME 才能做 K8s state machine 判斷
"""
pods: list[PodInfo] = []
try:
import subprocess
@@ -525,7 +530,8 @@ class HeartbeatReportService:
["kubectl", "-n", "awoooi-prod", "get", "pods",
"--no-headers", "-o",
"custom-columns=NAME:.metadata.name,READY:.status.containerStatuses[0].ready,"
"STATUS:.status.phase,RESTARTS:.status.containerStatuses[0].restartCount"],
"STATUS:.status.phase,RESTARTS:.status.containerStatuses[0].restartCount,"
"STARTTIME:.status.startTime"],
capture_output=True, text=True, timeout=8,
)
for line in r.stdout.strip().splitlines():
@@ -535,7 +541,11 @@ class HeartbeatReportService:
ready = parts[1].lower() == "true"
status = parts[2]
restarts = int(parts[3]) if len(parts) >= 4 and parts[3].isdigit() else 0
pods.append(PodInfo(name=name, ready=ready, status=status, restarts=restarts))
start_time = parts[4] if len(parts) >= 5 and parts[4] != "<none>" else None
pods.append(PodInfo(
name=name, ready=ready, status=status,
restarts=restarts, start_time=start_time,
))
except Exception as e:
logger.debug("heartbeat_pod_status_failed", error=str(e))
return pods
@@ -700,18 +710,58 @@ class HeartbeatReportService:
if report.alert_pipeline.pending_approval > 10:
warnings.append(f"PENDING 積壓 {report.alert_pipeline.pending_approval} 筆,需人工處理")
# Pod 異常
# 2026-05-02 Claude Opus 4.7 + 統帥 ogtCronJob/Job 跑完的 Pod (Succeeded/Completed)
# ready=False 是設計容器已退出不是異常。原本邏輯每天推「Pod drift-scanner-* 未就緒
# (Succeeded)」3 條 false positive讓統帥誤以為告警重複。
# Pod 異常 — 2026-05-03 Claude Opus 4.7 + 統帥 ogtP0 #3 完整 K8s pod state machine
# K8s pod phases (https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/):
# Pending — 已建立但容器還沒起(短暫 OK>5min 異常 = image pull / scheduling 卡)
# Running — 至少 1 容器跑中ready=False 短暫 OK>2min 異常 = probe fail
# Succeeded — 全容器成功結束CronJob/Job 正常,不算未就緒)
# Failed — 全容器結束,至少 1 fail必告警
# Unknown — 狀態無法取得(必告警)
from datetime import datetime, timezone
_now = datetime.now(timezone.utc)
_PENDING_THRESHOLD_MIN = 5
_NOT_READY_THRESHOLD_MIN = 2
def _age_minutes(start_time: Optional[str]) -> Optional[float]:
"""ISO 8601 startTime → 距今分鐘。None 或解析失敗返 None。"""
if not start_time:
return None
try:
# K8s startTime 格式2026-05-03T12:34:56Z
dt = datetime.fromisoformat(start_time.replace("Z", "+00:00"))
return (_now - dt).total_seconds() / 60.0
except (ValueError, TypeError):
return None
for pod in report.pods:
if pod.status in ("Succeeded", "Completed"):
continue # CronJob/Job 跑完是成功,不算未就緒
if not pod.ready:
warnings.append(f"Pod {pod.name} 未就緒({pod.status}")
phase = pod.status
age_min = _age_minutes(pod.start_time)
# restart 次數高無論 phase 都告警CrashLoop 中或跑完都值得追)
# 放最前面,避免後面 continue 跳過
if pod.restarts >= 3:
warnings.append(f"Pod {pod.name} 重啟 {pod.restarts}")
if phase in ("Succeeded", "Completed"):
# CronJob/Job 成功跑完ready=False 是設計phase 部分不算未就緒
continue
elif phase == "Failed":
# 真正失敗 — 一定告警
warnings.append(f"Pod {pod.name} Failed")
elif phase == "Unknown":
warnings.append(f"Pod {pod.name} 狀態 Unknown")
elif phase == "Pending":
# 短暫 Pending OK持續 >5min 表示 image pull / scheduling 卡住
if age_min is None or age_min >= _PENDING_THRESHOLD_MIN:
age_str = f"{age_min:.0f}m" if age_min else "未知"
warnings.append(f"Pod {pod.name} 持續 Pending {age_str}image pull / scheduling 卡住)")
elif phase == "Running" and not pod.ready:
# Running 但 not ready短暫 OK剛起>2min 表示 probe fail / 啟動慢
if age_min is None or age_min >= _NOT_READY_THRESHOLD_MIN:
age_str = f"{age_min:.0f}m" if age_min else "未知"
warnings.append(f"Pod {pod.name} NotReady {age_str}readiness probe fail / 啟動異常)")
# Running + ready=True 是健康狀態,跳過
return warnings

View File

@@ -0,0 +1,172 @@
"""
P0 #3 K8s pod state machine 測試
2026-05-03 Claude Opus 4.7 + 統帥 ogt
驗證 _build_warnings() 對 5 種 K8s pod phase 的判斷邏輯:
Pending — 短暫 OK>5min 警告
Running — ready=True OKready=False 短暫 OK>2min 警告
Succeeded/Completed — CronJob 跑完,跳過(不算未就緒)
Failed — 必告警
Unknown — 必告警
restarts >= 3 — 必告警(無關 phase
加 test 同時保護未來重構不誤砍 K8s lifecycle 處理。
"""
from datetime import datetime, timedelta, timezone
import pytest
from src.services.heartbeat_report_service import (
HeartbeatReport,
HeartbeatReportService,
PodInfo,
)
def _make_report(pods: list[PodInfo]) -> HeartbeatReport:
"""構造最小 HeartbeatReport只填 pods 欄位"""
return HeartbeatReport(
timestamp=datetime.now(timezone.utc),
pods=pods,
)
def _start_time(minutes_ago: int) -> str:
"""構造 N 分鐘前的 ISO 8601 startTimeK8s 格式)"""
dt = datetime.now(timezone.utc) - timedelta(minutes=minutes_ago)
return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
class TestPodStateMachine:
"""K8s pod phase 完整 state machine 覆蓋"""
def setup_method(self):
self.svc = HeartbeatReportService()
# --- Succeeded / Completed: CronJob 跑完,不算未就緒 ---
def test_succeeded_pod_no_warning(self):
pod = PodInfo(name="drift-scanner-abc", ready=False, status="Succeeded")
warnings = self.svc._build_warnings(_make_report([pod]))
assert not any("drift-scanner" in w for w in warnings), \
f"Succeeded pod 不該被當未就緒,實際 warnings: {warnings}"
def test_completed_pod_no_warning(self):
pod = PodInfo(name="job-xyz", ready=False, status="Completed")
warnings = self.svc._build_warnings(_make_report([pod]))
assert not any("job-xyz" in w for w in warnings)
# --- Failed: 必告警 ---
def test_failed_pod_warns(self):
pod = PodInfo(name="api-bad", ready=False, status="Failed")
warnings = self.svc._build_warnings(_make_report([pod]))
assert any("api-bad" in w and "Failed" in w for w in warnings), \
f"Failed pod 必告警,實際 warnings: {warnings}"
# --- Unknown: 必告警 ---
def test_unknown_pod_warns(self):
pod = PodInfo(name="api-unknown", ready=False, status="Unknown")
warnings = self.svc._build_warnings(_make_report([pod]))
assert any("api-unknown" in w and "Unknown" in w for w in warnings)
# --- Pending: 短暫 OK>5min 警告 ---
def test_pending_short_no_warning(self):
# 剛建立 1 分鐘的 Pending pod 不該告警
pod = PodInfo(
name="api-starting", ready=False, status="Pending",
start_time=_start_time(minutes_ago=1),
)
warnings = self.svc._build_warnings(_make_report([pod]))
assert not any("api-starting" in w for w in warnings), \
f"Pending <5min 不該告警,實際: {warnings}"
def test_pending_long_warns(self):
# Pending 10 分鐘 → image pull / scheduling 卡住
pod = PodInfo(
name="api-stuck", ready=False, status="Pending",
start_time=_start_time(minutes_ago=10),
)
warnings = self.svc._build_warnings(_make_report([pod]))
assert any("api-stuck" in w and "Pending" in w for w in warnings), \
f"Pending >5min 必告警,實際: {warnings}"
def test_pending_no_starttime_warns(self):
# 拿不到 start_time 時保守告警
pod = PodInfo(
name="api-no-time", ready=False, status="Pending",
start_time=None,
)
warnings = self.svc._build_warnings(_make_report([pod]))
assert any("api-no-time" in w for w in warnings)
# --- Running ready=True: 健康,跳過 ---
def test_running_ready_no_warning(self):
pod = PodInfo(
name="api-healthy", ready=True, status="Running",
start_time=_start_time(minutes_ago=60),
)
warnings = self.svc._build_warnings(_make_report([pod]))
assert not any("api-healthy" in w for w in warnings), \
f"Running+Ready 不該告警,實際: {warnings}"
# --- Running ready=False: 短暫 OK>2min 警告 ---
def test_running_not_ready_short_no_warning(self):
# Running 但 not ready 1 分鐘剛起來probe 還沒過)
pod = PodInfo(
name="api-warming", ready=False, status="Running",
start_time=_start_time(minutes_ago=1),
)
warnings = self.svc._build_warnings(_make_report([pod]))
assert not any("api-warming" in w for w in warnings), \
f"Running NotReady <2min 不該告警,實際: {warnings}"
def test_running_not_ready_long_warns(self):
# Running 但 not ready 5 分鐘 → readiness probe fail
pod = PodInfo(
name="api-stale", ready=False, status="Running",
start_time=_start_time(minutes_ago=5),
)
warnings = self.svc._build_warnings(_make_report([pod]))
assert any("api-stale" in w and "NotReady" in w for w in warnings), \
f"Running NotReady >2min 必告警,實際: {warnings}"
# --- restarts >= 3: 必告警,無關 phase ---
def test_high_restarts_warns_even_if_running(self):
pod = PodInfo(
name="api-flaky", ready=True, status="Running",
start_time=_start_time(minutes_ago=30), restarts=5,
)
warnings = self.svc._build_warnings(_make_report([pod]))
assert any("api-flaky" in w and "重啟" in w for w in warnings)
def test_high_restarts_warns_for_succeeded_pod(self):
# 即使 Succeeded 也應該因為 restart 過多而告警CrashLoop 跑完)
pod = PodInfo(
name="job-crashy", ready=False, status="Succeeded", restarts=10,
)
warnings = self.svc._build_warnings(_make_report([pod]))
assert any("job-crashy" in w and "重啟" in w for w in warnings)
class TestRegression:
"""Regression保護 daily report 不再出現 24h 連續同樣 false positive"""
def setup_method(self):
self.svc = HeartbeatReportService()
def test_3_drift_scanner_succeeded_pods_zero_warning(self):
"""2026-05-02 統帥截圖鐵證3 個 drift-scanner Succeeded pod 每天造成
「需關注 3 項」假警報。修復後同樣 fixture 應 0 warning。"""
pods = [
PodInfo(name=f"drift-scanner-{i}-pvbst", ready=False, status="Succeeded")
for i in range(3)
]
warnings = self.svc._build_warnings(_make_report(pods))
assert not any("drift-scanner" in w for w in warnings), \
f"Codex stash + 我的修法後 Succeeded CronJob pod 不該告警,實際: {warnings}"