diff --git a/apps/api/src/services/heartbeat_report_service.py b/apps/api/src/services/heartbeat_report_service.py index 338985e9..4cc58ad8 100644 --- a/apps/api/src/services/heartbeat_report_service.py +++ b/apps/api/src/services/heartbeat_report_service.py @@ -72,6 +72,9 @@ class PodInfo: ready: bool status: str restarts: int = 0 + # 2026-05-03 Claude Opus 4.7 + 統帥 ogt:P0 #3 K8s pod state machine 完整化 + # 加 start_time 才能判斷 Pending/NotReady 是「剛起來合理」還是「卡住該告警」 + start_time: Optional[str] = None # ISO 8601 from .status.startTime @dataclass @@ -517,7 +520,9 @@ class HeartbeatReportService: return s async def _get_pod_status(self) -> list[PodInfo]: - """查 awoooi-prod namespace 的所有 Pod 狀態""" + """查 awoooi-prod namespace 的所有 Pod 狀態 + 2026-05-03 Claude Opus 4.7 + 統帥 ogt:P0 #3 加抓 STARTTIME 才能做 K8s state machine 判斷 + """ pods: list[PodInfo] = [] try: import subprocess @@ -525,7 +530,8 @@ class HeartbeatReportService: ["kubectl", "-n", "awoooi-prod", "get", "pods", "--no-headers", "-o", "custom-columns=NAME:.metadata.name,READY:.status.containerStatuses[0].ready," - "STATUS:.status.phase,RESTARTS:.status.containerStatuses[0].restartCount"], + "STATUS:.status.phase,RESTARTS:.status.containerStatuses[0].restartCount," + "STARTTIME:.status.startTime"], capture_output=True, text=True, timeout=8, ) for line in r.stdout.strip().splitlines(): @@ -535,7 +541,11 @@ class HeartbeatReportService: ready = parts[1].lower() == "true" status = parts[2] restarts = int(parts[3]) if len(parts) >= 4 and parts[3].isdigit() else 0 - pods.append(PodInfo(name=name, ready=ready, status=status, restarts=restarts)) + start_time = parts[4] if len(parts) >= 5 and parts[4] != "" else None + pods.append(PodInfo( + name=name, ready=ready, status=status, + restarts=restarts, start_time=start_time, + )) except Exception as e: logger.debug("heartbeat_pod_status_failed", error=str(e)) return pods @@ -700,18 +710,58 @@ class HeartbeatReportService: if report.alert_pipeline.pending_approval > 10: warnings.append(f"PENDING 積壓 {report.alert_pipeline.pending_approval} 筆,需人工處理") - # Pod 異常 - # 2026-05-02 Claude Opus 4.7 + 統帥 ogt:CronJob/Job 跑完的 Pod (Succeeded/Completed) - # ready=False 是設計(容器已退出),不是異常。原本邏輯每天推「Pod drift-scanner-* 未就緒 - # (Succeeded)」3 條 false positive,讓統帥誤以為告警重複。 + # Pod 異常 — 2026-05-03 Claude Opus 4.7 + 統帥 ogt:P0 #3 完整 K8s pod state machine + # K8s pod phases (https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/): + # Pending — 已建立但容器還沒起(短暫 OK,>5min 異常 = image pull / scheduling 卡) + # Running — 至少 1 容器跑中(ready=False 短暫 OK,>2min 異常 = probe fail) + # Succeeded — 全容器成功結束(CronJob/Job 正常,不算未就緒) + # Failed — 全容器結束,至少 1 fail(必告警) + # Unknown — 狀態無法取得(必告警) + from datetime import datetime, timezone + _now = datetime.now(timezone.utc) + _PENDING_THRESHOLD_MIN = 5 + _NOT_READY_THRESHOLD_MIN = 2 + + def _age_minutes(start_time: Optional[str]) -> Optional[float]: + """ISO 8601 startTime → 距今分鐘。None 或解析失敗返 None。""" + if not start_time: + return None + try: + # K8s startTime 格式:2026-05-03T12:34:56Z + dt = datetime.fromisoformat(start_time.replace("Z", "+00:00")) + return (_now - dt).total_seconds() / 60.0 + except (ValueError, TypeError): + return None + for pod in report.pods: - if pod.status in ("Succeeded", "Completed"): - continue # CronJob/Job 跑完是成功,不算未就緒 - if not pod.ready: - warnings.append(f"Pod {pod.name} 未就緒({pod.status})") + phase = pod.status + age_min = _age_minutes(pod.start_time) + + # restart 次數高無論 phase 都告警(CrashLoop 中或跑完都值得追) + # 放最前面,避免後面 continue 跳過 if pod.restarts >= 3: warnings.append(f"Pod {pod.name} 重啟 {pod.restarts} 次") + if phase in ("Succeeded", "Completed"): + # CronJob/Job 成功跑完,ready=False 是設計,phase 部分不算未就緒 + continue + elif phase == "Failed": + # 真正失敗 — 一定告警 + warnings.append(f"Pod {pod.name} Failed") + elif phase == "Unknown": + warnings.append(f"Pod {pod.name} 狀態 Unknown") + elif phase == "Pending": + # 短暫 Pending OK;持續 >5min 表示 image pull / scheduling 卡住 + if age_min is None or age_min >= _PENDING_THRESHOLD_MIN: + age_str = f"{age_min:.0f}m" if age_min else "未知" + warnings.append(f"Pod {pod.name} 持續 Pending {age_str}(image pull / scheduling 卡住)") + elif phase == "Running" and not pod.ready: + # Running 但 not ready:短暫 OK(剛起);>2min 表示 probe fail / 啟動慢 + if age_min is None or age_min >= _NOT_READY_THRESHOLD_MIN: + age_str = f"{age_min:.0f}m" if age_min else "未知" + warnings.append(f"Pod {pod.name} NotReady {age_str}(readiness probe fail / 啟動異常)") + # Running + ready=True 是健康狀態,跳過 + return warnings diff --git a/apps/api/tests/test_heartbeat_pod_state_machine.py b/apps/api/tests/test_heartbeat_pod_state_machine.py new file mode 100644 index 00000000..38f437fe --- /dev/null +++ b/apps/api/tests/test_heartbeat_pod_state_machine.py @@ -0,0 +1,172 @@ +""" +P0 #3 K8s pod state machine 測試 +2026-05-03 Claude Opus 4.7 + 統帥 ogt + +驗證 _build_warnings() 對 5 種 K8s pod phase 的判斷邏輯: + Pending — 短暫 OK,>5min 警告 + Running — ready=True OK;ready=False 短暫 OK,>2min 警告 + Succeeded/Completed — CronJob 跑完,跳過(不算未就緒) + Failed — 必告警 + Unknown — 必告警 + restarts >= 3 — 必告警(無關 phase) + +加 test 同時保護未來重構不誤砍 K8s lifecycle 處理。 +""" +from datetime import datetime, timedelta, timezone + +import pytest + +from src.services.heartbeat_report_service import ( + HeartbeatReport, + HeartbeatReportService, + PodInfo, +) + + +def _make_report(pods: list[PodInfo]) -> HeartbeatReport: + """構造最小 HeartbeatReport,只填 pods 欄位""" + return HeartbeatReport( + timestamp=datetime.now(timezone.utc), + pods=pods, + ) + + +def _start_time(minutes_ago: int) -> str: + """構造 N 分鐘前的 ISO 8601 startTime(K8s 格式)""" + dt = datetime.now(timezone.utc) - timedelta(minutes=minutes_ago) + return dt.strftime("%Y-%m-%dT%H:%M:%SZ") + + +class TestPodStateMachine: + """K8s pod phase 完整 state machine 覆蓋""" + + def setup_method(self): + self.svc = HeartbeatReportService() + + # --- Succeeded / Completed: CronJob 跑完,不算未就緒 --- + + def test_succeeded_pod_no_warning(self): + pod = PodInfo(name="drift-scanner-abc", ready=False, status="Succeeded") + warnings = self.svc._build_warnings(_make_report([pod])) + assert not any("drift-scanner" in w for w in warnings), \ + f"Succeeded pod 不該被當未就緒,實際 warnings: {warnings}" + + def test_completed_pod_no_warning(self): + pod = PodInfo(name="job-xyz", ready=False, status="Completed") + warnings = self.svc._build_warnings(_make_report([pod])) + assert not any("job-xyz" in w for w in warnings) + + # --- Failed: 必告警 --- + + def test_failed_pod_warns(self): + pod = PodInfo(name="api-bad", ready=False, status="Failed") + warnings = self.svc._build_warnings(_make_report([pod])) + assert any("api-bad" in w and "Failed" in w for w in warnings), \ + f"Failed pod 必告警,實際 warnings: {warnings}" + + # --- Unknown: 必告警 --- + + def test_unknown_pod_warns(self): + pod = PodInfo(name="api-unknown", ready=False, status="Unknown") + warnings = self.svc._build_warnings(_make_report([pod])) + assert any("api-unknown" in w and "Unknown" in w for w in warnings) + + # --- Pending: 短暫 OK,>5min 警告 --- + + def test_pending_short_no_warning(self): + # 剛建立 1 分鐘的 Pending pod 不該告警 + pod = PodInfo( + name="api-starting", ready=False, status="Pending", + start_time=_start_time(minutes_ago=1), + ) + warnings = self.svc._build_warnings(_make_report([pod])) + assert not any("api-starting" in w for w in warnings), \ + f"Pending <5min 不該告警,實際: {warnings}" + + def test_pending_long_warns(self): + # Pending 10 分鐘 → image pull / scheduling 卡住 + pod = PodInfo( + name="api-stuck", ready=False, status="Pending", + start_time=_start_time(minutes_ago=10), + ) + warnings = self.svc._build_warnings(_make_report([pod])) + assert any("api-stuck" in w and "Pending" in w for w in warnings), \ + f"Pending >5min 必告警,實際: {warnings}" + + def test_pending_no_starttime_warns(self): + # 拿不到 start_time 時保守告警 + pod = PodInfo( + name="api-no-time", ready=False, status="Pending", + start_time=None, + ) + warnings = self.svc._build_warnings(_make_report([pod])) + assert any("api-no-time" in w for w in warnings) + + # --- Running ready=True: 健康,跳過 --- + + def test_running_ready_no_warning(self): + pod = PodInfo( + name="api-healthy", ready=True, status="Running", + start_time=_start_time(minutes_ago=60), + ) + warnings = self.svc._build_warnings(_make_report([pod])) + assert not any("api-healthy" in w for w in warnings), \ + f"Running+Ready 不該告警,實際: {warnings}" + + # --- Running ready=False: 短暫 OK,>2min 警告 --- + + def test_running_not_ready_short_no_warning(self): + # Running 但 not ready 1 分鐘(剛起來,probe 還沒過) + pod = PodInfo( + name="api-warming", ready=False, status="Running", + start_time=_start_time(minutes_ago=1), + ) + warnings = self.svc._build_warnings(_make_report([pod])) + assert not any("api-warming" in w for w in warnings), \ + f"Running NotReady <2min 不該告警,實際: {warnings}" + + def test_running_not_ready_long_warns(self): + # Running 但 not ready 5 分鐘 → readiness probe fail + pod = PodInfo( + name="api-stale", ready=False, status="Running", + start_time=_start_time(minutes_ago=5), + ) + warnings = self.svc._build_warnings(_make_report([pod])) + assert any("api-stale" in w and "NotReady" in w for w in warnings), \ + f"Running NotReady >2min 必告警,實際: {warnings}" + + # --- restarts >= 3: 必告警,無關 phase --- + + def test_high_restarts_warns_even_if_running(self): + pod = PodInfo( + name="api-flaky", ready=True, status="Running", + start_time=_start_time(minutes_ago=30), restarts=5, + ) + warnings = self.svc._build_warnings(_make_report([pod])) + assert any("api-flaky" in w and "重啟" in w for w in warnings) + + def test_high_restarts_warns_for_succeeded_pod(self): + # 即使 Succeeded 也應該因為 restart 過多而告警(CrashLoop 跑完) + pod = PodInfo( + name="job-crashy", ready=False, status="Succeeded", restarts=10, + ) + warnings = self.svc._build_warnings(_make_report([pod])) + assert any("job-crashy" in w and "重啟" in w for w in warnings) + + +class TestRegression: + """Regression:保護 daily report 不再出現 24h 連續同樣 false positive""" + + def setup_method(self): + self.svc = HeartbeatReportService() + + def test_3_drift_scanner_succeeded_pods_zero_warning(self): + """2026-05-02 統帥截圖鐵證:3 個 drift-scanner Succeeded pod 每天造成 + 「需關注 3 項」假警報。修復後同樣 fixture 應 0 warning。""" + pods = [ + PodInfo(name=f"drift-scanner-{i}-pvbst", ready=False, status="Succeeded") + for i in range(3) + ] + warnings = self.svc._build_warnings(_make_report(pods)) + assert not any("drift-scanner" in w for w in warnings), \ + f"Codex stash + 我的修法後 Succeeded CronJob pod 不該告警,實際: {warnings}"