diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index f39d91c7..193526d2 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,17 @@ +## 2026-07-01 — 08:54 CD host pressure interrupted / Postgres recovery classifier + +**照主線修正的問題**: +- Live Gitea queue 顯示 sustained-load automation commit 已到 main,但正式 deploy 未完成;CD `#4199` 在 tests stage 的 `wait-host-web-build-pressure.sh` 第 2 次等待後被 `signal: interrupt` 中斷,log 只留下 `k3s-postgres-recovery CPU cores 3.595200 > 2.0`,沒有跑到 fail-hard refusal 結尾。 +- `ops/runner/read-public-gitea-actions-queue.py` 新增 host pressure classifier 欄位:`host_web_build_pressure_interrupted`、`latest_visible_cd_host_pressure_interrupted`、`latest_visible_cd_postgres_recovery_cpu_pressure`、CPU cores / threshold 與對應 rollups。 +- 此補丁不打開 110 capacity gate、不把 pressure gate 改成 warn-only;只把模糊 `host_web_build_pressure_waiting` 轉成可被 AI Agent / PlayBook / RAG 使用的明確 blocker evidence。 + +**驗證**: +- `python3.11 -m pytest ops/runner/test_read_public_gitea_actions_queue.py -q` 通過(30 passed)。 +- `DATABASE_URL=postgresql+asyncpg://test:test@localhost:5432/test PYTHONPATH=apps/api python3.11 -m pytest ops/runner/test_read_public_gitea_actions_queue.py apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py apps/api/tests/test_awoooi_priority_work_order_readback_api.py -q` 通過(58 passed)。 +- `python3.11 -m py_compile ops/runner/read-public-gitea-actions-queue.py`、`python3.11 ops/runner/guard-gitea-runner-pressure.py --root .`、`git diff --check` 通過。 + +**邊界**:只改 public Gitea queue readback parser / tests / LOGBOOK;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未使用 GitHub / `gh` / GitHub API;未 workflow_dispatch;未 SSH 寫入、未重啟主機、未 restart Docker / Nginx / K3s / DB / firewall;未恢復 legacy / generic runner label。 + ## 2026-07-01 — 08:37 Host sustained-load AI controlled automation **照主線修正的問題**: diff --git a/ops/runner/read-public-gitea-actions-queue.py b/ops/runner/read-public-gitea-actions-queue.py index 81b1eb8f..8ffa567f 100644 --- a/ops/runner/read-public-gitea-actions-queue.py +++ b/ops/runner/read-public-gitea-actions-queue.py @@ -82,9 +82,14 @@ _HOST_PRESSURE_ATTEMPT_RE = re.compile( _HOST_PRESSURE_LOAD_RE = re.compile( r"host load5/core (?P[0-9.]+) > (?P[0-9.]+)" ) +_HOST_PRESSURE_POSTGRES_RE = re.compile( + r"(?Pk3s-postgres-recovery) CPU cores " + r"(?P[0-9.]+) > (?P[0-9.]+)" +) _HOST_PRESSURE_REFUSAL_RE = re.compile( r"refusing to start AWOOI image build while host web/build/smoke pressure is still active" ) +_HOST_PRESSURE_INTERRUPTED_RE = re.compile(r"signal:\s*interrupt") @dataclass(frozen=True) @@ -631,6 +636,18 @@ def build_readback( "latest_visible_cd_host_pressure_refused": effective_tests_log_classifier[ "host_pressure_refused" ], + "latest_visible_cd_host_pressure_interrupted": effective_tests_log_classifier[ + "host_pressure_interrupted" + ], + "latest_visible_cd_postgres_recovery_cpu_pressure": ( + effective_tests_log_classifier["postgres_recovery_cpu_pressure"] + ), + "latest_visible_cd_postgres_recovery_cpu_cores": effective_tests_log_classifier[ + "postgres_recovery_cpu_cores" + ], + "latest_visible_cd_postgres_recovery_cpu_threshold": ( + effective_tests_log_classifier["postgres_recovery_cpu_threshold"] + ), "latest_visible_cd_host_pressure_log_stale_or_mismatched": ( host_pressure_waiting_from_stale_jobs ), @@ -752,6 +769,12 @@ def build_readback( "current_main_cd_host_pressure_refused": effective_tests_log_classifier[ "host_pressure_refused" ], + "current_main_cd_host_pressure_interrupted": ( + effective_tests_log_classifier["host_pressure_interrupted"] + ), + "current_main_cd_postgres_recovery_cpu_pressure": ( + effective_tests_log_classifier["postgres_recovery_cpu_pressure"] + ), "current_main_cd_host_pressure_log_stale_or_mismatched": ( host_pressure_waiting_from_stale_jobs ), @@ -875,13 +898,24 @@ def classify_cd_tests_log(text: str) -> dict[str, Any]: latest_load = match.group("load") latest_threshold = match.group("threshold") + latest_postgres_cores = "" + latest_postgres_threshold = "" + postgres_recovery_cpu_pressure = False + for match in _HOST_PRESSURE_POSTGRES_RE.finditer(text): + postgres_recovery_cpu_pressure = True + latest_postgres_cores = match.group("cores") + latest_postgres_threshold = match.group("threshold") + host_pressure_refused = _HOST_PRESSURE_REFUSAL_RE.search(text) is not None + host_pressure_interrupted = _HOST_PRESSURE_INTERRUPTED_RE.search(text) is not None host_pressure_waiting = bool(attempt_numbers) and not host_pressure_refused host_pressure_blocked_or_waiting = host_pressure_waiting or host_pressure_refused return { "host_pressure_classifier": ( "host_web_build_pressure_refused" if host_pressure_refused + else "host_web_build_pressure_interrupted" + if host_pressure_interrupted else "host_web_build_pressure_waiting" if host_pressure_waiting else "" @@ -894,6 +928,10 @@ def classify_cd_tests_log(text: str) -> dict[str, Any]: "load5_per_core_threshold": latest_threshold, "host_pressure_waiting": host_pressure_waiting, "host_pressure_refused": host_pressure_refused, + "host_pressure_interrupted": host_pressure_interrupted, + "postgres_recovery_cpu_pressure": postgres_recovery_cpu_pressure, + "postgres_recovery_cpu_cores": latest_postgres_cores, + "postgres_recovery_cpu_threshold": latest_postgres_threshold, "host_pressure_blocked_or_waiting": host_pressure_blocked_or_waiting, } diff --git a/ops/runner/test_read_public_gitea_actions_queue.py b/ops/runner/test_read_public_gitea_actions_queue.py index f57b9f9c..26723f9f 100644 --- a/ops/runner/test_read_public_gitea_actions_queue.py +++ b/ops/runner/test_read_public_gitea_actions_queue.py @@ -360,6 +360,16 @@ def _host_pressure_refused_log() -> str: """ +def _host_pressure_interrupted_postgres_log() -> str: + return """ +2026-07-01T00:44:23.7335391Z ⏳ host web/build/smoke pressure detected (attempt 1/60); waiting 10s +2026-07-01T00:44:23.7371812Z k3s-postgres-recovery CPU cores 3.595200 > 2.0 +2026-07-01T00:44:33.8647990Z ⏳ host web/build/smoke pressure detected (attempt 2/60); waiting 10s +2026-07-01T00:44:33.8680334Z k3s-postgres-recovery CPU cores 3.595200 > 2.0 +2026-07-01T00:44:43.0684981Z signal: interrupt +""" + + def test_parse_visible_runs_extracts_no_matching_runner_label() -> None: module = _load_module() runs = module.parse_visible_runs(_actions_html()) @@ -896,6 +906,42 @@ def test_stale_waiting_host_pressure_log_does_not_hide_failed_cd_jobs_payload() ) +def test_interrupted_host_pressure_reports_postgres_recovery_cpu() -> None: + module = _load_module() + payload = module.build_readback( + actions_html=_actions_html_failed_cd_run(), + actions_list_http_status=401, + actions_list_payload={"message": "token is required"}, + cd_jobs_http_status=200, + cd_jobs_payload={"jobs": [], "total_count": 0}, + latest_cd_tests_log_http_status=200, + latest_cd_tests_log_text=_host_pressure_interrupted_postgres_log(), + ) + assert payload["status"] == "blocked_host_web_build_pressure" + assert payload["readback"]["latest_visible_cd_host_pressure_classifier"] == ( + "host_web_build_pressure_interrupted" + ) + assert payload["readback"]["latest_visible_cd_host_pressure_attempt_count"] == 2 + assert payload["readback"]["latest_visible_cd_host_pressure_interrupted"] is True + assert ( + payload["readback"]["latest_visible_cd_postgres_recovery_cpu_pressure"] + is True + ) + assert ( + payload["readback"]["latest_visible_cd_postgres_recovery_cpu_cores"] + == "3.595200" + ) + assert ( + payload["readback"]["latest_visible_cd_postgres_recovery_cpu_threshold"] + == "2.0" + ) + assert payload["rollups"]["current_main_cd_host_pressure_interrupted"] is True + assert ( + payload["rollups"]["current_main_cd_postgres_recovery_cpu_pressure"] + is True + ) + + def test_build_readback_flags_stale_cd_jobs_api_payload() -> None: module = _load_module() payload = module.build_readback(