fix(runner): classify interrupted host pressure source
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 43s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
AWOOOI Harbor 110 Local Repair / workflow-shape (push) Successful in 0s
AWOOOI Harbor 110 Local Repair / harbor-110-local-repair (push) Has been cancelled

This commit is contained in:
Your Name
2026-07-01 08:54:50 +08:00
parent c3d848ecff
commit 444de0b40c
3 changed files with 98 additions and 0 deletions

View File

@@ -1,3 +1,17 @@
## 2026-07-01 — 08:54 CD host pressure interrupted / Postgres recovery classifier
**照主線修正的問題**
- Live Gitea queue 顯示 sustained-load automation commit 已到 main但正式 deploy 未完成CD `#4199` 在 tests stage 的 `wait-host-web-build-pressure.sh` 第 2 次等待後被 `signal: interrupt` 中斷log 只留下 `k3s-postgres-recovery CPU cores 3.595200 > 2.0`,沒有跑到 fail-hard refusal 結尾。
- `ops/runner/read-public-gitea-actions-queue.py` 新增 host pressure classifier 欄位:`host_web_build_pressure_interrupted``latest_visible_cd_host_pressure_interrupted``latest_visible_cd_postgres_recovery_cpu_pressure`、CPU cores / threshold 與對應 rollups。
- 此補丁不打開 110 capacity gate、不把 pressure gate 改成 warn-only只把模糊 `host_web_build_pressure_waiting` 轉成可被 AI Agent / PlayBook / RAG 使用的明確 blocker evidence。
**驗證**
- `python3.11 -m pytest ops/runner/test_read_public_gitea_actions_queue.py -q` 通過30 passed
- `DATABASE_URL=postgresql+asyncpg://test:test@localhost:5432/test PYTHONPATH=apps/api python3.11 -m pytest ops/runner/test_read_public_gitea_actions_queue.py apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py apps/api/tests/test_awoooi_priority_work_order_readback_api.py -q` 通過58 passed
- `python3.11 -m py_compile ops/runner/read-public-gitea-actions-queue.py``python3.11 ops/runner/guard-gitea-runner-pressure.py --root .``git diff --check` 通過。
**邊界**:只改 public Gitea queue readback parser / tests / LOGBOOK未讀 secret / token / `.env` / raw sessions / SQLite / auth未使用 GitHub / `gh` / GitHub API未 workflow_dispatch未 SSH 寫入、未重啟主機、未 restart Docker / Nginx / K3s / DB / firewall未恢復 legacy / generic runner label。
## 2026-07-01 — 08:37 Host sustained-load AI controlled automation
**照主線修正的問題**

View File

@@ -82,9 +82,14 @@ _HOST_PRESSURE_ATTEMPT_RE = re.compile(
_HOST_PRESSURE_LOAD_RE = re.compile(
r"host load5/core (?P<load>[0-9.]+) > (?P<threshold>[0-9.]+)"
)
_HOST_PRESSURE_POSTGRES_RE = re.compile(
r"(?P<container>k3s-postgres-recovery) CPU cores "
r"(?P<cores>[0-9.]+) > (?P<threshold>[0-9.]+)"
)
_HOST_PRESSURE_REFUSAL_RE = re.compile(
r"refusing to start AWOOI image build while host web/build/smoke pressure is still active"
)
_HOST_PRESSURE_INTERRUPTED_RE = re.compile(r"signal:\s*interrupt")
@dataclass(frozen=True)
@@ -631,6 +636,18 @@ def build_readback(
"latest_visible_cd_host_pressure_refused": effective_tests_log_classifier[
"host_pressure_refused"
],
"latest_visible_cd_host_pressure_interrupted": effective_tests_log_classifier[
"host_pressure_interrupted"
],
"latest_visible_cd_postgres_recovery_cpu_pressure": (
effective_tests_log_classifier["postgres_recovery_cpu_pressure"]
),
"latest_visible_cd_postgres_recovery_cpu_cores": effective_tests_log_classifier[
"postgres_recovery_cpu_cores"
],
"latest_visible_cd_postgres_recovery_cpu_threshold": (
effective_tests_log_classifier["postgres_recovery_cpu_threshold"]
),
"latest_visible_cd_host_pressure_log_stale_or_mismatched": (
host_pressure_waiting_from_stale_jobs
),
@@ -752,6 +769,12 @@ def build_readback(
"current_main_cd_host_pressure_refused": effective_tests_log_classifier[
"host_pressure_refused"
],
"current_main_cd_host_pressure_interrupted": (
effective_tests_log_classifier["host_pressure_interrupted"]
),
"current_main_cd_postgres_recovery_cpu_pressure": (
effective_tests_log_classifier["postgres_recovery_cpu_pressure"]
),
"current_main_cd_host_pressure_log_stale_or_mismatched": (
host_pressure_waiting_from_stale_jobs
),
@@ -875,13 +898,24 @@ def classify_cd_tests_log(text: str) -> dict[str, Any]:
latest_load = match.group("load")
latest_threshold = match.group("threshold")
latest_postgres_cores = ""
latest_postgres_threshold = ""
postgres_recovery_cpu_pressure = False
for match in _HOST_PRESSURE_POSTGRES_RE.finditer(text):
postgres_recovery_cpu_pressure = True
latest_postgres_cores = match.group("cores")
latest_postgres_threshold = match.group("threshold")
host_pressure_refused = _HOST_PRESSURE_REFUSAL_RE.search(text) is not None
host_pressure_interrupted = _HOST_PRESSURE_INTERRUPTED_RE.search(text) is not None
host_pressure_waiting = bool(attempt_numbers) and not host_pressure_refused
host_pressure_blocked_or_waiting = host_pressure_waiting or host_pressure_refused
return {
"host_pressure_classifier": (
"host_web_build_pressure_refused"
if host_pressure_refused
else "host_web_build_pressure_interrupted"
if host_pressure_interrupted
else "host_web_build_pressure_waiting"
if host_pressure_waiting
else ""
@@ -894,6 +928,10 @@ def classify_cd_tests_log(text: str) -> dict[str, Any]:
"load5_per_core_threshold": latest_threshold,
"host_pressure_waiting": host_pressure_waiting,
"host_pressure_refused": host_pressure_refused,
"host_pressure_interrupted": host_pressure_interrupted,
"postgres_recovery_cpu_pressure": postgres_recovery_cpu_pressure,
"postgres_recovery_cpu_cores": latest_postgres_cores,
"postgres_recovery_cpu_threshold": latest_postgres_threshold,
"host_pressure_blocked_or_waiting": host_pressure_blocked_or_waiting,
}

View File

@@ -360,6 +360,16 @@ def _host_pressure_refused_log() -> str:
"""
def _host_pressure_interrupted_postgres_log() -> str:
return """
2026-07-01T00:44:23.7335391Z ⏳ host web/build/smoke pressure detected (attempt 1/60); waiting 10s
2026-07-01T00:44:23.7371812Z k3s-postgres-recovery CPU cores 3.595200 > 2.0
2026-07-01T00:44:33.8647990Z ⏳ host web/build/smoke pressure detected (attempt 2/60); waiting 10s
2026-07-01T00:44:33.8680334Z k3s-postgres-recovery CPU cores 3.595200 > 2.0
2026-07-01T00:44:43.0684981Z signal: interrupt
"""
def test_parse_visible_runs_extracts_no_matching_runner_label() -> None:
module = _load_module()
runs = module.parse_visible_runs(_actions_html())
@@ -896,6 +906,42 @@ def test_stale_waiting_host_pressure_log_does_not_hide_failed_cd_jobs_payload()
)
def test_interrupted_host_pressure_reports_postgres_recovery_cpu() -> None:
module = _load_module()
payload = module.build_readback(
actions_html=_actions_html_failed_cd_run(),
actions_list_http_status=401,
actions_list_payload={"message": "token is required"},
cd_jobs_http_status=200,
cd_jobs_payload={"jobs": [], "total_count": 0},
latest_cd_tests_log_http_status=200,
latest_cd_tests_log_text=_host_pressure_interrupted_postgres_log(),
)
assert payload["status"] == "blocked_host_web_build_pressure"
assert payload["readback"]["latest_visible_cd_host_pressure_classifier"] == (
"host_web_build_pressure_interrupted"
)
assert payload["readback"]["latest_visible_cd_host_pressure_attempt_count"] == 2
assert payload["readback"]["latest_visible_cd_host_pressure_interrupted"] is True
assert (
payload["readback"]["latest_visible_cd_postgres_recovery_cpu_pressure"]
is True
)
assert (
payload["readback"]["latest_visible_cd_postgres_recovery_cpu_cores"]
== "3.595200"
)
assert (
payload["readback"]["latest_visible_cd_postgres_recovery_cpu_threshold"]
== "2.0"
)
assert payload["rollups"]["current_main_cd_host_pressure_interrupted"] is True
assert (
payload["rollups"]["current_main_cd_postgres_recovery_cpu_pressure"]
is True
)
def test_build_readback_flags_stale_cd_jobs_api_payload() -> None:
module = _load_module()
payload = module.build_readback(