fix(runner): classify interrupted host pressure source
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 43s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
AWOOOI Harbor 110 Local Repair / workflow-shape (push) Successful in 0s
AWOOOI Harbor 110 Local Repair / harbor-110-local-repair (push) Has been cancelled
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 43s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
AWOOOI Harbor 110 Local Repair / workflow-shape (push) Successful in 0s
AWOOOI Harbor 110 Local Repair / harbor-110-local-repair (push) Has been cancelled
This commit is contained in:
@@ -1,3 +1,17 @@
|
||||
## 2026-07-01 — 08:54 CD host pressure interrupted / Postgres recovery classifier
|
||||
|
||||
**照主線修正的問題**:
|
||||
- Live Gitea queue 顯示 sustained-load automation commit 已到 main,但正式 deploy 未完成;CD `#4199` 在 tests stage 的 `wait-host-web-build-pressure.sh` 第 2 次等待後被 `signal: interrupt` 中斷,log 只留下 `k3s-postgres-recovery CPU cores 3.595200 > 2.0`,沒有跑到 fail-hard refusal 結尾。
|
||||
- `ops/runner/read-public-gitea-actions-queue.py` 新增 host pressure classifier 欄位:`host_web_build_pressure_interrupted`、`latest_visible_cd_host_pressure_interrupted`、`latest_visible_cd_postgres_recovery_cpu_pressure`、CPU cores / threshold 與對應 rollups。
|
||||
- 此補丁不打開 110 capacity gate、不把 pressure gate 改成 warn-only;只把模糊 `host_web_build_pressure_waiting` 轉成可被 AI Agent / PlayBook / RAG 使用的明確 blocker evidence。
|
||||
|
||||
**驗證**:
|
||||
- `python3.11 -m pytest ops/runner/test_read_public_gitea_actions_queue.py -q` 通過(30 passed)。
|
||||
- `DATABASE_URL=postgresql+asyncpg://test:test@localhost:5432/test PYTHONPATH=apps/api python3.11 -m pytest ops/runner/test_read_public_gitea_actions_queue.py apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py apps/api/tests/test_awoooi_priority_work_order_readback_api.py -q` 通過(58 passed)。
|
||||
- `python3.11 -m py_compile ops/runner/read-public-gitea-actions-queue.py`、`python3.11 ops/runner/guard-gitea-runner-pressure.py --root .`、`git diff --check` 通過。
|
||||
|
||||
**邊界**:只改 public Gitea queue readback parser / tests / LOGBOOK;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未使用 GitHub / `gh` / GitHub API;未 workflow_dispatch;未 SSH 寫入、未重啟主機、未 restart Docker / Nginx / K3s / DB / firewall;未恢復 legacy / generic runner label。
|
||||
|
||||
## 2026-07-01 — 08:37 Host sustained-load AI controlled automation
|
||||
|
||||
**照主線修正的問題**:
|
||||
|
||||
@@ -82,9 +82,14 @@ _HOST_PRESSURE_ATTEMPT_RE = re.compile(
|
||||
_HOST_PRESSURE_LOAD_RE = re.compile(
|
||||
r"host load5/core (?P<load>[0-9.]+) > (?P<threshold>[0-9.]+)"
|
||||
)
|
||||
_HOST_PRESSURE_POSTGRES_RE = re.compile(
|
||||
r"(?P<container>k3s-postgres-recovery) CPU cores "
|
||||
r"(?P<cores>[0-9.]+) > (?P<threshold>[0-9.]+)"
|
||||
)
|
||||
_HOST_PRESSURE_REFUSAL_RE = re.compile(
|
||||
r"refusing to start AWOOI image build while host web/build/smoke pressure is still active"
|
||||
)
|
||||
_HOST_PRESSURE_INTERRUPTED_RE = re.compile(r"signal:\s*interrupt")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@@ -631,6 +636,18 @@ def build_readback(
|
||||
"latest_visible_cd_host_pressure_refused": effective_tests_log_classifier[
|
||||
"host_pressure_refused"
|
||||
],
|
||||
"latest_visible_cd_host_pressure_interrupted": effective_tests_log_classifier[
|
||||
"host_pressure_interrupted"
|
||||
],
|
||||
"latest_visible_cd_postgres_recovery_cpu_pressure": (
|
||||
effective_tests_log_classifier["postgres_recovery_cpu_pressure"]
|
||||
),
|
||||
"latest_visible_cd_postgres_recovery_cpu_cores": effective_tests_log_classifier[
|
||||
"postgres_recovery_cpu_cores"
|
||||
],
|
||||
"latest_visible_cd_postgres_recovery_cpu_threshold": (
|
||||
effective_tests_log_classifier["postgres_recovery_cpu_threshold"]
|
||||
),
|
||||
"latest_visible_cd_host_pressure_log_stale_or_mismatched": (
|
||||
host_pressure_waiting_from_stale_jobs
|
||||
),
|
||||
@@ -752,6 +769,12 @@ def build_readback(
|
||||
"current_main_cd_host_pressure_refused": effective_tests_log_classifier[
|
||||
"host_pressure_refused"
|
||||
],
|
||||
"current_main_cd_host_pressure_interrupted": (
|
||||
effective_tests_log_classifier["host_pressure_interrupted"]
|
||||
),
|
||||
"current_main_cd_postgres_recovery_cpu_pressure": (
|
||||
effective_tests_log_classifier["postgres_recovery_cpu_pressure"]
|
||||
),
|
||||
"current_main_cd_host_pressure_log_stale_or_mismatched": (
|
||||
host_pressure_waiting_from_stale_jobs
|
||||
),
|
||||
@@ -875,13 +898,24 @@ def classify_cd_tests_log(text: str) -> dict[str, Any]:
|
||||
latest_load = match.group("load")
|
||||
latest_threshold = match.group("threshold")
|
||||
|
||||
latest_postgres_cores = ""
|
||||
latest_postgres_threshold = ""
|
||||
postgres_recovery_cpu_pressure = False
|
||||
for match in _HOST_PRESSURE_POSTGRES_RE.finditer(text):
|
||||
postgres_recovery_cpu_pressure = True
|
||||
latest_postgres_cores = match.group("cores")
|
||||
latest_postgres_threshold = match.group("threshold")
|
||||
|
||||
host_pressure_refused = _HOST_PRESSURE_REFUSAL_RE.search(text) is not None
|
||||
host_pressure_interrupted = _HOST_PRESSURE_INTERRUPTED_RE.search(text) is not None
|
||||
host_pressure_waiting = bool(attempt_numbers) and not host_pressure_refused
|
||||
host_pressure_blocked_or_waiting = host_pressure_waiting or host_pressure_refused
|
||||
return {
|
||||
"host_pressure_classifier": (
|
||||
"host_web_build_pressure_refused"
|
||||
if host_pressure_refused
|
||||
else "host_web_build_pressure_interrupted"
|
||||
if host_pressure_interrupted
|
||||
else "host_web_build_pressure_waiting"
|
||||
if host_pressure_waiting
|
||||
else ""
|
||||
@@ -894,6 +928,10 @@ def classify_cd_tests_log(text: str) -> dict[str, Any]:
|
||||
"load5_per_core_threshold": latest_threshold,
|
||||
"host_pressure_waiting": host_pressure_waiting,
|
||||
"host_pressure_refused": host_pressure_refused,
|
||||
"host_pressure_interrupted": host_pressure_interrupted,
|
||||
"postgres_recovery_cpu_pressure": postgres_recovery_cpu_pressure,
|
||||
"postgres_recovery_cpu_cores": latest_postgres_cores,
|
||||
"postgres_recovery_cpu_threshold": latest_postgres_threshold,
|
||||
"host_pressure_blocked_or_waiting": host_pressure_blocked_or_waiting,
|
||||
}
|
||||
|
||||
|
||||
@@ -360,6 +360,16 @@ def _host_pressure_refused_log() -> str:
|
||||
"""
|
||||
|
||||
|
||||
def _host_pressure_interrupted_postgres_log() -> str:
|
||||
return """
|
||||
2026-07-01T00:44:23.7335391Z ⏳ host web/build/smoke pressure detected (attempt 1/60); waiting 10s
|
||||
2026-07-01T00:44:23.7371812Z k3s-postgres-recovery CPU cores 3.595200 > 2.0
|
||||
2026-07-01T00:44:33.8647990Z ⏳ host web/build/smoke pressure detected (attempt 2/60); waiting 10s
|
||||
2026-07-01T00:44:33.8680334Z k3s-postgres-recovery CPU cores 3.595200 > 2.0
|
||||
2026-07-01T00:44:43.0684981Z signal: interrupt
|
||||
"""
|
||||
|
||||
|
||||
def test_parse_visible_runs_extracts_no_matching_runner_label() -> None:
|
||||
module = _load_module()
|
||||
runs = module.parse_visible_runs(_actions_html())
|
||||
@@ -896,6 +906,42 @@ def test_stale_waiting_host_pressure_log_does_not_hide_failed_cd_jobs_payload()
|
||||
)
|
||||
|
||||
|
||||
def test_interrupted_host_pressure_reports_postgres_recovery_cpu() -> None:
|
||||
module = _load_module()
|
||||
payload = module.build_readback(
|
||||
actions_html=_actions_html_failed_cd_run(),
|
||||
actions_list_http_status=401,
|
||||
actions_list_payload={"message": "token is required"},
|
||||
cd_jobs_http_status=200,
|
||||
cd_jobs_payload={"jobs": [], "total_count": 0},
|
||||
latest_cd_tests_log_http_status=200,
|
||||
latest_cd_tests_log_text=_host_pressure_interrupted_postgres_log(),
|
||||
)
|
||||
assert payload["status"] == "blocked_host_web_build_pressure"
|
||||
assert payload["readback"]["latest_visible_cd_host_pressure_classifier"] == (
|
||||
"host_web_build_pressure_interrupted"
|
||||
)
|
||||
assert payload["readback"]["latest_visible_cd_host_pressure_attempt_count"] == 2
|
||||
assert payload["readback"]["latest_visible_cd_host_pressure_interrupted"] is True
|
||||
assert (
|
||||
payload["readback"]["latest_visible_cd_postgres_recovery_cpu_pressure"]
|
||||
is True
|
||||
)
|
||||
assert (
|
||||
payload["readback"]["latest_visible_cd_postgres_recovery_cpu_cores"]
|
||||
== "3.595200"
|
||||
)
|
||||
assert (
|
||||
payload["readback"]["latest_visible_cd_postgres_recovery_cpu_threshold"]
|
||||
== "2.0"
|
||||
)
|
||||
assert payload["rollups"]["current_main_cd_host_pressure_interrupted"] is True
|
||||
assert (
|
||||
payload["rollups"]["current_main_cd_postgres_recovery_cpu_pressure"]
|
||||
is True
|
||||
)
|
||||
|
||||
|
||||
def test_build_readback_flags_stale_cd_jobs_api_payload() -> None:
|
||||
module = _load_module()
|
||||
payload = module.build_readback(
|
||||
|
||||
Reference in New Issue
Block a user