diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 61db5502..16643f6b 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -51922,6 +51922,7 @@ production browser smoke: - 110 live pressure 重新讀回:`load5` 曾回到約 `8.91`、`awoooi_host_load5_per_core=0.8075`,Gitea 即時 `docker stats` 一度 `218.56%`,但既有 `HostLoadAverageSustainedHigh` 門檻是 `load5/core > 1.5 for 15m`,`DockerContainerCpuSustainedHigh` 也是 `>2 core for 10m` pending;因此先前沒有 CPU firing / Telegram 並不是沒有監控,而是門檻太晚且 auto-repair action 指到未部署路徑。 - 已部署 `/home/wooo/scripts/host-sustained-load-controller.py`、`host-sustained-load-evidence.py`、`host-runaway-process-remediation.py` 到 110,備份 suffix `before-host-pressure-controller-20260701-232314`;controller live readback 可執行,且不讀 secret / raw session / runner registration。 - `ops/monitoring/alerts-unified.yml` 新增 `Host110SustainedModeratePressure`:`load5/core > 0.75` 或 Gitea / StockPlatform 關鍵容器 CPU `>2.0 core` 持續 1 分鐘即 warning,auto-repair action 指向 110 實際 controller 路徑。 +- `host-sustained-load-controller.py` 補 `--container-cpu-threshold`,當 Gitea / StockPlatform 關鍵容器貼著 CPU quota 超過門檻時,即使 `load5/core` 尚未達 critical,也會產生 source-specific playbook packet;未超門檻時只回 observing,不亂殺、不重啟。 - 將 Gitea container runtime CPU quota 從 `3` core 收斂到 `2` core:`docker update --cpus=2 gitea`;rollback 為 `docker update --cpus=3 gitea`。post-check:`nanocpus=2000000000`、memory 仍 `3GiB`、Gitea API `/api/v1/version` 回 `1.25.5`,無容器重啟。 - 修正備份噪音:`BackupAggregateRunFailed` 不再因 `backup_all` 舊 aggregate failed_count firing,改成只看 component job failed count;live `backup-status.sh --no-notify` 已回 `每日備份心跳正常`、`component_failed=0`、`core_blockers=0`、`escrow_missing=0`。 - Alertmanager / webhook readback:Alertmanager 仍有 5 個非 CPU active warning;路由預設 `awoooi-webhook`,`telegram-direct` 只給 alert-chain 自身異常。110 到 VIP / 120 / 121 `/api/v1/webhooks/alertmanager` synthetic no-secret smoke 均 HTTP 200,回 `告警已排入背景分析`;`/api/v1/telegram/health` 回 `configured`。 @@ -51929,6 +51930,7 @@ production browser smoke: **live readback**: - Prometheus rule readback:`Host110SustainedModeratePressure=inactive`、`DockerContainerCpuSustainedHigh=inactive`、`BackupAggregateRunFailed=inactive`。 - Node exporter readback:`awoooi_host_load5_per_core{host="110"} 0.536667`、`node_load5 6.52`、`docker_container_cpu_cores{container_name="gitea",host="110"} 1.4917`、`docker_container_cpu_limit_cores{container_name="gitea",host="110"} 2`。 +- 第二輪 controller readback:`load5_per_core=0.473333`、Gitea `1.7221` core、`container_cpu_threshold=2.0`,classification `observing_load_within_threshold`;Prometheus 短暫 pending 由上一輪 Gitea >2 core 樣本造成,live controller 未誤執行。 - Alertmanager active alerts after fix:`DockerContainerMissingResourceLimit` on 188、`HostDiskUsageHigh` on 110/188、`HostOutOfDiskSpace` on 110/188;CPU / backup aggregate alert no longer firing. - Full-stack cold-start after fix:`PASS=96 WARN=0 BLOCKED=0`,Result `GREEN`;110 registry / Gitea / Harbor / Prometheus / Alertmanager OK,runner fail-closed OK,110 docker/systemd/storage/backup textfiles fresh,public routes expected 2xx/3xx,backup aggregate failed_count 僅列 INFO、不再形成 blocker。 diff --git a/docs/operations/host-cpu-pressure-drain-readback-2026-07-01.snapshot.json b/docs/operations/host-cpu-pressure-drain-readback-2026-07-01.snapshot.json index a1a6a34b..c98cdac4 100644 --- a/docs/operations/host-cpu-pressure-drain-readback-2026-07-01.snapshot.json +++ b/docs/operations/host-cpu-pressure-drain-readback-2026-07-01.snapshot.json @@ -92,6 +92,11 @@ "change": "add Host110SustainedModeratePressure and point sustained-load auto_repair_action to the deployed /home/wooo/scripts controller path", "evidence": "Prometheus rule readback shows Host110SustainedModeratePressure loaded; after quota apply it is inactive because load5/core and Gitea CPU are below threshold" }, + { + "path": "scripts/ops/host-sustained-load-controller.py", + "change": "route Gitea / StockPlatform container CPU quota pressure to source-specific playbook packets even when host load is below the critical load5/core threshold", + "evidence": "live controller readback after Gitea dropped below threshold returns observing; test fixture with Gitea 2.08 cores returns blocked_gitea_queue_or_hook_backlog_requires_playbook" + }, { "path": "scripts/ops/backup-alert-label-contract-check.py", "change": "make BackupAggregateRunFailed ignore aggregate-only backup_all noise and require component job failed-count evidence", @@ -140,6 +145,8 @@ "gitea_container_cpu_cores_before_quota": "2.1856", "gitea_container_cpu_cores_after_quota_textfile": "1.4917", "gitea_container_cpu_limit_cores": "2", + "controller_container_cpu_threshold": "2.0", + "controller_latest_classification": "observing_load_within_threshold", "ssh_control_path": "available", "alert_rules": "Host110SustainedModeratePressure loaded; DockerContainerCpuSustainedHigh inactive; BackupAggregateRunFailed inactive", "alert_chain": "110 to VIP/120/121 /api/v1/webhooks/alertmanager synthetic no-secret smoke returned HTTP 200", diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index 87f5380f..c3f348a2 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -324,7 +324,7 @@ groups: annotations: summary: "110 sustained pressure needs triage" description: "110 load5/core > 0.75 或 Gitea / StockPlatform 關鍵容器 CPU > 2.0 core 持續 1 分鐘;這是 critical 之前的主動偵測,避免等到 load5/core > 1.5 才反應。" - auto_repair_action: "ssh 192.168.0.110 '/home/wooo/scripts/host-sustained-load-controller.py --host 110 --load5-per-core-threshold 0.75 --metrics-file /home/wooo/node_exporter_textfiles/host_runaway_process.prom --docker-stats-file /home/wooo/node_exporter_textfiles/docker_stats.prom --json'" + auto_repair_action: "ssh 192.168.0.110 '/home/wooo/scripts/host-sustained-load-controller.py --host 110 --load5-per-core-threshold 0.75 --container-cpu-threshold 2.0 --metrics-file /home/wooo/node_exporter_textfiles/host_runaway_process.prom --docker-stats-file /home/wooo/node_exporter_textfiles/docker_stats.prom --json'" runbook: "controller 只產生 controlled packet,不讀 secret、不重啟服務。若分類為 gitea_queue_or_hook_backlog,先跑 host-sustained-load-evidence.py 取得脫敏 top family / container,再選 Gitea queue/hook backlog playbook;若是 orphan browser 才允許 gated SIGTERM;若是 StockPlatform postgres/API,轉 Stock hot-query/source freshness playbook。禁止 Docker / systemd / Nginx / DB restart、reboot、firewall。" - alert: HostCiRunnerLoadSaturation diff --git a/scripts/ops/host-sustained-load-controller.py b/scripts/ops/host-sustained-load-controller.py index ce13db51..6c2e751d 100755 --- a/scripts/ops/host-sustained-load-controller.py +++ b/scripts/ops/host-sustained-load-controller.py @@ -49,6 +49,7 @@ def parse_args() -> argparse.Namespace: default=DEFAULT_DOCKER_STATS_MAX_AGE_SECONDS, ) parser.add_argument("--load5-per-core-threshold", type=float, default=1.5) + parser.add_argument("--container-cpu-threshold", type=float, default=2.0) parser.add_argument("--ci-stale-age-seconds", type=int, default=1800) parser.add_argument("--json", action="store_true", help="Print JSON only.") return parser.parse_args() @@ -217,6 +218,7 @@ def build_packet( docker_samples: list[dict[str, Any]], docker_stats_status: dict[str, Any], load5_per_core_threshold: float, + container_cpu_threshold: float, ci_stale_age_seconds: int, ) -> dict[str, Any]: monitor_up = int( @@ -327,19 +329,27 @@ def build_packet( if controlled_apply_allowed else "keep_pressure_gate_fail_closed_until_ci_load_clears" ) - elif ( - load5_per_core > load5_per_core_threshold - and top_container_name == "gitea" - and top_container_cpu >= 2.0 - ): + elif top_container_name == "gitea" and top_container_cpu >= container_cpu_threshold: classification = "blocked_gitea_queue_or_hook_backlog_requires_playbook" - severity = "critical" + severity = "critical" if load5_per_core > load5_per_core_threshold else "warning" dry_run_command = ( "scripts/ops/host-sustained-load-evidence.py " f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} " f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json" ) next_action = "run_gitea_queue_or_hook_backlog_playbook_check_mode" + elif ( + top_container_name in {"stockplatform-v2-postgres-1", "stockplatform-v2-api-1"} + and top_container_cpu >= container_cpu_threshold + ): + classification = "blocked_stockplatform_hot_query_or_api_pressure_requires_playbook" + severity = "critical" if load5_per_core > load5_per_core_threshold else "warning" + dry_run_command = ( + "scripts/ops/host-sustained-load-evidence.py " + f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} " + f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json" + ) + next_action = "run_stockplatform_hot_query_or_api_pressure_playbook_check_mode" elif load5_per_core > load5_per_core_threshold and swap_used_ratio >= 0.85: classification = "blocked_memory_or_swap_pressure_requires_service_playbook" severity = "critical" @@ -367,6 +377,7 @@ def build_packet( "monitor_up": monitor_up, "load5_per_core": round(load5_per_core, 6), "load5_per_core_threshold": load5_per_core_threshold, + "container_cpu_threshold": container_cpu_threshold, "swap_used_ratio": round(swap_used_ratio, 6), "remediation_authorized": remediation_authorized, "active_ci_container_count": active_ci_containers, @@ -430,6 +441,7 @@ def main() -> int: max_age_seconds=args.docker_stats_max_age_seconds, ), load5_per_core_threshold=args.load5_per_core_threshold, + container_cpu_threshold=args.container_cpu_threshold, ci_stale_age_seconds=args.ci_stale_age_seconds, ) if args.json: diff --git a/scripts/ops/tests/test_host_pressure_alert_contract.py b/scripts/ops/tests/test_host_pressure_alert_contract.py index a67d923e..ad895d82 100644 --- a/scripts/ops/tests/test_host_pressure_alert_contract.py +++ b/scripts/ops/tests/test_host_pressure_alert_contract.py @@ -36,6 +36,7 @@ def test_110_moderate_pressure_alert_routes_to_live_controller() -> None: assert rule["labels"]["auto_repair"] == "true" assert "/home/wooo/scripts/host-sustained-load-controller.py" in action assert "--load5-per-core-threshold 0.75" in action + assert "--container-cpu-threshold 2.0" in action assert "不讀 secret" in annotations["runbook"] assert "禁止 Docker / systemd / Nginx / DB restart" in annotations["runbook"] diff --git a/scripts/ops/tests/test_host_runaway_process_exporter.py b/scripts/ops/tests/test_host_runaway_process_exporter.py index d977ffa6..b72d1675 100644 --- a/scripts/ops/tests/test_host_runaway_process_exporter.py +++ b/scripts/ops/tests/test_host_runaway_process_exporter.py @@ -425,6 +425,62 @@ def test_sustained_load_controller_routes_gitea_backlog_from_docker_metrics(tmp_ assert "host-sustained-load-evidence.py" in payload["commands"]["dry_run"] +def test_sustained_load_controller_routes_gitea_quota_pressure_even_when_load_is_moderate( + tmp_path: Path, +) -> None: + metrics_file = tmp_path / "host.prom" + metrics_file.write_text( + "\n".join( + [ + 'awoooi_host_runaway_process_monitor_up{host="110",mode="read_only"} 1', + 'awoooi_host_load5_per_core{host="110"} 0.55', + 'awoooi_host_swap_used_ratio{host="110"} 0.1', + 'awoooi_host_runaway_process_remediation_authorized{host="110"} 0', + 'awoooi_host_gitea_actions_active_container_count{host="110"} 0', + 'awoooi_host_gitea_actions_active_process_group_count{host="110"} 0', + 'awoooi_host_runaway_browser_orphan_group_count{host="110",rule="stockplatform_headless_smoke",min_age_seconds="1800",min_cpu_percent="50"} 0', + ] + ), + encoding="utf-8", + ) + docker_file = tmp_path / "docker.prom" + docker_file.write_text( + "\n".join( + [ + 'docker_container_cpu_cores{host="110",container_name="gitea"} 2.08', + 'docker_container_cpu_cores{host="110",container_name="redis"} 0.2', + ] + ), + encoding="utf-8", + ) + + result = subprocess.run( + [ + sys.executable, + str(CONTROLLER_PATH), + "--host", + "110", + "--load5-per-core-threshold", + "0.75", + "--metrics-file", + str(metrics_file), + "--docker-stats-file", + str(docker_file), + "--json", + ], + capture_output=True, + text=True, + ) + + assert result.returncode == 75 + payload = json.loads(result.stdout) + assert payload["classification"] == "blocked_gitea_queue_or_hook_backlog_requires_playbook" + assert payload["severity"] == "warning" + assert payload["readback"]["container_cpu_threshold"] == 2.0 + assert payload["readback"]["top_container_cpu"]["cpu_cores"] == 2.08 + assert "host-sustained-load-evidence.py" in payload["commands"]["dry_run"] + + def test_sustained_load_controller_ignores_stale_docker_stats_attribution(tmp_path: Path) -> None: metrics_file = tmp_path / "host.prom" metrics_file.write_text(