diff --git a/apps/api/migrations/automation_operation_log_truth_chain_hot_path_indexes_2026-07-01.sql b/apps/api/migrations/automation_operation_log_truth_chain_hot_path_indexes_2026-07-01.sql new file mode 100644 index 00000000..8dda5bcb --- /dev/null +++ b/apps/api/migrations/automation_operation_log_truth_chain_hot_path_indexes_2026-07-01.sql @@ -0,0 +1,16 @@ +-- P0 post-reboot CPU pressure: bound truth-chain automation_operation_log lookups. +-- Run outside an explicit transaction because CREATE INDEX CONCURRENTLY requires it. + +CREATE EXTENSION IF NOT EXISTS pg_trgm; + +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_automation_operation_log_incident_text +ON automation_operation_log ((incident_id::text)) +WHERE incident_id IS NOT NULL; + +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_automation_operation_log_input_text_trgm +ON automation_operation_log +USING gin ((coalesce(input::text, '')) gin_trgm_ops); + +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_automation_operation_log_output_text_trgm +ON automation_operation_log +USING gin ((coalesce(output::text, '')) gin_trgm_ops); diff --git a/apps/api/migrations/automation_operation_log_truth_chain_hot_path_indexes_2026-07-01_down.sql b/apps/api/migrations/automation_operation_log_truth_chain_hot_path_indexes_2026-07-01_down.sql new file mode 100644 index 00000000..26cfef13 --- /dev/null +++ b/apps/api/migrations/automation_operation_log_truth_chain_hot_path_indexes_2026-07-01_down.sql @@ -0,0 +1,6 @@ +-- Roll back only indexes owned by the P0 automation_operation_log truth-chain migration. +-- Do not drop pg_trgm; the extension may be shared by other indexes. + +DROP INDEX CONCURRENTLY IF EXISTS idx_automation_operation_log_output_text_trgm; +DROP INDEX CONCURRENTLY IF EXISTS idx_automation_operation_log_input_text_trgm; +DROP INDEX CONCURRENTLY IF EXISTS idx_automation_operation_log_incident_text; diff --git a/apps/api/tests/test_awooop_conversation_event_hot_path_indexes.py b/apps/api/tests/test_awooop_conversation_event_hot_path_indexes.py index 9b745384..adb7ada0 100644 --- a/apps/api/tests/test_awooop_conversation_event_hot_path_indexes.py +++ b/apps/api/tests/test_awooop_conversation_event_hot_path_indexes.py @@ -5,6 +5,8 @@ MIGRATION = "awooop_conversation_event_hot_path_indexes_2026-07-01.sql" ROLLBACK = "awooop_conversation_event_hot_path_indexes_2026-07-01_down.sql" CONTENT_PREVIEW_MIGRATION = "awooop_conversation_event_content_preview_trgm_2026-07-01.sql" CONTENT_PREVIEW_ROLLBACK = "awooop_conversation_event_content_preview_trgm_2026-07-01_down.sql" +AUTOMATION_LOG_MIGRATION = "automation_operation_log_truth_chain_hot_path_indexes_2026-07-01.sql" +AUTOMATION_LOG_ROLLBACK = "automation_operation_log_truth_chain_hot_path_indexes_2026-07-01_down.sql" def _read(path: str) -> str: @@ -27,6 +29,14 @@ def _content_preview_rollback_sql() -> str: return _read(f"migrations/{CONTENT_PREVIEW_ROLLBACK}") +def _automation_log_migration_sql() -> str: + return _read(f"migrations/{AUTOMATION_LOG_MIGRATION}") + + +def _automation_log_rollback_sql() -> str: + return _read(f"migrations/{AUTOMATION_LOG_ROLLBACK}") + + def test_hot_path_indexes_are_concurrent_and_transactionless() -> None: sql = _migration_sql() @@ -125,3 +135,34 @@ def test_content_preview_trgm_index_bounds_truth_chain_preview_fallback() -> Non assert "DROP INDEX CONCURRENTLY IF EXISTS idx_awooop_conv_event_content_preview_trgm;" in rollback assert "DROP INDEX CONCURRENTLY IF EXISTS idx_awooop_conv_event_project_content_preview_trgm;" in rollback assert "DROP EXTENSION" not in rollback + + +def test_automation_operation_log_indexes_bound_truth_chain_lookups() -> None: + sql = _automation_log_migration_sql() + rollback = _automation_log_rollback_sql() + truth_chain_source = _read("src/services/awooop_truth_chain_service.py") + + assert "BEGIN;" not in sql + assert "COMMIT;" not in sql + assert "CREATE EXTENSION IF NOT EXISTS pg_trgm" in sql + assert "CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_automation_operation_log_incident_text" in sql + assert "ON automation_operation_log ((incident_id::text))" in sql + assert "CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_automation_operation_log_input_text_trgm" in sql + assert "coalesce(input::text, '')" in sql + assert "CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_automation_operation_log_output_text_trgm" in sql + assert "coalesce(output::text, '')" in sql + assert "gin_trgm_ops" in sql + + assert "FROM automation_operation_log" in truth_chain_source + assert "incident_id::text" in truth_chain_source + assert "coalesce(input::text, '') LIKE" in truth_chain_source + assert "coalesce(output::text, '') LIKE" in truth_chain_source + assert "coalesce(array_to_string(tags, ','), '') LIKE" in truth_chain_source + + for index_name in ( + "idx_automation_operation_log_output_text_trgm", + "idx_automation_operation_log_input_text_trgm", + "idx_automation_operation_log_incident_text", + ): + assert f"DROP INDEX CONCURRENTLY IF EXISTS {index_name};" in rollback + assert "DROP EXTENSION" not in rollback diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 193526d2..f903c92c 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -51096,6 +51096,37 @@ production browser smoke: **仍維持**: - 沒有讀 secret / token / `.env` / raw sessions / SQLite / auth;沒有使用 GitHub / gh / GitHub API;沒有 runtime write。 +## 2026-07-01 — 08:59 P0 188 CPU 降回可控區 / 110 控制面仍 blocked + +**完成內容**: +- 188 抓到 `awoooi-cd-7391-1-api-tests` 又由 non-110 runner 重生,短採樣時吃約 `103.65%` 並持續打 production PostgreSQL;已再次 `disable --now` `awoooi-non110-runner.service`、`awoooi-non110-runner-autostart.path`、`awoooi-non110-runner-autostart.service`,並停止 `awoooi-cd-*api-tests` / `stockplatform-ubuntu-runner` 壓力來源。 +- runner drain readback:`awoooi-non110-runner.service=inactive`、autostart path `inactive`、autostart service `failed`,沒有 active `awoooi-cd-*api-tests` 容器。 +- 舊 conversation event OR query 在 120 尚未部署 source split 前仍會反覆打 188 DB;因 Gitea CD / source deploy 目前卡 no matching runner / queue,已把 188 temporary DB circuit breaker 收斂成明確 rollback set: + - `ALTER ROLE awoooi SET max_parallel_workers_per_gather=0` + - `ALTER ROLE awoooi SET statement_timeout='750ms'` + - `ALTER ROLE awoooi SET enable_seqscan=off` + - `ALTER ROLE awoooi CONNECTION LIMIT 2` +- live apply `automation_operation_log_truth_chain_hot_path_indexes_2026-07-01.sql`:`idx_automation_operation_log_incident_text`、`idx_automation_operation_log_input_text_trgm`、`idx_automation_operation_log_output_text_trgm` 均 `indisvalid=true / indisready=true`。原先嘗試的 tags text trigram index 因 `array_to_string(tags, ',')` 非 immutable 被 Postgres 拒絕,已從 migration source 移除;既有 `idx_automation_operation_log_tags_gin` 保留。 +- 188 post-readback:`k3s-postgres-recovery=0.98%`,DB session 僅 `idle 2 from 192.168.0.120`,active old OR=`0`、active automation log=`0`、local parallel worker=`0`,host load 約 `3.48 / 4.43 / 6.10`。這是 188 CPU 降回可控的證據。 +- 110 post-readback:`node_load1=9.97`、`node_load5=14.57`、`node_load15=18.78`、`awoooi_host_load5_per_core=1.2275`、`node_procs_running=366`、`gitea=3.4019` cores;短 SSH 仍 `Connection timed out`,所以 110 不能宣稱恢復,live exporter/systemd drain 尚未套用。 + +**本地驗證結果**: +- `DATABASE_URL=postgresql+asyncpg://test:test@localhost:5432/test PYTHONPATH=apps/api python3.11 -m pytest apps/api/tests/test_awooop_conversation_event_hot_path_indexes.py -q`:`6 passed`。 +- live migration verifier:三個 automation operation log indexes 均 valid / ready。 + +**rollback / 後續必做**: +- 120 部署 `abc512a7b` 後,讀回舊 OR query 消失,再執行: + - `ALTER ROLE awoooi RESET max_parallel_workers_per_gather;` + - `ALTER ROLE awoooi RESET statement_timeout;` + - `ALTER ROLE awoooi RESET enable_seqscan;` + - `ALTER ROLE awoooi CONNECTION LIMIT -1;` +- 110 SSH/control path 恢復後第一個 runtime 動作仍是套用新版 systemd exporter 或暫停舊 exporter,再回讀 systemd/logind、Gitea CPU、`systemd_units.prom` 與 load5/core。 + +**仍維持**: +- 沒有讀 secret / token / `.env` / raw sessions / SQLite / auth;沒有讀 `.runner` 內容。 +- 沒有使用 GitHub / gh / GitHub API / GitHub Actions。 +- 沒有重啟主機,沒有 Docker / Nginx / K3s / DB restart,沒有 workflow_dispatch,沒有 DROP / TRUNCATE / restore / prune。 + **下一步**: - 188 DB CPU 已降;110 仍高,原因仍是 `gitea` / queue / `awoooi-host` control path:110 `load5=27.22`、`gitea=3.4019` cores、Harbor repair `#4176 Waiting`、no matching `awoooi-host`。主線下一步繼續 110 Gitea queue / controlled lane recovery,不恢復 generic runner、不重啟主機。 diff --git a/docs/operations/host-cpu-pressure-drain-readback-2026-07-01.snapshot.json b/docs/operations/host-cpu-pressure-drain-readback-2026-07-01.snapshot.json index c0e6c7bb..1967e949 100644 --- a/docs/operations/host-cpu-pressure-drain-readback-2026-07-01.snapshot.json +++ b/docs/operations/host-cpu-pressure-drain-readback-2026-07-01.snapshot.json @@ -1,7 +1,7 @@ { "schema_version": "awoooi_host_cpu_pressure_drain_readback_v1", - "generated_at": "2026-07-01T08:50:00+08:00", - "status": "partial_188_bounded_source_fix_ready_110_control_path_blocked", + "generated_at": "2026-07-01T08:59:00+08:00", + "status": "partial_188_cpu_recovered_with_temporary_db_circuit_breaker_110_control_path_blocked", "scope": { "hosts": ["188", "110"], "incident_family": "post_reboot_host_cpu_pressure", @@ -39,10 +39,18 @@ { "host": "188", "target_selector": "awoooi PostgreSQL role sessions from 192.168.0.120 running old awooop_conversation_event OR lookup", - "action": "set max_parallel_workers_per_gather=0, statement_timeout=1500ms, and enable_seqscan=off for role; cancel active old OR queries and terminate 192.168.0.120 app DB sessions so new sessions inherit the bound", + "action": "set max_parallel_workers_per_gather=0, statement_timeout=750ms, enable_seqscan=off, and CONNECTION LIMIT 2 for role; cancel active queries and terminate 192.168.0.120 app DB sessions so new sessions inherit the bound", "reason": "the live 120 API was still repeatedly issuing the pre-fix OR query and spawning parallel workers after index apply", - "rollback": "ALTER ROLE awoooi RESET max_parallel_workers_per_gather; ALTER ROLE awoooi RESET statement_timeout; ALTER ROLE awoooi RESET enable_seqscan; then let the app reconnect after source split deploys", - "post_apply_readback": "rolconfig shows max_parallel_workers_per_gather=0, statement_timeout=1500ms, enable_seqscan=off; local parallel workers for the old query stayed 0 and k3s-postgres-recovery dropped to 173.33 percent" + "rollback": "ALTER ROLE awoooi RESET max_parallel_workers_per_gather; ALTER ROLE awoooi RESET statement_timeout; ALTER ROLE awoooi RESET enable_seqscan; ALTER ROLE awoooi CONNECTION LIMIT -1; then let the app reconnect after source split deploys", + "post_apply_readback": "rolconfig shows max_parallel_workers_per_gather=0, statement_timeout=750ms, enable_seqscan=off, rolconnlimit=2; old OR, automation_operation_log, and local parallel worker counts dropped to 0; k3s-postgres-recovery dropped to 0.98 percent" + }, + { + "host": "188", + "target_selector": "automation_operation_log truth-chain hot path indexes", + "action": "CREATE INDEX CONCURRENTLY IF NOT EXISTS for incident_id::text, input::text trigram, and output::text trigram", + "reason": "truth-chain shifted CPU pressure from conversation_event to automation_operation_log after the first circuit breaker", + "rollback": "DROP INDEX CONCURRENTLY IF EXISTS idx_automation_operation_log_output_text_trgm; DROP INDEX CONCURRENTLY IF EXISTS idx_automation_operation_log_input_text_trgm; DROP INDEX CONCURRENTLY IF EXISTS idx_automation_operation_log_incident_text;", + "post_apply_readback": "all three indexes read back indisvalid=true and indisready=true" } ], "source_fixes": [ @@ -65,6 +73,11 @@ "path": "apps/api/migrations/awooop_conversation_event_content_preview_trgm_2026-07-01.sql", "change": "add bounded pg_trgm / btree_gin indexes for content_preview fallback lookup", "evidence": "live verifier read back both trigram indexes as valid and ready" + }, + { + "path": "apps/api/migrations/automation_operation_log_truth_chain_hot_path_indexes_2026-07-01.sql", + "change": "add automation_operation_log expression/trigram indexes for truth-chain incident/input/output lookups", + "evidence": "live verifier read back incident_text, input_text_trgm, and output_text_trgm indexes as valid and ready" } ], "readback": { @@ -84,11 +97,13 @@ "docker_stats_textfile_k3s_postgres_recovery_cores": "0.406900" }, "post_bounded_db_mitigation_signals": { - "awoooi_role_config": "max_parallel_workers_per_gather=0, statement_timeout=1500ms, enable_seqscan=off", - "active_old_or_queries": "3", + "awoooi_role_connection_limit": "2", + "awoooi_role_config": "max_parallel_workers_per_gather=0, statement_timeout=750ms, enable_seqscan=off", + "active_old_or_queries": "0", + "active_automation_operation_log_queries": "0", "local_parallel_workers_for_old_or_query": "0", - "k3s_postgres_recovery_docker_stats_percent": "173.33", - "host_load_average": "4.50, 5.79, 7.52", + "k3s_postgres_recovery_docker_stats_percent": "0.98", + "host_load_average": "3.48, 4.43, 6.10", "remaining_root_cause": "120 live API still runs the pre-fix OR query until the source split is deployed" }, "deployment_readback": { @@ -101,9 +116,9 @@ "host_110": { "signals": { "load5_initial": "39.14", - "load5_latest": "20.14", - "load5_per_core_latest": "1.9", - "node_procs_running_latest": "39", + "load5_latest": "14.57", + "load5_per_core_latest": "1.2275", + "node_procs_running_latest": "366", "gitea_container_cpu_cores": "3.4019", "ssh_control_path": "timeout", "systemd_dbus_symptom": "systemd-logind pending replies exhausted; systemctl list/show timeout", @@ -123,13 +138,14 @@ "pytest": [ "scripts/ops/tests/test_systemd_units_textfile_exporter.py: 2 passed", "scripts/ops/tests/test_systemd_units_textfile_exporter.py + ops/runner/test_cd_controlled_runtime_profile.py: 34 passed", - "truth-chain source split + content-preview migration focused suite: 7 passed" + "truth-chain source split + content-preview migration focused suite: 7 passed", + "conversation_event + automation_operation_log hot-path migration tests: 6 passed" ], "diff_check": "passed" }, "next_actions": [ "commit and push the truth-chain source split so 120 stops issuing the old OR query", - "after deploy convergence, reset the temporary awoooi role max_parallel_workers_per_gather override", + "after deploy convergence, reset the temporary awoooi role max_parallel_workers_per_gather, statement_timeout, enable_seqscan, and connection limit overrides", "apply the systemd exporter source fix to 110 once SSH/control path is available", "continue 110 Gitea queue / awoooi-host controlled lane recovery without generic runner restore" ]