fix(db): bound automation log truth-chain lookups
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / tests (push) Has been cancelled
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / tests (push) Has been cancelled
This commit is contained in:
@@ -0,0 +1,16 @@
|
||||
-- P0 post-reboot CPU pressure: bound truth-chain automation_operation_log lookups.
|
||||
-- Run outside an explicit transaction because CREATE INDEX CONCURRENTLY requires it.
|
||||
|
||||
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
||||
|
||||
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_automation_operation_log_incident_text
|
||||
ON automation_operation_log ((incident_id::text))
|
||||
WHERE incident_id IS NOT NULL;
|
||||
|
||||
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_automation_operation_log_input_text_trgm
|
||||
ON automation_operation_log
|
||||
USING gin ((coalesce(input::text, '')) gin_trgm_ops);
|
||||
|
||||
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_automation_operation_log_output_text_trgm
|
||||
ON automation_operation_log
|
||||
USING gin ((coalesce(output::text, '')) gin_trgm_ops);
|
||||
@@ -0,0 +1,6 @@
|
||||
-- Roll back only indexes owned by the P0 automation_operation_log truth-chain migration.
|
||||
-- Do not drop pg_trgm; the extension may be shared by other indexes.
|
||||
|
||||
DROP INDEX CONCURRENTLY IF EXISTS idx_automation_operation_log_output_text_trgm;
|
||||
DROP INDEX CONCURRENTLY IF EXISTS idx_automation_operation_log_input_text_trgm;
|
||||
DROP INDEX CONCURRENTLY IF EXISTS idx_automation_operation_log_incident_text;
|
||||
@@ -5,6 +5,8 @@ MIGRATION = "awooop_conversation_event_hot_path_indexes_2026-07-01.sql"
|
||||
ROLLBACK = "awooop_conversation_event_hot_path_indexes_2026-07-01_down.sql"
|
||||
CONTENT_PREVIEW_MIGRATION = "awooop_conversation_event_content_preview_trgm_2026-07-01.sql"
|
||||
CONTENT_PREVIEW_ROLLBACK = "awooop_conversation_event_content_preview_trgm_2026-07-01_down.sql"
|
||||
AUTOMATION_LOG_MIGRATION = "automation_operation_log_truth_chain_hot_path_indexes_2026-07-01.sql"
|
||||
AUTOMATION_LOG_ROLLBACK = "automation_operation_log_truth_chain_hot_path_indexes_2026-07-01_down.sql"
|
||||
|
||||
|
||||
def _read(path: str) -> str:
|
||||
@@ -27,6 +29,14 @@ def _content_preview_rollback_sql() -> str:
|
||||
return _read(f"migrations/{CONTENT_PREVIEW_ROLLBACK}")
|
||||
|
||||
|
||||
def _automation_log_migration_sql() -> str:
|
||||
return _read(f"migrations/{AUTOMATION_LOG_MIGRATION}")
|
||||
|
||||
|
||||
def _automation_log_rollback_sql() -> str:
|
||||
return _read(f"migrations/{AUTOMATION_LOG_ROLLBACK}")
|
||||
|
||||
|
||||
def test_hot_path_indexes_are_concurrent_and_transactionless() -> None:
|
||||
sql = _migration_sql()
|
||||
|
||||
@@ -125,3 +135,34 @@ def test_content_preview_trgm_index_bounds_truth_chain_preview_fallback() -> Non
|
||||
assert "DROP INDEX CONCURRENTLY IF EXISTS idx_awooop_conv_event_content_preview_trgm;" in rollback
|
||||
assert "DROP INDEX CONCURRENTLY IF EXISTS idx_awooop_conv_event_project_content_preview_trgm;" in rollback
|
||||
assert "DROP EXTENSION" not in rollback
|
||||
|
||||
|
||||
def test_automation_operation_log_indexes_bound_truth_chain_lookups() -> None:
|
||||
sql = _automation_log_migration_sql()
|
||||
rollback = _automation_log_rollback_sql()
|
||||
truth_chain_source = _read("src/services/awooop_truth_chain_service.py")
|
||||
|
||||
assert "BEGIN;" not in sql
|
||||
assert "COMMIT;" not in sql
|
||||
assert "CREATE EXTENSION IF NOT EXISTS pg_trgm" in sql
|
||||
assert "CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_automation_operation_log_incident_text" in sql
|
||||
assert "ON automation_operation_log ((incident_id::text))" in sql
|
||||
assert "CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_automation_operation_log_input_text_trgm" in sql
|
||||
assert "coalesce(input::text, '')" in sql
|
||||
assert "CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_automation_operation_log_output_text_trgm" in sql
|
||||
assert "coalesce(output::text, '')" in sql
|
||||
assert "gin_trgm_ops" in sql
|
||||
|
||||
assert "FROM automation_operation_log" in truth_chain_source
|
||||
assert "incident_id::text" in truth_chain_source
|
||||
assert "coalesce(input::text, '') LIKE" in truth_chain_source
|
||||
assert "coalesce(output::text, '') LIKE" in truth_chain_source
|
||||
assert "coalesce(array_to_string(tags, ','), '') LIKE" in truth_chain_source
|
||||
|
||||
for index_name in (
|
||||
"idx_automation_operation_log_output_text_trgm",
|
||||
"idx_automation_operation_log_input_text_trgm",
|
||||
"idx_automation_operation_log_incident_text",
|
||||
):
|
||||
assert f"DROP INDEX CONCURRENTLY IF EXISTS {index_name};" in rollback
|
||||
assert "DROP EXTENSION" not in rollback
|
||||
|
||||
@@ -51096,6 +51096,37 @@ production browser smoke:
|
||||
**仍維持**:
|
||||
- 沒有讀 secret / token / `.env` / raw sessions / SQLite / auth;沒有使用 GitHub / gh / GitHub API;沒有 runtime write。
|
||||
|
||||
## 2026-07-01 — 08:59 P0 188 CPU 降回可控區 / 110 控制面仍 blocked
|
||||
|
||||
**完成內容**:
|
||||
- 188 抓到 `awoooi-cd-7391-1-api-tests` 又由 non-110 runner 重生,短採樣時吃約 `103.65%` 並持續打 production PostgreSQL;已再次 `disable --now` `awoooi-non110-runner.service`、`awoooi-non110-runner-autostart.path`、`awoooi-non110-runner-autostart.service`,並停止 `awoooi-cd-*api-tests` / `stockplatform-ubuntu-runner` 壓力來源。
|
||||
- runner drain readback:`awoooi-non110-runner.service=inactive`、autostart path `inactive`、autostart service `failed`,沒有 active `awoooi-cd-*api-tests` 容器。
|
||||
- 舊 conversation event OR query 在 120 尚未部署 source split 前仍會反覆打 188 DB;因 Gitea CD / source deploy 目前卡 no matching runner / queue,已把 188 temporary DB circuit breaker 收斂成明確 rollback set:
|
||||
- `ALTER ROLE awoooi SET max_parallel_workers_per_gather=0`
|
||||
- `ALTER ROLE awoooi SET statement_timeout='750ms'`
|
||||
- `ALTER ROLE awoooi SET enable_seqscan=off`
|
||||
- `ALTER ROLE awoooi CONNECTION LIMIT 2`
|
||||
- live apply `automation_operation_log_truth_chain_hot_path_indexes_2026-07-01.sql`:`idx_automation_operation_log_incident_text`、`idx_automation_operation_log_input_text_trgm`、`idx_automation_operation_log_output_text_trgm` 均 `indisvalid=true / indisready=true`。原先嘗試的 tags text trigram index 因 `array_to_string(tags, ',')` 非 immutable 被 Postgres 拒絕,已從 migration source 移除;既有 `idx_automation_operation_log_tags_gin` 保留。
|
||||
- 188 post-readback:`k3s-postgres-recovery=0.98%`,DB session 僅 `idle 2 from 192.168.0.120`,active old OR=`0`、active automation log=`0`、local parallel worker=`0`,host load 約 `3.48 / 4.43 / 6.10`。這是 188 CPU 降回可控的證據。
|
||||
- 110 post-readback:`node_load1=9.97`、`node_load5=14.57`、`node_load15=18.78`、`awoooi_host_load5_per_core=1.2275`、`node_procs_running=366`、`gitea=3.4019` cores;短 SSH 仍 `Connection timed out`,所以 110 不能宣稱恢復,live exporter/systemd drain 尚未套用。
|
||||
|
||||
**本地驗證結果**:
|
||||
- `DATABASE_URL=postgresql+asyncpg://test:test@localhost:5432/test PYTHONPATH=apps/api python3.11 -m pytest apps/api/tests/test_awooop_conversation_event_hot_path_indexes.py -q`:`6 passed`。
|
||||
- live migration verifier:三個 automation operation log indexes 均 valid / ready。
|
||||
|
||||
**rollback / 後續必做**:
|
||||
- 120 部署 `abc512a7b` 後,讀回舊 OR query 消失,再執行:
|
||||
- `ALTER ROLE awoooi RESET max_parallel_workers_per_gather;`
|
||||
- `ALTER ROLE awoooi RESET statement_timeout;`
|
||||
- `ALTER ROLE awoooi RESET enable_seqscan;`
|
||||
- `ALTER ROLE awoooi CONNECTION LIMIT -1;`
|
||||
- 110 SSH/control path 恢復後第一個 runtime 動作仍是套用新版 systemd exporter 或暫停舊 exporter,再回讀 systemd/logind、Gitea CPU、`systemd_units.prom` 與 load5/core。
|
||||
|
||||
**仍維持**:
|
||||
- 沒有讀 secret / token / `.env` / raw sessions / SQLite / auth;沒有讀 `.runner` 內容。
|
||||
- 沒有使用 GitHub / gh / GitHub API / GitHub Actions。
|
||||
- 沒有重啟主機,沒有 Docker / Nginx / K3s / DB restart,沒有 workflow_dispatch,沒有 DROP / TRUNCATE / restore / prune。
|
||||
|
||||
**下一步**:
|
||||
- 188 DB CPU 已降;110 仍高,原因仍是 `gitea` / queue / `awoooi-host` control path:110 `load5=27.22`、`gitea=3.4019` cores、Harbor repair `#4176 Waiting`、no matching `awoooi-host`。主線下一步繼續 110 Gitea queue / controlled lane recovery,不恢復 generic runner、不重啟主機。
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"schema_version": "awoooi_host_cpu_pressure_drain_readback_v1",
|
||||
"generated_at": "2026-07-01T08:50:00+08:00",
|
||||
"status": "partial_188_bounded_source_fix_ready_110_control_path_blocked",
|
||||
"generated_at": "2026-07-01T08:59:00+08:00",
|
||||
"status": "partial_188_cpu_recovered_with_temporary_db_circuit_breaker_110_control_path_blocked",
|
||||
"scope": {
|
||||
"hosts": ["188", "110"],
|
||||
"incident_family": "post_reboot_host_cpu_pressure",
|
||||
@@ -39,10 +39,18 @@
|
||||
{
|
||||
"host": "188",
|
||||
"target_selector": "awoooi PostgreSQL role sessions from 192.168.0.120 running old awooop_conversation_event OR lookup",
|
||||
"action": "set max_parallel_workers_per_gather=0, statement_timeout=1500ms, and enable_seqscan=off for role; cancel active old OR queries and terminate 192.168.0.120 app DB sessions so new sessions inherit the bound",
|
||||
"action": "set max_parallel_workers_per_gather=0, statement_timeout=750ms, enable_seqscan=off, and CONNECTION LIMIT 2 for role; cancel active queries and terminate 192.168.0.120 app DB sessions so new sessions inherit the bound",
|
||||
"reason": "the live 120 API was still repeatedly issuing the pre-fix OR query and spawning parallel workers after index apply",
|
||||
"rollback": "ALTER ROLE awoooi RESET max_parallel_workers_per_gather; ALTER ROLE awoooi RESET statement_timeout; ALTER ROLE awoooi RESET enable_seqscan; then let the app reconnect after source split deploys",
|
||||
"post_apply_readback": "rolconfig shows max_parallel_workers_per_gather=0, statement_timeout=1500ms, enable_seqscan=off; local parallel workers for the old query stayed 0 and k3s-postgres-recovery dropped to 173.33 percent"
|
||||
"rollback": "ALTER ROLE awoooi RESET max_parallel_workers_per_gather; ALTER ROLE awoooi RESET statement_timeout; ALTER ROLE awoooi RESET enable_seqscan; ALTER ROLE awoooi CONNECTION LIMIT -1; then let the app reconnect after source split deploys",
|
||||
"post_apply_readback": "rolconfig shows max_parallel_workers_per_gather=0, statement_timeout=750ms, enable_seqscan=off, rolconnlimit=2; old OR, automation_operation_log, and local parallel worker counts dropped to 0; k3s-postgres-recovery dropped to 0.98 percent"
|
||||
},
|
||||
{
|
||||
"host": "188",
|
||||
"target_selector": "automation_operation_log truth-chain hot path indexes",
|
||||
"action": "CREATE INDEX CONCURRENTLY IF NOT EXISTS for incident_id::text, input::text trigram, and output::text trigram",
|
||||
"reason": "truth-chain shifted CPU pressure from conversation_event to automation_operation_log after the first circuit breaker",
|
||||
"rollback": "DROP INDEX CONCURRENTLY IF EXISTS idx_automation_operation_log_output_text_trgm; DROP INDEX CONCURRENTLY IF EXISTS idx_automation_operation_log_input_text_trgm; DROP INDEX CONCURRENTLY IF EXISTS idx_automation_operation_log_incident_text;",
|
||||
"post_apply_readback": "all three indexes read back indisvalid=true and indisready=true"
|
||||
}
|
||||
],
|
||||
"source_fixes": [
|
||||
@@ -65,6 +73,11 @@
|
||||
"path": "apps/api/migrations/awooop_conversation_event_content_preview_trgm_2026-07-01.sql",
|
||||
"change": "add bounded pg_trgm / btree_gin indexes for content_preview fallback lookup",
|
||||
"evidence": "live verifier read back both trigram indexes as valid and ready"
|
||||
},
|
||||
{
|
||||
"path": "apps/api/migrations/automation_operation_log_truth_chain_hot_path_indexes_2026-07-01.sql",
|
||||
"change": "add automation_operation_log expression/trigram indexes for truth-chain incident/input/output lookups",
|
||||
"evidence": "live verifier read back incident_text, input_text_trgm, and output_text_trgm indexes as valid and ready"
|
||||
}
|
||||
],
|
||||
"readback": {
|
||||
@@ -84,11 +97,13 @@
|
||||
"docker_stats_textfile_k3s_postgres_recovery_cores": "0.406900"
|
||||
},
|
||||
"post_bounded_db_mitigation_signals": {
|
||||
"awoooi_role_config": "max_parallel_workers_per_gather=0, statement_timeout=1500ms, enable_seqscan=off",
|
||||
"active_old_or_queries": "3",
|
||||
"awoooi_role_connection_limit": "2",
|
||||
"awoooi_role_config": "max_parallel_workers_per_gather=0, statement_timeout=750ms, enable_seqscan=off",
|
||||
"active_old_or_queries": "0",
|
||||
"active_automation_operation_log_queries": "0",
|
||||
"local_parallel_workers_for_old_or_query": "0",
|
||||
"k3s_postgres_recovery_docker_stats_percent": "173.33",
|
||||
"host_load_average": "4.50, 5.79, 7.52",
|
||||
"k3s_postgres_recovery_docker_stats_percent": "0.98",
|
||||
"host_load_average": "3.48, 4.43, 6.10",
|
||||
"remaining_root_cause": "120 live API still runs the pre-fix OR query until the source split is deployed"
|
||||
},
|
||||
"deployment_readback": {
|
||||
@@ -101,9 +116,9 @@
|
||||
"host_110": {
|
||||
"signals": {
|
||||
"load5_initial": "39.14",
|
||||
"load5_latest": "20.14",
|
||||
"load5_per_core_latest": "1.9",
|
||||
"node_procs_running_latest": "39",
|
||||
"load5_latest": "14.57",
|
||||
"load5_per_core_latest": "1.2275",
|
||||
"node_procs_running_latest": "366",
|
||||
"gitea_container_cpu_cores": "3.4019",
|
||||
"ssh_control_path": "timeout",
|
||||
"systemd_dbus_symptom": "systemd-logind pending replies exhausted; systemctl list/show timeout",
|
||||
@@ -123,13 +138,14 @@
|
||||
"pytest": [
|
||||
"scripts/ops/tests/test_systemd_units_textfile_exporter.py: 2 passed",
|
||||
"scripts/ops/tests/test_systemd_units_textfile_exporter.py + ops/runner/test_cd_controlled_runtime_profile.py: 34 passed",
|
||||
"truth-chain source split + content-preview migration focused suite: 7 passed"
|
||||
"truth-chain source split + content-preview migration focused suite: 7 passed",
|
||||
"conversation_event + automation_operation_log hot-path migration tests: 6 passed"
|
||||
],
|
||||
"diff_check": "passed"
|
||||
},
|
||||
"next_actions": [
|
||||
"commit and push the truth-chain source split so 120 stops issuing the old OR query",
|
||||
"after deploy convergence, reset the temporary awoooi role max_parallel_workers_per_gather override",
|
||||
"after deploy convergence, reset the temporary awoooi role max_parallel_workers_per_gather, statement_timeout, enable_seqscan, and connection limit overrides",
|
||||
"apply the systemd exporter source fix to 110 once SSH/control path is available",
|
||||
"continue 110 Gitea queue / awoooi-host controlled lane recovery without generic runner restore"
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user