fix(db): bound automation log truth-chain lookups
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / tests (push) Has been cancelled

This commit is contained in:
Your Name
2026-07-01 09:00:34 +08:00
parent 444de0b40c
commit 548127ffb0
5 changed files with 124 additions and 14 deletions

View File

@@ -0,0 +1,16 @@
-- P0 post-reboot CPU pressure: bound truth-chain automation_operation_log lookups.
-- Run outside an explicit transaction because CREATE INDEX CONCURRENTLY requires it.
CREATE EXTENSION IF NOT EXISTS pg_trgm;
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_automation_operation_log_incident_text
ON automation_operation_log ((incident_id::text))
WHERE incident_id IS NOT NULL;
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_automation_operation_log_input_text_trgm
ON automation_operation_log
USING gin ((coalesce(input::text, '')) gin_trgm_ops);
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_automation_operation_log_output_text_trgm
ON automation_operation_log
USING gin ((coalesce(output::text, '')) gin_trgm_ops);

View File

@@ -0,0 +1,6 @@
-- Roll back only indexes owned by the P0 automation_operation_log truth-chain migration.
-- Do not drop pg_trgm; the extension may be shared by other indexes.
DROP INDEX CONCURRENTLY IF EXISTS idx_automation_operation_log_output_text_trgm;
DROP INDEX CONCURRENTLY IF EXISTS idx_automation_operation_log_input_text_trgm;
DROP INDEX CONCURRENTLY IF EXISTS idx_automation_operation_log_incident_text;

View File

@@ -5,6 +5,8 @@ MIGRATION = "awooop_conversation_event_hot_path_indexes_2026-07-01.sql"
ROLLBACK = "awooop_conversation_event_hot_path_indexes_2026-07-01_down.sql"
CONTENT_PREVIEW_MIGRATION = "awooop_conversation_event_content_preview_trgm_2026-07-01.sql"
CONTENT_PREVIEW_ROLLBACK = "awooop_conversation_event_content_preview_trgm_2026-07-01_down.sql"
AUTOMATION_LOG_MIGRATION = "automation_operation_log_truth_chain_hot_path_indexes_2026-07-01.sql"
AUTOMATION_LOG_ROLLBACK = "automation_operation_log_truth_chain_hot_path_indexes_2026-07-01_down.sql"
def _read(path: str) -> str:
@@ -27,6 +29,14 @@ def _content_preview_rollback_sql() -> str:
return _read(f"migrations/{CONTENT_PREVIEW_ROLLBACK}")
def _automation_log_migration_sql() -> str:
return _read(f"migrations/{AUTOMATION_LOG_MIGRATION}")
def _automation_log_rollback_sql() -> str:
return _read(f"migrations/{AUTOMATION_LOG_ROLLBACK}")
def test_hot_path_indexes_are_concurrent_and_transactionless() -> None:
sql = _migration_sql()
@@ -125,3 +135,34 @@ def test_content_preview_trgm_index_bounds_truth_chain_preview_fallback() -> Non
assert "DROP INDEX CONCURRENTLY IF EXISTS idx_awooop_conv_event_content_preview_trgm;" in rollback
assert "DROP INDEX CONCURRENTLY IF EXISTS idx_awooop_conv_event_project_content_preview_trgm;" in rollback
assert "DROP EXTENSION" not in rollback
def test_automation_operation_log_indexes_bound_truth_chain_lookups() -> None:
sql = _automation_log_migration_sql()
rollback = _automation_log_rollback_sql()
truth_chain_source = _read("src/services/awooop_truth_chain_service.py")
assert "BEGIN;" not in sql
assert "COMMIT;" not in sql
assert "CREATE EXTENSION IF NOT EXISTS pg_trgm" in sql
assert "CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_automation_operation_log_incident_text" in sql
assert "ON automation_operation_log ((incident_id::text))" in sql
assert "CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_automation_operation_log_input_text_trgm" in sql
assert "coalesce(input::text, '')" in sql
assert "CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_automation_operation_log_output_text_trgm" in sql
assert "coalesce(output::text, '')" in sql
assert "gin_trgm_ops" in sql
assert "FROM automation_operation_log" in truth_chain_source
assert "incident_id::text" in truth_chain_source
assert "coalesce(input::text, '') LIKE" in truth_chain_source
assert "coalesce(output::text, '') LIKE" in truth_chain_source
assert "coalesce(array_to_string(tags, ','), '') LIKE" in truth_chain_source
for index_name in (
"idx_automation_operation_log_output_text_trgm",
"idx_automation_operation_log_input_text_trgm",
"idx_automation_operation_log_incident_text",
):
assert f"DROP INDEX CONCURRENTLY IF EXISTS {index_name};" in rollback
assert "DROP EXTENSION" not in rollback

View File

@@ -51096,6 +51096,37 @@ production browser smoke:
**仍維持**
- 沒有讀 secret / token / `.env` / raw sessions / SQLite / auth沒有使用 GitHub / gh / GitHub API沒有 runtime write。
## 2026-07-01 — 08:59 P0 188 CPU 降回可控區 / 110 控制面仍 blocked
**完成內容**
- 188 抓到 `awoooi-cd-7391-1-api-tests` 又由 non-110 runner 重生,短採樣時吃約 `103.65%` 並持續打 production PostgreSQL已再次 `disable --now` `awoooi-non110-runner.service``awoooi-non110-runner-autostart.path``awoooi-non110-runner-autostart.service`,並停止 `awoooi-cd-*api-tests` / `stockplatform-ubuntu-runner` 壓力來源。
- runner drain readback`awoooi-non110-runner.service=inactive`、autostart path `inactive`、autostart service `failed`,沒有 active `awoooi-cd-*api-tests` 容器。
- 舊 conversation event OR query 在 120 尚未部署 source split 前仍會反覆打 188 DB因 Gitea CD / source deploy 目前卡 no matching runner / queue已把 188 temporary DB circuit breaker 收斂成明確 rollback set
- `ALTER ROLE awoooi SET max_parallel_workers_per_gather=0`
- `ALTER ROLE awoooi SET statement_timeout='750ms'`
- `ALTER ROLE awoooi SET enable_seqscan=off`
- `ALTER ROLE awoooi CONNECTION LIMIT 2`
- live apply `automation_operation_log_truth_chain_hot_path_indexes_2026-07-01.sql``idx_automation_operation_log_incident_text``idx_automation_operation_log_input_text_trgm``idx_automation_operation_log_output_text_trgm``indisvalid=true / indisready=true`。原先嘗試的 tags text trigram index 因 `array_to_string(tags, ',')` 非 immutable 被 Postgres 拒絕,已從 migration source 移除;既有 `idx_automation_operation_log_tags_gin` 保留。
- 188 post-readback`k3s-postgres-recovery=0.98%`DB session 僅 `idle 2 from 192.168.0.120`active old OR=`0`、active automation log=`0`、local parallel worker=`0`host load 約 `3.48 / 4.43 / 6.10`。這是 188 CPU 降回可控的證據。
- 110 post-readback`node_load1=9.97``node_load5=14.57``node_load15=18.78``awoooi_host_load5_per_core=1.2275``node_procs_running=366``gitea=3.4019` cores短 SSH 仍 `Connection timed out`,所以 110 不能宣稱恢復live exporter/systemd drain 尚未套用。
**本地驗證結果**
- `DATABASE_URL=postgresql+asyncpg://test:test@localhost:5432/test PYTHONPATH=apps/api python3.11 -m pytest apps/api/tests/test_awooop_conversation_event_hot_path_indexes.py -q``6 passed`
- live migration verifier三個 automation operation log indexes 均 valid / ready。
**rollback / 後續必做**
- 120 部署 `abc512a7b` 後,讀回舊 OR query 消失,再執行:
- `ALTER ROLE awoooi RESET max_parallel_workers_per_gather;`
- `ALTER ROLE awoooi RESET statement_timeout;`
- `ALTER ROLE awoooi RESET enable_seqscan;`
- `ALTER ROLE awoooi CONNECTION LIMIT -1;`
- 110 SSH/control path 恢復後第一個 runtime 動作仍是套用新版 systemd exporter 或暫停舊 exporter再回讀 systemd/logind、Gitea CPU、`systemd_units.prom` 與 load5/core。
**仍維持**
- 沒有讀 secret / token / `.env` / raw sessions / SQLite / auth沒有讀 `.runner` 內容。
- 沒有使用 GitHub / gh / GitHub API / GitHub Actions。
- 沒有重啟主機,沒有 Docker / Nginx / K3s / DB restart沒有 workflow_dispatch沒有 DROP / TRUNCATE / restore / prune。
**下一步**
- 188 DB CPU 已降110 仍高,原因仍是 `gitea` / queue / `awoooi-host` control path110 `load5=27.22``gitea=3.4019` cores、Harbor repair `#4176 Waiting`、no matching `awoooi-host`。主線下一步繼續 110 Gitea queue / controlled lane recovery不恢復 generic runner、不重啟主機。

View File

@@ -1,7 +1,7 @@
{
"schema_version": "awoooi_host_cpu_pressure_drain_readback_v1",
"generated_at": "2026-07-01T08:50:00+08:00",
"status": "partial_188_bounded_source_fix_ready_110_control_path_blocked",
"generated_at": "2026-07-01T08:59:00+08:00",
"status": "partial_188_cpu_recovered_with_temporary_db_circuit_breaker_110_control_path_blocked",
"scope": {
"hosts": ["188", "110"],
"incident_family": "post_reboot_host_cpu_pressure",
@@ -39,10 +39,18 @@
{
"host": "188",
"target_selector": "awoooi PostgreSQL role sessions from 192.168.0.120 running old awooop_conversation_event OR lookup",
"action": "set max_parallel_workers_per_gather=0, statement_timeout=1500ms, and enable_seqscan=off for role; cancel active old OR queries and terminate 192.168.0.120 app DB sessions so new sessions inherit the bound",
"action": "set max_parallel_workers_per_gather=0, statement_timeout=750ms, enable_seqscan=off, and CONNECTION LIMIT 2 for role; cancel active queries and terminate 192.168.0.120 app DB sessions so new sessions inherit the bound",
"reason": "the live 120 API was still repeatedly issuing the pre-fix OR query and spawning parallel workers after index apply",
"rollback": "ALTER ROLE awoooi RESET max_parallel_workers_per_gather; ALTER ROLE awoooi RESET statement_timeout; ALTER ROLE awoooi RESET enable_seqscan; then let the app reconnect after source split deploys",
"post_apply_readback": "rolconfig shows max_parallel_workers_per_gather=0, statement_timeout=1500ms, enable_seqscan=off; local parallel workers for the old query stayed 0 and k3s-postgres-recovery dropped to 173.33 percent"
"rollback": "ALTER ROLE awoooi RESET max_parallel_workers_per_gather; ALTER ROLE awoooi RESET statement_timeout; ALTER ROLE awoooi RESET enable_seqscan; ALTER ROLE awoooi CONNECTION LIMIT -1; then let the app reconnect after source split deploys",
"post_apply_readback": "rolconfig shows max_parallel_workers_per_gather=0, statement_timeout=750ms, enable_seqscan=off, rolconnlimit=2; old OR, automation_operation_log, and local parallel worker counts dropped to 0; k3s-postgres-recovery dropped to 0.98 percent"
},
{
"host": "188",
"target_selector": "automation_operation_log truth-chain hot path indexes",
"action": "CREATE INDEX CONCURRENTLY IF NOT EXISTS for incident_id::text, input::text trigram, and output::text trigram",
"reason": "truth-chain shifted CPU pressure from conversation_event to automation_operation_log after the first circuit breaker",
"rollback": "DROP INDEX CONCURRENTLY IF EXISTS idx_automation_operation_log_output_text_trgm; DROP INDEX CONCURRENTLY IF EXISTS idx_automation_operation_log_input_text_trgm; DROP INDEX CONCURRENTLY IF EXISTS idx_automation_operation_log_incident_text;",
"post_apply_readback": "all three indexes read back indisvalid=true and indisready=true"
}
],
"source_fixes": [
@@ -65,6 +73,11 @@
"path": "apps/api/migrations/awooop_conversation_event_content_preview_trgm_2026-07-01.sql",
"change": "add bounded pg_trgm / btree_gin indexes for content_preview fallback lookup",
"evidence": "live verifier read back both trigram indexes as valid and ready"
},
{
"path": "apps/api/migrations/automation_operation_log_truth_chain_hot_path_indexes_2026-07-01.sql",
"change": "add automation_operation_log expression/trigram indexes for truth-chain incident/input/output lookups",
"evidence": "live verifier read back incident_text, input_text_trgm, and output_text_trgm indexes as valid and ready"
}
],
"readback": {
@@ -84,11 +97,13 @@
"docker_stats_textfile_k3s_postgres_recovery_cores": "0.406900"
},
"post_bounded_db_mitigation_signals": {
"awoooi_role_config": "max_parallel_workers_per_gather=0, statement_timeout=1500ms, enable_seqscan=off",
"active_old_or_queries": "3",
"awoooi_role_connection_limit": "2",
"awoooi_role_config": "max_parallel_workers_per_gather=0, statement_timeout=750ms, enable_seqscan=off",
"active_old_or_queries": "0",
"active_automation_operation_log_queries": "0",
"local_parallel_workers_for_old_or_query": "0",
"k3s_postgres_recovery_docker_stats_percent": "173.33",
"host_load_average": "4.50, 5.79, 7.52",
"k3s_postgres_recovery_docker_stats_percent": "0.98",
"host_load_average": "3.48, 4.43, 6.10",
"remaining_root_cause": "120 live API still runs the pre-fix OR query until the source split is deployed"
},
"deployment_readback": {
@@ -101,9 +116,9 @@
"host_110": {
"signals": {
"load5_initial": "39.14",
"load5_latest": "20.14",
"load5_per_core_latest": "1.9",
"node_procs_running_latest": "39",
"load5_latest": "14.57",
"load5_per_core_latest": "1.2275",
"node_procs_running_latest": "366",
"gitea_container_cpu_cores": "3.4019",
"ssh_control_path": "timeout",
"systemd_dbus_symptom": "systemd-logind pending replies exhausted; systemctl list/show timeout",
@@ -123,13 +138,14 @@
"pytest": [
"scripts/ops/tests/test_systemd_units_textfile_exporter.py: 2 passed",
"scripts/ops/tests/test_systemd_units_textfile_exporter.py + ops/runner/test_cd_controlled_runtime_profile.py: 34 passed",
"truth-chain source split + content-preview migration focused suite: 7 passed"
"truth-chain source split + content-preview migration focused suite: 7 passed",
"conversation_event + automation_operation_log hot-path migration tests: 6 passed"
],
"diff_check": "passed"
},
"next_actions": [
"commit and push the truth-chain source split so 120 stops issuing the old OR query",
"after deploy convergence, reset the temporary awoooi role max_parallel_workers_per_gather override",
"after deploy convergence, reset the temporary awoooi role max_parallel_workers_per_gather, statement_timeout, enable_seqscan, and connection limit overrides",
"apply the systemd exporter source fix to 110 once SSH/control path is available",
"continue 110 Gitea queue / awoooi-host controlled lane recovery without generic runner restore"
]