fix(api): route runaway host alerts to ai event packets
Some checks failed
CD Pipeline / tests (push) Successful in 1m44s
Code Review / ai-code-review (push) Successful in 14s
CD Pipeline / build-and-deploy (push) Successful in 7m8s
CD Pipeline / post-deploy-checks (push) Successful in 2m56s
Ansible / Reboot Recovery Contract / validate (push) Has been cancelled

This commit is contained in:
Your Name
2026-06-18 14:39:31 +08:00
parent e025cda641
commit f358a0f6c3
6 changed files with 188 additions and 23 deletions

View File

@@ -97,6 +97,9 @@ _HOST_RESOURCE_ALERT_HEADER_RE = re.compile(
_HOST_RESOURCE_TARGET_RE = re.compile(
r"\b(?:WARN|CRIT|INFO)\s+(?P<target>[A-Za-z0-9_.-]+)\b"
)
_HOST_RESOURCE_ALERTNAME_RE = re.compile(r"\balertname\s*=\s*\"?(?P<alertname>[A-Za-z0-9_.:-]+)\"?")
_HOST_RESOURCE_HOST_LABEL_RE = re.compile(r"\bhost\s*=\s*\"?(?P<host>[A-Za-z0-9_.:-]+)\"?")
_HOST_RESOURCE_RULE_LABEL_RE = re.compile(r"\brule\s*=\s*\"?(?P<rule>[A-Za-z0-9_.:-]+)\"?")
_HOST_PROCESS_LINE_RE = re.compile(
r"^\s*(?P<user>\S+)\s+"
r"(?P<pid>\d+)\s+"
@@ -112,6 +115,10 @@ def _is_host_resource_alert_text(text: str) -> bool:
"CPU 警告" in text
or "容器內 root Node.js 進程" in text
or ("ps aux" in text and ("next build" in text or "npm run build" in text))
or "HostOrphanBrowserSmokeHighCpu" in text
or "HostCiRunnerLoadSaturation" in text
or "awoooi_host_runaway_browser_orphan" in text
or "awoooi_host_gitea_actions_active_container_count" in text
)
@@ -165,10 +172,15 @@ def _parse_host_process_lines(text: str) -> list[dict[str, str | float]]:
def _host_resource_alert_impact(
text: str,
cpu_text: str,
load_text: str,
processes: list[dict[str, str | float]],
) -> str:
if "HostOrphanBrowserSmokeHighCpu" in text or "awoooi_host_runaway_browser_orphan" in text:
return "orphan Chrome / Playwright smoke 疑似吃滿 CPU先驗 pgid、age、cmdline 與 active CI 分流"
if "HostCiRunnerLoadSaturation" in text or "awoooi_host_gitea_actions_active_container_count" in text:
return "CI runner 正在造成主機負載;先確認 Actions run、queue、timeout 與服務 SLO"
try:
load = float(load_text)
except (TypeError, ValueError):
@@ -186,7 +198,20 @@ def _host_resource_alert_impact(
return "資源升高但尚未確認根因;先聚合觀察並補足 owner 判讀"
def _host_resource_automation_lane(processes: list[dict[str, str | float]]) -> tuple[str, str]:
def _host_resource_automation_lane(
text: str,
processes: list[dict[str, str | float]],
) -> tuple[str, str]:
if "HostOrphanBrowserSmokeHighCpu" in text or "awoooi_host_runaway_browser_orphan" in text:
return (
"orphan_browser_smoke_runaway_process",
"建立 runaway process triage packet先跑 remediation dry-run待 owner / window / evidence 後才可 SIGTERM",
)
if "HostCiRunnerLoadSaturation" in text or "awoooi_host_gitea_actions_active_container_count" in text:
return (
"ci_runner_load_saturation",
"建立 CI load evidence packet彙整 Gitea Actions run、runner queue、load/core 與 swap不 kill process",
)
commands = " ".join(str(item.get("command", "")).lower() for item in processes)
if "build" in commands:
return (
@@ -204,6 +229,69 @@ def _host_resource_automation_lane(processes: list[dict[str, str | float]]) -> t
)
def _host_resource_alert_label(name: str, text: str) -> str:
patterns = {
"alertname": _HOST_RESOURCE_ALERTNAME_RE,
"host": _HOST_RESOURCE_HOST_LABEL_RE,
"rule": _HOST_RESOURCE_RULE_LABEL_RE,
}
match = patterns[name].search(text)
return match.group(name) if match else ""
def _host_resource_alert_evidence_lines(
text: str,
processes: list[dict[str, str | float]],
) -> list[str]:
if processes:
lines: list[str] = []
for item in processes[:3]:
process_cpu = f"{float(item['cpu']):g}%"
lines.append(
""
f"<code>PID {html.escape(str(item['pid']))}</code> "
f"CPU <code>{html.escape(process_cpu)}</code>"
f"<code>{html.escape(str(item['command']))}</code>"
)
lines[-1] = "" + lines[-1][1:]
return lines
alertname = _host_resource_alert_label("alertname", text)
rule = _host_resource_alert_label("rule", text)
if alertname:
lines = [
f"├ Alert<code>{html.escape(alertname)}</code>",
"├ Metric<code>awoooi_host_runaway_process_*</code>",
]
if rule:
lines.append(f"└ Rule<code>{html.escape(rule)}</code>")
else:
lines[-1] = "" + lines[-1][1:]
return lines
return ["└ 尚未收到可解析的 top process請補只讀 evidence。"]
def _host_resource_recommendation_lines(text: str) -> list[str]:
if "HostOrphanBrowserSmokeHighCpu" in text or "awoooi_host_runaway_browser_orphan" in text:
return [
"├ 先讀 Prometheus orphan group / CPU / age / cmdline 指標與 textfile timestamp",
"├ 執行 `host-runaway-process-remediation.py` dry-run 產生候選,不直接 apply",
"└ 若 owner approval、maintenance window、evidence ref 齊全,才可 gated SIGTERM 並回寫 KM / PlayBook / Verifier",
]
if "HostCiRunnerLoadSaturation" in text or "awoooi_host_gitea_actions_active_container_count" in text:
return [
"├ 確認 Gitea Actions run、runner queue、build timeout、load/core 與 swap trend",
"├ 若是合法 CI標記為 capacity / queue 事件,不做 process remediation",
"└ 若 CI 卡死,產出 owner packet 與 runner cleanup dry-run再進維護窗口",
]
return [
"├ 確認是否為 CI/CD / Actions / runner 正常建置窗口",
"├ 若持續超過門檻,先查 runner queue、build job、容器資源限制與服務 SLO",
"└ 同一 host/service 5 分鐘聚合一次,避免洗版",
]
def format_host_resource_alert_card(text: str) -> str:
"""把 host CPU/load raw dump 轉成值班者可讀的 Telegram HTML 卡。"""
if not _is_host_resource_alert_text(text):
@@ -214,29 +302,21 @@ def format_host_resource_alert_card(text: str) -> str:
target = (
header.group("target")
if header
else (target_match.group("target") if target_match else "unknown-host")
else (
target_match.group("target")
if target_match
else (_host_resource_alert_label("host", text) or "unknown-host")
)
)
cpu = header.group("cpu") if header else "-"
load = header.group("load") if header else "-"
processes = _parse_host_process_lines(text)
impact = _host_resource_alert_impact(cpu, load, processes)
automation_lane, automation_next_step = _host_resource_automation_lane(processes)
impact = _host_resource_alert_impact(text, cpu, load, processes)
automation_lane, automation_next_step = _host_resource_automation_lane(text, processes)
load_bar = _resource_load_bar(load)
severity = "🔴" if load != "-" and load_bar.count("") >= 7 else "⚠️"
evidence_lines: list[str] = []
for item in processes[:3]:
process_cpu = f"{float(item['cpu']):g}%"
evidence_lines.append(
""
f"<code>PID {html.escape(str(item['pid']))}</code> "
f"CPU <code>{html.escape(process_cpu)}</code>"
f"<code>{html.escape(str(item['command']))}</code>"
)
if evidence_lines:
evidence_lines[-1] = "" + evidence_lines[-1][1:]
else:
evidence_lines.append("└ 尚未收到可解析的 top process請補只讀 evidence。")
evidence_lines = _host_resource_alert_evidence_lines(text, processes)
recommendation_lines = _host_resource_recommendation_lines(text)
return "\n".join(
[
@@ -257,9 +337,7 @@ def format_host_resource_alert_card(text: str) -> str:
*evidence_lines,
"",
"<b>建議下一步</b>",
"├ 確認是否為 CI/CD / Actions / runner 正常建置窗口",
"├ 若持續超過門檻,先查 runner queue、build job、容器資源限制與服務 SLO",
"└ 同一 host/service 5 分鐘聚合一次,避免洗版",
*recommendation_lines,
"",
"<b>禁止事項</b>",
"└ 不 kill process、不 restart Docker / Gitea、不 reload Nginx、不改 firewall除非已有維護窗口與 owner 批准。",

View File

@@ -91,6 +91,49 @@ root 364 181 0.7 3491396 494608 ? Rl 05:56 0:18 /opt/hostedto
assert "processChild.js" not in result
def test_orphan_browser_alert_becomes_runaway_process_event_packet() -> None:
"""HostOrphanBrowserSmokeHighCpu 必須變成 runaway process 專屬事件包。"""
raw_alert = (
'alertname="HostOrphanBrowserSmokeHighCpu" host="110" '
'rule="stockplatform_headless_smoke" '
"description=\"orphan Chrome smoke group detected\""
)
result = format_host_resource_alert_card(raw_alert)
assert "主機資源告警110" in result
assert "ai_automation_alert_card_v1" in result
assert "orphan_browser_smoke_runaway_process" in result
assert "HostOrphanBrowserSmokeHighCpu" in result
assert "stockplatform_headless_smoke" in result
assert "host-runaway-process-remediation.py" in result
assert "dry-run" in result
assert "gated SIGTERM" in result
assert "KM / PlayBook / Verifier" in result
assert "runtime_write_gate=0" in result
assert "不 kill process" in result
assert "Docker" in result
def test_ci_runner_load_alert_becomes_capacity_event_packet() -> None:
"""HostCiRunnerLoadSaturation 不可被誤導成可 kill 的 runaway process。"""
raw_alert = (
'alertname="HostCiRunnerLoadSaturation" host="110" '
"awoooi_host_gitea_actions_active_container_count 2"
)
result = format_host_resource_alert_card(raw_alert)
assert "主機資源告警110" in result
assert "ci_runner_load_saturation" in result
assert "CI load evidence packet" in result
assert "Gitea Actions run" in result
assert "合法 CI" in result
assert "不做 process remediation" in result
assert "runtime_write_gate=0" in result
assert "不 kill process" in result
@pytest.mark.asyncio
async def test_send_alert_notification_normalizes_host_resource_raw_dump(monkeypatch) -> None:
"""send_alert_notification 是最後出口,必須自動套用 AI 自動化事件包。"""

View File

@@ -59,6 +59,19 @@
- Runtime auto-remediation`0%`,這是安全設計;若未來要由 AI 進入修復,必須先產生 triage packet、dry-run evidence、owner approval、maintenance window、evidence ref、post-check 與 KM 回寫,不得由 exporter 自行 kill。
- 目前 110 高 CPU 判讀orphan headless browser 已歸零;剩餘負載應歸因於 active CI 或其他一般 workload不能再被誤判為前一輪 stockPlatform orphan Chrome 事故。
### 2026-06-18 14:38 台北Host runaway alert -> AI event packet 補強
**修補**`TelegramGateway.send_alert_notification()` 的最後出口已能把新 Prometheus alert text 轉成專屬 AI automation card而不是只靠泛用 CPU raw dump parser。`HostOrphanBrowserSmokeHighCpu` 會進 `orphan_browser_smoke_runaway_process` lane顯示 alertname / host / rule、runaway dry-run、owner / maintenance / evidence gate、KM / PlayBook / Verifier 回寫;`HostCiRunnerLoadSaturation` 會進 `ci_runner_load_saturation` lane要求彙整 Gitea Actions run、runner queue、load/core 與 swap trend並明確標示合法 CI 不做 process remediation。
**驗證**
- `DATABASE_URL=postgresql+asyncpg://ci:ci@localhost/ci PYTHONPATH=apps/api /Users/ogt/.pyenv/shims/python3.11 -m pytest apps/api/tests/test_telegram_message_templates.py -q -p no:cacheprovider``59 passed`
- `PYTHONPATH=apps/api /Users/ogt/.pyenv/shims/python3.11 -m py_compile apps/api/src/services/telegram_gateway.py`:通過。
**完成度同步**
- Host runaway alert -> AI event packet`0% -> 100%`
- Monitoring / alert / PlayBook / Telegram event packet / live scrape`100%`
- Runtime remediation / Telegram 實發 / Bot API call / host write`0 / false`;本段未發 Telegram、未讀 secret、未 kill process、未重啟服務、未改 firewall/K8s。
## 2026-06-18P2-406B Receipt Readback Owner Review 本地完成
**背景**P2-004 已把依賴 / 供應鏈漂移收斂成只讀監控讀回;統帥要求每次推進都不能忘記目標與方向,因此本段把日報 / 週報 / 月報、Telegram receipt owner review、P2-004 drift monitor 與 P2-403J 報表真相串成同一個 owner review surface讓治理頁可以直接看到 AI Agent 分工、互審與仍被關閉的 runtime 邊界。

View File

@@ -55,6 +55,16 @@ read-only exporter -> Prometheus alert -> AI triage packet -> KM / PlayBook evid
| `HostRunawayProcessMonitorMissing` / `Stale` | exporter 缺失或超過 10 分鐘未更新 | 修 exporter / cron / textfile collector |
| `HostRunawayProcessRemediationUnexpectedlyAuthorized` | `remediation_authorized > 0` | 立即回滾;禁止把監控器改成執行器 |
Telegram / AI event packet contract:
| Alert / input | Telegram lane | 必須顯示 |
|---------------|---------------|----------|
| `HostOrphanBrowserSmokeHighCpu` | `orphan_browser_smoke_runaway_process` | alertname、host、rule、runaway dry-run、owner / maintenance / evidence gate、KM / PlayBook / Verifier 回寫 |
| `HostCiRunnerLoadSaturation` | `ci_runner_load_saturation` | Gitea Actions run、runner queue、load/core、swap trend、capacity / queue 判定、不做 process remediation |
| raw `CPU 警告` / `ps aux` dump | `runner_build_resource_pressure``host_resource_pressure_triage` | sanitized top process evidence不顯示 raw workspace path 或完整 process dump |
所有 Telegram 卡片都必須保留 `runtime_write_gate=0`,並不得把 alert/card 轉成直接 kill / restart / reload 指令。
---
## 3. AI Triager 必做判讀

View File

@@ -5088,3 +5088,16 @@ Trigger commit `f5cd37b7` 與 deploy marker `0ba92357` 已把 governance UI 的
- 新增 pytest鎖住 orphan 分類、Linux / BSD `ps` 解析、合法 / 年輕 process 忽略、CI/swap 指標、dry-run 與 apply gate 拒絕行為readiness audit 以 pyenv Python 重跑後 `BLOCKED=0`
**裁決:** 這是 host CPU runaway 的 observe -> classify -> alert -> PlayBook -> KM contract -> gated remediation 閉環,不是 runtime 自動 kill 授權。AI 可以自動診斷、告警、產生 dry-run 修復包與 KM/PlayBook 回寫要求;真正 process termination 仍需 owner approval、maintenance window、evidence ref 與 post-check。Docker restart、systemd restart、Nginx reload、firewall change、secret read、host write 與 production write 仍全部禁止。
### 2026-06-18 14:38 (台北) — §8 / Host CPU AIOps — Host runaway alert 轉 AI event packet
**觸發**:前段已把 110 runaway process 監控、告警、PlayBook 與 live scrape 補齊,但 alert 進 Telegram 最後出口時仍需確認 `HostOrphanBrowserSmokeHighCpu``HostCiRunnerLoadSaturation` 不會被壓成泛用 CPU 文本。
**已推進:**
- `TelegramGateway.send_alert_notification()` 仍是最後出口;`format_host_resource_alert_card()` 現在可解析 `alertname``host``rule` label。
- `HostOrphanBrowserSmokeHighCpu` 會轉成 `orphan_browser_smoke_runaway_process` lane顯示 runaway dry-run、owner / maintenance / evidence gate、KM / PlayBook / Verifier 回寫。
- `HostCiRunnerLoadSaturation` 會轉成 `ci_runner_load_saturation` lane要求彙整 Gitea Actions run、runner queue、load/core 與 swap並標示合法 CI 不做 process remediation。
- `docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md` 已補 Telegram / AI event packet contract。
- 精準測試 `apps/api/tests/test_telegram_message_templates.py` 已新增兩條 regression`59 passed``telegram_gateway.py` py_compile 通過。
**裁決:** 這是 alert -> AI event packet 的只讀與訊息模板閉環,不是 Telegram 實發、Bot API call、Gateway queue write、host write 或 process kill 授權。所有卡片仍固定 `runtime_write_gate=0`,真正修復仍必須走 dry-run、owner approval、maintenance window、evidence ref、post-check 與 KM / PlayBook 回寫。

View File

@@ -15,7 +15,7 @@
| P0 host / K3s recovery | DONE | 100% | 120 booted after console fsck at `2026-06-12 15:13`; latest 2026-06-14 18:15 readback shows 120 is reachable, K3s is active, `mon` and `mon1` are both `Ready control-plane`, and cold-start P0/P1 checks are green. |
| P1 backup / alert / escrow | BLOCKED_DR_ESCROW | 92% | 2026-06-15 03:11 `backup-status` shows 110 `13/13 fresh failed=0`, 188 `2/2 fresh failed=0`, `core_blockers=0`, `escrow_missing=5`, last aggregate `2026-06-15 02:40:13`. Offsite / escrow report shows `SCRIPT_MISSING_COUNT=0`, `OFFSITE_CONFIGURED=1`, `RCLONE_CONFIGURED=1`, `ESCROW_MISSING_COUNT=5`. Owner request package is ready; actual marker write remains blocked on real non-secret evidence IDs. |
| P2 service / data truth | VERIFIED_FULL_STACK_GREEN_FOR_SERVICE | 100% | 2026-06-18 13:43 cold-start verifies public route/TLS, API/Web route, momo health and current-month parity `10936|10936|2026-06-01|2026-06-17|2026-06-01|2026-06-17`, backup exporters, schedules, K3s node readiness, VIP, and 110 / 188 runtime health. K8s active failed Job count is `0`, bad pods are `0`, and cold-start returns `PASS=84 WARN=0 BLOCKED=0`. |
| P3 docs / automation contracts | DONE_WITH_RUNAWAY_PROCESS_AIOPS_LIVE_SCRAPED | 100% | Workplan, SOP v1.26, BACKUP-STATUS, LOGBOOK, 120 console/fsck recovery, Gitea backup stale-dump hardening, reboot ledger/version-comparison SOP, escrow evidence audit, 188 nginx Ansible baseline, 110 cold-start detector script, startup judgment layers, GO/NO-GO tree, host recovery cards, explicit Plan B degraded-operation path, machine-readable `plan_b` baseline, readiness-audit Plan B guard, B0-B5 service levels, T+0/T+120 fallback timeline checks, host role / load-balancing assessment, CD `known_hosts` guardrail, `fwupd-refresh.timer` rollback note, K3s filesystem event blocker, AWOOOI backup no-direct-offsite-sync contract, 110/188 Ansible source-of-truth, Gitea self-hosted readiness validation workflow, post-CD no-regression readbacks, stale-vs-active K8s failed Job classification, 110 runaway browser / CI load AIOps exporter + alert + gated remediation PlayBook, and 2026-06-18 live readback are updated. 14:31-14:32 Prometheus scrape confirms 110 `monitor_up=1`, orphan browser group count `0`, active CI containers `2`, load5/core around `0.79-0.81`, swap ratio around `1.0`, `remediation_authorized=0`, and missing/orphan alerts not firing. Repo-side readiness audit also checks runaway process exporter / remediation helper / alert group; live cold-start remains `PASS=84 WARN=0 BLOCKED=0` from the latest service readiness readback. |
| P3 docs / automation contracts | DONE_WITH_RUNAWAY_PROCESS_AIOPS_LIVE_SCRAPED | 100% | Workplan, SOP v1.26, BACKUP-STATUS, LOGBOOK, 120 console/fsck recovery, Gitea backup stale-dump hardening, reboot ledger/version-comparison SOP, escrow evidence audit, 188 nginx Ansible baseline, 110 cold-start detector script, startup judgment layers, GO/NO-GO tree, host recovery cards, explicit Plan B degraded-operation path, machine-readable `plan_b` baseline, readiness-audit Plan B guard, B0-B5 service levels, T+0/T+120 fallback timeline checks, host role / load-balancing assessment, CD `known_hosts` guardrail, `fwupd-refresh.timer` rollback note, K3s filesystem event blocker, AWOOOI backup no-direct-offsite-sync contract, 110/188 Ansible source-of-truth, Gitea self-hosted readiness validation workflow, post-CD no-regression readbacks, stale-vs-active K8s failed Job classification, 110 runaway browser / CI load AIOps exporter + alert + gated remediation PlayBook, Telegram / AI event packet mapping, and 2026-06-18 live readback are updated. 14:31-14:32 Prometheus scrape confirms 110 `monitor_up=1`, orphan browser group count `0`, active CI containers `2`, load5/core around `0.79-0.81`, swap ratio around `1.0`, `remediation_authorized=0`, and missing/orphan alerts not firing. Repo-side readiness audit also checks runaway process exporter / remediation helper / alert group; live cold-start remains `PASS=84 WARN=0 BLOCKED=0` from the latest service readiness readback. |
Full cold-start service readiness may be declared green for the latest verified evidence set. As of 2026-06-18 13:43, services are green with `WARN=0` and `BLOCKED=0`; the retained stale `km-vectorize` failed Job remains historical evidence only. Do not declare DR scorecard complete while credential escrow evidence remains blocked.
@@ -214,7 +214,7 @@ Do not run `truncate`, whole DB restore, force-push, DROP, or online root filesy
## 9. Progress Updates
```text
2026-06-18 15:10 Asia/Taipei
2026-06-18 14:20 Asia/Taipei
Phase: P3 AI Ops runaway process automation
Before: 110 CPU 滿載只能靠人工 `ps/top` 判斷;泛用 `HostHighCpuLoad` 無法分辨跨專案 orphan Chrome smoke 與合法 Gitea Actions CI load。
After: 新增 read-only `host-runaway-process-exporter.py`、gated `host-runaway-process-remediation.py`、Prometheus `host_runaway_process_alerts`、Ansible textfile exporter source-of-truth、SOP v1.26 與 `HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md`。Exporter 暴露 orphan browser、active CI、load/core、swap ratio 與 `remediation_authorized=0`;修復器預設 dry-run`SIGTERM` 必須帶 owner approval、maintenance window、evidence ref。
@@ -228,6 +228,14 @@ Blocked: No for live observability; yes for runtime remediation by design until
Next: Keep cron scrape under normal monitoring; if orphan count becomes >0, create AI triage packet and remediation dry-run before any gated `SIGTERM`.
Completion: monitoring / alert / PlayBook / KM contract 100%; runtime auto-remediation remains gated at 0 until a real owner-approved apply is executed.
2026-06-18 14:38 Asia/Taipei
Phase: P3 AI Ops alert-to-event packet
Before: 泛用 CPU raw dump 可被轉成 AI automation card但 `HostOrphanBrowserSmokeHighCpu` / `HostCiRunnerLoadSaturation` alert text 尚未有專屬 lane。
After: Telegram 最後出口可將 `HostOrphanBrowserSmokeHighCpu` 轉成 `orphan_browser_smoke_runaway_process`,將 `HostCiRunnerLoadSaturation` 轉成 `ci_runner_load_saturation`;兩者都保留 `runtime_write_gate=0`,並要求 dry-run / owner / maintenance / evidence / KM / PlayBook / Verifier。
Evidence: `apps/api/src/services/telegram_gateway.py`、`apps/api/tests/test_telegram_message_templates.py`,精準 pytest `59 passed`。
Blocked: No for alert-to-event packet; yes for Telegram live send / runtime remediation by design.
Next: 等 code-review / CD 後做 production readback若未來 alert 實際 firing確認 Telegram card 與 AwoooP Run truth-chain 都能呈現同一 lane。
2026-06-18 13:43 Asia/Taipei
Phase: P1/P2/P3 live readback
Before: live cold-start was `PASS=83 WARN=1 BLOCKED=0`, result `DEGRADED`, because retained stale `km-vectorize-29689620` failed Job evidence was still counted as a service warning.