fix(api): route runaway host alerts to ai event packets
Some checks failed
CD Pipeline / tests (push) Successful in 1m44s
Code Review / ai-code-review (push) Successful in 14s
CD Pipeline / build-and-deploy (push) Successful in 7m8s
CD Pipeline / post-deploy-checks (push) Successful in 2m56s
Ansible / Reboot Recovery Contract / validate (push) Has been cancelled
Some checks failed
CD Pipeline / tests (push) Successful in 1m44s
Code Review / ai-code-review (push) Successful in 14s
CD Pipeline / build-and-deploy (push) Successful in 7m8s
CD Pipeline / post-deploy-checks (push) Successful in 2m56s
Ansible / Reboot Recovery Contract / validate (push) Has been cancelled
This commit is contained in:
@@ -97,6 +97,9 @@ _HOST_RESOURCE_ALERT_HEADER_RE = re.compile(
|
||||
_HOST_RESOURCE_TARGET_RE = re.compile(
|
||||
r"\b(?:WARN|CRIT|INFO)\s+(?P<target>[A-Za-z0-9_.-]+)\b"
|
||||
)
|
||||
_HOST_RESOURCE_ALERTNAME_RE = re.compile(r"\balertname\s*=\s*\"?(?P<alertname>[A-Za-z0-9_.:-]+)\"?")
|
||||
_HOST_RESOURCE_HOST_LABEL_RE = re.compile(r"\bhost\s*=\s*\"?(?P<host>[A-Za-z0-9_.:-]+)\"?")
|
||||
_HOST_RESOURCE_RULE_LABEL_RE = re.compile(r"\brule\s*=\s*\"?(?P<rule>[A-Za-z0-9_.:-]+)\"?")
|
||||
_HOST_PROCESS_LINE_RE = re.compile(
|
||||
r"^\s*(?P<user>\S+)\s+"
|
||||
r"(?P<pid>\d+)\s+"
|
||||
@@ -112,6 +115,10 @@ def _is_host_resource_alert_text(text: str) -> bool:
|
||||
"CPU 警告" in text
|
||||
or "容器內 root Node.js 進程" in text
|
||||
or ("ps aux" in text and ("next build" in text or "npm run build" in text))
|
||||
or "HostOrphanBrowserSmokeHighCpu" in text
|
||||
or "HostCiRunnerLoadSaturation" in text
|
||||
or "awoooi_host_runaway_browser_orphan" in text
|
||||
or "awoooi_host_gitea_actions_active_container_count" in text
|
||||
)
|
||||
|
||||
|
||||
@@ -165,10 +172,15 @@ def _parse_host_process_lines(text: str) -> list[dict[str, str | float]]:
|
||||
|
||||
|
||||
def _host_resource_alert_impact(
|
||||
text: str,
|
||||
cpu_text: str,
|
||||
load_text: str,
|
||||
processes: list[dict[str, str | float]],
|
||||
) -> str:
|
||||
if "HostOrphanBrowserSmokeHighCpu" in text or "awoooi_host_runaway_browser_orphan" in text:
|
||||
return "orphan Chrome / Playwright smoke 疑似吃滿 CPU;先驗 pgid、age、cmdline 與 active CI 分流"
|
||||
if "HostCiRunnerLoadSaturation" in text or "awoooi_host_gitea_actions_active_container_count" in text:
|
||||
return "CI runner 正在造成主機負載;先確認 Actions run、queue、timeout 與服務 SLO"
|
||||
try:
|
||||
load = float(load_text)
|
||||
except (TypeError, ValueError):
|
||||
@@ -186,7 +198,20 @@ def _host_resource_alert_impact(
|
||||
return "資源升高但尚未確認根因;先聚合觀察並補足 owner 判讀"
|
||||
|
||||
|
||||
def _host_resource_automation_lane(processes: list[dict[str, str | float]]) -> tuple[str, str]:
|
||||
def _host_resource_automation_lane(
|
||||
text: str,
|
||||
processes: list[dict[str, str | float]],
|
||||
) -> tuple[str, str]:
|
||||
if "HostOrphanBrowserSmokeHighCpu" in text or "awoooi_host_runaway_browser_orphan" in text:
|
||||
return (
|
||||
"orphan_browser_smoke_runaway_process",
|
||||
"建立 runaway process triage packet;先跑 remediation dry-run,待 owner / window / evidence 後才可 SIGTERM",
|
||||
)
|
||||
if "HostCiRunnerLoadSaturation" in text or "awoooi_host_gitea_actions_active_container_count" in text:
|
||||
return (
|
||||
"ci_runner_load_saturation",
|
||||
"建立 CI load evidence packet,彙整 Gitea Actions run、runner queue、load/core 與 swap;不 kill process",
|
||||
)
|
||||
commands = " ".join(str(item.get("command", "")).lower() for item in processes)
|
||||
if "build" in commands:
|
||||
return (
|
||||
@@ -204,6 +229,69 @@ def _host_resource_automation_lane(processes: list[dict[str, str | float]]) -> t
|
||||
)
|
||||
|
||||
|
||||
def _host_resource_alert_label(name: str, text: str) -> str:
|
||||
patterns = {
|
||||
"alertname": _HOST_RESOURCE_ALERTNAME_RE,
|
||||
"host": _HOST_RESOURCE_HOST_LABEL_RE,
|
||||
"rule": _HOST_RESOURCE_RULE_LABEL_RE,
|
||||
}
|
||||
match = patterns[name].search(text)
|
||||
return match.group(name) if match else ""
|
||||
|
||||
|
||||
def _host_resource_alert_evidence_lines(
|
||||
text: str,
|
||||
processes: list[dict[str, str | float]],
|
||||
) -> list[str]:
|
||||
if processes:
|
||||
lines: list[str] = []
|
||||
for item in processes[:3]:
|
||||
process_cpu = f"{float(item['cpu']):g}%"
|
||||
lines.append(
|
||||
"├ "
|
||||
f"<code>PID {html.escape(str(item['pid']))}</code> "
|
||||
f"CPU <code>{html.escape(process_cpu)}</code>:"
|
||||
f"<code>{html.escape(str(item['command']))}</code>"
|
||||
)
|
||||
lines[-1] = "└" + lines[-1][1:]
|
||||
return lines
|
||||
|
||||
alertname = _host_resource_alert_label("alertname", text)
|
||||
rule = _host_resource_alert_label("rule", text)
|
||||
if alertname:
|
||||
lines = [
|
||||
f"├ Alert:<code>{html.escape(alertname)}</code>",
|
||||
"├ Metric:<code>awoooi_host_runaway_process_*</code>",
|
||||
]
|
||||
if rule:
|
||||
lines.append(f"└ Rule:<code>{html.escape(rule)}</code>")
|
||||
else:
|
||||
lines[-1] = "└" + lines[-1][1:]
|
||||
return lines
|
||||
|
||||
return ["└ 尚未收到可解析的 top process,請補只讀 evidence。"]
|
||||
|
||||
|
||||
def _host_resource_recommendation_lines(text: str) -> list[str]:
|
||||
if "HostOrphanBrowserSmokeHighCpu" in text or "awoooi_host_runaway_browser_orphan" in text:
|
||||
return [
|
||||
"├ 先讀 Prometheus orphan group / CPU / age / cmdline 指標與 textfile timestamp",
|
||||
"├ 執行 `host-runaway-process-remediation.py` dry-run 產生候選,不直接 apply",
|
||||
"└ 若 owner approval、maintenance window、evidence ref 齊全,才可 gated SIGTERM 並回寫 KM / PlayBook / Verifier",
|
||||
]
|
||||
if "HostCiRunnerLoadSaturation" in text or "awoooi_host_gitea_actions_active_container_count" in text:
|
||||
return [
|
||||
"├ 確認 Gitea Actions run、runner queue、build timeout、load/core 與 swap trend",
|
||||
"├ 若是合法 CI,標記為 capacity / queue 事件,不做 process remediation",
|
||||
"└ 若 CI 卡死,產出 owner packet 與 runner cleanup dry-run,再進維護窗口",
|
||||
]
|
||||
return [
|
||||
"├ 確認是否為 CI/CD / Actions / runner 正常建置窗口",
|
||||
"├ 若持續超過門檻,先查 runner queue、build job、容器資源限制與服務 SLO",
|
||||
"└ 同一 host/service 5 分鐘聚合一次,避免洗版",
|
||||
]
|
||||
|
||||
|
||||
def format_host_resource_alert_card(text: str) -> str:
|
||||
"""把 host CPU/load raw dump 轉成值班者可讀的 Telegram HTML 卡。"""
|
||||
if not _is_host_resource_alert_text(text):
|
||||
@@ -214,29 +302,21 @@ def format_host_resource_alert_card(text: str) -> str:
|
||||
target = (
|
||||
header.group("target")
|
||||
if header
|
||||
else (target_match.group("target") if target_match else "unknown-host")
|
||||
else (
|
||||
target_match.group("target")
|
||||
if target_match
|
||||
else (_host_resource_alert_label("host", text) or "unknown-host")
|
||||
)
|
||||
)
|
||||
cpu = header.group("cpu") if header else "-"
|
||||
load = header.group("load") if header else "-"
|
||||
processes = _parse_host_process_lines(text)
|
||||
impact = _host_resource_alert_impact(cpu, load, processes)
|
||||
automation_lane, automation_next_step = _host_resource_automation_lane(processes)
|
||||
impact = _host_resource_alert_impact(text, cpu, load, processes)
|
||||
automation_lane, automation_next_step = _host_resource_automation_lane(text, processes)
|
||||
load_bar = _resource_load_bar(load)
|
||||
severity = "🔴" if load != "-" and load_bar.count("■") >= 7 else "⚠️"
|
||||
|
||||
evidence_lines: list[str] = []
|
||||
for item in processes[:3]:
|
||||
process_cpu = f"{float(item['cpu']):g}%"
|
||||
evidence_lines.append(
|
||||
"├ "
|
||||
f"<code>PID {html.escape(str(item['pid']))}</code> "
|
||||
f"CPU <code>{html.escape(process_cpu)}</code>:"
|
||||
f"<code>{html.escape(str(item['command']))}</code>"
|
||||
)
|
||||
if evidence_lines:
|
||||
evidence_lines[-1] = "└" + evidence_lines[-1][1:]
|
||||
else:
|
||||
evidence_lines.append("└ 尚未收到可解析的 top process,請補只讀 evidence。")
|
||||
evidence_lines = _host_resource_alert_evidence_lines(text, processes)
|
||||
recommendation_lines = _host_resource_recommendation_lines(text)
|
||||
|
||||
return "\n".join(
|
||||
[
|
||||
@@ -257,9 +337,7 @@ def format_host_resource_alert_card(text: str) -> str:
|
||||
*evidence_lines,
|
||||
"",
|
||||
"<b>建議下一步</b>",
|
||||
"├ 確認是否為 CI/CD / Actions / runner 正常建置窗口",
|
||||
"├ 若持續超過門檻,先查 runner queue、build job、容器資源限制與服務 SLO",
|
||||
"└ 同一 host/service 5 分鐘聚合一次,避免洗版",
|
||||
*recommendation_lines,
|
||||
"",
|
||||
"<b>禁止事項</b>",
|
||||
"└ 不 kill process、不 restart Docker / Gitea、不 reload Nginx、不改 firewall;除非已有維護窗口與 owner 批准。",
|
||||
|
||||
@@ -91,6 +91,49 @@ root 364 181 0.7 3491396 494608 ? Rl 05:56 0:18 /opt/hostedto
|
||||
assert "processChild.js" not in result
|
||||
|
||||
|
||||
def test_orphan_browser_alert_becomes_runaway_process_event_packet() -> None:
|
||||
"""HostOrphanBrowserSmokeHighCpu 必須變成 runaway process 專屬事件包。"""
|
||||
raw_alert = (
|
||||
'alertname="HostOrphanBrowserSmokeHighCpu" host="110" '
|
||||
'rule="stockplatform_headless_smoke" '
|
||||
"description=\"orphan Chrome smoke group detected\""
|
||||
)
|
||||
|
||||
result = format_host_resource_alert_card(raw_alert)
|
||||
|
||||
assert "主機資源告警|110" in result
|
||||
assert "ai_automation_alert_card_v1" in result
|
||||
assert "orphan_browser_smoke_runaway_process" in result
|
||||
assert "HostOrphanBrowserSmokeHighCpu" in result
|
||||
assert "stockplatform_headless_smoke" in result
|
||||
assert "host-runaway-process-remediation.py" in result
|
||||
assert "dry-run" in result
|
||||
assert "gated SIGTERM" in result
|
||||
assert "KM / PlayBook / Verifier" in result
|
||||
assert "runtime_write_gate=0" in result
|
||||
assert "不 kill process" in result
|
||||
assert "Docker" in result
|
||||
|
||||
|
||||
def test_ci_runner_load_alert_becomes_capacity_event_packet() -> None:
|
||||
"""HostCiRunnerLoadSaturation 不可被誤導成可 kill 的 runaway process。"""
|
||||
raw_alert = (
|
||||
'alertname="HostCiRunnerLoadSaturation" host="110" '
|
||||
"awoooi_host_gitea_actions_active_container_count 2"
|
||||
)
|
||||
|
||||
result = format_host_resource_alert_card(raw_alert)
|
||||
|
||||
assert "主機資源告警|110" in result
|
||||
assert "ci_runner_load_saturation" in result
|
||||
assert "CI load evidence packet" in result
|
||||
assert "Gitea Actions run" in result
|
||||
assert "合法 CI" in result
|
||||
assert "不做 process remediation" in result
|
||||
assert "runtime_write_gate=0" in result
|
||||
assert "不 kill process" in result
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_send_alert_notification_normalizes_host_resource_raw_dump(monkeypatch) -> None:
|
||||
"""send_alert_notification 是最後出口,必須自動套用 AI 自動化事件包。"""
|
||||
|
||||
@@ -59,6 +59,19 @@
|
||||
- Runtime auto-remediation:仍 `0%`,這是安全設計;若未來要由 AI 進入修復,必須先產生 triage packet、dry-run evidence、owner approval、maintenance window、evidence ref、post-check 與 KM 回寫,不得由 exporter 自行 kill。
|
||||
- 目前 110 高 CPU 判讀:orphan headless browser 已歸零;剩餘負載應歸因於 active CI 或其他一般 workload,不能再被誤判為前一輪 stockPlatform orphan Chrome 事故。
|
||||
|
||||
### 2026-06-18 14:38 台北|Host runaway alert -> AI event packet 補強
|
||||
|
||||
**修補**:`TelegramGateway.send_alert_notification()` 的最後出口已能把新 Prometheus alert text 轉成專屬 AI automation card,而不是只靠泛用 CPU raw dump parser。`HostOrphanBrowserSmokeHighCpu` 會進 `orphan_browser_smoke_runaway_process` lane,顯示 alertname / host / rule、runaway dry-run、owner / maintenance / evidence gate、KM / PlayBook / Verifier 回寫;`HostCiRunnerLoadSaturation` 會進 `ci_runner_load_saturation` lane,要求彙整 Gitea Actions run、runner queue、load/core 與 swap trend,並明確標示合法 CI 不做 process remediation。
|
||||
|
||||
**驗證**:
|
||||
- `DATABASE_URL=postgresql+asyncpg://ci:ci@localhost/ci PYTHONPATH=apps/api /Users/ogt/.pyenv/shims/python3.11 -m pytest apps/api/tests/test_telegram_message_templates.py -q -p no:cacheprovider`:`59 passed`。
|
||||
- `PYTHONPATH=apps/api /Users/ogt/.pyenv/shims/python3.11 -m py_compile apps/api/src/services/telegram_gateway.py`:通過。
|
||||
|
||||
**完成度同步**:
|
||||
- Host runaway alert -> AI event packet:`0% -> 100%`。
|
||||
- Monitoring / alert / PlayBook / Telegram event packet / live scrape:`100%`。
|
||||
- Runtime remediation / Telegram 實發 / Bot API call / host write:仍 `0 / false`;本段未發 Telegram、未讀 secret、未 kill process、未重啟服務、未改 firewall/K8s。
|
||||
|
||||
## 2026-06-18|P2-406B Receipt Readback Owner Review 本地完成
|
||||
|
||||
**背景**:P2-004 已把依賴 / 供應鏈漂移收斂成只讀監控讀回;統帥要求每次推進都不能忘記目標與方向,因此本段把日報 / 週報 / 月報、Telegram receipt owner review、P2-004 drift monitor 與 P2-403J 報表真相串成同一個 owner review surface,讓治理頁可以直接看到 AI Agent 分工、互審與仍被關閉的 runtime 邊界。
|
||||
|
||||
@@ -55,6 +55,16 @@ read-only exporter -> Prometheus alert -> AI triage packet -> KM / PlayBook evid
|
||||
| `HostRunawayProcessMonitorMissing` / `Stale` | exporter 缺失或超過 10 分鐘未更新 | 修 exporter / cron / textfile collector |
|
||||
| `HostRunawayProcessRemediationUnexpectedlyAuthorized` | `remediation_authorized > 0` | 立即回滾;禁止把監控器改成執行器 |
|
||||
|
||||
Telegram / AI event packet contract:
|
||||
|
||||
| Alert / input | Telegram lane | 必須顯示 |
|
||||
|---------------|---------------|----------|
|
||||
| `HostOrphanBrowserSmokeHighCpu` | `orphan_browser_smoke_runaway_process` | alertname、host、rule、runaway dry-run、owner / maintenance / evidence gate、KM / PlayBook / Verifier 回寫 |
|
||||
| `HostCiRunnerLoadSaturation` | `ci_runner_load_saturation` | Gitea Actions run、runner queue、load/core、swap trend、capacity / queue 判定、不做 process remediation |
|
||||
| raw `CPU 警告` / `ps aux` dump | `runner_build_resource_pressure` 或 `host_resource_pressure_triage` | sanitized top process evidence,不顯示 raw workspace path 或完整 process dump |
|
||||
|
||||
所有 Telegram 卡片都必須保留 `runtime_write_gate=0`,並不得把 alert/card 轉成直接 kill / restart / reload 指令。
|
||||
|
||||
---
|
||||
|
||||
## 3. AI Triager 必做判讀
|
||||
|
||||
@@ -5088,3 +5088,16 @@ Trigger commit `f5cd37b7` 與 deploy marker `0ba92357` 已把 governance UI 的
|
||||
- 新增 pytest,鎖住 orphan 分類、Linux / BSD `ps` 解析、合法 / 年輕 process 忽略、CI/swap 指標、dry-run 與 apply gate 拒絕行為;readiness audit 以 pyenv Python 重跑後 `BLOCKED=0`。
|
||||
|
||||
**裁決:** 這是 host CPU runaway 的 observe -> classify -> alert -> PlayBook -> KM contract -> gated remediation 閉環,不是 runtime 自動 kill 授權。AI 可以自動診斷、告警、產生 dry-run 修復包與 KM/PlayBook 回寫要求;真正 process termination 仍需 owner approval、maintenance window、evidence ref 與 post-check。Docker restart、systemd restart、Nginx reload、firewall change、secret read、host write 與 production write 仍全部禁止。
|
||||
|
||||
### 2026-06-18 14:38 (台北) — §8 / Host CPU AIOps — Host runaway alert 轉 AI event packet
|
||||
|
||||
**觸發**:前段已把 110 runaway process 監控、告警、PlayBook 與 live scrape 補齊,但 alert 進 Telegram 最後出口時仍需確認 `HostOrphanBrowserSmokeHighCpu` 與 `HostCiRunnerLoadSaturation` 不會被壓成泛用 CPU 文本。
|
||||
|
||||
**已推進:**
|
||||
- `TelegramGateway.send_alert_notification()` 仍是最後出口;`format_host_resource_alert_card()` 現在可解析 `alertname`、`host`、`rule` label。
|
||||
- `HostOrphanBrowserSmokeHighCpu` 會轉成 `orphan_browser_smoke_runaway_process` lane,顯示 runaway dry-run、owner / maintenance / evidence gate、KM / PlayBook / Verifier 回寫。
|
||||
- `HostCiRunnerLoadSaturation` 會轉成 `ci_runner_load_saturation` lane,要求彙整 Gitea Actions run、runner queue、load/core 與 swap,並標示合法 CI 不做 process remediation。
|
||||
- `docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md` 已補 Telegram / AI event packet contract。
|
||||
- 精準測試 `apps/api/tests/test_telegram_message_templates.py` 已新增兩條 regression,`59 passed`;`telegram_gateway.py` py_compile 通過。
|
||||
|
||||
**裁決:** 這是 alert -> AI event packet 的只讀與訊息模板閉環,不是 Telegram 實發、Bot API call、Gateway queue write、host write 或 process kill 授權。所有卡片仍固定 `runtime_write_gate=0`,真正修復仍必須走 dry-run、owner approval、maintenance window、evidence ref、post-check 與 KM / PlayBook 回寫。
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
| P0 host / K3s recovery | DONE | 100% | 120 booted after console fsck at `2026-06-12 15:13`; latest 2026-06-14 18:15 readback shows 120 is reachable, K3s is active, `mon` and `mon1` are both `Ready control-plane`, and cold-start P0/P1 checks are green. |
|
||||
| P1 backup / alert / escrow | BLOCKED_DR_ESCROW | 92% | 2026-06-15 03:11 `backup-status` shows 110 `13/13 fresh failed=0`, 188 `2/2 fresh failed=0`, `core_blockers=0`, `escrow_missing=5`, last aggregate `2026-06-15 02:40:13`. Offsite / escrow report shows `SCRIPT_MISSING_COUNT=0`, `OFFSITE_CONFIGURED=1`, `RCLONE_CONFIGURED=1`, `ESCROW_MISSING_COUNT=5`. Owner request package is ready; actual marker write remains blocked on real non-secret evidence IDs. |
|
||||
| P2 service / data truth | VERIFIED_FULL_STACK_GREEN_FOR_SERVICE | 100% | 2026-06-18 13:43 cold-start verifies public route/TLS, API/Web route, momo health and current-month parity `10936|10936|2026-06-01|2026-06-17|2026-06-01|2026-06-17`, backup exporters, schedules, K3s node readiness, VIP, and 110 / 188 runtime health. K8s active failed Job count is `0`, bad pods are `0`, and cold-start returns `PASS=84 WARN=0 BLOCKED=0`. |
|
||||
| P3 docs / automation contracts | DONE_WITH_RUNAWAY_PROCESS_AIOPS_LIVE_SCRAPED | 100% | Workplan, SOP v1.26, BACKUP-STATUS, LOGBOOK, 120 console/fsck recovery, Gitea backup stale-dump hardening, reboot ledger/version-comparison SOP, escrow evidence audit, 188 nginx Ansible baseline, 110 cold-start detector script, startup judgment layers, GO/NO-GO tree, host recovery cards, explicit Plan B degraded-operation path, machine-readable `plan_b` baseline, readiness-audit Plan B guard, B0-B5 service levels, T+0/T+120 fallback timeline checks, host role / load-balancing assessment, CD `known_hosts` guardrail, `fwupd-refresh.timer` rollback note, K3s filesystem event blocker, AWOOOI backup no-direct-offsite-sync contract, 110/188 Ansible source-of-truth, Gitea self-hosted readiness validation workflow, post-CD no-regression readbacks, stale-vs-active K8s failed Job classification, 110 runaway browser / CI load AIOps exporter + alert + gated remediation PlayBook, and 2026-06-18 live readback are updated. 14:31-14:32 Prometheus scrape confirms 110 `monitor_up=1`, orphan browser group count `0`, active CI containers `2`, load5/core around `0.79-0.81`, swap ratio around `1.0`, `remediation_authorized=0`, and missing/orphan alerts not firing. Repo-side readiness audit also checks runaway process exporter / remediation helper / alert group; live cold-start remains `PASS=84 WARN=0 BLOCKED=0` from the latest service readiness readback. |
|
||||
| P3 docs / automation contracts | DONE_WITH_RUNAWAY_PROCESS_AIOPS_LIVE_SCRAPED | 100% | Workplan, SOP v1.26, BACKUP-STATUS, LOGBOOK, 120 console/fsck recovery, Gitea backup stale-dump hardening, reboot ledger/version-comparison SOP, escrow evidence audit, 188 nginx Ansible baseline, 110 cold-start detector script, startup judgment layers, GO/NO-GO tree, host recovery cards, explicit Plan B degraded-operation path, machine-readable `plan_b` baseline, readiness-audit Plan B guard, B0-B5 service levels, T+0/T+120 fallback timeline checks, host role / load-balancing assessment, CD `known_hosts` guardrail, `fwupd-refresh.timer` rollback note, K3s filesystem event blocker, AWOOOI backup no-direct-offsite-sync contract, 110/188 Ansible source-of-truth, Gitea self-hosted readiness validation workflow, post-CD no-regression readbacks, stale-vs-active K8s failed Job classification, 110 runaway browser / CI load AIOps exporter + alert + gated remediation PlayBook, Telegram / AI event packet mapping, and 2026-06-18 live readback are updated. 14:31-14:32 Prometheus scrape confirms 110 `monitor_up=1`, orphan browser group count `0`, active CI containers `2`, load5/core around `0.79-0.81`, swap ratio around `1.0`, `remediation_authorized=0`, and missing/orphan alerts not firing. Repo-side readiness audit also checks runaway process exporter / remediation helper / alert group; live cold-start remains `PASS=84 WARN=0 BLOCKED=0` from the latest service readiness readback. |
|
||||
|
||||
Full cold-start service readiness may be declared green for the latest verified evidence set. As of 2026-06-18 13:43, services are green with `WARN=0` and `BLOCKED=0`; the retained stale `km-vectorize` failed Job remains historical evidence only. Do not declare DR scorecard complete while credential escrow evidence remains blocked.
|
||||
|
||||
@@ -214,7 +214,7 @@ Do not run `truncate`, whole DB restore, force-push, DROP, or online root filesy
|
||||
## 9. Progress Updates
|
||||
|
||||
```text
|
||||
2026-06-18 15:10 Asia/Taipei
|
||||
2026-06-18 14:20 Asia/Taipei
|
||||
Phase: P3 AI Ops runaway process automation
|
||||
Before: 110 CPU 滿載只能靠人工 `ps/top` 判斷;泛用 `HostHighCpuLoad` 無法分辨跨專案 orphan Chrome smoke 與合法 Gitea Actions CI load。
|
||||
After: 新增 read-only `host-runaway-process-exporter.py`、gated `host-runaway-process-remediation.py`、Prometheus `host_runaway_process_alerts`、Ansible textfile exporter source-of-truth、SOP v1.26 與 `HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md`。Exporter 暴露 orphan browser、active CI、load/core、swap ratio 與 `remediation_authorized=0`;修復器預設 dry-run,`SIGTERM` 必須帶 owner approval、maintenance window、evidence ref。
|
||||
@@ -228,6 +228,14 @@ Blocked: No for live observability; yes for runtime remediation by design until
|
||||
Next: Keep cron scrape under normal monitoring; if orphan count becomes >0, create AI triage packet and remediation dry-run before any gated `SIGTERM`.
|
||||
Completion: monitoring / alert / PlayBook / KM contract 100%; runtime auto-remediation remains gated at 0 until a real owner-approved apply is executed.
|
||||
|
||||
2026-06-18 14:38 Asia/Taipei
|
||||
Phase: P3 AI Ops alert-to-event packet
|
||||
Before: 泛用 CPU raw dump 可被轉成 AI automation card,但 `HostOrphanBrowserSmokeHighCpu` / `HostCiRunnerLoadSaturation` alert text 尚未有專屬 lane。
|
||||
After: Telegram 最後出口可將 `HostOrphanBrowserSmokeHighCpu` 轉成 `orphan_browser_smoke_runaway_process`,將 `HostCiRunnerLoadSaturation` 轉成 `ci_runner_load_saturation`;兩者都保留 `runtime_write_gate=0`,並要求 dry-run / owner / maintenance / evidence / KM / PlayBook / Verifier。
|
||||
Evidence: `apps/api/src/services/telegram_gateway.py`、`apps/api/tests/test_telegram_message_templates.py`,精準 pytest `59 passed`。
|
||||
Blocked: No for alert-to-event packet; yes for Telegram live send / runtime remediation by design.
|
||||
Next: 等 code-review / CD 後做 production readback;若未來 alert 實際 firing,確認 Telegram card 與 AwoooP Run truth-chain 都能呈現同一 lane。
|
||||
|
||||
2026-06-18 13:43 Asia/Taipei
|
||||
Phase: P1/P2/P3 live readback
|
||||
Before: live cold-start was `PASS=83 WARN=1 BLOCKED=0`, result `DEGRADED`, because retained stale `km-vectorize-29689620` failed Job evidence was still counted as a service warning.
|
||||
|
||||
Reference in New Issue
Block a user