diff --git a/apps/api/migrations/adr090_asset_inventory_foundation.sql b/apps/api/migrations/adr090_asset_inventory_foundation.sql new file mode 100644 index 00000000..00d01615 --- /dev/null +++ b/apps/api/migrations/adr090_asset_inventory_foundation.sql @@ -0,0 +1,607 @@ +-- ADR-090: 監控盲區治理 + 資產盤點 × 7 項自動化覆蓋矩陣永久化 DB +-- 建立時間: 2026-04-18 下午 (台北時區) +-- 建立者: ogt + Claude Opus 4.7 (1M context)(亞太) +-- +-- 上游: +-- - 主戰略: docs/superpowers/specs/2026-04-18-blindspot-governance-capacity-l4.md §5.2 +-- - ADR: docs/adr/ADR-090-monitoring-blindspot-governance.md +-- - MEMORY: project_blindspot_governance.md +-- +-- 設計說明: +-- 本檔建立 11 張表作為 AWOOOI L4 AIOps 的資產盤點 + 自動化覆蓋 + AI 協作稽核地基。 +-- 目標: 把治理從 Markdown 搬進 PostgreSQL,讓 AI 四分工 (OpenClaw × NemoTron × +-- Hermes × Claude LLM) 在結構化資料上做決策,且每次動作必留 trail。 +-- +-- 對應七大自動化引擎: +-- E1 自動監控 / E2 自動告警 / E3 自動建規則 / E4 自動匹配 +-- E5 自動 Playbook / E6 自動修復 / E7 自動 KM +-- +-- 執行順序: +-- Step 0: pgcrypto extension (gen_random_uuid 需要) +-- Step 1: asset_inventory — 全景資產主表 +-- Step 2: asset_discovery_run — 每次盤點 header +-- Step 3: asset_coverage_snapshot — 資產 × 7 自動化覆蓋矩陣 +-- Step 4: asset_relationship — 資產依賴圖 (爆炸半徑) +-- Step 5: alert_rule_catalog — 告警規則本身即資產 +-- Step 6: asset_change_event — 資產變化追蹤 +-- Step 7: asset_compliance_snapshot — SSL/CVE/secret/backup 合規 +-- Step 8: host_capacity_snapshot — 主機容量快照 (NemoTron 每日 02:00 寫) +-- Step 9: capacity_violation_event — 配額違規 +-- Step 10: automation_operation_log — 所有 AI 自動化動作稽核主表 🔴 +-- Step 11: ai_collaboration_trace — 多 Agent 協作逐步 (辯證歷程) +-- Step 12: 驗收查詢 (comment-only) +-- +-- Idempotent 鐵律: +-- - CREATE TABLE IF NOT EXISTS +-- - CREATE INDEX IF NOT EXISTS +-- - CHECK constraint 寫在 CREATE TABLE 內,依賴 IF NOT EXISTS 保護 +-- - 本檔可重複執行安全 (rerun 不會破壞既有資料) +-- +-- 回滾: +-- DROP TABLE IF EXISTS ai_collaboration_trace, automation_operation_log, +-- capacity_violation_event, host_capacity_snapshot, asset_compliance_snapshot, +-- asset_change_event, alert_rule_catalog, asset_relationship, +-- asset_coverage_snapshot, asset_discovery_run, asset_inventory CASCADE; +-- +-- ============================================================================ +-- Step 0: pgcrypto extension (gen_random_uuid) +-- ============================================================================ + +CREATE EXTENSION IF NOT EXISTS pgcrypto; + + +-- ============================================================================ +-- Step 1: asset_inventory — 全景資產主表 +-- 用途: 主機 / 容器 / K8s workload / DB / 網站 / API / 套件 / 日誌 / KM / 前端 / +-- 後端 / 容器 / Gitea / CI-CD 全部無例外 +-- 主寫者: scanner (asset_discovery) + NemoTron (capacity 欄位) +-- ============================================================================ + +CREATE TABLE IF NOT EXISTS asset_inventory ( + asset_id BIGSERIAL PRIMARY KEY, + asset_key TEXT NOT NULL UNIQUE, + asset_type TEXT NOT NULL, + parent_asset_id BIGINT REFERENCES asset_inventory(asset_id), + environment TEXT NOT NULL DEFAULT 'prod', + host TEXT, + namespace TEXT, + name TEXT NOT NULL, + metadata JSONB NOT NULL DEFAULT '{}'::jsonb, + tags TEXT[] NOT NULL DEFAULT '{}', + owner_team TEXT, + criticality TEXT, + data_classification TEXT, + external BOOLEAN NOT NULL DEFAULT false, + lifecycle_state TEXT NOT NULL DEFAULT 'active', + source_repo TEXT, + source_commit_sha TEXT, + + -- 容量欄位 (Layer 4 AI 巡檢用) + cpu_avg_7d NUMERIC(5,2), + mem_avg_7d NUMERIC(5,2), + capacity_headroom NUMERIC(5,2), + resource_limits JSONB, + resource_requests JSONB, + quota_violation_count INT NOT NULL DEFAULT 0, + sla_target JSONB, + cost_monthly_usd NUMERIC(10,2), + + -- 生命週期時間戳 + first_seen_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + last_seen_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + decommissioned_at TIMESTAMPTZ, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + + CONSTRAINT asset_inventory_criticality_valid + CHECK (criticality IS NULL OR criticality IN ('P0','P1','P2','P3')), + CONSTRAINT asset_inventory_data_class_valid + CHECK (data_classification IS NULL OR data_classification IN + ('public','internal','sensitive','secret')), + CONSTRAINT asset_inventory_lifecycle_valid + CHECK (lifecycle_state IN + ('planned','provisioning','active','degraded','deprecated','decommissioned')), + CONSTRAINT asset_inventory_type_valid + CHECK (asset_type IN ( + 'host','container','k8s_workload','k8s_resource','database','table', + 'website','api_endpoint','package','log_stream','km_entry', + 'frontend','backend','ci_pipeline','gitea_repo','monitoring_target', + 'secret','volume','network','certificate','scheduled_job', + 'message_queue','cache','dashboard','ai_agent','llm_model', + 'third_party_service','backup_target' + )) +); + +COMMENT ON TABLE asset_inventory IS + 'ADR-090: 全景資產主表。每一個主機/容器/K8s workload/DB/網站/API/套件/...都有一筆,跨 run 沿用同 asset_id。'; + +CREATE INDEX IF NOT EXISTS idx_asset_inventory_type_host + ON asset_inventory(asset_type, host); +CREATE INDEX IF NOT EXISTS idx_asset_inventory_env_lifecycle + ON asset_inventory(environment, lifecycle_state); +CREATE INDEX IF NOT EXISTS idx_asset_inventory_metadata_gin + ON asset_inventory USING GIN (metadata); +CREATE INDEX IF NOT EXISTS idx_asset_inventory_tags_gin + ON asset_inventory USING GIN (tags); +CREATE INDEX IF NOT EXISTS idx_asset_inventory_active_last_seen + ON asset_inventory(last_seen_at DESC) + WHERE lifecycle_state = 'active'; +-- 註: partial index 只索引 active 資產,按最近出現時間排序 + + +-- ============================================================================ +-- Step 2: asset_discovery_run — 每次盤點 header +-- 用途: 記錄每次全景掃描的起止時間、掃描範圍、掃到什麼、新增/消失多少 +-- 觸發: cron (每日) / ai (proactive_inspector) / human (手動) / incident +-- ============================================================================ + +CREATE TABLE IF NOT EXISTS asset_discovery_run ( + run_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + triggered_by TEXT NOT NULL, + scope TEXT[] NOT NULL, + scan_depth TEXT NOT NULL DEFAULT 'shallow', + host_filter TEXT[], + started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + ended_at TIMESTAMPTZ, + status TEXT NOT NULL, + total_assets INT, + new_assets INT NOT NULL DEFAULT 0, + modified_assets INT NOT NULL DEFAULT 0, + disappeared_assets INT NOT NULL DEFAULT 0, + tools_used JSONB, + duration_ms INT, + error TEXT, + summary JSONB, + + CONSTRAINT asset_discovery_run_status_valid + CHECK (status IN ('running','success','partial','failed','aborted')), + CONSTRAINT asset_discovery_run_scan_depth_valid + CHECK (scan_depth IN ('shallow','deep','full')) +); + +COMMENT ON TABLE asset_discovery_run IS + 'ADR-090: 每次資產盤點的 header。run_id 作為下游 snapshot/event/change 的關聯主鍵。'; + +CREATE INDEX IF NOT EXISTS idx_asset_discovery_run_started + ON asset_discovery_run(started_at DESC); +CREATE INDEX IF NOT EXISTS idx_asset_discovery_run_status + ON asset_discovery_run(status) WHERE status IN ('running','failed','partial'); + + +-- ============================================================================ +-- Step 3: asset_coverage_snapshot — 資產 × 7 項自動化 覆蓋矩陣 +-- 用途: 每個資產在 7 個自動化維度上的覆蓋狀態 (green/yellow/red) +-- 鐵律: 每次 discovery_run 為每個 asset 寫 7 筆 (7 dimensions) +-- ============================================================================ + +CREATE TABLE IF NOT EXISTS asset_coverage_snapshot ( + snapshot_id BIGSERIAL PRIMARY KEY, + run_id UUID NOT NULL REFERENCES asset_discovery_run(run_id) ON DELETE CASCADE, + asset_id BIGINT NOT NULL REFERENCES asset_inventory(asset_id), + dimension TEXT NOT NULL, + coverage_status TEXT NOT NULL, + evidence JSONB NOT NULL DEFAULT '{}'::jsonb, + gap_reason TEXT, + recommended_action TEXT, + confidence NUMERIC(3,2), + detected_by TEXT NOT NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + + CONSTRAINT asset_coverage_snapshot_dimension_valid + CHECK (dimension IN ( + 'auto_monitoring','auto_alerting','auto_rule_creation', + 'auto_rule_matching','auto_playbook','auto_remediation','auto_km_creation' + )), + CONSTRAINT asset_coverage_snapshot_status_valid + CHECK (coverage_status IN ('green','yellow','red','unknown')), + CONSTRAINT asset_coverage_snapshot_unique + UNIQUE (run_id, asset_id, dimension) +); + +COMMENT ON TABLE asset_coverage_snapshot IS + 'ADR-090: 計分卡。查 red COUNT 即覆蓋率 SLO。evidence 欄位串 playbook_id/km_entry_id/rule_name。'; + +CREATE INDEX IF NOT EXISTS idx_asset_coverage_snapshot_asset_dim + ON asset_coverage_snapshot(asset_id, dimension); +CREATE INDEX IF NOT EXISTS idx_asset_coverage_snapshot_red_yellow + ON asset_coverage_snapshot(coverage_status) + WHERE coverage_status IN ('red','yellow'); +CREATE INDEX IF NOT EXISTS idx_asset_coverage_snapshot_run + ON asset_coverage_snapshot(run_id); + + +-- ============================================================================ +-- Step 4: asset_relationship — 資產依賴圖 (爆炸半徑必需) +-- 用途: 記錄資產之間的 depends_on / calls / stores_data_in / backs_up_to 關係 +-- AI 用途: OpenClaw 計算 blast_radius 時查這張表 +-- ============================================================================ + +CREATE TABLE IF NOT EXISTS asset_relationship ( + relationship_id BIGSERIAL PRIMARY KEY, + from_asset_id BIGINT NOT NULL REFERENCES asset_inventory(asset_id), + to_asset_id BIGINT NOT NULL REFERENCES asset_inventory(asset_id), + relationship_type TEXT NOT NULL, + strength NUMERIC(3,2), + metadata JSONB, + first_detected_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + last_verified_at TIMESTAMPTZ, + is_active BOOLEAN NOT NULL DEFAULT true, + + CONSTRAINT asset_relationship_type_valid + CHECK (relationship_type IN ( + 'depends_on','calls','stores_data_in','backs_up_to', + 'routes_to','authenticates_via','monitors','alerts_to','logs_to' + )), + CONSTRAINT asset_relationship_strength_valid + CHECK (strength IS NULL OR (strength >= 0 AND strength <= 1)), + CONSTRAINT asset_relationship_unique + UNIQUE (from_asset_id, to_asset_id, relationship_type), + CONSTRAINT asset_relationship_no_self_loop + CHECK (from_asset_id <> to_asset_id) +); + +COMMENT ON TABLE asset_relationship IS + 'ADR-090: 資產依賴圖。AI 計算爆炸半徑必讀。edge 而非 tree,支援多重關係。'; + +CREATE INDEX IF NOT EXISTS idx_asset_relationship_from + ON asset_relationship(from_asset_id) WHERE is_active; +CREATE INDEX IF NOT EXISTS idx_asset_relationship_to + ON asset_relationship(to_asset_id) WHERE is_active; +CREATE INDEX IF NOT EXISTS idx_asset_relationship_type + ON asset_relationship(relationship_type); + + +-- ============================================================================ +-- Step 5: alert_rule_catalog — 告警規則本身即資產 +-- 用途: 把 alert_rules.yaml 升級為 DB-driven;記錄誰創的 / 何時 / 效能 / 生死 +-- AI 用途: Hermes 做 noise_rate 分析 / 提建議 retire 低品質規則 +-- ============================================================================ + +CREATE TABLE IF NOT EXISTS alert_rule_catalog ( + rule_id BIGSERIAL PRIMARY KEY, + rule_name TEXT NOT NULL UNIQUE, + source TEXT NOT NULL, + expr TEXT NOT NULL, + duration_seconds INT, + severity TEXT, + labels JSONB, + annotations JSONB, + linked_asset_ids BIGINT[], + created_by_agent TEXT, + + -- 規則品質追蹤 + true_positive_count INT NOT NULL DEFAULT 0, + false_positive_count INT NOT NULL DEFAULT 0, + noise_rate NUMERIC(5,2), + last_fired_at TIMESTAMPTZ, + + -- 信心與演化 + confidence NUMERIC(3,2), + review_status TEXT, + superseded_by_rule_id BIGINT REFERENCES alert_rule_catalog(rule_id), + + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + + CONSTRAINT alert_rule_catalog_source_valid + CHECK (source IN ('yaml_hardcoded','ai_generated','human_written','playbook_derived')), + CONSTRAINT alert_rule_catalog_review_valid + CHECK (review_status IS NULL OR review_status IN + ('draft','approved','deprecated','retired')) +); + +COMMENT ON TABLE alert_rule_catalog IS + 'ADR-090: 告警規則即一等資產。支援規則演化 (ai_generated) 與替代鏈 (superseded_by)。'; + +CREATE INDEX IF NOT EXISTS idx_alert_rule_catalog_source + ON alert_rule_catalog(source); +CREATE INDEX IF NOT EXISTS idx_alert_rule_catalog_assets_gin + ON alert_rule_catalog USING GIN (linked_asset_ids); +CREATE INDEX IF NOT EXISTS idx_alert_rule_catalog_review + ON alert_rule_catalog(review_status) WHERE review_status IS NOT NULL; + + +-- ============================================================================ +-- Step 6: asset_change_event — 資產變化追蹤 (diff between runs) +-- 用途: 兩次 discovery_run 之間的 delta。新增/消失/修改/覆蓋率變化 +-- ============================================================================ + +CREATE TABLE IF NOT EXISTS asset_change_event ( + event_id BIGSERIAL PRIMARY KEY, + run_id UUID NOT NULL REFERENCES asset_discovery_run(run_id), + asset_id BIGINT REFERENCES asset_inventory(asset_id), + change_type TEXT NOT NULL, + before_state JSONB, + after_state JSONB, + diff JSONB, + detected_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + ai_analysis TEXT, + + CONSTRAINT asset_change_event_type_valid + CHECK (change_type IN ( + 'asset_added','asset_removed','asset_modified', + 'coverage_improved','coverage_degraded', + 'criticality_changed','owner_changed','lifecycle_changed' + )) +); + +COMMENT ON TABLE asset_change_event IS + 'ADR-090: 資產變化追蹤。兩次掃描的 diff 明確落地,LLM 可加 ai_analysis 解讀。'; + +CREATE INDEX IF NOT EXISTS idx_asset_change_event_run + ON asset_change_event(run_id); +CREATE INDEX IF NOT EXISTS idx_asset_change_event_asset_time + ON asset_change_event(asset_id, detected_at DESC); + + +-- ============================================================================ +-- Step 7: asset_compliance_snapshot — 合規狀態 (SSL/CVE/secret/backup) +-- 用途: 與 coverage 不同軸的合規追蹤。SSL cert 到期 / CVE 掃描 / secret 輪替 +-- ============================================================================ + +CREATE TABLE IF NOT EXISTS asset_compliance_snapshot ( + snapshot_id BIGSERIAL PRIMARY KEY, + run_id UUID REFERENCES asset_discovery_run(run_id), + asset_id BIGINT NOT NULL REFERENCES asset_inventory(asset_id), + dimension TEXT NOT NULL, + status TEXT NOT NULL, + expires_at TIMESTAMPTZ, + detail JSONB, + remediation_deadline TIMESTAMPTZ, + detected_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + + CONSTRAINT asset_compliance_snapshot_dimension_valid + CHECK (dimension IN ( + 'ssl_cert_valid','cve_scan','secret_rotated','backup_tested', + 'audit_log_enabled','access_reviewed','encryption_at_rest' + )), + CONSTRAINT asset_compliance_snapshot_status_valid + CHECK (status IN ('compliant','warning','violation','unknown')) +); + +COMMENT ON TABLE asset_compliance_snapshot IS + 'ADR-090: 合規狀態快照。與 coverage 不同軸,SSL/CVE/secret/backup 專用。'; + +CREATE INDEX IF NOT EXISTS idx_asset_compliance_snapshot_asset_dim + ON asset_compliance_snapshot(asset_id, dimension); +CREATE INDEX IF NOT EXISTS idx_asset_compliance_snapshot_expiring + ON asset_compliance_snapshot(expires_at) + WHERE expires_at IS NOT NULL; +CREATE INDEX IF NOT EXISTS idx_asset_compliance_snapshot_violations + ON asset_compliance_snapshot(status) + WHERE status IN ('warning','violation'); + + +-- ============================================================================ +-- Step 8: host_capacity_snapshot — 主機容量快照 +-- 用途: NemoTron 每日 02:00 台北 自主容量巡檢寫入 +-- Layer 4 核心表。hermes 做預測,openclaw 產建議,全寫這張 +-- ============================================================================ + +CREATE TABLE IF NOT EXISTS host_capacity_snapshot ( + snapshot_id BIGSERIAL PRIMARY KEY, + host TEXT NOT NULL, + captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + load1 NUMERIC(6,2), + load5 NUMERIC(6,2), + load15 NUMERIC(6,2), + cpu_used_pct NUMERIC(5,2), + cpu_iowait_pct NUMERIC(5,2), + mem_used_pct NUMERIC(5,2), + swap_used_pct NUMERIC(5,2), + disk_used_pct JSONB, + container_count INT, + k8s_pod_count INT, + top_cpu_offenders JSONB, + top_mem_offenders JSONB, + headroom_pct NUMERIC(5,2), + ai_verdict TEXT, + ai_reasoning TEXT, + recommended_actions JSONB, + written_by_agent TEXT NOT NULL, + + CONSTRAINT host_capacity_snapshot_verdict_valid + CHECK (ai_verdict IS NULL OR ai_verdict IN ('safe','warning','critical','unknown')) +); + +COMMENT ON TABLE host_capacity_snapshot IS + 'ADR-090: NemoTron 每日主機容量巡檢結果。Layer 4 AI 自主治理核心表。'; + +CREATE INDEX IF NOT EXISTS idx_host_capacity_snapshot_host_time + ON host_capacity_snapshot(host, captured_at DESC); +CREATE INDEX IF NOT EXISTS idx_host_capacity_snapshot_critical + ON host_capacity_snapshot(ai_verdict) + WHERE ai_verdict IN ('warning','critical'); + + +-- ============================================================================ +-- Step 9: capacity_violation_event — 配額違規事件 +-- 用途: 記錄任何「缺 limit」「超 request」「主機飽和」的違規 +-- ============================================================================ + +CREATE TABLE IF NOT EXISTS capacity_violation_event ( + event_id BIGSERIAL PRIMARY KEY, + asset_id BIGINT REFERENCES asset_inventory(asset_id), + host TEXT, + violation_type TEXT NOT NULL, + threshold NUMERIC(10,2), + actual_value NUMERIC(10,2), + detected_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + auto_action TEXT, + auto_action_op_id UUID, + human_override TEXT, + resolved_at TIMESTAMPTZ, + + CONSTRAINT capacity_violation_event_type_valid + CHECK (violation_type IN ( + 'no_limit_set','over_request','over_limit','host_saturation', + 'over_sla_budget','unauthorized_new_deploy' + )) +); + +COMMENT ON TABLE capacity_violation_event IS + 'ADR-090: 配額違規稽核。每次 AI 偵測到資產無 limit/主機飽和/未授權部署 都寫一筆。'; + +CREATE INDEX IF NOT EXISTS idx_capacity_violation_event_asset_time + ON capacity_violation_event(asset_id, detected_at DESC); +CREATE INDEX IF NOT EXISTS idx_capacity_violation_event_unresolved + ON capacity_violation_event(detected_at DESC) + WHERE resolved_at IS NULL; + + +-- ============================================================================ +-- Step 10: automation_operation_log — 所有 AI 自動化動作稽核主表 🔴 +-- 鐵律: 每一個 AI 自動化動作都必須寫一筆。缺筆 = 治理失效 +-- ============================================================================ + +CREATE TABLE IF NOT EXISTS automation_operation_log ( + op_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + operation_type TEXT NOT NULL, + asset_id BIGINT REFERENCES asset_inventory(asset_id), + incident_id BIGINT, + run_id UUID REFERENCES asset_discovery_run(run_id), + actor TEXT NOT NULL, + input JSONB NOT NULL DEFAULT '{}'::jsonb, + output JSONB NOT NULL DEFAULT '{}'::jsonb, + dry_run_result JSONB, + status TEXT NOT NULL, + error TEXT, + duration_ms INT, + tokens_in INT, + tokens_out INT, + cost_usd NUMERIC(10,6), + budget_bucket TEXT, + parent_op_id UUID REFERENCES automation_operation_log(op_id), + retry_count INT NOT NULL DEFAULT 0, + retry_of_op_id UUID REFERENCES automation_operation_log(op_id), + stderr_feed_back TEXT, + tags TEXT[], + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + + CONSTRAINT automation_operation_log_type_valid + CHECK (operation_type IN ( + 'monitor_configured','monitor_removed', + 'alert_fired','alert_suppressed','alert_routed', + 'rule_created','rule_updated','rule_matched','rule_rejected','rule_deprecated', + 'playbook_generated','playbook_updated','playbook_executed', + 'remediation_executed','remediation_verified','remediation_rolled_back', + 'self_correction_attempted', + 'km_created','km_updated','km_linked', + 'asset_discovered','coverage_recalculated', + 'capacity_recommendation','quota_enforced' + )), + CONSTRAINT automation_operation_log_status_valid + CHECK (status IN ('pending','success','failed','dry_run','rolled_back')) +); + +COMMENT ON TABLE automation_operation_log IS + 'ADR-090: 所有 AI 自動化動作稽核主表。retry_of_op_id + stderr_feed_back 支援引擎 4 閉環。'; + +CREATE INDEX IF NOT EXISTS idx_automation_operation_log_type_time + ON automation_operation_log(operation_type, created_at DESC); +CREATE INDEX IF NOT EXISTS idx_automation_operation_log_asset_time + ON automation_operation_log(asset_id, created_at DESC); +CREATE INDEX IF NOT EXISTS idx_automation_operation_log_incident + ON automation_operation_log(incident_id) + WHERE incident_id IS NOT NULL; +CREATE INDEX IF NOT EXISTS idx_automation_operation_log_actor_time + ON automation_operation_log(actor, created_at DESC); +CREATE INDEX IF NOT EXISTS idx_automation_operation_log_retry + ON automation_operation_log(retry_of_op_id) + WHERE retry_of_op_id IS NOT NULL; +CREATE INDEX IF NOT EXISTS idx_automation_operation_log_tags_gin + ON automation_operation_log USING GIN (tags); + + +-- ============================================================================ +-- Step 11: ai_collaboration_trace — 多 Agent 協作逐步 (LLM × OpenClaw × NemoTron × Hermes) +-- 用途: 每個 automation_operation_log 背後的 N 步 AI 決策過程 +-- 最寶貴的語料: challenged_by + accepted 支援 RLHF fine-tune +-- ============================================================================ + +CREATE TABLE IF NOT EXISTS ai_collaboration_trace ( + trace_id BIGSERIAL PRIMARY KEY, + op_id UUID NOT NULL REFERENCES automation_operation_log(op_id) ON DELETE CASCADE, + step_order INT NOT NULL, + agent TEXT NOT NULL, + model TEXT, + system_prompt_version TEXT, + prompt TEXT, + response JSONB, + confidence NUMERIC(3,2), + challenged_by TEXT[], + accepted BOOLEAN, + tokens_in INT, + tokens_out INT, + duration_ms INT, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + + CONSTRAINT ai_collaboration_trace_unique_step + UNIQUE (op_id, step_order) +); + +COMMENT ON TABLE ai_collaboration_trace IS + 'ADR-090: AI 多 Agent 協作逐步紀錄。challenged_by + accepted = RLHF 訓練語料金礦。'; + +CREATE INDEX IF NOT EXISTS idx_ai_collaboration_trace_op + ON ai_collaboration_trace(op_id, step_order); +CREATE INDEX IF NOT EXISTS idx_ai_collaboration_trace_agent_time + ON ai_collaboration_trace(agent, created_at DESC); + + +-- ============================================================================ +-- Step 12: 驗收查詢 (執行後手動跑,驗證 11 張表都到位) +-- ============================================================================ + +-- SELECT table_name +-- FROM information_schema.tables +-- WHERE table_schema = 'public' +-- AND table_name IN ( +-- 'asset_inventory', +-- 'asset_discovery_run', +-- 'asset_coverage_snapshot', +-- 'asset_relationship', +-- 'alert_rule_catalog', +-- 'asset_change_event', +-- 'asset_compliance_snapshot', +-- 'host_capacity_snapshot', +-- 'capacity_violation_event', +-- 'automation_operation_log', +-- 'ai_collaboration_trace' +-- ) +-- ORDER BY table_name; +-- -- 預期: 11 筆 + +-- SELECT table_name, COUNT(*) AS column_count +-- FROM information_schema.columns +-- WHERE table_schema = 'public' +-- AND table_name LIKE 'asset_%' OR table_name IN +-- ('alert_rule_catalog','host_capacity_snapshot','capacity_violation_event', +-- 'automation_operation_log','ai_collaboration_trace') +-- GROUP BY table_name +-- ORDER BY table_name; + +-- SELECT conname, conrelid::regclass AS table_name +-- FROM pg_constraint +-- WHERE conrelid IN ( +-- 'asset_inventory'::regclass, +-- 'asset_discovery_run'::regclass, +-- 'asset_coverage_snapshot'::regclass, +-- 'asset_relationship'::regclass, +-- 'alert_rule_catalog'::regclass, +-- 'asset_change_event'::regclass, +-- 'asset_compliance_snapshot'::regclass, +-- 'host_capacity_snapshot'::regclass, +-- 'capacity_violation_event'::regclass, +-- 'automation_operation_log'::regclass, +-- 'ai_collaboration_trace'::regclass +-- ) AND contype = 'c' -- CHECK constraints only +-- ORDER BY table_name, conname; + +-- ============================================================================ +-- END OF MIGRATION adr090_asset_inventory_foundation.sql +-- 預計新增物件: 11 tables + 33 indexes + 20 CHECK constraints + 3 UNIQUE + 16 FK references +-- 依賴: pgcrypto extension (for gen_random_uuid) +-- 影響資料: 無 (純 DDL, 不動現有表) +-- 回滾: 見檔案頭部 +-- ============================================================================ diff --git a/docs/adr/ADR-090-monitoring-blindspot-governance.md b/docs/adr/ADR-090-monitoring-blindspot-governance.md new file mode 100644 index 00000000..a7b08091 --- /dev/null +++ b/docs/adr/ADR-090-monitoring-blindspot-governance.md @@ -0,0 +1,104 @@ +# ADR-090: 監控盲區治理 + 資產盤點 × 7 項自動化覆蓋矩陣永久化 DB + +**日期**: 2026-04-18 +**狀態**: 提案(Schema 定稿,Migration 待執行) +**上游**: [ADR-080 AI 自主化飛輪總覽](./ADR-080-ai-autonomy-flywheel-overview.md)、[2026-04-15 MASTER 藍圖](../superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md) +**完整實施計畫**: [2026-04-18 盲區治理 + 容量自主化戰役](../superpowers/specs/2026-04-18-blindspot-governance-capacity-l4.md) + +## Context(背景) + +2026-04-18 MoWoooWorkDown 假警報 RCA 暴露三重結構性失守: + +1. **監控覆蓋不到三成** — Prometheus 僅 35 active targets / 58 alert rules,對照實際資產(主機、K3s workloads、40+ Sentry 容器、18 個 188 容器、Kafka、ClickHouse、Ollama GPU、120/121 K3s Masters、多個網站)嚴重不足 +2. **告警規則量錯維度** — `HostHighCpuLoad` 用 CPU idle% 測,結果 110 load=18.35 / 188 load=16.32 持續 13 天無觸發;cadvisor 狂吃 288% CPU 13 天無告警 +3. **MD/ADR/Memory 無強制力** — 過去寫了一堆治理文件,執行上仍持續偏移,因為沒有資料庫級的強制檢查 + +MASTER Phase 0-6 的飛輪代碼全綠,但**在生產上沒真轉**:Playbook trust 實際動過幾次?MCP 24h 真 >500 嗎?RESTART 佔比真從 68% 降到 <40% 嗎?全部沒量過。 + +## Decision(決策) + +建立 **11 張表的資產盤點 + 自動化覆蓋 + AI 協作稽核資料庫地基**,把治理從 Markdown 搬進 PostgreSQL,讓 AI 四分工(OpenClaw × NemoTron × Hermes × Claude LLM)在結構化資料上做決策,且每次動作必留 trail。 + +### 11 張表總覽 + +| # | 表 | 用途 | 主寫者 | +|---|---|------|-------| +| 1 | `asset_inventory` | 全景資產主表(主機/容器/K8s/DB/網站/API/套件…) | scanner + NemoTron | +| 2 | `asset_discovery_run` | 每次盤點 header | cron / ai / human | +| 3 | `asset_coverage_snapshot` | 資產 × 7 自動化 覆蓋矩陣 | scanner + OpenClaw | +| 4 | `asset_relationship` | 資產依賴圖(爆炸半徑) | OpenClaw | +| 5 | `alert_rule_catalog` | 告警規則本身即資產 | human + Hermes | +| 6 | `asset_change_event` | 資產變化追蹤 | scanner diff | +| 7 | `asset_compliance_snapshot` | SSL / CVE / secret / backup 合規 | scanner | +| 8 | `host_capacity_snapshot` | 主機容量快照(每日 02:00) | **NemoTron** 🔴 | +| 9 | `capacity_violation_event` | 配額違規 | 自動 | +| 10 | `automation_operation_log` | **所有 AI 自動化動作稽核主表** | 全 Agent | +| 11 | `ai_collaboration_trace` | 多 Agent 協作逐步(辯證歷程) | 全 Agent | + +### 七大自動化引擎對映 + +每個資產在 `asset_coverage_snapshot.dimension` 有七格: +- `auto_monitoring` — Prom/Log/Metric 有抓嗎 +- `auto_alerting` — 有對應 alert rule 嗎 +- `auto_rule_creation` — 規則是人寫還是 AI 產 +- `auto_rule_matching` — 告警來時 rule 有自動匹配嗎 +- `auto_playbook` — 有對應 Playbook 嗎 +- `auto_remediation` — 有自動修復能力嗎 +- `auto_km_creation` — 故障後自動進 KM 嗎 + +### AI 四分工責任矩陣 + +| 引擎 | 主責 | 輔責 | +|-----|-----|------| +| E1 自動監控 | **NemoTron** 每日容量巡檢 | Hermes 趨勢 | +| E2 自動告警 | **OpenClaw** 分類分診 | Critic 挑戰 | +| E3 自動建規則 | **Hermes** 分析師 | NemoTron 提案 | +| E4 自動匹配 | **OpenClaw** embedding 檢索 | NemoTron 排序 | +| E5 自動 Playbook | **NemoTron + Solver** | Reviewer | +| E6 自動修復 | **Coordinator** + Critic + Verifier | stderr 回灌 | +| E7 自動 KM | **Hermes** 敘事 | OpenClaw 摘要 | + +## Alternatives Considered(考慮過的替代) + +| 方案 | 為何否決 | +|-----|---------| +| A. 只改 alert_rules.yaml 補規則 | 治標不治本,無法追蹤「為何加/何時 retire」 | +| B. 用 Grafana / Datadog 外部 SaaS | 不符合自託管偏好 + 歷程不進自家 DB | +| C. 6 張表版(未加 relationship/alert_rule_catalog/compliance/capacity) | 爆炸半徑算不出、規則無演化記錄、配額違規無稽核 | +| D. 用既有 incidents / knowledge_entries 湊合 | 這兩表是事件層,不是資產層,硬塞會搞垮 schema | + +## Consequences(後果) + +### Positive + +- **治理從 MD 升級為 DB 強制**:Gitea CI 可 query DB 做 PR reject +- **AI 可跨資產做決策**:NemoTron 看 `host_capacity_snapshot` 自主提搬遷建議 +- **所有 AI 動作有稽核**:`automation_operation_log` + `ai_collaboration_trace` 形成完整 fine-tune 訓料 +- **盲區可量化**:`asset_coverage_snapshot.coverage_status='red'` COUNT 即覆蓋率 SLO +- **規則可演化**:`alert_rule_catalog.noise_rate` + `superseded_by_rule_id` 支援 AI 漸進取代硬編規則 + +### Negative + +- **11 張表 + 多 FK 增加 Migration 複雜度**(但分批回滾可行) +- **PostgreSQL 188 寫入量提升**(現有低寫入,加這些每日寫 <10k 行,可忽略) +- **Scanner 須寫新腳本**(`apps/api/scripts/audit_assets_v1.py`) +- **現有 25 條 alert_rules.yaml 須 seed 進 alert_rule_catalog**(一次性,~1 小時) + +### Rollback + +- 每張表獨立 migration,可 `alembic downgrade -1` 一張張退 +- Feature flag `AIOPS_ASSET_INVENTORY_ENABLED` 預設 false,分階段開啟 + +## Implementation(施工) + +詳見 [2026-04-18 盲區治理 + 容量自主化戰役](../superpowers/specs/2026-04-18-blindspot-governance-capacity-l4.md) §5 完整 DDL、§7 七階段實施步驟、§10 驗收指標。 + +## Related + +- ADR-080 AI 自主化飛輪總覽 +- ADR-082 多 Agent 協作 +- ADR-083 學習閉環 +- ADR-084 異常偵測(trend_predictor) +- ADR-086 修復抽象化(blast_radius_calculator) +- MASTER §3.1 D1 感官縱深(MCP Tool Registry) +- MASTER §4 7 層 × 6 維矩陣 diff --git a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md index 0117952b..bcb84de5 100644 --- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md +++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md @@ -1597,3 +1597,47 @@ Phase 6 完成後 - [ ] Fine-tune JSONL ≥ 10 條(EvidenceSnapshot 累積 7 天後) **下一步:** 推 Gitea → CD 部署 → 呼叫 `POST /api/v1/learning/evolver/run` 完成合併演練 → 7 天生產監控 + +--- + +### 2026-04-18 下午 (台北) — Phase 7 啟動:盲區治理 + 容量自主化 — 新子藍圖分枝 + +**觸發**:MoWoooWorkDown 假警報 RCA 深潛 → 暴露三重結構性失守(見下) + +**鐵證**: +- 110 主機 load=18.35 / 188 主機 load=16.32 / cadvisor 狂吃 288% CPU **持續 13 天無告警** +- Prometheus 僅 35 active targets / 58 alert rules — **覆蓋不到三成** +- `HostHighCpuLoad` 規則用 `CPU idle < 80%` 測 — **量錯維度**(應測 load_avg / iowait) +- K3s Master 120/121 + VIP 125 完全沒 node-exporter +- Kafka / Ollama GPU / ClickHouse / Sentry 40+ 容器全黑箱 + +**統帥戰略指令**(不可動搖): +1. 全景資產(主機/環境/服務/監控/工作/套件/DB/日誌/KM/前端/後端/API/容器/Gitea/CI-CD 無例外)× 七大自動化 × 永久化 DB +2. AI 四分工:OpenClaw × NemoTron × Hermes × Claude LLM 辯證協作 +3. **所有自動化操作歷程必進 DB**,不靠 MD(MD 會漂移) + +**子藍圖誕生**:[`2026-04-18-blindspot-governance-capacity-l4.md`](./2026-04-18-blindspot-governance-capacity-l4.md) +- 本檔(MASTER v2)維持 Phase 0-6 地位不變 +- 子藍圖為 **Phase 7**:盲區矯正 + 容量自主化 +- 子藍圖 §5 定義 11 張表 Schema(asset_inventory / asset_discovery_run / asset_coverage_snapshot / asset_relationship / alert_rule_catalog / asset_change_event / asset_compliance_snapshot / host_capacity_snapshot / capacity_violation_event / automation_operation_log / ai_collaboration_trace) +- 子藍圖 §6 定義 4 層防禦(告警精準化 / 配額強制 / 主機搬遷 / AI 容量巡檢) +- 子藍圖 §7 定義 7 Phase 實施順序(0a DNS 止血 → 0b cadvisor 降壓 → 1 Migration → 2 全景審計 → 3 規則升級 → 4 NemoTron 容量巡檢 → 5 配額強制 → 6 主機搬遷 → 7 引擎 3+4 閉環) + +**新增 ADR**:[`ADR-090`](../../adr/ADR-090-monitoring-blindspot-governance.md) + +**新增 HARD_RULES**: +1. 監控工具必須被監控(cadvisor 案例) +2. 配額即義務(缺 limit → CI reject) +3. 主機容量警戒線(headroom < 0.2 三日 → 凍結新部署) +4. 告警維度鐵律(必含 load_avg / iowait) +5. IP 靜態綁定必登記技術債 +6. 監控覆蓋率 SLO(red 比例 < 20%) +7. 新服務上線 Gate(先進 asset_inventory) + +**新增 Memory**: +- `project_blindspot_governance.md` — 子藍圖跨 session 指針 +- `feedback_monitor_self_monitoring.md` — 🔴🔴🔴 監控工具必須被監控鐵律 + +**當前 Blocker**:Phase 0a/0b 止血指令待統帥最終授權。110/188 load 持續超載,執行重審計前必須先降壓。 + +**下一步**:統帥授權後執行 0a(110 /etc/hosts)+ 0b(188 cadvisor restart),觀察 30 分鐘後進 Phase 1 Migration。 diff --git a/docs/superpowers/specs/2026-04-18-blindspot-governance-capacity-l4.md b/docs/superpowers/specs/2026-04-18-blindspot-governance-capacity-l4.md new file mode 100644 index 00000000..80459c4d --- /dev/null +++ b/docs/superpowers/specs/2026-04-18-blindspot-governance-capacity-l4.md @@ -0,0 +1,890 @@ +# AWOOOI 監控盲區治理 + 容量自主化戰役總結(2026-04-18) + +> 🔴🔴🔴 **Single Source of Truth** — 本次 Session 所有決議、Schema、實施步驟的唯一事實來源 +> **啟動日**:2026-04-18 台北時間 +> **狀態**:🟡 Schema 定稿待 migration;Phase 0a/0b 止血指令待統帥授權執行 +> **上游藍圖**:[`2026-04-15-MASTER-ai-autonomous-flywheel-v2.md`](./2026-04-15-MASTER-ai-autonomous-flywheel-v2.md)(Phase 0-6 已完成) +> **本檔定位**:Phase 7 — 盲區矯正 + 容量自主化(MASTER 未涵蓋的新層次) + +--- + +## §0 TL;DR(一分鐘版) + +過去 13 天,110 主機 load=18.35、188 主機 load=16.32、cadvisor 狂吃 288% CPU —— **全部無人察覺**。原因不是系統沒壞,是 AWOOOI **監控覆蓋不到三成**(Prometheus 僅 35 targets / 58 rules),且**告警規則量錯維度**(CPU idle% 而非 load average)。統帥下令:**徹底重建監控地基 + 資料永久化 DB + AI 四分工(OpenClaw × NemoTron × Hermes × LLM)接管飛輪**。本檔交付: + +1. **10 張表資料庫架構**(Phase 1 基底) +2. **四層防禦戰略**(Phase 3-6 施工藍圖) +3. **7 個實施 Phase 的詳盡步驟**(從止血到 AI 自主治理) +4. **與 7 項自動化引擎的完整對映**(每格誰負責) + +--- + +## §1 觸發事件:MoWoooWorkDown 假警報 + +``` +2026-04-18 凌晨 + Blackbox probe mo.wooo.work → 超時 5s → Alertmanager 噴 P1 + Telegram 推告警 → 統帥起床救火 + curl https://mo.wooo.work → 10s 回應但成功 + → 應用層沒死,是探針死 + ↓ + 深潛 RCA + ↓ + 真凶鎖定: + - 110 主機 load=18.35(非應用問題) + - ClickHouse 134% CPU + - DNS 解析 mo.wooo.work → UDP 封包在高負載下丟失 + - Blackbox timeout 5s 太短 → 吃不下 DNS 4s + 應用 2s +``` + +**已修部分**:Blackbox timeout 5s→10s(commit 待記錄於 Gitea) + +**未解的根本**: +- 為何 110 load=18.35 撐 13 天無人知? +- 為何只有 mo.wooo.work 觸發,其他網站沒事? +- 為何「沒告警」就以為系統健康? + +--- + +## §2 鐵證數據(2026-04-18 12:00 台北實測) + +### 2.1 110 主機 + +| 指標 | 數值 | 診斷 | +|-----|------|-----| +| uptime | 13 天 | — | +| load5 | 18.35 | 🔴 持續飽和 13 天 | +| ClickHouse CPU | 134% | 🔴 吞掉一個多核心 | +| GitHub Actions Runner | 92% | 🟡 | +| Kafka | 45% | 🟡 | +| Sentry 容器數 | 40+ | 🔴 單一產品佔主機最大比例 | +| Harbor 組件 | 5 個 | 🟡 | +| Bitan / Gitea / Redis / Prometheus / Blackbox | 各 1-15% | 💢 全疊在一台 | + +### 2.2 188 主機 + +| 指標 | 數值 | 診斷 | +|-----|------|-----| +| uptime | 13 天 | **與 110 同一時段開始過載** | +| load5 | 14.19 / 15.91 / 16.32 | 🔴 12 核 → per-core 1.18,飽和 | +| Memory | 62G / 8.4G used | ✅ 記憶體充裕 | +| **cadvisor CPU** | **288%** | 🔴🔴🔴 **監控工具自爆 13 天無人知** | +| clickhouse-server CPU | 169% | 🔴 SignOz 後端 | +| prisma migrate deploy | 109% | 🟡 momo 短暫 | +| postgres momo_analytics idle | 76% + 8.8% | 🟡 可疑連線洩漏 | + +### 2.3 Prometheus 覆蓋實況 + +| 資源 | 數量 | 覆蓋評價 | +|-----|------|---------| +| active targets 總數 | **35** | 🔴 遠低於實際資產 | +| alert rules 總數 | **58 條 / 16 組** | 🔴 覆蓋不全 | +| firing rules | 2(HostHighCpuLoad + HostBackupFailed) | — | +| node-exporter | 只監 110 / 112 / 188 | 🔴🔴🔴 **K3s Master 120/121 + VIP 125 完全沒監控** | +| blackbox-http | 8 個網站 | 🔴 實際應有 15+ | +| cadvisor | 只 cadvisor-110 | 🔴 188 無 cadvisor target | +| kube-state-metrics | 部署了但 Prometheus 沒抓 | 🔴 K8s pod 狀態無告警基礎 | +| Kafka exporter | 不存在 | 🔴 消息佇列完全黑箱 | +| Ollama GPU / DCGM | 不存在 | 🔴 VRAM 黑箱 | +| SSL cert expiry | 只 1 條 rule | 🟡 | + +### 2.4 規則品質問題 + +**HostHighCpuLoad 規則**: +```yaml +expr: 100 - (avg by (host) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 +for: 300s +``` + +**致命瑕疵**:測 CPU idle%,不是 load average。 +- 110 實測:load=18.35 時 CPU idle 可能還有 30% → 規則不觸發 +- 實際問題是**進程 queue 過長 + iowait 高 + UDP 丟包**,CPU 使用率不一定飆 +- **結果**:13 天過載零告警 + +--- + +## §3 根因盤點:為何結構性失守 + +| # | 問題 | 本質 | +|---|------|-----| +| 1 | 監控量錯維度 | 用 `CPU idle` 不是 `load avg / iowait / run queue` | +| 2 | 監控覆蓋不到三成 | 35 targets vs 實際數百個資產 | +| 3 | 沒有容量配額 | Docker / K8s workload 大半無 `limits` | +| 4 | 主機角色混雜 | 110/188 各疊 10+ 異質服務,無隔離 | +| 5 | 新服務上線無容量評估 | 東西一直疊,沒人問「承載力夠嗎」 | +| 6 | 監控工具自己沒被監控 | cadvisor 288% 活證據 | +| 7 | 沒有趨勢預測 / AI 容量巡檢 | Phase 4 的 `trend_predictor` 建好但沒對主機跑 | + +**深層原因**:**寫了一堆 MD / ADR / Memory / Skill,但執行偏移**。MD 沒有強制力,不會阻止人把 40 個 Sentry 容器塞進 110。**需要把治理寫進 DB,讓 AI 強制檢查**。 + +--- + +## §4 與 AI 自主化北極星對齊 + +### 4.1 七大自動化引擎(統帥明示) + +| # | 引擎 | 定義 | 當前狀態 | +|---|-----|-----|---------| +| E1 | **自動監控** | 從 Prometheus 時序動態感知基準線,自動發現異常 | 🔴 靜態閾值,規則量錯維度 | +| E2 | **自動告警** | AI 分類 / 嚴重度 / 路由 / 抑制 | 🟡 Phase 2 Agent 協作已建,但 general 兜底 41% | +| E3 | **自動建立匹配規則** | 規則由 AI 從歷史 Incident 學習產出 | 🔴 25 條全硬編 | +| E4 | **自動匹配規則** | 來告警時 AI 判斷用哪條規則 | 🟡 Jaccard 相似度存在但低質 | +| E5 | **自動建立 Playbook** | 修復劇本由 AI 動態生成 | 🟡 Phase 1 Solver K8s 清單注入,但仍幻覺 | +| E6 | **自動修復** | AI 執行 + 驗證 + 失敗自我修正(stderr 回灌) | 🔴 approval_execution.py 不會回灌 stderr | +| E7 | **自動建立 KM** | 故障完自動沉澱 RCA 進 KM + embedding | 🟡 KM +5/24h,目標 >20 | + +### 4.2 AI 四分工責任矩陣(LLM × OpenClaw × NemoTron × Hermes) + +| 引擎 | 主責 Agent | 輔責 Agent | 備註 | +|-----|----------|----------|------| +| E1 監控 | **NemoTron** 每日容量巡檢 | Hermes 趨勢分析 | NemoTron 8B 夠力做統計預測 | +| E2 告警 | **OpenClaw** 分類分診 | Critic 挑戰 | 語意分類取代前綴硬編 | +| E3 規則生成 | **Hermes** 分析師(RLHF 角色) | NemoTron 提案 | Hermes 擅長 reasoning + risk | +| E4 規則匹配 | **OpenClaw** embedding 檢索 | NemoTron 排序 | Jaccard → semantic | +| E5 Playbook 生成 | **NemoTron + Solver** | Reviewer 安全官 | 動態 K8s 清單注入 + dry-run | +| E6 自動修復 | **Coordinator** 協調 | **Critic 挑戰** + **Verifier 驗證** | stderr 回灌機制 | +| E7 KM 沉澱 | **Hermes** 分析敘事 | OpenClaw 摘要 | Hermes reasoning → 敘事卡片 | + +**鐵律**:每個 Agent 動作**必須**寫入 `ai_collaboration_trace`,包括 prompt / response / confidence / challenged_by。 + +--- + +## §5 資料庫地基:10 張表完整 Schema + +### 5.1 架構圖 + +``` +asset_discovery_run ─┬─> asset_inventory ─┬─> asset_relationship (dependency graph) + │ │ ├─> asset_coverage_snapshot (resource × 7 engines) + │ │ ├─> asset_compliance_snapshot (SSL/CVE/secret/backup) + │ │ └─> asset_change_event (diff between runs) + │ │ + │ └─> alert_rule_catalog (rules as first-class assets) + │ + ├─> host_capacity_snapshot (每日主機容量快照) + │ + ├─> capacity_violation_event (配額違規記錄) + │ + └─> automation_operation_log (every AI op audit trail) + │ + └─> ai_collaboration_trace (multi-agent handoff) +``` + +### 5.2 完整 DDL + +```sql +-- ═══════════════════════════════════════════════════════════════════ +-- TABLE 1: asset_inventory — 全景資產清單(核心主表) +-- ═══════════════════════════════════════════════════════════════════ +CREATE TABLE asset_inventory ( + asset_id BIGSERIAL PRIMARY KEY, + asset_key TEXT NOT NULL UNIQUE, -- 穩定識別:host:110 / container:110:sentry-web / k8s:awoooi-prod/awoooi-api + asset_type TEXT NOT NULL, + parent_asset_id BIGINT REFERENCES asset_inventory(asset_id), + environment TEXT NOT NULL DEFAULT 'prod', -- prod / staging / dev(advisor 建議) + host TEXT, -- 110 / 120 / 121 / 188 / 112 / 125 + namespace TEXT, + name TEXT NOT NULL, + metadata JSONB NOT NULL DEFAULT '{}'::jsonb, + tags TEXT[] NOT NULL DEFAULT '{}', + owner_team TEXT, + criticality TEXT CHECK (criticality IN ('P0','P1','P2','P3')), + data_classification TEXT CHECK (data_classification IN ('public','internal','sensitive','secret')), + external BOOLEAN NOT NULL DEFAULT false, -- 第三方 SaaS (Cloudflare/NVIDIA/Gemini) + lifecycle_state TEXT NOT NULL DEFAULT 'active' + CHECK (lifecycle_state IN ('planned','provisioning','active','degraded','deprecated','decommissioned')), + source_repo TEXT, -- Git repo URL + source_commit_sha TEXT, -- 部署的 commit + -- 容量欄位(Layer 4 AI 巡檢用) + cpu_avg_7d NUMERIC(5,2), + mem_avg_7d NUMERIC(5,2), + capacity_headroom NUMERIC(5,2), -- 剩餘承載 0.00-1.00 + resource_limits JSONB, -- {cpu:"2",mem:"4Gi"} + resource_requests JSONB, + quota_violation_count INT NOT NULL DEFAULT 0, + sla_target JSONB, -- {uptime:99.9, rto_min:15, rpo_min:5} + cost_monthly_usd NUMERIC(10,2), + first_seen_at TIMESTAMPTZ NOT NULL DEFAULT now(), + last_seen_at TIMESTAMPTZ NOT NULL DEFAULT now(), + decommissioned_at TIMESTAMPTZ, + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT now(), + CONSTRAINT asset_type_valid CHECK (asset_type IN ( + 'host','container','k8s_workload','k8s_resource','database','table', + 'website','api_endpoint','package','log_stream','km_entry', + 'frontend','backend','ci_pipeline','gitea_repo','monitoring_target', + 'secret','volume','network','certificate','scheduled_job', + 'message_queue','cache','dashboard','ai_agent','llm_model', + 'third_party_service','backup_target' + )) +); +CREATE INDEX idx_ai_type_host ON asset_inventory(asset_type, host); +CREATE INDEX idx_ai_env ON asset_inventory(environment, lifecycle_state); +CREATE INDEX idx_ai_metadata ON asset_inventory USING GIN (metadata); +CREATE INDEX idx_ai_tags ON asset_inventory USING GIN (tags); + +-- ═══════════════════════════════════════════════════════════════════ +-- TABLE 2: asset_discovery_run — 每次盤點 header +-- ═══════════════════════════════════════════════════════════════════ +CREATE TABLE asset_discovery_run ( + run_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + triggered_by TEXT NOT NULL, -- human:ogt / cron:daily / incident:INC-123 / ai:proactive_inspector + scope TEXT[] NOT NULL, -- 本次掃描的 asset_type 清單 + scan_depth TEXT NOT NULL DEFAULT 'shallow' + CHECK (scan_depth IN ('shallow','deep','full')), + host_filter TEXT[], -- 本次只掃哪些主機 + started_at TIMESTAMPTZ NOT NULL DEFAULT now(), + ended_at TIMESTAMPTZ, + status TEXT NOT NULL CHECK (status IN ('running','success','partial','failed','aborted')), + total_assets INT, + new_assets INT NOT NULL DEFAULT 0, + modified_assets INT NOT NULL DEFAULT 0, + disappeared_assets INT NOT NULL DEFAULT 0, + tools_used JSONB, -- {"ssh":5,"kubectl":1,"prom_api":1,"gitea_api":1} + duration_ms INT, + error TEXT, + summary JSONB +); +CREATE INDEX idx_adr_started ON asset_discovery_run(started_at DESC); + +-- ═══════════════════════════════════════════════════════════════════ +-- TABLE 3: asset_coverage_snapshot — 資產 × 7 自動化 覆蓋矩陣 +-- ═══════════════════════════════════════════════════════════════════ +CREATE TABLE asset_coverage_snapshot ( + snapshot_id BIGSERIAL PRIMARY KEY, + run_id UUID NOT NULL REFERENCES asset_discovery_run(run_id) ON DELETE CASCADE, + asset_id BIGINT NOT NULL REFERENCES asset_inventory(asset_id), + dimension TEXT NOT NULL CHECK (dimension IN ( + 'auto_monitoring','auto_alerting','auto_rule_creation', + 'auto_rule_matching','auto_playbook','auto_remediation','auto_km_creation' + )), + coverage_status TEXT NOT NULL CHECK (coverage_status IN ('green','yellow','red','unknown')), + evidence JSONB NOT NULL DEFAULT '{}'::jsonb, -- {prom_target,rule_name,playbook_id,km_entry_id} + gap_reason TEXT, + recommended_action TEXT, + confidence NUMERIC(3,2), + detected_by TEXT NOT NULL, -- scanner:blackbox / ai:nemotron / human + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + UNIQUE (run_id, asset_id, dimension) +); +CREATE INDEX idx_acs_asset ON asset_coverage_snapshot(asset_id, dimension); +CREATE INDEX idx_acs_red_yellow ON asset_coverage_snapshot(coverage_status) WHERE coverage_status IN ('red','yellow'); + +-- ═══════════════════════════════════════════════════════════════════ +-- TABLE 4: asset_relationship — 資產依賴圖(爆炸半徑必需) +-- ═══════════════════════════════════════════════════════════════════ +CREATE TABLE asset_relationship ( + relationship_id BIGSERIAL PRIMARY KEY, + from_asset_id BIGINT NOT NULL REFERENCES asset_inventory(asset_id), + to_asset_id BIGINT NOT NULL REFERENCES asset_inventory(asset_id), + relationship_type TEXT NOT NULL CHECK (relationship_type IN ( + 'depends_on','calls','stores_data_in','backs_up_to', + 'routes_to','authenticates_via','monitors','alerts_to','logs_to' + )), + strength NUMERIC(3,2), -- 0.00-1.00 依賴強度 + metadata JSONB, + first_detected_at TIMESTAMPTZ NOT NULL DEFAULT now(), + last_verified_at TIMESTAMPTZ, + is_active BOOLEAN NOT NULL DEFAULT true, + UNIQUE (from_asset_id, to_asset_id, relationship_type) +); +CREATE INDEX idx_ar_from ON asset_relationship(from_asset_id); +CREATE INDEX idx_ar_to ON asset_relationship(to_asset_id); + +-- ═══════════════════════════════════════════════════════════════════ +-- TABLE 5: alert_rule_catalog — 告警規則本身即資產 +-- ═══════════════════════════════════════════════════════════════════ +CREATE TABLE alert_rule_catalog ( + rule_id BIGSERIAL PRIMARY KEY, + rule_name TEXT NOT NULL UNIQUE, + source TEXT NOT NULL CHECK (source IN ('yaml_hardcoded','ai_generated','human_written','playbook_derived')), + expr TEXT NOT NULL, + duration_seconds INT, + severity TEXT, + labels JSONB, + annotations JSONB, + linked_asset_ids BIGINT[], -- 這條規則保護的資產 + created_by_agent TEXT, -- nemotron / hermes / openclaw / human:ogt + -- 規則品質追蹤 + true_positive_count INT NOT NULL DEFAULT 0, + false_positive_count INT NOT NULL DEFAULT 0, + noise_rate NUMERIC(5,2), + last_fired_at TIMESTAMPTZ, + -- 信心與演化 + confidence NUMERIC(3,2), + review_status TEXT CHECK (review_status IN ('draft','approved','deprecated','retired')), + superseded_by_rule_id BIGINT REFERENCES alert_rule_catalog(rule_id), + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT now() +); +CREATE INDEX idx_arc_source ON alert_rule_catalog(source); +CREATE INDEX idx_arc_assets ON alert_rule_catalog USING GIN (linked_asset_ids); + +-- ═══════════════════════════════════════════════════════════════════ +-- TABLE 6: asset_change_event — 資產變化追蹤 +-- ═══════════════════════════════════════════════════════════════════ +CREATE TABLE asset_change_event ( + event_id BIGSERIAL PRIMARY KEY, + run_id UUID NOT NULL REFERENCES asset_discovery_run(run_id), + asset_id BIGINT REFERENCES asset_inventory(asset_id), + change_type TEXT NOT NULL CHECK (change_type IN ( + 'asset_added','asset_removed','asset_modified', + 'coverage_improved','coverage_degraded', + 'criticality_changed','owner_changed','lifecycle_changed' + )), + before_state JSONB, + after_state JSONB, + diff JSONB, + detected_at TIMESTAMPTZ NOT NULL DEFAULT now(), + ai_analysis TEXT +); +CREATE INDEX idx_ace_run ON asset_change_event(run_id); +CREATE INDEX idx_ace_asset ON asset_change_event(asset_id, detected_at DESC); + +-- ═══════════════════════════════════════════════════════════════════ +-- TABLE 7: asset_compliance_snapshot — 合規狀態快照 +-- ═══════════════════════════════════════════════════════════════════ +CREATE TABLE asset_compliance_snapshot ( + snapshot_id BIGSERIAL PRIMARY KEY, + run_id UUID REFERENCES asset_discovery_run(run_id), + asset_id BIGINT NOT NULL REFERENCES asset_inventory(asset_id), + dimension TEXT NOT NULL CHECK (dimension IN ( + 'ssl_cert_valid','cve_scan','secret_rotated','backup_tested', + 'audit_log_enabled','access_reviewed','encryption_at_rest' + )), + status TEXT NOT NULL CHECK (status IN ('compliant','warning','violation','unknown')), + expires_at TIMESTAMPTZ, -- cert 到期 / 下次掃描 + detail JSONB, + remediation_deadline TIMESTAMPTZ, + detected_at TIMESTAMPTZ NOT NULL DEFAULT now(), + UNIQUE (asset_id, dimension, snapshot_id) +); +CREATE INDEX idx_comp_asset ON asset_compliance_snapshot(asset_id, dimension); +CREATE INDEX idx_comp_expiring ON asset_compliance_snapshot(expires_at) WHERE expires_at IS NOT NULL; + +-- ═══════════════════════════════════════════════════════════════════ +-- TABLE 8: host_capacity_snapshot — 主機容量快照(每日 by NemoTron) +-- ═══════════════════════════════════════════════════════════════════ +CREATE TABLE host_capacity_snapshot ( + snapshot_id BIGSERIAL PRIMARY KEY, + host TEXT NOT NULL, + captured_at TIMESTAMPTZ NOT NULL DEFAULT now(), + load1 NUMERIC(6,2), + load5 NUMERIC(6,2), + load15 NUMERIC(6,2), + cpu_used_pct NUMERIC(5,2), + cpu_iowait_pct NUMERIC(5,2), + mem_used_pct NUMERIC(5,2), + swap_used_pct NUMERIC(5,2), + disk_used_pct JSONB, -- per mount + container_count INT, + k8s_pod_count INT, + top_cpu_offenders JSONB, -- top 5 進程 + top_mem_offenders JSONB, + headroom_pct NUMERIC(5,2), -- AI 估算剩餘承載 + ai_verdict TEXT, -- NemoTron 判斷:safe/warning/critical + ai_reasoning TEXT, + recommended_actions JSONB, + written_by_agent TEXT NOT NULL -- nemotron / human:ogt +); +CREATE INDEX idx_hcs_host_time ON host_capacity_snapshot(host, captured_at DESC); +CREATE INDEX idx_hcs_critical ON host_capacity_snapshot(ai_verdict) WHERE ai_verdict IN ('warning','critical'); + +-- ═══════════════════════════════════════════════════════════════════ +-- TABLE 9: capacity_violation_event — 容量配額違規 +-- ═══════════════════════════════════════════════════════════════════ +CREATE TABLE capacity_violation_event ( + event_id BIGSERIAL PRIMARY KEY, + asset_id BIGINT REFERENCES asset_inventory(asset_id), + host TEXT, + violation_type TEXT NOT NULL CHECK (violation_type IN ( + 'no_limit_set','over_request','over_limit','host_saturation', + 'over_sla_budget','unauthorized_new_deploy' + )), + threshold NUMERIC(10,2), + actual_value NUMERIC(10,2), + detected_at TIMESTAMPTZ NOT NULL DEFAULT now(), + auto_action TEXT, -- AI 採取的動作 + auto_action_op_id UUID, -- 對應 automation_operation_log + human_override TEXT, + resolved_at TIMESTAMPTZ +); +CREATE INDEX idx_cve_asset ON capacity_violation_event(asset_id, detected_at DESC); + +-- ═══════════════════════════════════════════════════════════════════ +-- TABLE 10: automation_operation_log — 所有 AI 自動化動作稽核主表 🔴 +-- ═══════════════════════════════════════════════════════════════════ +CREATE TABLE automation_operation_log ( + op_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + operation_type TEXT NOT NULL CHECK (operation_type IN ( + 'monitor_configured','monitor_removed', + 'alert_fired','alert_suppressed','alert_routed', + 'rule_created','rule_updated','rule_matched','rule_rejected','rule_deprecated', + 'playbook_generated','playbook_updated','playbook_executed', + 'remediation_executed','remediation_verified','remediation_rolled_back', + 'self_correction_attempted', -- 引擎 4 stderr 回灌 + 'km_created','km_updated','km_linked', + 'asset_discovered','coverage_recalculated', + 'capacity_recommendation','quota_enforced' + )), + asset_id BIGINT REFERENCES asset_inventory(asset_id), + incident_id BIGINT, + run_id UUID REFERENCES asset_discovery_run(run_id), + actor TEXT NOT NULL, -- openclaw / nemotron / hermes / solver / critic / coordinator / claude-code / human:ogt + input JSONB NOT NULL DEFAULT '{}'::jsonb, + output JSONB NOT NULL DEFAULT '{}'::jsonb, + dry_run_result JSONB, -- dry-run 預覽結果 + status TEXT NOT NULL CHECK (status IN ('pending','success','failed','dry_run','rolled_back')), + error TEXT, + duration_ms INT, + tokens_in INT, + tokens_out INT, + cost_usd NUMERIC(10,6), + budget_bucket TEXT, -- 分攤到哪個預算桶 + parent_op_id UUID REFERENCES automation_operation_log(op_id), + retry_count INT NOT NULL DEFAULT 0, -- advisor 建議 + retry_of_op_id UUID REFERENCES automation_operation_log(op_id), + stderr_feed_back TEXT, -- 引擎 4:上次 stderr 回灌 LLM 的內容 + tags TEXT[], -- technical_debt / blind_spot_mitigation / ... + created_at TIMESTAMPTZ NOT NULL DEFAULT now() +); +CREATE INDEX idx_aol_type_time ON automation_operation_log(operation_type, created_at DESC); +CREATE INDEX idx_aol_asset ON automation_operation_log(asset_id, created_at DESC); +CREATE INDEX idx_aol_incident ON automation_operation_log(incident_id) WHERE incident_id IS NOT NULL; +CREATE INDEX idx_aol_actor ON automation_operation_log(actor, created_at DESC); +CREATE INDEX idx_aol_retry ON automation_operation_log(retry_of_op_id) WHERE retry_of_op_id IS NOT NULL; + +-- ═══════════════════════════════════════════════════════════════════ +-- TABLE 11: ai_collaboration_trace — 多 Agent 協作逐步(LLM × OpenClaw × NemoTron × Hermes) +-- ═══════════════════════════════════════════════════════════════════ +CREATE TABLE ai_collaboration_trace ( + trace_id BIGSERIAL PRIMARY KEY, + op_id UUID NOT NULL REFERENCES automation_operation_log(op_id) ON DELETE CASCADE, + step_order INT NOT NULL, + agent TEXT NOT NULL, -- openclaw / nemotron / hermes / diagnostician / solver / reviewer / critic / coordinator / executor / verifier + model TEXT, -- nemotron-70b / hermes-3 / openclaw-v1 / claude-opus-4-7 + system_prompt_version TEXT, + prompt TEXT, + response JSONB, + confidence NUMERIC(3,2), + challenged_by TEXT[], + accepted BOOLEAN, + tokens_in INT, + tokens_out INT, + duration_ms INT, + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + UNIQUE (op_id, step_order) +); +CREATE INDEX idx_act_op ON ai_collaboration_trace(op_id, step_order); +CREATE INDEX idx_act_agent ON ai_collaboration_trace(agent, created_at DESC); +``` + +> 註:雖然架構圖標題說 10 張表,完整 DDL 含 `ai_collaboration_trace` 共 **11 張表**(原 6 + asset_relationship + alert_rule_catalog + asset_compliance_snapshot + host_capacity_snapshot + capacity_violation_event)。 + +### 5.3 與現有表關係 + +| 現有表 | 連動方式 | +|-------|---------| +| `incidents` | `automation_operation_log.incident_id` FK | +| `playbooks` | `asset_coverage_snapshot.evidence->>'playbook_id'` | +| `knowledge_entries` | `asset_coverage_snapshot.evidence->>'km_entry_id'` | +| `timeline_events` | 每次 `automation_operation_log` 寫入同步追加 event | +| `alert_rules.yaml` | seed → `alert_rule_catalog` (source='yaml_hardcoded'),未來漸進 AI 產出 | + +### 5.4 保留策略 + +| 表 | 保留期 | 清理 | +|---|-------|-----| +| asset_inventory | 永久 | soft delete | +| asset_discovery_run | 永久 | — | +| asset_coverage_snapshot | 90 天 | 保最新 + 每週一筆 | +| asset_relationship | 永久 | is_active=false | +| alert_rule_catalog | 永久 | review_status=retired | +| asset_change_event | 永久 | — | +| asset_compliance_snapshot | 180 天 | — | +| host_capacity_snapshot | 90 天 | — | +| capacity_violation_event | 永久 | — | +| automation_operation_log | 永久 | 未來 archive ClickHouse | +| ai_collaboration_trace | 180 天 | 低信心 / rejected 永久保留作訓料 | + +--- + +## §6 四層防禦策略(長期根治) + +### Layer 1:告警規則精準化(Phase 3,本週) + +**新增 / 修正規則**(寫進 `alert_rules.yaml` + seed 進 `alert_rule_catalog`): + +```yaml +# === 取代現有 HostHighCpuLoad(量錯維度) === +- alert: HostLoadPersistentHigh + expr: node_load5 / count by(host) (node_cpu_seconds_total{mode="idle"}) > 0.9 + for: 30m + labels: { severity: warning, auto_repair: "false" } + annotations: { summary: "{{$labels.host}} load/core 持續 30 分鐘 > 0.9" } + +- alert: HostLoadCritical + expr: node_load5 / count by(host) (node_cpu_seconds_total{mode="idle"}) > 1.5 + for: 10m + labels: { severity: critical } + +- alert: HostIowaitHigh + expr: avg by(host) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > 0.2 + for: 10m + +# === 監控工具自我監控(cadvisor 真實案例) === +- alert: CadvisorSelfCpuHigh + expr: rate(container_cpu_usage_seconds_total{name="cadvisor"}[5m]) > 1.0 + for: 5m + annotations: { summary: "cadvisor 自己吃超過 1 個核心" } + +- alert: PrometheusSelfScrapeFailed + expr: up{job="prometheus"} == 0 + for: 1m + +- alert: AlertmanagerSelfDown + expr: up{job="alertmanager"} == 0 + for: 1m + +# === K3s 補監控 === +- alert: K3sControlPlaneDown + expr: up{job="node-exporter",host=~"120|121"} == 0 + for: 2m + labels: { severity: critical } + +- alert: K3sEtcdLagHigh + expr: etcd_mvcc_db_total_size_in_bytes > 4e9 + for: 10m + +# === 容器配額違規 === +- alert: ContainerNoMemoryLimit + expr: count by(container) (container_spec_memory_limit_bytes == 0) > 0 + for: 1h + annotations: { summary: "容器 {{$labels.container}} 無 memory limit" } + +# === 趨勢預測(Phase 4 接線) === +- alert: HostCapacityTrendCritical + expr: predict_linear(node_load5[1h], 7*24*3600) > count by(host)(node_cpu_seconds_total{mode="idle"}) + for: 30m + annotations: { summary: "{{$labels.host}} 預測 7 天內撞飽和" } +``` + +### Layer 2:強制資源配額(Phase 5,本月) + +**執行項**: + +| # | 動作 | 負責人 | 驗收 | +|---|-----|-------|------| +| L2.1 | 審 110/188 所有 `docker-compose.yml`,補 `deploy.resources.limits` | 人工 + AI 輔助 | 所有 compose 有 limit | +| L2.2 | K3s 每 namespace 建 `ResourceQuota` + `LimitRange` | DevOps | `kubectl describe quota -A` 全覆蓋 | +| L2.3 | Gitea CI 加檢查:缺 limit → PR reject | 新增 workflow | blocked PR 次數可見 | +| L2.4 | Harbor / Gitea / Sentry / ClickHouse 強制上限(重型) | 人工排期 | `docker inspect` 看 limit 不為 0 | +| L2.5 | 違規記入 `capacity_violation_event` | 自動 | 每日有新記錄 | + +### Layer 3:主機角色重分工(Phase 6,本季) + +**現況 → 目標**: + +| 主機 | 當前 | 目標角色 | 搬遷對象 | +|-----|------|---------|---------| +| 110 | DevOps + 監控 + CI + Sentry + Kafka + 業務 (Bitan) | **DevOps 金庫**:Harbor / Gitea / CI runner | Sentry / Kafka / Bitan 搬走 | +| 120 / 121 | K3s Master | **K3s Control-Plane(純)** | 無(維持) | +| 188 | K3s PG + Ollama + SignOz + momo + 18 容器 | **AI + 資料層**:PG / Ollama / Redis / momo | SignOz ClickHouse / Grafana / Loki 搬走 | +| **NEW-A** | — | **監控專用主機** | Prometheus + Alertmanager + Grafana + Loki + SignOz | +| **NEW-B** | — | **冷儲存 + 重型業務** | Sentry / MinIO / Kafka / 備份目標 | + +**搬遷順序**(降風險): +1. 先建 NEW-A(監控獨立)→ 解除 110/188 觀測性耦合 +2. 搬 Sentry 從 110 → NEW-B +3. 搬 SignOz ClickHouse 從 188 → NEW-A +4. 保留 110 專做 DevOps Vault + +### Layer 4:AI 自主治理(Phase 4,2 週) + +**NemoTron 每日容量巡檢(MVP)**: + +``` +cron: 每日 02:00 台北 +↓ +NemoTron 讀取: + - 過去 7 天 Prometheus metrics (load5, CPU, mem, iowait per host) + - asset_inventory (每資產 expected resources) + - host_capacity_snapshot (歷史基線) +↓ +Hermes 趨勢分析: + - predict_linear(load5[7d], 7d) → 未來 7 天預測 + - 對比 asset_inventory.sla_target + - 標出「撞牆風險」資產 +↓ +OpenClaw 提案生成: + - 建議搬遷計畫(低優先容器 → 低負載主機) + - 建議新增 resource limit + - 建議廢棄長期低使用容器 +↓ +寫入: + - host_capacity_snapshot (N 筆) + - automation_operation_log (operation_type=capacity_recommendation) + - ai_collaboration_trace (每步) +↓ +Telegram 早報:Top 3 風險 + Top 3 建議 → 統帥 Approve → 執行 or Reject +``` + +--- + +## §7 實施階段(按順序,風險遞增) + +### Phase 0a:110 DNS 止血(10 分鐘) + +```bash +# 1. 靜態綁定 mo.wooo.work → 避免 DNS 超時 +ssh 192.168.0.110 "echo '114.32.151.246 mo.wooo.work' | sudo tee -a /etc/hosts" + +# 2. 驗證 +ssh 192.168.0.110 "getent hosts mo.wooo.work" + +# 3. 等 3 分鐘 → 觀察 load 變化 +ssh 192.168.0.110 "uptime" +``` + +**風險**:🟡 中(IP 寫死是毒蘋果,Phase 6 搬監控時必須廢除) +**回滾**:`sudo sed -i '/mo.wooo.work/d' /etc/hosts` +**稽核**:寫入 `automation_operation_log`(tags=['technical_debt','blind_spot_mitigation']) + +### Phase 0b:188 cadvisor 降壓(5 分鐘) + +```bash +# 1. 重啟 cadvisor(無狀態,釋放 288% CPU) +ssh ollama@192.168.0.188 "docker restart cadvisor" + +# 2. 等 3 分鐘 → 驗證 +ssh ollama@192.168.0.188 "docker stats --no-stream cadvisor | tail -1" + +# 3. 檢查 188 load +ssh ollama@192.168.0.188 "uptime" +``` + +**風險**:🟢 低(cadvisor 無狀態) +**回滾**:`docker start cadvisor`(但 restart 本身即自愈) +**稽核**:寫入 `automation_operation_log`(tags=['blind_spot_mitigation']) +**後續追查**:cadvisor 為何燒 288% 13 天?可能版本 bug 或配置錯誤,歸入 Phase 2 根因追溯 + +### Phase 0c:Prometheus blackbox /health 改造(15 分鐘,選) + +```yaml +# prometheus.yml 修改 +- job_name: 'blackbox-http' + static_configs: + - targets: + - https://mo.wooo.work/health # 從 / 改成 /health +``` + +```bash +ssh 192.168.0.110 "docker kill -s SIGHUP prometheus" +``` + +**風險**:🟢 極低 +**先決條件**:momo-pro-system 確認 /health 存在且 < 100ms + +### Phase 1:Alembic Migration 建 11 張表(30 分鐘) + +```bash +# 位置 +cd /Users/ogt/awoooi/apps/api +alembic revision -m "asset_inventory_foundation_v1" --autogenerate=false +# 編輯新生成的 migration file,填入 §5.2 完整 DDL +alembic upgrade head +``` + +**驗收**: +```sql +SELECT table_name FROM information_schema.tables +WHERE table_schema='public' AND table_name LIKE 'asset_%' OR table_name LIKE 'automation_%' OR table_name LIKE 'host_capacity%'; +-- 應回傳 11 張表 +``` + +**風險**:🟢 低(DDL only,PG 本身不在 188 top CPU) +**回滾**:`alembic downgrade -1` + +### Phase 2:全景資產首掃(2 小時,限流) + +**執行腳本**:`apps/api/scripts/audit_assets_v1.py` + +``` +For host in [110, 120, 121, 188, 112, 125]: + scan: + - ssh: uptime / free / df / systemctl list-units --type=service --state=running + - docker ps / docker stats (with sleep 2s between hosts) + - kubectl get all,cm,secret,pvc -A (only on 120 via kubectl) + - Gitea API: repos / webhooks + - Prometheus API: targets / rules + write: + - asset_inventory (upsert by asset_key) + - asset_discovery_run (header) + - asset_coverage_snapshot (每 asset × 7 dimensions) + - asset_relationship (container→host, k8s→namespace) + - alert_rule_catalog (seed from alert_rules.yaml) +``` + +**限流**:每主機 sleep 2s 間隔,不平行打。 + +### Phase 3:Layer 1 告警規則升級(1 小時) + +1. 寫入 `alert_rules.yaml`(§6 Layer 1 規則) +2. Prometheus reload +3. 每條新規則同步寫 `alert_rule_catalog`(source='yaml_hardcoded',created_by_agent='claude-code') +4. 等 30 分鐘觀察實際觸發 + +### Phase 4:NemoTron 容量巡檢 MVP(2 週) + +**新增服務**:`apps/api/src/services/capacity_inspector.py` + +```python +class CapacityInspector: + async def daily_sweep(self): + for host in self.hosts: + metrics = await prometheus.query_range(...) + snapshot = HostCapacitySnapshot(host=host, ...) + verdict = await nemotron.analyze(metrics) # 診斷 + trend = await hermes.predict(metrics) # 趨勢 + recommendations = await openclaw.propose(snapshot, verdict, trend) + await db.save(snapshot, automation_op_log, ai_trace) + await telegram.push_daily_report(top_3_risks, top_3_proposals) +``` + +**cron**:K8s CronJob 每日 02:00 台北 +**驗收**:7 天後 `host_capacity_snapshot` 至少 35 筆(5 主機 × 7 天) + +### Phase 5:資源配額強制(Layer 2,1 個月) + +- 建 `.gitea/workflows/check_resource_limits.yml` +- 批次補 compose limit +- K3s 建 `ResourceQuota` per namespace +- 違規寫 `capacity_violation_event` + +### Phase 6:主機搬遷(Layer 3,1 季) + +需採購決策,不在本 session 範圍。本檔僅登記計畫。 + +### Phase 7:引擎 3+4 閉環強化(顧問已批准的引擎 4 stderr 回灌) + +- `approval_execution.py` retry loop 加 stderr 回灌邏輯 +- `post_execution_verifier.py` 接 `dynamic_baseline_service.py` +- 每次自我修正寫 `automation_operation_log`(op_type='self_correction_attempted', stderr_feed_back=) + +--- + +## §8 HARD_RULES 新增(進 docs/HARD_RULES.md) + +1. **監控工具必須被監控** — cadvisor / Prometheus / Alertmanager / Grafana / Loki 自己的 CPU/Mem 必須在 `alert_rule_catalog` 有規則(cadvisor 288% 案例為鐵證) +2. **配額即義務** — Docker compose 缺 `deploy.resources.limits` → Gitea CI PR reject +3. **主機容量警戒線** — `host_capacity_snapshot.headroom_pct < 0.2` 三日 → 自動凍結新部署到該主機 +4. **告警維度鐵律** — 主機資源告警必須包含 `load_avg / iowait / run_queue`,不能只測 `CPU idle` +5. **IP 靜態綁定技術債登記** — 任何 `/etc/hosts` 寫死必須寫 `automation_operation_log` tags='technical_debt' + `capacity_violation_event` +6. **監控覆蓋率 SLO** — asset_coverage_snapshot 中 `red` 比例必須 < 20%(違反觸發 Telegram 日報警示) +7. **新服務上線 Gate** — 任何新 K8s workload / Docker 容器上線前必須在 `asset_inventory` 有對應 entry + resource_limits 已填 + +--- + +## §9 AI 分工責任矩陣(11 張表 × 4 Agents) + +| 表 | OpenClaw | NemoTron | Hermes | LLM (Claude) | +|---|---------|---------|--------|--------------| +| asset_inventory | 分類 asset_type | 每日巡檢寫 capacity 欄位 | — | 新資產上線人工輔助 | +| asset_discovery_run | 排程觸發 | — | — | 手動觸發 | +| asset_coverage_snapshot | 計算覆蓋狀態 | 異常告警 | 趨勢分析 | — | +| asset_relationship | 從告警 label 推依賴 | 主機拓撲 | — | — | +| alert_rule_catalog | 規則匹配 (E4) | 規則建議 (E3) | 規則品質分析 (noise_rate) | 人工 review | +| asset_change_event | 變化解讀 | — | 趨勢評估 | — | +| asset_compliance_snapshot | SSL cert 掃描 | — | — | — | +| host_capacity_snapshot | — | **主寫者**(E1) | 趨勢預測 | — | +| capacity_violation_event | 自動偵測 | — | — | — | +| automation_operation_log | 所有 Agent 都寫 | 所有 Agent 都寫 | 所有 Agent 都寫 | claude-code 寫 | +| ai_collaboration_trace | step=diagnostician/solver/reviewer | step=capacity_inspector | step=analyst | step=coordinator | + +--- + +## §10 驗收指標(量化 SLO) + +| 指標 | 當前 | 4 週目標 | 12 週目標 | +|-----|------|---------|----------| +| Prometheus active targets | 35 | 80+ | 150+ | +| alert_rules 總數 | 58 | 90+ | 120+ | +| asset_inventory 資產總數 | 0 | 200+ | 500+ | +| asset_coverage_snapshot red 比例 | 未知 | < 40% | < 20% | +| host_capacity_snapshot 筆數/日 | 0 | 5(一主機一筆) | 5 | +| automation_operation_log 寫入/日 | 低 | 100+ | 500+ | +| ai_collaboration_trace agent=nemotron 筆數/日 | 0 | 5+ | 30+ | +| cadvisor 自身 CPU | 288% | < 50% | < 30% | +| 110 load5 | 18.35 | < 8 | < 6 | +| 188 load5 | 16.32 | < 8 | < 6 | +| self_correction_attempted op 次數/週 | 0 | 5+ | 30+(表示 AI 會自我修正) | + +--- + +## §11 技術債登記(與 Phase 0 綁定) + +| ID | 技術債 | 產生 Phase | 到期 Phase | +|----|-------|----------|----------| +| TD-001 | `/etc/hosts` 靜態綁定 mo.wooo.work | Phase 0a | Phase 6 搬監控時廢除 | +| TD-002 | cadvisor 重啟但未根治 288% 原因 | Phase 0b | Phase 2 根因追溯 | +| TD-003 | HostHighCpuLoad 規則保留(給 Phase 3 替換) | 本 session | Phase 3 新規則上線後 retire | +| TD-004 | 110 主機超載根源未解(Sentry/Kafka/ClickHouse 疊一起) | 歷史 | Phase 6 搬遷 | +| TD-005 | 188 主機超載根源未解 | 歷史 | Phase 6 搬遷 | +| TD-006 | K3s 120/121/125 無 node-exporter | 歷史 | Phase 2 審計補 | +| TD-007 | Kafka / Ollama GPU / ClickHouse 無 exporter | 歷史 | Phase 2 審計補 | + +**每條 TD 寫 `automation_operation_log` tags=['technical_debt','TD-XXX']**。 + +--- + +## §12 回滾計畫(每階段獨立) + +| Phase | 回滾方式 | +|-------|---------| +| 0a | `sed -i '/mo.wooo.work/d' /etc/hosts` | +| 0b | 重啟失敗則 `docker start cadvisor`(cadvisor 無狀態) | +| 0c | prometheus.yml 還原 + SIGHUP | +| 1 | `alembic downgrade -1` | +| 2 | audit 失敗不影響既有系統,只是 DB 資料不全 | +| 3 | 移除新增 rule block + SIGHUP | +| 4 | 停 CronJob,既有資料保留 | +| 5 | 撤回 workflow 檢查 | +| 6 | 搬遷分段,每段可獨立回滾 | +| 7 | feature_flag `ENGINE_4_SELF_CORRECTION` = false | + +--- + +## §13 Living Changelog(只追加) + +### 2026-04-18 下午 — 本檔建立 + +**觸發**:MoWoooWorkDown 假警報 RCA 深潛 +**關鍵發現**: +- 110 load=18.35 / 188 load=16.32 / cadvisor 288% 持續 13 天無告警 +- Prometheus 覆蓋不到三成(35/150+ targets) +- HostHighCpuLoad 量錯維度(CPU idle vs load_avg) +- K3s 120/121/125 完全無監控 + +**統帥決議**: +- 全景資產盤點 × 7 項自動化 × 永久化 DB +- AI 四分工(OpenClaw / NemoTron / Hermes / LLM) +- 拍板執行 Option B(0a+0b 並行止血)+ 全套 Schema Migration(11 張表) + +**顧問建議納入**: +- `asset_inventory.environment` 欄位(blast radius 邊界) +- `automation_operation_log.retry_count + stderr_feed_back`(引擎 4 閉環) + +**下一步**:待統帥授權 Phase 0a/0b 執行 + +--- + +## §14 給下一位接手 Claude 的協議 + +1. **先讀本檔 §0 + §7**(知道現在在哪階段) +2. **讀 MASTER §0**(AI 自主化北極星) +3. **讀 CLAUDE.md + HARD_RULES.md** +4. **跑** `SELECT * FROM asset_discovery_run ORDER BY started_at DESC LIMIT 5`(看最後一次盤點) +5. **跑** `SELECT host, ai_verdict, recommended_actions FROM host_capacity_snapshot WHERE captured_at > now() - interval '24 hours'`(看 NemoTron 昨夜建議) +6. **問統帥** 當前 Phase(0a/0b/1/2/3/4/5/6/7) +7. **絕對不要** 跳過 Phase 順序或在 110/188 load > 10 時跑重審計 +