diff --git a/apps/api/migrations/adr090d_kpi_data_sources.sql b/apps/api/migrations/adr090d_kpi_data_sources.sql new file mode 100644 index 00000000..fc705911 --- /dev/null +++ b/apps/api/migrations/adr090d_kpi_data_sources.sql @@ -0,0 +1,149 @@ +-- ADR-090-D: MASTER §7.1 北極星 KPI 資料源建立 +-- 建立時間: 2026-04-18 晚 (台北時區) +-- 建立者: ogt + Claude Opus 4.7 (1M) +-- +-- 背景: +-- MASTER §7.1 15 個 KPI 對標發現 4 張關鍵表根本沒建立,導致以下 KPI 永遠 +-- 量不到: +-- #3 fine-tune JSONL /week → finetune_exports 表 +-- #6 Declarative 修復使用率 → remediation_events 表 +-- #10 notification_outcomes → notification_outcomes 表 +-- +-- 此 migration 補齊 3 張資料源表(idempotent)。 +-- +-- 對應 MASTER § 指標: +-- §3.3 D3 修復抽象(Imperative → Declarative) +-- §3.4 D4 學習深度(Fine-tune) +-- §3.6 D6 自我治理(通知品質) + +-- ═══════════════════════════════════════════════════════════════════ +-- 1. finetune_exports — Phase 3 Fine-tune JSONL 產出追蹤 +-- ═══════════════════════════════════════════════════════════════════ + +CREATE TABLE IF NOT EXISTS finetune_exports ( + export_id BIGSERIAL PRIMARY KEY, + export_type TEXT NOT NULL, -- 'evidence_snapshot' | 'agent_session' | 'decision_outcome' + source_table TEXT, -- 來源表名 (incidents / agent_sessions ...) + source_ids TEXT[], -- 涵蓋的 source record ids + file_path TEXT, -- 匯出的 JSONL 檔案路徑 + record_count INT NOT NULL DEFAULT 0, + size_bytes BIGINT, + checksum_sha256 TEXT, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + metadata JSONB NOT NULL DEFAULT '{}'::jsonb, + CONSTRAINT finetune_export_type_valid CHECK (export_type IN ( + 'evidence_snapshot','agent_session','decision_outcome', + 'incident_rca','playbook_outcome','rlhf_trace' + )) +); + +COMMENT ON TABLE finetune_exports IS + 'ADR-090-D: MASTER §7.1 #3 Fine-tune JSONL 產出追蹤。每次 finetune_exporter 匯出寫一筆。'; + +CREATE INDEX IF NOT EXISTS idx_finetune_exports_created + ON finetune_exports(created_at DESC); +CREATE INDEX IF NOT EXISTS idx_finetune_exports_type + ON finetune_exports(export_type); + + +-- ═══════════════════════════════════════════════════════════════════ +-- 2. remediation_events — Phase 5 Declarative 修復追蹤 +-- ═══════════════════════════════════════════════════════════════════ + +CREATE TABLE IF NOT EXISTS remediation_events ( + event_id BIGSERIAL PRIMARY KEY, + incident_id TEXT, + approval_id TEXT, + remediation_type TEXT NOT NULL, -- 'declarative' | 'imperative' | 'gitops_pr' | 'kubectl' + action_name TEXT, + target_resource TEXT, -- deployment/awoooi-api 等 + namespace TEXT, + dry_run BOOLEAN NOT NULL DEFAULT false, + status TEXT NOT NULL, -- 'pending' | 'success' | 'failed' | 'rolled_back' + error_message TEXT, + blast_radius_score INT, + duration_ms INT, + executed_by TEXT, -- 'ai_agent' | 'human:ogt' | 'cron' + triggered_by_op_id UUID, -- 指向 automation_operation_log.op_id + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + completed_at TIMESTAMPTZ, + metadata JSONB NOT NULL DEFAULT '{}'::jsonb, + CONSTRAINT remediation_type_valid CHECK (remediation_type IN ( + 'declarative','imperative','gitops_pr','kubectl','ansible','helm','argocd_sync' + )), + CONSTRAINT remediation_status_valid CHECK (status IN ( + 'pending','success','failed','rolled_back','dry_run_ok','dry_run_failed' + )) +); + +COMMENT ON TABLE remediation_events IS + 'ADR-090-D: MASTER §7.1 #6 Declarative 修復使用率。每次 declarative_remediation 執行寫一筆。'; + +CREATE INDEX IF NOT EXISTS idx_remediation_events_time + ON remediation_events(created_at DESC); +CREATE INDEX IF NOT EXISTS idx_remediation_events_type + ON remediation_events(remediation_type); +CREATE INDEX IF NOT EXISTS idx_remediation_events_incident + ON remediation_events(incident_id) WHERE incident_id IS NOT NULL; + + +-- ═══════════════════════════════════════════════════════════════════ +-- 3. notification_outcomes — 通知成果追蹤 +-- ═══════════════════════════════════════════════════════════════════ + +CREATE TABLE IF NOT EXISTS notification_outcomes ( + outcome_id BIGSERIAL PRIMARY KEY, + incident_id TEXT, + approval_id TEXT, + channel TEXT NOT NULL, -- 'telegram' | 'email' | 'slack' | 'webhook' + notification_type TEXT, -- TYPE-1/2/3/4/4D/5S/6B/7E/8M + recipient TEXT, -- chat_id / email / user + message_id TEXT, -- telegram message_id 等 + sent_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + delivery_status TEXT NOT NULL, -- 'delivered' | 'failed' | 'pending' + delivery_error TEXT, + -- 人類互動追蹤 (RLHF 語料黃金) + user_action TEXT, -- 'approved' | 'rejected' | 'silenced' | 'ignored' | 'no_response' + user_action_at TIMESTAMPTZ, + user_comment TEXT, + -- 通知品質 + snoozed_count INT NOT NULL DEFAULT 0, + time_to_action_sec INT, -- 收到到按鈕按下的秒數 + metadata JSONB NOT NULL DEFAULT '{}'::jsonb, + CONSTRAINT notif_channel_valid CHECK (channel IN ( + 'telegram','email','slack','webhook','sms','discord' + )), + CONSTRAINT notif_delivery_valid CHECK (delivery_status IN ( + 'delivered','failed','pending','rate_limited' + )) +); + +COMMENT ON TABLE notification_outcomes IS + 'ADR-090-D: MASTER §7.1 #10 notification_outcomes 追蹤。每次 telegram_gateway 推送寫一筆,用戶按鈕觸發時 update user_action。'; + +CREATE INDEX IF NOT EXISTS idx_notification_outcomes_sent + ON notification_outcomes(sent_at DESC); +CREATE INDEX IF NOT EXISTS idx_notification_outcomes_incident + ON notification_outcomes(incident_id) WHERE incident_id IS NOT NULL; +CREATE INDEX IF NOT EXISTS idx_notification_outcomes_approval + ON notification_outcomes(approval_id) WHERE approval_id IS NOT NULL; +CREATE INDEX IF NOT EXISTS idx_notification_outcomes_pending_action + ON notification_outcomes(sent_at DESC) + WHERE user_action IS NULL AND delivery_status='delivered'; + + +-- ═══════════════════════════════════════════════════════════════════ +-- 驗收 (執行後可手動跑) +-- ═══════════════════════════════════════════════════════════════════ + +-- SELECT table_name FROM information_schema.tables +-- WHERE table_schema='public' +-- AND table_name IN ('finetune_exports','remediation_events','notification_outcomes') +-- ORDER BY table_name; +-- 預期: 3 筆 + +-- SELECT conname FROM pg_constraint WHERE conrelid IN ( +-- 'finetune_exports'::regclass, +-- 'remediation_events'::regclass, +-- 'notification_outcomes'::regclass +-- ) AND contype='c' ORDER BY conname; diff --git a/apps/api/src/services/declarative_remediation.py b/apps/api/src/services/declarative_remediation.py index be1f32b9..92746092 100644 --- a/apps/api/src/services/declarative_remediation.py +++ b/apps/api/src/services/declarative_remediation.py @@ -166,6 +166,16 @@ class DeclarativeRemediation: can_auto=spec.can_auto_execute, action=action[:80], ) + + # 2026-04-18 ADR-090-D: 寫入 remediation_events 表(MASTER §7.1 #6 KPI 資料源) + # fire-and-forget,不阻塞主流程 + try: + import asyncio as _a + _a.create_task(_log_remediation_event(spec, action, target, namespace)) + except RuntimeError: + # 非 async context (正規呼叫都是 async),靜默跳過 + pass + return spec @@ -173,6 +183,54 @@ class DeclarativeRemediation: # Helpers # ───────────────────────────────────────────────────────────────────────────── +async def _log_remediation_event( + spec: "DeclarativeSpec", + action: str, + target: str, + namespace: str, +) -> None: + """ + 2026-04-18 ADR-090-D: 寫入 remediation_events 表(MASTER §7.1 #6 KPI 資料源) + + 每次 DeclarativeRemediation.evaluate() 呼叫後寫一筆 'pending' 記錄。 + 後續實際執行狀態由 approval_execution.py 更新(未來 iteration)。 + """ + try: + from sqlalchemy import text as _sql + from src.db.base import get_db_context + + # remediation_type 判定 + _rt = "declarative" if spec.can_auto_execute else "imperative" + if spec.requires_gitops_pr: + _rt = "gitops_pr" + + async with get_db_context() as db: + await db.execute( + _sql(""" + INSERT INTO remediation_events ( + remediation_type, action_name, target_resource, namespace, + dry_run, status, blast_radius_score, executed_by, + metadata + ) VALUES ( + :rt, :an, :tr, :ns, + :dr, 'pending', :br, 'ai_agent', + CAST(:md AS jsonb) + ) + """), + { + "rt": _rt, + "an": action[:200], + "tr": target[:100] if target else None, + "ns": namespace[:50], + "dr": spec.dry_run_required, + "br": spec.blast_radius_score, + "md": '{"tier":"' + spec.tier + '"}', + }, + ) + except Exception as _e: + logger.warning("remediation_events_db_write_failed", error=str(_e)) + + def _build_constraints(action: str, namespace: str, score: int) -> list[str]: """依動作特性建立安全約束清單。""" constraints: list[str] = [] diff --git a/apps/api/src/services/finetune_exporter.py b/apps/api/src/services/finetune_exporter.py index 47ae8abb..79db14ca 100644 --- a/apps/api/src/services/finetune_exporter.py +++ b/apps/api/src/services/finetune_exporter.py @@ -50,7 +50,7 @@ from datetime import timedelta from pathlib import Path import structlog -from sqlalchemy import and_, select +from sqlalchemy import and_, select, text as sql_text from src.db.base import get_session_factory from src.db.models import AgentSession, AutoRepairExecution, IncidentEvidence @@ -143,6 +143,40 @@ class FineTuneExporter: row_count=len(rows), path=output_path, ) + + # 2026-04-18 ADR-090-D: 寫入 finetune_exports 表(MASTER §7.1 #3 KPI 資料源) + try: + import hashlib, os + _size = os.path.getsize(output_path) if output_path and os.path.exists(output_path) else None + _checksum = None + if output_path and os.path.exists(output_path): + with open(output_path, 'rb') as _f: + _checksum = hashlib.sha256(_f.read()).hexdigest() + _ids = [str(ev.id) for ev in evidences] + async with session_factory() as _db: + await _db.execute( + sql_text(""" + INSERT INTO finetune_exports ( + export_type, source_table, source_ids, + file_path, record_count, size_bytes, checksum_sha256, + metadata + ) VALUES ( + 'evidence_snapshot', 'incident_evidence', :ids, + :fp, :rc, :sz, :cs, CAST(:md AS jsonb) + ) + """), + { + "ids": _ids, + "fp": output_path, + "rc": len(rows), + "sz": _size, + "cs": _checksum, + "md": json.dumps({"lookback_days": EXPORT_LOOKBACK_DAYS}), + }, + ) + except Exception as _db_e: + logger.warning("finetune_exports_db_write_failed", error=str(_db_e)) + return output_path, len(rows) async def _build_row(self, db, ev: IncidentEvidence) -> dict | None: diff --git a/apps/api/src/services/incident_service.py b/apps/api/src/services/incident_service.py index faa81a16..feabbf29 100644 --- a/apps/api/src/services/incident_service.py +++ b/apps/api/src/services/incident_service.py @@ -184,6 +184,40 @@ def classify_alert_early(alertname: str, severity: str, labels: dict | None = No ): return "backup", "TYPE-1" + # 2026-04-18 ogt + Claude Opus 4.7: 擴規則降 general 兜底(MASTER §7.1 #7 <10%) + # 根據 7d 實測 general 17 種 alertname 整理: + # + # 5.1 測試告警攔截(避免污染生產指標) + # TestAlert / FingerprintTest / E2ETestAlert / ADR089Test / L4ClosureLoop + # FP[A-Z]... / *FreshUniq* → test category (TYPE-1 純通知) + if ( + alertname.startswith(("Test", "FingerprintTest", "ADR089", "L4Closure", "FPTest")) + or "FreshUniq" in alertname + or alertname in ("E2ETestAlert",) + or alertname.startswith("FP") and alertname[2:3].isupper() # FPTestB, FPTestA + ): + return "test", "TYPE-1" + + # 5.2 HighCPU / HighMemory / 其他 High* 主機資源類 + if alertname.startswith(("HighCPU", "HighMemory", "HighMem", "HighDisk", "HighLoad")): + return "host_resource", "TYPE-3" + + # 5.3 TLS / SSL / ProbeFailure → ssl_cert 或 external_site + if ( + alertname.startswith(("TLS", "SSL", "Certificate")) + or "ProbeFailure" in alertname + or alertname in ("TestConnectivity",) # ProbeFailure 同義 + ): + return "ssl_cert", "TYPE-3" + + # 5.4 PostgreSQL 詳盡(補 PostgreSQL* 變體,原 rule 用 startswith("Postgres") + # 按理涵蓋 PostgreSQLDiskGrowthRate 但實測落 general → 加保險規則) + if ( + alertname.startswith(("PostgreSQL", "MySQL", "MongoDB")) + or "DiskGrowthRate" in alertname + ): + return "database", "TYPE-3" + # 6. 主機資源(從 infrastructure 分離,ADR-075 統帥決議) if alertname.startswith("Host"): return "host_resource", "TYPE-3" diff --git a/apps/api/src/services/pre_decision_investigator.py b/apps/api/src/services/pre_decision_investigator.py index 1a0a64f7..e8f6eae5 100644 --- a/apps/api/src/services/pre_decision_investigator.py +++ b/apps/api/src/services/pre_decision_investigator.py @@ -265,6 +265,9 @@ class PreDecisionInvestigator: tool_name = reg.tool.name snapshot.mcp_health[tool_name] = False # 預設失敗,成功後覆蓋 + _started = asyncio.get_event_loop().time() + _mcp_status = "failed" + _mcp_error = None try: result = await asyncio.wait_for( reg.provider.execute(tool_name, params), @@ -277,10 +280,12 @@ class PreDecisionInvestigator: tool=tool_name, error=result.error, ) + _mcp_error = str(result.error)[:200] if result.error else "unknown" return snapshot.mcp_health[tool_name] = True snapshot.sensors_succeeded += 1 + _mcp_status = "success" # 依感官維度填入對應欄位 raw = result.output @@ -288,8 +293,73 @@ class PreDecisionInvestigator: except asyncio.TimeoutError: logger.warning("investigator_tool_timeout", tool=tool_name, timeout=MCP_TOOL_TIMEOUT_SEC) - except Exception: + _mcp_status = "timeout" + _mcp_error = f"timeout {MCP_TOOL_TIMEOUT_SEC}s" + except Exception as _e: logger.exception("investigator_tool_error", tool=tool_name) + _mcp_status = "error" + _mcp_error = str(_e)[:200] + finally: + # 2026-04-18 ADR-090-D: MCP 呼叫入 timeline_events(MASTER §7.1 #4 KPI) + try: + _duration_ms = int((asyncio.get_event_loop().time() - _started) * 1000) + asyncio.create_task(_log_mcp_call_to_timeline( + snapshot_incident_id=getattr(snapshot, "incident_id", None), + provider_name=reg.provider.name, + tool_name=tool_name, + status=_mcp_status, + error=_mcp_error, + duration_ms=_duration_ms, + )) + except Exception: + pass + + +async def _log_mcp_call_to_timeline( + snapshot_incident_id: str | None, + provider_name: str, + tool_name: str, + status: str, + error: str | None, + duration_ms: int, +) -> None: + """ + 2026-04-18 ADR-090-D: MCP 呼叫寫入 timeline_events,支援 MASTER §7.1 #4 + "MCP 呼叫次數/24h > 0" KPI 量測。 + """ + try: + from sqlalchemy import text as _sql + from src.db.base import get_db_context + import json as _json + _description = _json.dumps({ + "provider": provider_name, + "tool": tool_name, + "status": status, + "error": error, + "duration_ms": duration_ms, + }, ensure_ascii=False) + async with get_db_context() as _db: + await _db.execute( + _sql(""" + INSERT INTO timeline_events ( + incident_id, event_type, status, title, description, actor, + actor_role, created_at + ) VALUES ( + :iid, 'mcp_call', :st, :tl, :desc, :actor, + 'mcp', NOW() + ) + """), + { + "iid": snapshot_incident_id or "unknown", + "st": status, + "tl": f"MCP {provider_name}.{tool_name}"[:100], + "desc": _description[:500], + "actor": provider_name[:50], + }, + ) + except Exception: + # 靜默失敗,timeline_events 是稽核,不能反噬 MCP 主流程 + pass # ───────────────────────────────────────────────────────────────────────────── diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index 788c65db..e8d8013e 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -1688,6 +1688,35 @@ class TelegramGateway: message_id=_msg_id, ) + # 2026-04-18 ADR-090-D: 寫入 notification_outcomes (MASTER §7.1 #10 KPI) + try: + from sqlalchemy import text as _sql + from src.db.base import get_db_context + _delivered = "delivered" if _msg_id else "failed" + _notif_type = f"TYPE-3-{alert_category}" if alert_category else "TYPE-3" + async with get_db_context() as _db: + await _db.execute( + _sql(""" + INSERT INTO notification_outcomes ( + approval_id, channel, notification_type, recipient, + message_id, delivery_status, metadata + ) VALUES ( + :aid, 'telegram', :nt, :rp, + :mid, :ds, CAST(:md AS jsonb) + ) + """), + { + "aid": approval_id, + "nt": _notif_type, + "rp": str(settings.OPENCLAW_TG_CHAT_ID), + "mid": str(_msg_id) if _msg_id else None, + "ds": _delivered, + "md": '{"risk_level":"' + str(risk_level) + '"}', + }, + ) + except Exception as _db_e: + logger.warning("notification_outcomes_db_write_failed", error=str(_db_e)) + # 2026-04-10 Claude Sonnet 4.6 Asia/Taipei: 儲存 message_id 供自動修復後更新卡片 # key: tg_approval:{approval_id},TTL 24h if _msg_id: