diff --git a/apps/api/migrations/awooop_phase2_budget_ledger_2026-05-04.sql b/apps/api/migrations/awooop_phase2_budget_ledger_2026-05-04.sql new file mode 100644 index 00000000..2a849e6a --- /dev/null +++ b/apps/api/migrations/awooop_phase2_budget_ledger_2026-05-04.sql @@ -0,0 +1,66 @@ +-- AwoooP Phase 2.6: budget_ledger 建表 + 欄位定義 +-- 2026-05-04 ogt + Claude Sonnet 4.6(ADR-120 D5 實作) +-- +-- 防止 $47k 事故的三層 Hard Kill 架構中的 accounting 層: +-- - 每次 LLM call 完成後寫入一筆 ledger record +-- - 供 Tenant Budget Cache 計算 / 儀表板消費統計 / 告警閾值觸發 +-- +-- Phase 1 Control Plane migration 必須先執行(awooop_projects 表存在) +-- awooop_run_state 欄位在 Phase 3 SAGA 實作後補加 + +-- ========================================================= +-- STEP 1: 建立 budget_ledger 表 +-- ========================================================= +CREATE TABLE IF NOT EXISTS budget_ledger ( + id UUID DEFAULT gen_random_uuid() PRIMARY KEY, + project_id VARCHAR(64) NOT NULL DEFAULT 'awoooi', + agent_id VARCHAR(128), + run_id UUID, + model VARCHAR(64), + provider VARCHAR(32), + prompt_tokens INT, + completion_tokens INT, + cost_usd NUMERIC(10, 4) NOT NULL DEFAULT 0.0000, + recorded_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +COMMENT ON TABLE budget_ledger IS 'ADR-120: 每次 LLM call 的 token/cost accounting 記錄'; +COMMENT ON COLUMN budget_ledger.cost_usd IS 'prompt + completion token 的估算費用(USD)'; + +-- ========================================================= +-- STEP 2: Index(分析 + 查詢效率) +-- ========================================================= +CREATE INDEX IF NOT EXISTS idx_budget_ledger_project_date + ON budget_ledger(project_id, recorded_at DESC); + +CREATE INDEX IF NOT EXISTS idx_budget_ledger_run + ON budget_ledger(run_id) + WHERE run_id IS NOT NULL; + +CREATE INDEX IF NOT EXISTS idx_budget_ledger_agent + ON budget_ledger(project_id, agent_id, recorded_at DESC) + WHERE agent_id IS NOT NULL; + +-- ========================================================= +-- STEP 3: RLS(ADR-118 多租戶隔離) +-- ========================================================= +ALTER TABLE budget_ledger ENABLE ROW LEVEL SECURITY; +ALTER TABLE budget_ledger FORCE ROW LEVEL SECURITY; + +DROP POLICY IF EXISTS budget_ledger_tenant_isolation ON budget_ledger; +CREATE POLICY budget_ledger_tenant_isolation ON budget_ledger + FOR ALL TO awooop_app + USING (project_id = current_setting('app.project_id', TRUE)) + WITH CHECK (project_id = current_setting('app.project_id', TRUE)); + +-- ========================================================= +-- STEP 4: GRANT +-- ========================================================= +GRANT SELECT, INSERT ON budget_ledger TO awooop_app; + +-- ========================================================= +-- 驗收查詢 +-- ========================================================= +-- SELECT tablename, rowsecurity FROM pg_tables WHERE tablename = 'budget_ledger'; +-- -- 結果:rowsecurity = true +-- SELECT count(*) FROM budget_ledger; -- = 0(剛建) diff --git a/apps/api/migrations/awooop_phase4_run_state_2026-05-04.sql b/apps/api/migrations/awooop_phase4_run_state_2026-05-04.sql new file mode 100644 index 00000000..12b3c410 --- /dev/null +++ b/apps/api/migrations/awooop_phase4_run_state_2026-05-04.sql @@ -0,0 +1,200 @@ +-- AwoooP Phase 4: Platform Shell in Shadow Mode +-- Run State Machine 持久化表 +-- 2026-05-04 ogt + Claude Sonnet 4.6(ADR-114/ADR-119) +-- +-- 前置:Phase 1 control plane(awooop_projects)必須已執行 +-- +-- 三表: +-- awooop_run_state — Run FSM 主表(lease + heartbeat + SKIP LOCKED) +-- awooop_run_step_journal — SAGA step journal(tool call + 補償指令,ADR-119) +-- awooop_run_idempotency — 去重冪等表(ADR-114) + +-- ========================================================= +-- STEP 1: awooop_run_state +-- ========================================================= +CREATE TABLE IF NOT EXISTS awooop_run_state ( + run_id UUID PRIMARY KEY, + project_id VARCHAR(64) NOT NULL REFERENCES awooop_projects(project_id), + agent_id VARCHAR(128) NOT NULL, + + -- FSM 狀態 + state VARCHAR(32) NOT NULL DEFAULT 'pending' + CHECK (state IN ( + 'pending','running','waiting_tool', + 'waiting_approval','completed','failed', + 'cancelled','timeout' + )), + + -- Worker lease(SKIP LOCKED 防 double-pickup) + lease_until TIMESTAMPTZ, + heartbeat_at TIMESTAMPTZ, + worker_id VARCHAR(128), + + -- Retry 計數 + attempt_count SMALLINT NOT NULL DEFAULT 0, + max_attempts SMALLINT NOT NULL DEFAULT 3, + + -- Observability + trace_id VARCHAR(128), + + -- Trigger 來源 + trigger_type VARCHAR(32), + trigger_ref VARCHAR(256), -- channel_event_id / schedule_id / etc. + + -- Shadow mode flag + is_shadow BOOLEAN NOT NULL DEFAULT TRUE, + + -- Artifact integrity(ADR-112) + input_sha256 CHAR(64), + output_sha256 CHAR(64), + + -- Budget + cost_usd NUMERIC(10, 4) NOT NULL DEFAULT 0.0000, + step_count SMALLINT NOT NULL DEFAULT 0, + + -- 結果 + error_code VARCHAR(64), + error_detail TEXT, + + -- 時間戳記 + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + started_at TIMESTAMPTZ, + completed_at TIMESTAMPTZ, + timeout_at TIMESTAMPTZ +); + +COMMENT ON TABLE awooop_run_state IS + 'ADR-114: Run FSM 主表,SKIP LOCKED worker lease'; +COMMENT ON COLUMN awooop_run_state.is_shadow IS + 'Phase 4 shadow mode:TRUE = 不產生 user response,不執行 destructive tool'; + +-- Index: worker 掃 PENDING(SKIP LOCKED 用) +CREATE INDEX IF NOT EXISTS idx_run_state_pending + ON awooop_run_state (project_id, created_at) + WHERE state = 'pending' AND lease_until IS NULL; + +-- Index: stale run reaper(找 lease 過期的 running run) +CREATE INDEX IF NOT EXISTS idx_run_state_stale + ON awooop_run_state (lease_until) + WHERE state = 'running' AND lease_until IS NOT NULL; + +-- Index: project timeline(dashboard 查詢) +CREATE INDEX IF NOT EXISTS idx_run_state_project_timeline + ON awooop_run_state (project_id, created_at DESC); + +-- Index: trace_id(跨系統追蹤) +CREATE INDEX IF NOT EXISTS idx_run_state_trace_id + ON awooop_run_state (trace_id) + WHERE trace_id IS NOT NULL; + +-- ========================================================= +-- STEP 2: awooop_run_step_journal(SAGA step journal,ADR-119) +-- ========================================================= +CREATE TABLE IF NOT EXISTS awooop_run_step_journal ( + step_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + run_id UUID NOT NULL REFERENCES awooop_run_state(run_id) ON DELETE CASCADE, + project_id VARCHAR(64) NOT NULL, + + -- Step 順序(每個 run 內遞增) + step_seq SMALLINT NOT NULL, + + -- Tool call 資訊 + tool_name VARCHAR(128) NOT NULL, + mcp_gateway_id VARCHAR(128), + + -- Artifact integrity(ADR-112) + input_hash CHAR(64), + output_hash CHAR(64), + + -- SAGA 補償指令(JSON) + compensation_json JSONB, + + -- 執行結果 + result_status VARCHAR(16) NOT NULL DEFAULT 'pending' + CHECK (result_status IN ('pending','success','failed','compensated')), + error_code VARCHAR(64), + + -- Shadow 攔截記錄 + was_blocked BOOLEAN NOT NULL DEFAULT FALSE, + block_reason VARCHAR(128), + + -- 時間 + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + completed_at TIMESTAMPTZ, + latency_ms INTEGER +); + +COMMENT ON TABLE awooop_run_step_journal IS + 'ADR-119 SAGA step journal:每個 tool call 獨立記錄 + 補償指令'; + +CREATE UNIQUE INDEX IF NOT EXISTS uix_run_step_seq + ON awooop_run_step_journal (run_id, step_seq); + +CREATE INDEX IF NOT EXISTS idx_run_step_run_id + ON awooop_run_step_journal (run_id, step_seq); + +-- ========================================================= +-- STEP 3: awooop_run_idempotency(ADR-114 去重冪等) +-- ========================================================= +CREATE TABLE IF NOT EXISTS awooop_run_idempotency ( + idempotency_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + project_id VARCHAR(64) NOT NULL, + channel_type VARCHAR(32) NOT NULL, + provider_event_id VARCHAR(256) NOT NULL, + + -- 映射到的 run + run_id UUID NOT NULL REFERENCES awooop_run_state(run_id), + + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +COMMENT ON TABLE awooop_run_idempotency IS + 'ADR-114: (project_id, channel_type, provider_event_id) → run_id 去重'; + +CREATE UNIQUE INDEX IF NOT EXISTS uix_run_idempotency_key + ON awooop_run_idempotency (project_id, channel_type, provider_event_id); + +CREATE INDEX IF NOT EXISTS idx_run_idempotency_run_id + ON awooop_run_idempotency (run_id); + +-- ========================================================= +-- STEP 4: RLS(ADR-118 多租戶隔離) +-- ========================================================= +ALTER TABLE awooop_run_state ENABLE ROW LEVEL SECURITY; +ALTER TABLE awooop_run_state FORCE ROW LEVEL SECURITY; +ALTER TABLE awooop_run_step_journal ENABLE ROW LEVEL SECURITY; +ALTER TABLE awooop_run_step_journal FORCE ROW LEVEL SECURITY; +ALTER TABLE awooop_run_idempotency ENABLE ROW LEVEL SECURITY; +ALTER TABLE awooop_run_idempotency FORCE ROW LEVEL SECURITY; + +DROP POLICY IF EXISTS run_state_tenant_isolation ON awooop_run_state; +CREATE POLICY run_state_tenant_isolation ON awooop_run_state + FOR ALL TO awooop_app + USING (project_id = current_setting('app.project_id', TRUE)) + WITH CHECK (project_id = current_setting('app.project_id', TRUE)); + +DROP POLICY IF EXISTS run_step_journal_tenant_isolation ON awooop_run_step_journal; +CREATE POLICY run_step_journal_tenant_isolation ON awooop_run_step_journal + FOR ALL TO awooop_app + USING (project_id = current_setting('app.project_id', TRUE)) + WITH CHECK (project_id = current_setting('app.project_id', TRUE)); + +DROP POLICY IF EXISTS run_idempotency_tenant_isolation ON awooop_run_idempotency; +CREATE POLICY run_idempotency_tenant_isolation ON awooop_run_idempotency + FOR ALL TO awooop_app + USING (project_id = current_setting('app.project_id', TRUE)) + WITH CHECK (project_id = current_setting('app.project_id', TRUE)); + +-- ========================================================= +-- STEP 5: GRANT +-- ========================================================= +GRANT SELECT, INSERT, UPDATE ON awooop_run_state TO awooop_app; +GRANT SELECT, INSERT, UPDATE ON awooop_run_step_journal TO awooop_app; +GRANT SELECT, INSERT ON awooop_run_idempotency TO awooop_app; + +-- ========================================================= +-- 驗收查詢 +-- ========================================================= +-- SELECT tablename, rowsecurity FROM pg_tables +-- WHERE tablename IN ('awooop_run_state','awooop_run_step_journal','awooop_run_idempotency'); +-- 預期:所有 rowsecurity = true diff --git a/apps/api/migrations/awooop_phase5_mcp_gateway_2026-05-04.sql b/apps/api/migrations/awooop_phase5_mcp_gateway_2026-05-04.sql new file mode 100644 index 00000000..d6955ca1 --- /dev/null +++ b/apps/api/migrations/awooop_phase5_mcp_gateway_2026-05-04.sql @@ -0,0 +1,198 @@ +-- ============================================================================= +-- AwoooP Phase 5: MCP Gateway 四表 +-- ADR-116(五閘門 enforcement)+ ADR-118(credential isolation) +-- 2026-05-04 ogt + Claude Sonnet 4.6 +-- ============================================================================= +-- 執行順序: +-- 1. awooop_mcp_tool_registry — Tool 白名單 +-- 2. awooop_mcp_grants — Agent × Tool 授權記錄 +-- 3. awooop_mcp_credential_refs — k8s Secret 參照(不儲存明文) +-- 4. awooop_mcp_gateway_audit — 每次 gateway call 稽核 +-- ============================================================================= + +BEGIN; + +-- --------------------------------------------------------------------------- +-- 1. awooop_mcp_tool_registry — Tool 白名單(Gate 3: Tool) +-- --------------------------------------------------------------------------- +CREATE TABLE IF NOT EXISTS awooop_mcp_tool_registry ( + tool_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + project_id VARCHAR(64) NOT NULL + REFERENCES awooop_projects(project_id) ON DELETE CASCADE, + tool_name VARCHAR(128) NOT NULL, + tool_type VARCHAR(32) NOT NULL, -- 'builtin' | 'mcp_server' | 'custom' + description TEXT, + allowed_scopes JSONB NOT NULL DEFAULT '[]'::jsonb, -- ["read","write","admin"] + environment_tags JSONB NOT NULL DEFAULT '{}'::jsonb, -- {"env": "prod"} gate 4 用 + is_active BOOLEAN NOT NULL DEFAULT TRUE, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + + CONSTRAINT chk_tool_type + CHECK (tool_type IN ('builtin','mcp_server','custom')), + CONSTRAINT chk_allowed_scopes_array + CHECK (jsonb_typeof(allowed_scopes) = 'array'), + CONSTRAINT uix_tool_registry_project_name + UNIQUE (project_id, tool_name) +); + +CREATE INDEX IF NOT EXISTS idx_mcp_tool_registry_project + ON awooop_mcp_tool_registry (project_id, is_active); + +-- --------------------------------------------------------------------------- +-- 2. awooop_mcp_grants — Agent × Tool 授權(Gate 2: Agent + Gate 3: Tool) +-- --------------------------------------------------------------------------- +CREATE TABLE IF NOT EXISTS awooop_mcp_grants ( + grant_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + project_id VARCHAR(64) NOT NULL + REFERENCES awooop_projects(project_id) ON DELETE CASCADE, + agent_id VARCHAR(128) NOT NULL, -- awooop_agents.agent_id + tool_id UUID NOT NULL + REFERENCES awooop_mcp_tool_registry(tool_id) ON DELETE CASCADE, + granted_by VARCHAR(128) NOT NULL, -- principal(human user / system) + granted_scopes JSONB NOT NULL DEFAULT '[]'::jsonb, -- subset of tool.allowed_scopes + expires_at TIMESTAMPTZ, -- NULL = 永不過期 + is_revoked BOOLEAN NOT NULL DEFAULT FALSE, + revoked_at TIMESTAMPTZ, + revoked_by VARCHAR(128), + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + + CONSTRAINT chk_grant_scopes_array + CHECK (jsonb_typeof(granted_scopes) = 'array'), + CONSTRAINT chk_revoke_consistency + CHECK ( + (is_revoked = FALSE AND revoked_at IS NULL AND revoked_by IS NULL) + OR + (is_revoked = TRUE AND revoked_at IS NOT NULL) + ), + CONSTRAINT uix_mcp_grant_agent_tool + UNIQUE (project_id, agent_id, tool_id) +); + +CREATE INDEX IF NOT EXISTS idx_mcp_grants_lookup + ON awooop_mcp_grants (project_id, agent_id, tool_id) + WHERE is_revoked = FALSE; + +CREATE INDEX IF NOT EXISTS idx_mcp_grants_expiry + ON awooop_mcp_grants (expires_at) + WHERE is_revoked = FALSE AND expires_at IS NOT NULL; + +-- --------------------------------------------------------------------------- +-- 3. awooop_mcp_credential_refs — k8s Secret 參照(ADR-118 credential isolation) +-- 只儲存 ref 路徑 + sha256 指紋;明文絕不入庫 +-- --------------------------------------------------------------------------- +CREATE TABLE IF NOT EXISTS awooop_mcp_credential_refs ( + ref_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + tool_id UUID NOT NULL + REFERENCES awooop_mcp_tool_registry(tool_id) ON DELETE CASCADE, + project_id VARCHAR(64) NOT NULL + REFERENCES awooop_projects(project_id) ON DELETE CASCADE, + -- k8s secret ref:格式 "namespace/secret-name#key" + k8s_secret_ref VARCHAR(256) NOT NULL, + -- sha256(actual_secret_value) — 用於 audit;不可還原原值 + value_sha256 VARCHAR(64), + description TEXT, + is_active BOOLEAN NOT NULL DEFAULT TRUE, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + rotated_at TIMESTAMPTZ, + + CONSTRAINT chk_k8s_ref_format + CHECK (k8s_secret_ref ~ '^[a-z0-9-]+/[a-z0-9-]+#[a-zA-Z0-9_-]+$'), + CONSTRAINT chk_value_sha256_hex + CHECK (value_sha256 IS NULL OR value_sha256 ~ '^[0-9a-f]{64}$'), + CONSTRAINT uix_credential_ref_tool + UNIQUE (tool_id, k8s_secret_ref) +); + +CREATE INDEX IF NOT EXISTS idx_mcp_cred_refs_tool + ON awooop_mcp_credential_refs (tool_id) + WHERE is_active = TRUE; + +-- --------------------------------------------------------------------------- +-- 4. awooop_mcp_gateway_audit — Gateway call 稽核日誌(ADR-116 P1-09) +-- 不儲存 raw input/output;只儲存 hash + 結果狀態 +-- --------------------------------------------------------------------------- +CREATE TABLE IF NOT EXISTS awooop_mcp_gateway_audit ( + call_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + project_id VARCHAR(64) NOT NULL, + run_id UUID, -- FK soft(run 可能不存在) + trace_id VARCHAR(128), + agent_id VARCHAR(128), + tool_id UUID NOT NULL + REFERENCES awooop_mcp_tool_registry(tool_id), + tool_name VARCHAR(128) NOT NULL, + credential_ref VARCHAR(256), -- k8s_secret_ref 路徑(不含 key value) + input_hash VARCHAR(64), -- sha256(canonical input JSON) + output_hash VARCHAR(64), -- sha256(canonical output JSON) + gate_result JSONB NOT NULL DEFAULT '{}'::jsonb, + -- {"gate1_project": true, "gate2_agent": true, "gate3_tool": true, + -- "gate4_env": true, "gate5_approval": true} + result_status VARCHAR(16) NOT NULL, -- 'success' | 'blocked' | 'failed' | 'timeout' + block_gate SMALLINT, -- 哪個 gate 攔截(1-5,NULL=未攔截) + block_reason VARCHAR(256), + latency_ms INTEGER, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + + CONSTRAINT chk_gateway_result_status + CHECK (result_status IN ('success','blocked','failed','timeout')), + CONSTRAINT chk_block_gate_range + CHECK (block_gate IS NULL OR (block_gate >= 1 AND block_gate <= 5)), + CONSTRAINT chk_input_hash_hex + CHECK (input_hash IS NULL OR input_hash ~ '^[0-9a-f]{64}$'), + CONSTRAINT chk_output_hash_hex + CHECK (output_hash IS NULL OR output_hash ~ '^[0-9a-f]{64}$') +); + +-- 查詢熱路徑:by project + run +CREATE INDEX IF NOT EXISTS idx_mcp_audit_run + ON awooop_mcp_gateway_audit (project_id, run_id, created_at DESC); + +-- 查詢熱路徑:blocked calls 分析 +CREATE INDEX IF NOT EXISTS idx_mcp_audit_blocked + ON awooop_mcp_gateway_audit (project_id, block_gate, created_at DESC) + WHERE result_status = 'blocked'; + +-- 時序熱路徑(recent calls) +CREATE INDEX IF NOT EXISTS idx_mcp_audit_recent + ON awooop_mcp_gateway_audit (project_id, created_at DESC); + +-- ============================================================================= +-- Row Level Security +-- ============================================================================= + +ALTER TABLE awooop_mcp_tool_registry ENABLE ROW LEVEL SECURITY; +ALTER TABLE awooop_mcp_grants ENABLE ROW LEVEL SECURITY; +ALTER TABLE awooop_mcp_credential_refs ENABLE ROW LEVEL SECURITY; +ALTER TABLE awooop_mcp_gateway_audit ENABLE ROW LEVEL SECURITY; + +ALTER TABLE awooop_mcp_tool_registry FORCE ROW LEVEL SECURITY; +ALTER TABLE awooop_mcp_grants FORCE ROW LEVEL SECURITY; +ALTER TABLE awooop_mcp_credential_refs FORCE ROW LEVEL SECURITY; +ALTER TABLE awooop_mcp_gateway_audit FORCE ROW LEVEL SECURITY; + +-- awooop_app role:只能看自己 project 的資料 +CREATE POLICY mcp_tool_registry_tenant_isolation ON awooop_mcp_tool_registry + USING ( + project_id = current_setting('app.project_id', TRUE) + OR current_setting('app.project_id', TRUE) IS NULL + ); + +CREATE POLICY mcp_grants_tenant_isolation ON awooop_mcp_grants + USING ( + project_id = current_setting('app.project_id', TRUE) + OR current_setting('app.project_id', TRUE) IS NULL + ); + +CREATE POLICY mcp_credential_refs_tenant_isolation ON awooop_mcp_credential_refs + USING ( + project_id = current_setting('app.project_id', TRUE) + OR current_setting('app.project_id', TRUE) IS NULL + ); + +CREATE POLICY mcp_gateway_audit_tenant_isolation ON awooop_mcp_gateway_audit + USING ( + project_id = current_setting('app.project_id', TRUE) + OR current_setting('app.project_id', TRUE) IS NULL + ); + +COMMIT; diff --git a/apps/api/migrations/awooop_phase6_ewoooc_onboarding_2026-05-04.sql b/apps/api/migrations/awooop_phase6_ewoooc_onboarding_2026-05-04.sql new file mode 100644 index 00000000..6286f5e5 --- /dev/null +++ b/apps/api/migrations/awooop_phase6_ewoooc_onboarding_2026-05-04.sql @@ -0,0 +1,93 @@ +-- ============================================================================= +-- AwoooP Phase 6: EwoooC Tenant Onboarding +-- ADR-115(Tenant Onboarding 模板) +-- 2026-05-04 ogt + Claude Sonnet 4.6 +-- ============================================================================= +-- 執行前提:Phase 1 migration(awooop_phase1_control_plane_2026-05-04.sql)已執行 +-- 說明: +-- EwoooC 是第二個接入 AwoooP 的租戶(awoooi 為第一個) +-- migration_mode = 'shadow' 啟動,進入 canary 前需通過 shadow run 驗證 +-- budget_limit_usd = 50.0(初始限制,可調整) +-- 4 個 read-only MCP tools 預先在白名單中(不需 approval) +-- ============================================================================= + +BEGIN; + +-- --------------------------------------------------------------------------- +-- Step 1: INSERT awooop_projects(EwoooC 租戶) +-- --------------------------------------------------------------------------- +INSERT INTO awooop_projects ( + project_id, + display_name, + migration_mode, + budget_limit_usd, + allowed_channels, + metadata +) VALUES ( + 'ewoooc', + 'EwoooC Business Platform', + 'shadow', -- Phase 6 啟動模式;通過驗證後升級為 canary + 50.00, -- 初始 USD 預算上限 + '["telegram","api"]'::jsonb, + '{ + "onboarded_at": "2026-05-04", + "tier": "business", + "ollama_topology": "gcp_three_tier", + "note": "ADR-115 EwoooC 接入,共用 GCP Ollama 三層拓撲" + }'::jsonb +) ON CONFLICT (project_id) DO NOTHING; + +-- --------------------------------------------------------------------------- +-- Step 2: awooop_mcp_tool_registry — 4 個 read-only MCP tools +-- (ewoooc 初始只允許唯讀工具,write/admin 需另外建 grant) +-- --------------------------------------------------------------------------- + +-- Tool 1: k8s_get — 查詢 k8s resource(唯讀) +INSERT INTO awooop_mcp_tool_registry ( + project_id, tool_name, tool_type, description, allowed_scopes, environment_tags +) VALUES ( + 'ewoooc', + 'k8s_get', + 'builtin', + 'kubectl get 唯讀查詢(pod/deployment/service 狀態)', + '["read"]'::jsonb, + '{"env": "any"}'::jsonb +) ON CONFLICT (project_id, tool_name) DO NOTHING; + +-- Tool 2: signoz_query — 查詢 SigNoz metrics/traces(唯讀) +INSERT INTO awooop_mcp_tool_registry ( + project_id, tool_name, tool_type, description, allowed_scopes, environment_tags +) VALUES ( + 'ewoooc', + 'signoz_query', + 'builtin', + 'SigNoz metrics/traces 查詢(唯讀,無告警修改)', + '["read"]'::jsonb, + '{"env": "any"}'::jsonb +) ON CONFLICT (project_id, tool_name) DO NOTHING; + +-- Tool 3: incident_read — 讀取 EwoooC incident 記錄(唯讀,RLS 隔離) +INSERT INTO awooop_mcp_tool_registry ( + project_id, tool_name, tool_type, description, allowed_scopes, environment_tags +) VALUES ( + 'ewoooc', + 'incident_read', + 'builtin', + 'Incident 查詢(僅限 ewoooc 租戶資料,RLS 強制隔離)', + '["read"]'::jsonb, + '{"env": "any"}'::jsonb +) ON CONFLICT (project_id, tool_name) DO NOTHING; + +-- Tool 4: km_read — 讀取 Knowledge Management 條目(唯讀) +INSERT INTO awooop_mcp_tool_registry ( + project_id, tool_name, tool_type, description, allowed_scopes, environment_tags +) VALUES ( + 'ewoooc', + 'km_read', + 'builtin', + 'Knowledge Management 讀取(ewoooc 租戶 KM,RLS 隔離)', + '["read"]'::jsonb, + '{"env": "any"}'::jsonb +) ON CONFLICT (project_id, tool_name) DO NOTHING; + +COMMIT; diff --git a/apps/api/migrations/awooop_phase7_channel_hub_2026-05-04.sql b/apps/api/migrations/awooop_phase7_channel_hub_2026-05-04.sql new file mode 100644 index 00000000..e65c5227 --- /dev/null +++ b/apps/api/migrations/awooop_phase7_channel_hub_2026-05-04.sql @@ -0,0 +1,131 @@ +-- ============================================================================= +-- AwoooP Phase 7: Channel Hub 雙表 +-- ADR-106(channel_event family)+ Progressive Feedback Policy +-- 2026-05-04 ogt + Claude Sonnet 4.6 +-- ============================================================================= +-- 兩張表: +-- awooop_conversation_event — 入站事件鏡像(Telegram/LINE inbound) +-- awooop_outbound_message — 出站訊息記錄(interim + final reply) +-- ============================================================================= + +BEGIN; + +-- --------------------------------------------------------------------------- +-- 1. awooop_conversation_event — 入站 Channel Event 鏡像 +-- 目的:AwoooP 平台保留所有入站事件的不可變記錄,與 legacy 系統解耦 +-- --------------------------------------------------------------------------- +CREATE TABLE IF NOT EXISTS awooop_conversation_event ( + event_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + project_id VARCHAR(64) NOT NULL + REFERENCES awooop_projects(project_id) ON DELETE CASCADE, + -- Channel 原始身份 + channel_type VARCHAR(32) NOT NULL, -- 'telegram' | 'line' | 'slack' | 'api' + provider_event_id VARCHAR(256) NOT NULL, -- Telegram: message_id, LINE: webhook event_id + -- 統一身份(由 ProviderProxy 注入) + platform_subject_id VARCHAR(128), + channel_user_id VARCHAR(256), + channel_chat_id VARCHAR(256), + -- 關聯 run(若已建立) + run_id UUID, -- FK soft(run 可能晚於 event 建立) + -- 事件內容(只存摘要/hash,不存明文) + content_type VARCHAR(32) NOT NULL DEFAULT 'text', -- 'text' | 'photo' | 'document' | 'command' + content_hash VARCHAR(64), -- sha256(raw_content),明文不入庫 + content_preview VARCHAR(256), -- 前 256 字元(無 PII/secret) + attachment_sha256 VARCHAR(64), -- 附件 sha256 + -- 去重(與 awooop_run_idempotency 對應) + is_duplicate BOOLEAN NOT NULL DEFAULT FALSE, + -- 時間 + provider_ts TIMESTAMPTZ, -- provider 原始時間戳 + received_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + + CONSTRAINT chk_conv_event_channel_type + CHECK (channel_type IN ('telegram','line','slack','api','internal')), + CONSTRAINT chk_conv_event_content_type + CHECK (content_type IN ('text','photo','document','command','callback_query')), + CONSTRAINT uix_conv_event_dedup + UNIQUE (project_id, channel_type, provider_event_id) +); + +CREATE INDEX IF NOT EXISTS idx_conv_event_run + ON awooop_conversation_event (project_id, run_id, received_at DESC); + +CREATE INDEX IF NOT EXISTS idx_conv_event_subject + ON awooop_conversation_event (project_id, platform_subject_id, received_at DESC); + +CREATE INDEX IF NOT EXISTS idx_conv_event_recent + ON awooop_conversation_event (project_id, channel_type, received_at DESC); + +-- --------------------------------------------------------------------------- +-- 2. awooop_outbound_message — 出站訊息記錄(interim + final reply) +-- 目的:追蹤 AwoooP 發出的每一條訊息(shadow 不發、canary/active 發) +-- Progressive Feedback Policy:WAITING_TOOL 超過 30s → 發 interim message +-- --------------------------------------------------------------------------- +CREATE TABLE IF NOT EXISTS awooop_outbound_message ( + message_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + project_id VARCHAR(64) NOT NULL + REFERENCES awooop_projects(project_id) ON DELETE CASCADE, + run_id UUID NOT NULL, -- FK soft + conversation_event_id UUID, -- 觸發訊息的入站 event + -- 出站目的地 + channel_type VARCHAR(32) NOT NULL, + channel_chat_id VARCHAR(256) NOT NULL, + -- 訊息分類 + message_type VARCHAR(32) NOT NULL, -- 'interim' | 'final' | 'error' | 'approval_request' + -- 內容(只存 hash,不存明文) + content_hash VARCHAR(64), -- sha256(rendered_content) + content_preview VARCHAR(256), -- 前 256 字元(無 PII/secret) + -- provider 回報的 message_id(Telegram: message.message_id) + provider_message_id VARCHAR(64), + -- 狀態 + send_status VARCHAR(16) NOT NULL DEFAULT 'pending', -- 'pending'|'sent'|'failed'|'shadow' + send_error TEXT, + -- 時間 + queued_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + sent_at TIMESTAMPTZ, + -- Progressive Feedback Policy(WAITING_TOOL 超 30s 觸發 interim) + triggered_by_state VARCHAR(32), -- 觸發本訊息的 run state('waiting_tool'等) + waiting_since TIMESTAMPTZ, -- 開始等待的時間(計算 30s 超時用) + + CONSTRAINT chk_outbound_channel_type + CHECK (channel_type IN ('telegram','line','slack','api','internal')), + CONSTRAINT chk_outbound_message_type + CHECK (message_type IN ('interim','final','error','approval_request')), + CONSTRAINT chk_outbound_send_status + CHECK (send_status IN ('pending','sent','failed','shadow')) +); + +CREATE INDEX IF NOT EXISTS idx_outbound_msg_run + ON awooop_outbound_message (project_id, run_id, queued_at DESC); + +CREATE INDEX IF NOT EXISTS idx_outbound_msg_pending + ON awooop_outbound_message (project_id, channel_type, queued_at) + WHERE send_status = 'pending'; + +-- Progressive Feedback Policy 查詢:找等待超過 30s 的 runs +CREATE INDEX IF NOT EXISTS idx_outbound_msg_waiting + ON awooop_outbound_message (project_id, triggered_by_state, waiting_since) + WHERE triggered_by_state = 'waiting_tool' AND send_status = 'pending'; + +-- ============================================================================= +-- Row Level Security +-- ============================================================================= + +ALTER TABLE awooop_conversation_event ENABLE ROW LEVEL SECURITY; +ALTER TABLE awooop_outbound_message ENABLE ROW LEVEL SECURITY; + +ALTER TABLE awooop_conversation_event FORCE ROW LEVEL SECURITY; +ALTER TABLE awooop_outbound_message FORCE ROW LEVEL SECURITY; + +CREATE POLICY conv_event_tenant_isolation ON awooop_conversation_event + USING ( + project_id = current_setting('app.project_id', TRUE) + OR current_setting('app.project_id', TRUE) IS NULL + ); + +CREATE POLICY outbound_msg_tenant_isolation ON awooop_outbound_message + USING ( + project_id = current_setting('app.project_id', TRUE) + OR current_setting('app.project_id', TRUE) IS NULL + ); + +COMMIT; diff --git a/apps/api/src/api/v1/platform/__init__.py b/apps/api/src/api/v1/platform/__init__.py new file mode 100644 index 00000000..b5d0d0e4 --- /dev/null +++ b/apps/api/src/api/v1/platform/__init__.py @@ -0,0 +1,4 @@ +"""AwoooP Platform API — Phase 4 Shadow Mode Shell""" +from src.api.v1.platform.runs import router + +__all__ = ["router"] diff --git a/apps/api/src/api/v1/platform/runs.py b/apps/api/src/api/v1/platform/runs.py new file mode 100644 index 00000000..8d317932 --- /dev/null +++ b/apps/api/src/api/v1/platform/runs.py @@ -0,0 +1,149 @@ +""" +Platform Runs API +================== +AwoooP Phase 4: POST /v1/platform/runs — Shadow mode run 建立 +2026-05-04 ogt + Claude Sonnet 4.6(ADR-106/ADR-114) + +禁止碰: +- /v1/incidents/ — legacy 路由 +- /v1/webhooks/ — legacy 路由 +- Telegram bot handler — legacy 維持 + +Shadow mode 保證(Phase 4): +- 建立的 run 全部 is_shadow=True +- 不發送任何 user-visible response +- 不執行任何 destructive tool call +""" + +from __future__ import annotations + +import uuid +from typing import Any + +from fastapi import APIRouter, HTTPException, status +from pydantic import BaseModel, Field + +from src.services.audit_sink import write_audit +from src.services.platform_runtime import create_run + +router = APIRouter() + + +# ───────────────────────────────────────────────────────────────────────────── +# Request / Response models +# ───────────────────────────────────────────────────────────────────────────── + +class CreateRunRequest(BaseModel): + """POST /v1/platform/runs request body""" + + project_id: str = Field(..., description="租戶 ID") + agent_id: str = Field(..., description="執行此 run 的 agent ID") + trigger_type: str = Field( + ..., + pattern="^(channel_event|schedule|api|sub_agent|retry)$", + description="觸發來源類型", + ) + trigger_ref: str | None = Field(None, description="觸發來源 ref(channel_event_id 等)") + input_payload: dict[str, Any] | None = Field(None, description="Run 輸入 payload") + channel_type: str | None = Field(None, description="Channel 類型(idempotency 用)") + provider_event_id: str | None = Field( + None, max_length=256, + description="Channel provider 原始事件 ID(idempotency 去重用)", + ) + timeout_seconds: int = Field(600, ge=30, le=3600, description="Run 超時秒數") + + +class CreateRunResponse(BaseModel): + """POST /v1/platform/runs response""" + + run_id: str + is_duplicate: bool = Field(description="True = 冪等命中,返回既有 run_id") + is_shadow: bool = Field(True, description="Phase 4 固定 True") + message: str + + +# ───────────────────────────────────────────────────────────────────────────── +# Routes +# ───────────────────────────────────────────────────────────────────────────── + +@router.post( + "/runs", + response_model=CreateRunResponse, + status_code=status.HTTP_202_ACCEPTED, + summary="建立 Platform Run(Shadow Mode)", + description=( + "AwoooP Phase 4 Shadow Mode:建立新 run,非同步執行。\n\n" + "- `is_shadow=true`:不產生任何 user-visible response\n" + "- `is_duplicate=true`:冪等命中,返回既有 run_id(不建立新 run)\n" + "- provider_event_id + channel_type 構成冪等 key(24h 視窗)" + ), +) +async def create_platform_run( + request: CreateRunRequest, +) -> CreateRunResponse: + """建立 shadow run。""" + try: + run_id, is_duplicate = await create_run( + project_id=request.project_id, + agent_id=request.agent_id, + trigger_type=request.trigger_type, + trigger_ref=request.trigger_ref, + input_payload=request.input_payload, + channel_type=request.channel_type, + provider_event_id=request.provider_event_id, + timeout_seconds=request.timeout_seconds, + ) + except Exception as exc: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Run 建立失敗: {exc}", + ) from exc + + # Audit log(非阻擋) + await write_audit( + project_id=request.project_id, + action="run.created", + resource_type="run", + resource_id=str(run_id), + details={ + "agent_id": request.agent_id, + "trigger_type": request.trigger_type, + "is_duplicate": is_duplicate, + "is_shadow": True, + }, + ) + + return CreateRunResponse( + run_id=str(run_id), + is_duplicate=is_duplicate, + is_shadow=True, + message="Run 已接受(shadow mode)" if not is_duplicate else "冪等命中,返回既有 run_id", + ) + + +@router.get( + "/runs/{run_id}", + summary="查詢 Run 狀態", +) +async def get_run_status( + run_id: str, + project_id: str, +) -> dict[str, Any]: + """查詢單一 run 的 FSM 狀態。""" + from src.services.platform_runtime import get_run_status as _svc_get_run_status + + try: + uid = uuid.UUID(run_id) + except ValueError as exc: + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail=f"run_id 格式錯誤: {exc}", + ) from exc + + result = await _svc_get_run_status(uid, project_id) + if result is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"run {run_id!r} 不存在", + ) + return result diff --git a/apps/api/src/core/context.py b/apps/api/src/core/context.py new file mode 100644 index 00000000..28f77ab6 --- /dev/null +++ b/apps/api/src/core/context.py @@ -0,0 +1,22 @@ +"""AwoooP Phase 2.4: Project ID Context Variable +================================================ +2026-05-04 ogt + Claude Sonnet 4.6(ADR-123 background loop tagging) + +設計原則: +- Python asyncio.create_task() 自動繼承父任務的 ContextVar 值 +- startup handler 設一次 PROJECT_ID.set("awoooi"),所有 31 個 loop 自動繼承 +- get_db_context() 讀此 contextvar 作為 fallback,確保 RLS SET LOCAL 正確 +- 多租戶未來:呼叫端傳入不同 project_id 即可隔離,無需改 loop 本體 +""" +from __future__ import annotations + +from contextvars import ContextVar + +# 追蹤當前非同步任務的 project_id +# default="awoooi" 確保未設時也能正常查詢(RLS fail-open 保護) +PROJECT_ID: ContextVar[str] = ContextVar("project_id", default="awoooi") + + +def get_current_project_id() -> str: + """取得當前任務的 project_id(給 service 層使用)""" + return PROJECT_ID.get() diff --git a/apps/api/src/db/awooop_models.py b/apps/api/src/db/awooop_models.py index 11908132..802a33cc 100644 --- a/apps/api/src/db/awooop_models.py +++ b/apps/api/src/db/awooop_models.py @@ -309,3 +309,383 @@ class AwoooPProjectMigrationState(Base): updated_at: Mapped[datetime] = mapped_column( nullable=False, server_default=text("NOW()") ) + + +# ───────────────────────────────────────────────────────────────────────────── +# Phase 4: Run State Machine(ADR-114/ADR-119) +# 2026-05-04 ogt + Claude Sonnet 4.6 +# ───────────────────────────────────────────────────────────────────────────── + +class AwoooPRunState(Base): + """Run FSM 主表(SKIP LOCKED worker lease,ADR-114)""" + + __tablename__ = "awooop_run_state" + __table_args__ = ( + CheckConstraint( + "state IN (" + "'pending','running','waiting_tool'," + "'waiting_approval','completed','failed','cancelled','timeout')", + name="chk_run_state", + ), + Index("idx_run_state_pending", "project_id", "created_at", + postgresql_where=text("state = 'pending' AND lease_until IS NULL")), + Index("idx_run_state_stale", "lease_until", + postgresql_where=text("state = 'running' AND lease_until IS NOT NULL")), + Index("idx_run_state_project_timeline", "project_id", "created_at"), + Index("idx_run_state_trace_id", "trace_id", + postgresql_where=text("trace_id IS NOT NULL")), + ) + + run_id: Mapped[UUID] = mapped_column(primary_key=True) + project_id: Mapped[str] = mapped_column( + String(64), ForeignKey("awooop_projects.project_id"), nullable=False + ) + agent_id: Mapped[str] = mapped_column(String(128), nullable=False) + state: Mapped[str] = mapped_column(String(32), nullable=False, default="pending") + lease_until: Mapped[datetime | None] = mapped_column(nullable=True) + heartbeat_at: Mapped[datetime | None] = mapped_column(nullable=True) + worker_id: Mapped[str | None] = mapped_column(String(128), nullable=True) + attempt_count: Mapped[int] = mapped_column(SmallInteger, nullable=False, default=0) + max_attempts: Mapped[int] = mapped_column(SmallInteger, nullable=False, default=3) + trace_id: Mapped[str | None] = mapped_column(String(128), nullable=True) + trigger_type: Mapped[str | None] = mapped_column(String(32), nullable=True) + trigger_ref: Mapped[str | None] = mapped_column(String(256), nullable=True) + is_shadow: Mapped[bool] = mapped_column(Boolean, nullable=False, default=True) + input_sha256: Mapped[str | None] = mapped_column(String(64), nullable=True) + output_sha256: Mapped[str | None] = mapped_column(String(64), nullable=True) + cost_usd: Mapped[Decimal] = mapped_column( + Numeric(10, 4), nullable=False, default=Decimal("0.0000") + ) + step_count: Mapped[int] = mapped_column(SmallInteger, nullable=False, default=0) + error_code: Mapped[str | None] = mapped_column(String(64), nullable=True) + error_detail: Mapped[str | None] = mapped_column(Text, nullable=True) + created_at: Mapped[datetime] = mapped_column( + nullable=False, server_default=text("NOW()") + ) + started_at: Mapped[datetime | None] = mapped_column(nullable=True) + completed_at: Mapped[datetime | None] = mapped_column(nullable=True) + timeout_at: Mapped[datetime | None] = mapped_column(nullable=True) + + +class AwoooPRunStepJournal(Base): + """SAGA step journal(ADR-119)— 每個 tool call 獨立記錄""" + + __tablename__ = "awooop_run_step_journal" + __table_args__ = ( + UniqueConstraint("run_id", "step_seq", name="uix_run_step_seq"), + CheckConstraint( + "result_status IN ('pending','success','failed','compensated')", + name="chk_step_result_status", + ), + Index("idx_run_step_run_id", "run_id", "step_seq"), + ) + + step_id: Mapped[UUID] = mapped_column( + primary_key=True, server_default=text("gen_random_uuid()") + ) + run_id: Mapped[UUID] = mapped_column( + ForeignKey("awooop_run_state.run_id", ondelete="CASCADE"), nullable=False + ) + project_id: Mapped[str] = mapped_column(String(64), nullable=False) + step_seq: Mapped[int] = mapped_column(SmallInteger, nullable=False) + tool_name: Mapped[str] = mapped_column(String(128), nullable=False) + mcp_gateway_id: Mapped[str | None] = mapped_column(String(128), nullable=True) + input_hash: Mapped[str | None] = mapped_column(String(64), nullable=True) + output_hash: Mapped[str | None] = mapped_column(String(64), nullable=True) + compensation_json: Mapped[dict[str, Any] | None] = mapped_column(JSONB, nullable=True) + result_status: Mapped[str] = mapped_column(String(16), nullable=False, default="pending") + error_code: Mapped[str | None] = mapped_column(String(64), nullable=True) + was_blocked: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) + block_reason: Mapped[str | None] = mapped_column(String(128), nullable=True) + created_at: Mapped[datetime] = mapped_column( + nullable=False, server_default=text("NOW()") + ) + completed_at: Mapped[datetime | None] = mapped_column(nullable=True) + latency_ms: Mapped[int | None] = mapped_column(Integer, nullable=True) + + +class AwoooPRunIdempotency(Base): + """Run 去重冪等表(ADR-114)— (project_id, channel_type, provider_event_id) → run_id""" + + __tablename__ = "awooop_run_idempotency" + __table_args__ = ( + UniqueConstraint( + "project_id", "channel_type", "provider_event_id", + name="uix_run_idempotency_key", + ), + Index("idx_run_idempotency_run_id", "run_id"), + ) + + idempotency_id: Mapped[UUID] = mapped_column( + primary_key=True, server_default=text("gen_random_uuid()") + ) + project_id: Mapped[str] = mapped_column(String(64), nullable=False) + channel_type: Mapped[str] = mapped_column(String(32), nullable=False) + provider_event_id: Mapped[str] = mapped_column(String(256), nullable=False) + run_id: Mapped[UUID] = mapped_column( + ForeignKey("awooop_run_state.run_id"), nullable=False + ) + created_at: Mapped[datetime] = mapped_column( + nullable=False, server_default=text("NOW()") + ) + + +# ============================================================================= +# Phase 5: MCP Gateway 四表(ADR-116/ADR-118,2026-05-04) +# ============================================================================= + + +class AwoooPMcpToolRegistry(Base): + """MCP Tool 白名單(Gate 3: Tool)""" + + __tablename__ = "awooop_mcp_tool_registry" + __table_args__ = ( + CheckConstraint( + "tool_type IN ('builtin','mcp_server','custom')", + name="chk_tool_type", + ), + CheckConstraint( + "jsonb_typeof(allowed_scopes) = 'array'", + name="chk_allowed_scopes_array", + ), + UniqueConstraint("project_id", "tool_name", name="uix_tool_registry_project_name"), + Index("idx_mcp_tool_registry_project", "project_id", "is_active"), + ) + + tool_id: Mapped[UUID] = mapped_column( + primary_key=True, server_default=text("gen_random_uuid()") + ) + project_id: Mapped[str] = mapped_column( + String(64), ForeignKey("awooop_projects.project_id", ondelete="CASCADE"), nullable=False + ) + tool_name: Mapped[str] = mapped_column(String(128), nullable=False) + tool_type: Mapped[str] = mapped_column(String(32), nullable=False) + description: Mapped[str | None] = mapped_column(Text, nullable=True) + allowed_scopes: Mapped[list[Any]] = mapped_column(JSONB, nullable=False, default=list) + environment_tags: Mapped[dict[str, Any]] = mapped_column(JSONB, nullable=False, default=dict) + is_active: Mapped[bool] = mapped_column(Boolean, nullable=False, default=True) + created_at: Mapped[datetime] = mapped_column( + nullable=False, server_default=text("NOW()") + ) + updated_at: Mapped[datetime] = mapped_column( + nullable=False, server_default=text("NOW()") + ) + + +class AwoooPMcpGrant(Base): + """Agent × Tool 授權記錄(Gate 2 + Gate 3)""" + + __tablename__ = "awooop_mcp_grants" + __table_args__ = ( + CheckConstraint( + "jsonb_typeof(granted_scopes) = 'array'", + name="chk_grant_scopes_array", + ), + CheckConstraint( + "(is_revoked = FALSE AND revoked_at IS NULL AND revoked_by IS NULL)" + " OR (is_revoked = TRUE AND revoked_at IS NOT NULL)", + name="chk_revoke_consistency", + ), + UniqueConstraint("project_id", "agent_id", "tool_id", name="uix_mcp_grant_agent_tool"), + Index( + "idx_mcp_grants_lookup", "project_id", "agent_id", "tool_id", + postgresql_where=text("is_revoked = FALSE"), + ), + ) + + grant_id: Mapped[UUID] = mapped_column( + primary_key=True, server_default=text("gen_random_uuid()") + ) + project_id: Mapped[str] = mapped_column( + String(64), ForeignKey("awooop_projects.project_id", ondelete="CASCADE"), nullable=False + ) + agent_id: Mapped[str] = mapped_column(String(128), nullable=False) + tool_id: Mapped[UUID] = mapped_column( + ForeignKey("awooop_mcp_tool_registry.tool_id", ondelete="CASCADE"), nullable=False + ) + granted_by: Mapped[str] = mapped_column(String(128), nullable=False) + granted_scopes: Mapped[list[Any]] = mapped_column(JSONB, nullable=False, default=list) + expires_at: Mapped[datetime | None] = mapped_column(nullable=True) + is_revoked: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) + revoked_at: Mapped[datetime | None] = mapped_column(nullable=True) + revoked_by: Mapped[str | None] = mapped_column(String(128), nullable=True) + created_at: Mapped[datetime] = mapped_column( + nullable=False, server_default=text("NOW()") + ) + + +class AwoooPMcpCredentialRef(Base): + """k8s Secret 參照(ADR-118 credential isolation)— 只存路徑,不存明文""" + + __tablename__ = "awooop_mcp_credential_refs" + __table_args__ = ( + CheckConstraint( + r"k8s_secret_ref ~ '^[a-z0-9-]+/[a-z0-9-]+#[a-zA-Z0-9_-]+$'", + name="chk_k8s_ref_format", + ), + CheckConstraint( + r"value_sha256 IS NULL OR value_sha256 ~ '^[0-9a-f]{64}$'", + name="chk_value_sha256_hex", + ), + UniqueConstraint("tool_id", "k8s_secret_ref", name="uix_credential_ref_tool"), + Index("idx_mcp_cred_refs_tool", "tool_id", postgresql_where=text("is_active = TRUE")), + ) + + ref_id: Mapped[UUID] = mapped_column( + primary_key=True, server_default=text("gen_random_uuid()") + ) + tool_id: Mapped[UUID] = mapped_column( + ForeignKey("awooop_mcp_tool_registry.tool_id", ondelete="CASCADE"), nullable=False + ) + project_id: Mapped[str] = mapped_column( + String(64), ForeignKey("awooop_projects.project_id", ondelete="CASCADE"), nullable=False + ) + k8s_secret_ref: Mapped[str] = mapped_column(String(256), nullable=False) + value_sha256: Mapped[str | None] = mapped_column(String(64), nullable=True) + description: Mapped[str | None] = mapped_column(Text, nullable=True) + is_active: Mapped[bool] = mapped_column(Boolean, nullable=False, default=True) + created_at: Mapped[datetime] = mapped_column( + nullable=False, server_default=text("NOW()") + ) + rotated_at: Mapped[datetime | None] = mapped_column(nullable=True) + + +class AwoooPMcpGatewayAudit(Base): + """MCP Gateway call 稽核日誌(ADR-116 P1-09)""" + + __tablename__ = "awooop_mcp_gateway_audit" + __table_args__ = ( + CheckConstraint( + "result_status IN ('success','blocked','failed','timeout')", + name="chk_gateway_result_status", + ), + CheckConstraint( + "block_gate IS NULL OR (block_gate >= 1 AND block_gate <= 5)", + name="chk_block_gate_range", + ), + Index("idx_mcp_audit_run", "project_id", "run_id", "created_at"), + Index( + "idx_mcp_audit_blocked", "project_id", "block_gate", "created_at", + postgresql_where=text("result_status = 'blocked'"), + ), + ) + + call_id: Mapped[UUID] = mapped_column( + primary_key=True, server_default=text("gen_random_uuid()") + ) + project_id: Mapped[str] = mapped_column(String(64), nullable=False) + run_id: Mapped[UUID | None] = mapped_column(nullable=True) + trace_id: Mapped[str | None] = mapped_column(String(128), nullable=True) + agent_id: Mapped[str | None] = mapped_column(String(128), nullable=True) + tool_id: Mapped[UUID] = mapped_column( + ForeignKey("awooop_mcp_tool_registry.tool_id"), nullable=False + ) + tool_name: Mapped[str] = mapped_column(String(128), nullable=False) + credential_ref: Mapped[str | None] = mapped_column(String(256), nullable=True) + input_hash: Mapped[str | None] = mapped_column(String(64), nullable=True) + output_hash: Mapped[str | None] = mapped_column(String(64), nullable=True) + gate_result: Mapped[dict[str, Any]] = mapped_column(JSONB, nullable=False, default=dict) + result_status: Mapped[str] = mapped_column(String(16), nullable=False) + block_gate: Mapped[int | None] = mapped_column(SmallInteger, nullable=True) + block_reason: Mapped[str | None] = mapped_column(String(256), nullable=True) + latency_ms: Mapped[int | None] = mapped_column(Integer, nullable=True) + created_at: Mapped[datetime] = mapped_column( + nullable=False, server_default=text("NOW()") + ) + + +# ============================================================================= +# Phase 7: Channel Hub 雙表(ADR-106 channel_event family,2026-05-04) +# ============================================================================= + + +class AwoooPConversationEvent(Base): + """入站 Channel Event 鏡像(Telegram/LINE inbound,不儲存明文)""" + + __tablename__ = "awooop_conversation_event" + __table_args__ = ( + CheckConstraint( + "channel_type IN ('telegram','line','slack','api','internal')", + name="chk_conv_event_channel_type", + ), + CheckConstraint( + "content_type IN ('text','photo','document','command','callback_query')", + name="chk_conv_event_content_type", + ), + UniqueConstraint( + "project_id", "channel_type", "provider_event_id", + name="uix_conv_event_dedup", + ), + Index("idx_conv_event_run", "project_id", "run_id", "received_at"), + Index("idx_conv_event_subject", "project_id", "platform_subject_id", "received_at"), + ) + + event_id: Mapped[UUID] = mapped_column( + primary_key=True, server_default=text("gen_random_uuid()") + ) + project_id: Mapped[str] = mapped_column( + String(64), ForeignKey("awooop_projects.project_id", ondelete="CASCADE"), nullable=False + ) + channel_type: Mapped[str] = mapped_column(String(32), nullable=False) + provider_event_id: Mapped[str] = mapped_column(String(256), nullable=False) + platform_subject_id: Mapped[str | None] = mapped_column(String(128), nullable=True) + channel_user_id: Mapped[str | None] = mapped_column(String(256), nullable=True) + channel_chat_id: Mapped[str | None] = mapped_column(String(256), nullable=True) + run_id: Mapped[UUID | None] = mapped_column(nullable=True) + content_type: Mapped[str] = mapped_column(String(32), nullable=False, default="text") + content_hash: Mapped[str | None] = mapped_column(String(64), nullable=True) + content_preview: Mapped[str | None] = mapped_column(String(256), nullable=True) + attachment_sha256: Mapped[str | None] = mapped_column(String(64), nullable=True) + is_duplicate: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) + provider_ts: Mapped[datetime | None] = mapped_column(nullable=True) + received_at: Mapped[datetime] = mapped_column( + nullable=False, server_default=text("NOW()") + ) + + +class AwoooPOutboundMessage(Base): + """出站訊息記錄(interim/final/approval_request + shadow status)""" + + __tablename__ = "awooop_outbound_message" + __table_args__ = ( + CheckConstraint( + "channel_type IN ('telegram','line','slack','api','internal')", + name="chk_outbound_channel_type", + ), + CheckConstraint( + "message_type IN ('interim','final','error','approval_request')", + name="chk_outbound_message_type", + ), + CheckConstraint( + "send_status IN ('pending','sent','failed','shadow')", + name="chk_outbound_send_status", + ), + Index("idx_outbound_msg_run", "project_id", "run_id", "queued_at"), + Index( + "idx_outbound_msg_pending", "project_id", "channel_type", "queued_at", + postgresql_where=text("send_status = 'pending'"), + ), + ) + + message_id: Mapped[UUID] = mapped_column( + primary_key=True, server_default=text("gen_random_uuid()") + ) + project_id: Mapped[str] = mapped_column( + String(64), ForeignKey("awooop_projects.project_id", ondelete="CASCADE"), nullable=False + ) + run_id: Mapped[UUID] = mapped_column(nullable=False) + conversation_event_id: Mapped[UUID | None] = mapped_column(nullable=True) + channel_type: Mapped[str] = mapped_column(String(32), nullable=False) + channel_chat_id: Mapped[str] = mapped_column(String(256), nullable=False) + message_type: Mapped[str] = mapped_column(String(32), nullable=False) + content_hash: Mapped[str | None] = mapped_column(String(64), nullable=True) + content_preview: Mapped[str | None] = mapped_column(String(256), nullable=True) + provider_message_id: Mapped[str | None] = mapped_column(String(64), nullable=True) + send_status: Mapped[str] = mapped_column(String(16), nullable=False, default="pending") + send_error: Mapped[str | None] = mapped_column(Text, nullable=True) + queued_at: Mapped[datetime] = mapped_column( + nullable=False, server_default=text("NOW()") + ) + sent_at: Mapped[datetime | None] = mapped_column(nullable=True) + triggered_by_state: Mapped[str | None] = mapped_column(String(32), nullable=True) + waiting_since: Mapped[datetime | None] = mapped_column(nullable=True) diff --git a/apps/api/src/db/base.py b/apps/api/src/db/base.py index 1c92ceeb..b02b7660 100644 --- a/apps/api/src/db/base.py +++ b/apps/api/src/db/base.py @@ -106,6 +106,11 @@ async def get_db() -> AsyncGenerator[AsyncSession, None]: factory = get_session_factory() async with factory() as session: try: + # AwoooP Phase 2.3 (2026-05-04 ogt): SET LOCAL app.project_id 讓 RLS Policy 生效 + # 預設 'awoooi',多租戶路由將在 middleware 注入實際 project_id + await session.execute( + text("SELECT set_config('app.project_id', 'awoooi', TRUE)") + ) yield session await session.commit() except Exception: @@ -114,17 +119,30 @@ async def get_db() -> AsyncGenerator[AsyncSession, None]: @asynccontextmanager -async def get_db_context() -> AsyncGenerator[AsyncSession, None]: +async def get_db_context(project_id: str | None = None) -> AsyncGenerator[AsyncSession, None]: """ Context manager for database session (non-FastAPI usage) + AwoooP Phase 2.3/2.4: 優先序 — 明確參數 > contextvar > "awoooi" + - Phase 2.3: 啟用 RLS tenant isolation(SET LOCAL app.project_id) + - Phase 2.4: 從 asyncio contextvar 讀取 background loop 的 project_id + Usage: - async with get_db_context() as db: + async with get_db_context() as db: # 繼承 contextvar 或預設 awoooi + ... + async with get_db_context("other-tenant") as db: # 明確指定 tenant ... """ + from src.core.context import get_current_project_id + effective_pid = project_id if project_id is not None else get_current_project_id() + factory = get_session_factory() async with factory() as session: try: + await session.execute( + text("SELECT set_config('app.project_id', :pid, TRUE)"), + {"pid": effective_pid}, + ) yield session await session.commit() except Exception: @@ -299,6 +317,62 @@ async def init_db() -> None: "ON timeline_events(incident_id);" )) + # AwoooP Phase 2.6 (2026-05-04 ogt): budget_ledger 建表(ADR-120 Token Budget Hard Kill) + await conn.execute(text(""" + CREATE TABLE IF NOT EXISTS budget_ledger ( + id UUID DEFAULT gen_random_uuid() PRIMARY KEY, + project_id VARCHAR(64) NOT NULL DEFAULT 'awoooi', + agent_id VARCHAR(128), + run_id UUID, + model VARCHAR(64), + provider VARCHAR(32), + prompt_tokens INT, + completion_tokens INT, + cost_usd NUMERIC(10, 4) NOT NULL DEFAULT 0.0000, + recorded_at TIMESTAMPTZ NOT NULL DEFAULT NOW() + ); + """)) + await conn.execute(text( + "CREATE INDEX IF NOT EXISTS idx_budget_ledger_project_date " + "ON budget_ledger(project_id, recorded_at DESC);" + )) + + # AwoooP Phase 2.3 (2026-05-04 ogt): 四表加 project_id(RLS 多租戶隔離) + # 防禦性 ALTER — 已存在欄位為 no-op,安全。 + # Batch 1 RLS migration 執行後,app.project_id 由 get_db_context() 自動設置。 + await conn.execute(text( + "ALTER TABLE incidents " + "ADD COLUMN IF NOT EXISTS project_id VARCHAR(64) NOT NULL DEFAULT 'awoooi';" + )) + await conn.execute(text( + "CREATE INDEX IF NOT EXISTS idx_incidents_project_id " + "ON incidents (project_id);" + )) + await conn.execute(text( + "ALTER TABLE knowledge_entries " + "ADD COLUMN IF NOT EXISTS project_id VARCHAR(64) NOT NULL DEFAULT 'awoooi';" + )) + await conn.execute(text( + "CREATE INDEX IF NOT EXISTS idx_knowledge_entries_project_id " + "ON knowledge_entries (project_id);" + )) + await conn.execute(text( + "ALTER TABLE playbooks " + "ADD COLUMN IF NOT EXISTS project_id VARCHAR(64) NOT NULL DEFAULT 'awoooi';" + )) + await conn.execute(text( + "CREATE INDEX IF NOT EXISTS idx_playbooks_project_id " + "ON playbooks (project_id);" + )) + await conn.execute(text( + "ALTER TABLE audit_logs " + "ADD COLUMN IF NOT EXISTS project_id VARCHAR(64) NOT NULL DEFAULT 'awoooi';" + )) + await conn.execute(text( + "CREATE INDEX IF NOT EXISTS idx_audit_logs_project_id " + "ON audit_logs (project_id);" + )) + # 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 6 自我治理閉環 # ADR-087: ai_governance_events 不可變 Event Sourcing 表 # asyncpg 不允許 prepared statement 內多條指令,必須分開 execute diff --git a/apps/api/src/db/models.py b/apps/api/src/db/models.py index 5e97cfa1..6601aed5 100644 --- a/apps/api/src/db/models.py +++ b/apps/api/src/db/models.py @@ -11,8 +11,9 @@ Schema 設計原則: """ from datetime import datetime +from decimal import Decimal from typing import Any -from uuid import uuid4 +from uuid import UUID, uuid4 from sqlalchemy import ( JSON, @@ -25,6 +26,7 @@ from sqlalchemy import ( ForeignKey, Index, Integer, + Numeric, String, Text, text, @@ -34,6 +36,7 @@ from sqlalchemy import ( ) from sqlalchemy.dialects.postgresql import ENUM as PgEnum from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.dialects.postgresql import UUID as pg_UUID from sqlalchemy.orm import Mapped, mapped_column from src.db.base import Base @@ -368,6 +371,13 @@ class AuditLog(Base): default="default", nullable=False, ) + # AwoooP Phase 2.3 (2026-05-04 ogt): 多租戶隔離欄位,配合 Batch 1 RLS migration + project_id: Mapped[str] = mapped_column( + String(64), + default="awoooi", + nullable=False, + index=True, + ) # Execution Result success: Mapped[bool] = mapped_column(default=False, nullable=False) @@ -671,6 +681,13 @@ class IncidentRecord(Base): primary_key=True, comment="事件唯一識別碼 (如 INC-20260322-A1B2C3)", ) + # AwoooP Phase 2.3 (2026-05-04 ogt): 多租戶隔離欄位,配合 Batch 1 RLS migration + project_id: Mapped[str] = mapped_column( + String(64), + default="awoooi", + nullable=False, + index=True, + ) # === 狀態與嚴重度 === status: Mapped[str] = mapped_column( @@ -813,6 +830,13 @@ class KnowledgeEntryRecord(Base): primary_key=True, default=generate_uuid, ) + # AwoooP Phase 2.3 (2026-05-04 ogt): 多租戶隔離欄位,配合 Batch 1 RLS migration + project_id: Mapped[str] = mapped_column( + String(64), + default="awoooi", + nullable=False, + index=True, + ) # Core Fields title: Mapped[str] = mapped_column(String(255), nullable=False) @@ -1075,6 +1099,13 @@ class PlaybookRecord(Base): String(36), primary_key=True, comment="Playbook 唯一識別碼 (PB-YYYYMMDD-XXXXXX)", ) + # AwoooP Phase 2.3 (2026-05-04 ogt): 多租戶隔離欄位,配合 Batch 1 RLS migration + project_id: Mapped[str] = mapped_column( + String(64), + default="awoooi", + nullable=False, + index=True, + ) # Core Fields name: Mapped[str] = mapped_column(String(256), nullable=False) @@ -1612,3 +1643,45 @@ class AIProviderVersionHistory(Base): __table_args__ = ( Index("ix_provider_version_captured", "provider", "captured_at"), ) + + +# ============================================================================= +# BudgetLedgerRecord — ADR-120 Token Budget Hard Kill(Phase 2.6) +# 2026-05-04 ogt + Claude Sonnet 4.6 +# ============================================================================= + +class BudgetLedgerRecord(Base): + """ + LLM call 費用記帳表(ADR-120 D5) + + 每次 LLM call 完成後插入一筆記錄,供: + - Tenant Budget 累計計算(Redis 快取,每分鐘從此表同步) + - 儀表板消費統計 + - 告警閾值觸發(80% / 95% / 100%) + """ + __tablename__ = "budget_ledger" + + id: Mapped[UUID] = mapped_column( + pg_UUID(as_uuid=True), + primary_key=True, + server_default=text("gen_random_uuid()"), + ) + project_id: Mapped[str] = mapped_column( + String(64), nullable=False, default="awoooi", index=True + ) + agent_id: Mapped[str | None] = mapped_column(String(128), nullable=True) + run_id: Mapped[UUID | None] = mapped_column(pg_UUID(as_uuid=True), nullable=True) + model: Mapped[str | None] = mapped_column(String(64), nullable=True) + provider: Mapped[str | None] = mapped_column(String(32), nullable=True) + prompt_tokens: Mapped[int | None] = mapped_column(Integer, nullable=True) + completion_tokens: Mapped[int | None] = mapped_column(Integer, nullable=True) + cost_usd: Mapped[Decimal] = mapped_column( + Numeric(10, 4), nullable=False, default=Decimal("0.0000") + ) + recorded_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), nullable=False, server_default=text("NOW()") + ) + + __table_args__ = ( + Index("idx_budget_ledger_project_date", "project_id", "recorded_at"), + ) diff --git a/apps/api/src/hermes/nl_gateway.py b/apps/api/src/hermes/nl_gateway.py index c112cb8c..c9779387 100644 --- a/apps/api/src/hermes/nl_gateway.py +++ b/apps/api/src/hermes/nl_gateway.py @@ -139,11 +139,11 @@ async def _write_dispatch_log( # T2:per-chat_id 速率限制(ADR-094,fail-open) # ───────────────────────────────────────────────────────────────────────────── -async def _check_rate_limit(chat_id: str) -> bool: +async def _check_rate_limit(chat_id: str, project_id: str = "awoooi") -> bool: """True = 允許;False = 超過限制(20 req/min per chat_id)。Redis 不可用時放行。""" try: redis = get_redis() - key = f"hermes:rl:{chat_id}" + key = f"{project_id}:hermes:rl:{chat_id}" count = await redis.incr(key) if count == 1: await redis.expire(key, _RATE_LIMIT_WINDOW_SEC) @@ -156,12 +156,15 @@ async def _check_rate_limit(chat_id: str) -> bool: # T3:Multi-turn session(Redis Hash TTL=300s,ADR-094) # ───────────────────────────────────────────────────────────────────────────── -async def _load_session_context(chat_id: str, user_id: int) -> str: +async def _load_session_context(chat_id: str, user_id: int, project_id: str = "awoooi") -> str: """載入最近 3 輪對話歷史(最多 600 字),組成 context prefix。Redis 不可用時回空字串。""" try: redis = get_redis() - key = f"hermes:session:{chat_id}:{user_id}" + key = f"{project_id}:hermes:session:{chat_id}:{user_id}" data = await redis.hgetall(key) + if not data: + # Phase A: fallback 到舊 key(滾動部署相容) + data = await redis.hgetall(f"hermes:session:{chat_id}:{user_id}") if not data: return "" turns = sorted( @@ -175,16 +178,19 @@ async def _load_session_context(chat_id: str, user_id: int) -> str: async def _save_session_turn( - chat_id: str, user_id: int, user_msg: str, assistant_reply: str + chat_id: str, user_id: int, user_msg: str, assistant_reply: str, project_id: str = "awoooi" ) -> None: """將本輪對話存入 Redis Hash,並重置 TTL=300s。Redis 不可用時靜默忽略。""" try: redis = get_redis() - key = f"hermes:session:{chat_id}:{user_id}" + key = f"{project_id}:hermes:session:{chat_id}:{user_id}" + legacy_key = f"hermes:session:{chat_id}:{user_id}" # Phase A dual-write turn_key = f"turn_{int(time.time())}" value = f"用戶:{user_msg[:100]}\nHermes:{assistant_reply[:200]}" await redis.hset(key, turn_key, value) await redis.expire(key, 300) + await redis.hset(legacy_key, turn_key, value) + await redis.expire(legacy_key, 300) except Exception: pass @@ -199,6 +205,7 @@ async def process_nl_message( chat_id: str, user_id: int, username: str = "", + project_id: str = "awoooi", ) -> str: """ 處理 NL 訊息,回傳 Telegram 格式的回覆文字。 @@ -231,7 +238,7 @@ async def process_nl_message( ) # T2:速率限制 - if not await _check_rate_limit(chat_id): + if not await _check_rate_limit(chat_id, project_id): return "⚠️ 請求太頻繁,請稍後再試(每分鐘上限 20 次)。" # Layer 1 意圖路由 @@ -249,7 +256,7 @@ async def process_nl_message( system_prompt = get_agent_system_prompt(agent_name) or "" # T3:載入 session context(最近 3 輪) - session_ctx = await _load_session_context(chat_id, user_id) + session_ctx = await _load_session_context(chat_id, user_id, project_id) prompt_with_ctx = f"{session_ctx}{user_message}" if session_ctx else user_message t0 = time.monotonic() @@ -306,7 +313,7 @@ async def process_nl_message( # T3:儲存本輪對話(只在成功時存) if success: - await _save_session_turn(chat_id, user_id, user_message, result_text) + await _save_session_turn(chat_id, user_id, user_message, result_text, project_id) # T1:非阻擋寫入 hermes_dispatch_log(失敗不影響回覆) asyncio.create_task( diff --git a/apps/api/src/main.py b/apps/api/src/main.py index 68287d6a..139010a9 100644 --- a/apps/api/src/main.py +++ b/apps/api/src/main.py @@ -65,6 +65,7 @@ from src.api.v1 import ( signoz_webhook as signoz_webhook_v1, # Phase 21: SignOz → Telegram (ADR-037) ) from src.api.v1 import drift as drift_v1 # Phase 25 P2: Config Drift Detection +from src.api.v1 import platform as platform_v1 # AwoooP Phase 4: Platform Shell(Shadow Mode) from src.api.v1 import rag as rag_v1 # Phase 33 ADR-067: RAG 知識庫 from src.api.v1 import monitoring as monitoring_v1 # 2026-04-03: 監控工具狀態 from src.api.v1 import notifications as notifications_v1 # 2026-04-10: 通知頻道狀態 @@ -185,6 +186,11 @@ else: @asynccontextmanager async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]: """Application lifespan events""" + # AwoooP Phase 2.4 (2026-05-04 ogt): 設定 startup handler 的 project_id context + # asyncio.create_task() 自動繼承父任務的 ContextVar → 31 個 background loop 全部標記為 awoooi + from src.core.context import PROJECT_ID + PROJECT_ID.set("awoooi") + # Startup logger.info( "api_startup", @@ -703,6 +709,16 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]: except Exception as e: logger.warning("model_version_tracker_schedule_failed", error=str(e)) + # AwoooP Phase 4 (2026-05-04 ogt + Claude Sonnet 4.6): Platform Worker(Shadow Mode Shell) + # ADR-106 Strangler Fig Phase 4:SKIP LOCKED run worker + stale run reaper + # Shadow mode:is_shadow=True,0 user-visible response,0 destructive tool call + try: + from src.workers.platform_worker import start_platform_worker + await start_platform_worker() + logger.info("platform_worker_started", mode="shadow") + except Exception as e: + logger.warning("platform_worker_start_failed", error=str(e)) + yield # Shutdown @@ -727,6 +743,14 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]: except Exception as e: logger.warning("auto_repair_drain_failed", error=str(e)) + # AwoooP Phase 4: Platform Worker 優雅停機(2026-05-04 ogt) + try: + from src.workers.platform_worker import stop_platform_worker + await stop_platform_worker() + logger.info("platform_worker_stopped") + except Exception as e: + logger.warning("platform_worker_stop_failed", error=str(e)) + # Phase 6.1: 關閉 Signal Worker (先關閉 Consumer) await close_signal_worker() await publisher.stop() @@ -968,6 +992,8 @@ app.include_router(agent.router, prefix="/api/v1/agent", tags=["Agent"]) app.include_router( notifications.router, prefix="/api/v1/notifications", tags=["Notifications"] ) +# AwoooP Phase 4 (2026-05-04 ogt): Platform Shell — Shadow Mode Run API +app.include_router(platform_v1.router, prefix="/api/v1/platform", tags=["AwoooP Platform"]) # ============================================================================= diff --git a/apps/api/src/models/awooop_contracts.py b/apps/api/src/models/awooop_contracts.py new file mode 100644 index 00000000..abb938f0 --- /dev/null +++ b/apps/api/src/models/awooop_contracts.py @@ -0,0 +1,437 @@ +""" +AwoooP Contract Pydantic Models +================================ +Phase 3: 六合約家族 Pydantic v2 驗證模型(ADR-112) +2026-05-04 ogt + Claude Sonnet 4.6 + +六合約家族: + 1. ProjectTenantContract — 租戶/專案能力邊界 + 2. AgentContract — Agent 模型、工具、治理 + 3. MCPGatewayContract — MCP 工具閘道 + 4. PolicyRoutingContract — LLM 路由規則 + 5. RuntimeRunStateContract — Run FSM 狀態 + 6. ChannelEventContract — Channel 事件(冪等) + +所有含 artifact ref 的欄位都附 sha256(ADR-112 artifact integrity)。 +""" + +from __future__ import annotations + +import re +from datetime import datetime +from enum import Enum +from typing import Any +from uuid import UUID + +from pydantic import BaseModel, Field, field_validator, model_validator + + +# ───────────────────────────────────────────────────────────────────────────── +# 共用型別 +# ───────────────────────────────────────────────────────────────────────────── + +_SHA256_RE = re.compile(r"^[0-9a-f]{64}$") +_PROJECT_ID_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{1,63}$") +_AGENT_ID_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{1,127}$") +_UUID_RE = re.compile( + r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$" +) + + +def _validate_sha256(v: str | None, field_name: str = "sha256") -> str | None: + if v is None: + return v + if not _SHA256_RE.match(v): + raise ValueError(f"{field_name} 必須為 64 位 hex 字串") + return v + + +class MigrationMode(str, Enum): + LEGACY = "legacy_awoooi_default" + SHADOW = "shadow" + CANARY = "canary" + ACTIVE = "active" + + +class ChannelType(str, Enum): + TELEGRAM = "telegram" + SLACK = "slack" + WEBHOOK = "webhook" + API = "api" + + +class Provider(str, Enum): + ANTHROPIC = "anthropic" + OPENAI = "openai" + OLLAMA = "ollama" + GEMINI = "gemini" + NVIDIA = "nvidia" + OPENROUTER = "openrouter" + + +class RunState(str, Enum): + PENDING = "pending" + RUNNING = "running" + WAITING_APPROVAL = "waiting_approval" + WAITING_TOOL = "waiting_tool" + COMPLETED = "completed" + FAILED = "failed" + CANCELLED = "cancelled" + TIMEOUT = "timeout" + + +class AuthScheme(str, Enum): + NONE = "none" + BEARER = "bearer" + HMAC = "hmac" + + +class Transport(str, Enum): + STDIO = "stdio" + HTTP = "http" + SSE = "sse" + + +class EventType(str, Enum): + MESSAGE_RECEIVED = "message_received" + CALLBACK_QUERY = "callback_query" + COMMAND_INVOKED = "command_invoked" + WEBHOOK_POST = "webhook_post" + API_REQUEST = "api_request" + APPROVAL_RESPONSE = "approval_response" + + +# ───────────────────────────────────────────────────────────────────────────── +# 1. Project Tenant Contract +# ───────────────────────────────────────────────────────────────────────────── + +class ProjectTenantContract(BaseModel): + """租戶/專案合約(ADR-111/115)""" + + model_config = {"extra": "forbid"} + + project_id: str = Field(..., description="全局唯一租戶識別符") + display_name: str = Field(..., min_length=1, max_length=256) + migration_mode: MigrationMode = MigrationMode.LEGACY + budget_limit_usd: float | None = Field(None, ge=0) + allowed_channels: list[ChannelType] = Field(default_factory=list) + is_active: bool = True + metadata: dict[str, Any] = Field(default_factory=dict) + + @field_validator("project_id") + @classmethod + def validate_project_id(cls, v: str) -> str: + if not _PROJECT_ID_RE.match(v): + raise ValueError("project_id 只允許 a-z, 0-9, _, -,長度 2-64") + return v + + @field_validator("allowed_channels") + @classmethod + def validate_unique_channels(cls, v: list[ChannelType]) -> list[ChannelType]: + if len(v) != len(set(v)): + raise ValueError("allowed_channels 不可包含重複項目") + return v + + +# ───────────────────────────────────────────────────────────────────────────── +# 2. Agent Contract +# ───────────────────────────────────────────────────────────────────────────── + +class ArtifactRef(BaseModel): + """含 SHA-256 的 artifact 參照(ADR-112 artifact integrity)""" + + model_config = {"extra": "forbid"} + + artifact_id: str + sha256: str = Field(..., description="SHA-256 hex digest(64 位)") + + @field_validator("sha256") + @classmethod + def validate_sha256(cls, v: str) -> str: + return _validate_sha256(v, "sha256") # type: ignore[return-value] + + +class ToolRef(BaseModel): + """Agent 工具參照""" + + model_config = {"extra": "allow"} + + tool_name: str + mcp_gateway_id: str | None = None + sha256: str | None = None + + @field_validator("sha256") + @classmethod + def validate_sha256(cls, v: str | None) -> str | None: + return _validate_sha256(v, "tool sha256") + + +class AgentContract(BaseModel): + """Agent 合約(ADR-112)""" + + model_config = {"extra": "forbid"} + + agent_id: str = Field(..., description="Agent 識別符") + agent_name: str = Field(..., min_length=1, max_length=256) + model: str = Field(..., min_length=1, max_length=128) + provider: Provider + max_tokens: int | None = Field(None, ge=1, le=200000) + temperature: float | None = Field(None, ge=0.0, le=2.0) + system_prompt_ref: ArtifactRef | None = None + tools: list[ToolRef] = Field(default_factory=list) + budget_limit_usd_per_run: float | None = Field(None, ge=0) + require_approval: bool = False + approval_timeout_seconds: int | None = Field(None, ge=60, le=86400) + max_parallel_runs: int = Field(1, ge=1, le=100) + tags: list[str] = Field(default_factory=list) + + @field_validator("agent_id") + @classmethod + def validate_agent_id(cls, v: str) -> str: + if not _AGENT_ID_RE.match(v): + raise ValueError("agent_id 只允許 a-z, 0-9, _, -,長度 2-128") + return v + + @model_validator(mode="after") + def validate_approval_config(self) -> AgentContract: + if self.require_approval and self.approval_timeout_seconds is None: + self.approval_timeout_seconds = 300 + return self + + +# ───────────────────────────────────────────────────────────────────────────── +# 3. MCP Gateway Contract +# ───────────────────────────────────────────────────────────────────────────── + +class ToolExposed(BaseModel): + """Gateway 暴露的工具定義""" + + model_config = {"extra": "forbid"} + + tool_name: str + description: str | None = None + schema_sha256: str = Field(..., description="工具 input schema SHA-256") + is_destructive: bool = False + + @field_validator("schema_sha256") + @classmethod + def validate_schema_sha256(cls, v: str) -> str: + return _validate_sha256(v, "schema_sha256") # type: ignore[return-value] + + +class MCPGatewayContract(BaseModel): + """MCP Gateway 合約(ADR-113)""" + + model_config = {"extra": "forbid"} + + gateway_id: str + gateway_name: str = Field(..., min_length=1, max_length=256) + transport: Transport + endpoint: str | None = None + auth_scheme: AuthScheme = AuthScheme.NONE + hmac_secret_ref: str | None = None + tools_exposed: list[ToolExposed] = Field(default_factory=list) + rate_limit_rpm: int | None = Field(None, ge=1) + timeout_seconds: int = Field(30, ge=1, le=300) + is_enabled: bool = True + + @model_validator(mode="after") + def validate_http_endpoint(self) -> MCPGatewayContract: + if self.transport in (Transport.HTTP, Transport.SSE) and not self.endpoint: + raise ValueError(f"transport={self.transport} 時 endpoint 為必填") + return self + + +# ───────────────────────────────────────────────────────────────────────────── +# 4. Policy Routing Contract +# ───────────────────────────────────────────────────────────────────────────── + +class TimeRange(BaseModel): + model_config = {"extra": "forbid"} + + start_utc: str = Field(..., pattern=r"^[0-2][0-9]:[0-5][0-9]$") + end_utc: str = Field(..., pattern=r"^[0-2][0-9]:[0-5][0-9]$") + + +class RoutingCondition(BaseModel): + model_config = {"extra": "forbid"} + + task_types: list[str] = Field(default_factory=list) + max_prompt_tokens: int | None = Field(None, ge=1) + time_range: TimeRange | None = None + + +class RoutingRule(BaseModel): + model_config = {"extra": "forbid"} + + rule_id: str + priority: int = Field(..., ge=0, le=9999) + provider: Provider + model: str + condition: RoutingCondition | None = None + weight: int = Field(100, ge=1, le=100) + + +class RetryPolicy(BaseModel): + model_config = {"extra": "forbid"} + + max_retries: int = Field(3, ge=0, le=10) + backoff_base_seconds: float = Field(1.0, ge=0.1, le=60) + retry_on_provider_errors: bool = True + + +class PolicyRoutingContract(BaseModel): + """路由/政策合約""" + + model_config = {"extra": "forbid"} + + policy_id: str + policy_name: str = Field(..., min_length=1, max_length=256) + routing_rules: list[RoutingRule] = Field(..., min_length=1) + fallback_provider: Provider | None = None + fallback_model: str | None = None + max_cost_per_run_usd: float | None = Field(None, ge=0) + retry_policy: RetryPolicy = Field(default_factory=RetryPolicy) + effective_from: datetime | None = None + effective_to: datetime | None = None + + +# ───────────────────────────────────────────────────────────────────────────── +# 5. Runtime Run State Contract +# ───────────────────────────────────────────────────────────────────────────── + +class RunTrigger(BaseModel): + model_config = {"extra": "forbid"} + + trigger_type: str = Field( + ..., pattern="^(channel_event|schedule|api|sub_agent|retry)$" + ) + channel_event_id: str | None = None + schedule_id: str | None = None + triggered_by: str | None = None + + +class RuntimeRunStateContract(BaseModel): + """Run 狀態機合約(ADR-106 Phase 3)""" + + model_config = {"extra": "forbid"} + + run_id: str = Field(..., description="UUID v7") + project_id: str + agent_id: str + state: RunState + trace_id: str | None = None + parent_run_id: str | None = None + trigger: RunTrigger | None = None + input_sha256: str | None = None + output_sha256: str | None = None + started_at: datetime | None = None + completed_at: datetime | None = None + timeout_at: datetime | None = None + error_code: str | None = None + cost_usd: float | None = Field(None, ge=0) + step_count: int = Field(0, ge=0) + + @field_validator("run_id", "parent_run_id") + @classmethod + def validate_uuid(cls, v: str | None) -> str | None: + if v is None: + return v + if not _UUID_RE.match(v): + raise ValueError("必須為標準 UUID 格式") + return v + + @field_validator("input_sha256", "output_sha256") + @classmethod + def validate_sha256_fields(cls, v: str | None) -> str | None: + return _validate_sha256(v) + + @field_validator("project_id") + @classmethod + def validate_project_id(cls, v: str) -> str: + if not _PROJECT_ID_RE.match(v): + raise ValueError("project_id 格式不合法") + return v + + +# ───────────────────────────────────────────────────────────────────────────── +# 6. Channel Event Contract +# ───────────────────────────────────────────────────────────────────────────── + +class AttachmentRef(BaseModel): + model_config = {"extra": "forbid"} + + attachment_type: str = Field(..., pattern="^(photo|document|audio|video)$") + file_id: str + sha256: str | None = None + + @field_validator("sha256") + @classmethod + def validate_sha256(cls, v: str | None) -> str | None: + return _validate_sha256(v, "attachment sha256") + + +class ChannelEventContract(BaseModel): + """Channel Event 合約(ADR-114 冪等去重)""" + + model_config = {"extra": "forbid"} + + event_id: str = Field(..., description="Platform 生成的 UUID") + project_id: str + channel_type: ChannelType + event_type: EventType + provider_event_id: str | None = Field(None, max_length=256) + user_id: str | None = None + chat_id: str | None = None + payload: dict[str, Any] = Field(..., min_length=1) + text: str | None = Field(None, max_length=4096) + attachments: list[AttachmentRef] = Field(default_factory=list) + run_id: str | None = None + is_duplicate: bool = False + received_at: datetime + + @field_validator("event_id", "run_id") + @classmethod + def validate_uuid(cls, v: str | None) -> str | None: + if v is None: + return v + if not _UUID_RE.match(v): + raise ValueError("必須為標準 UUID 格式") + return v + + @field_validator("project_id") + @classmethod + def validate_project_id(cls, v: str) -> str: + if not _PROJECT_ID_RE.match(v): + raise ValueError("project_id 格式不合法") + return v + + +# ───────────────────────────────────────────────────────────────────────────── +# Contract family dispatcher +# ───────────────────────────────────────────────────────────────────────────── + +CONTRACT_FAMILY_MODELS: dict[str, type[BaseModel]] = { + "project_tenant": ProjectTenantContract, + "agent": AgentContract, + "mcp_gateway": MCPGatewayContract, + "policy_routing": PolicyRoutingContract, + "runtime_run_state": RuntimeRunStateContract, + "channel_event": ChannelEventContract, +} + +VALID_CONTRACT_FAMILIES = frozenset(CONTRACT_FAMILY_MODELS.keys()) + + +def validate_contract_body(family: str, body: dict[str, Any]) -> BaseModel: + """ + 依 contract_family 驗證 body_json。 + 驗證失敗拋出 pydantic.ValidationError。 + """ + model_cls = CONTRACT_FAMILY_MODELS.get(family) + if model_cls is None: + raise ValueError( + f"未知 contract_family: {family!r}。" + f"合法值:{sorted(VALID_CONTRACT_FAMILIES)}" + ) + return model_cls.model_validate(body) diff --git a/apps/api/src/plugins/mcp/credential_resolver.py b/apps/api/src/plugins/mcp/credential_resolver.py new file mode 100644 index 00000000..dd01718d --- /dev/null +++ b/apps/api/src/plugins/mcp/credential_resolver.py @@ -0,0 +1,136 @@ +""" +MCP Credential Resolver — k8s Secret 參照解析 +============================================= +AwoooP Phase 5.5: ADR-118 Credential Isolation +2026-05-04 ogt + Claude Sonnet 4.6 + +設計原則(2026-04-18 Secret Leak 事故教訓): + - 明文 credential 絕不進入 audit log / LLM context + - Gateway 只傳 k8s secret ref(格式:"namespace/secret-name#key") + - 真實 secret value 在記憶體中短暫存在,使用後立刻清除 + - 回傳給 caller 時只提供「遮罩版」(前 4 字元 + *** + 後 4 字元) + - sha256(actual_value) 記入 awooop_mcp_credential_refs.value_sha256(指紋,不可還原) + +k8s secret ref 格式: + "namespace/secret-name#key" + 例:"awoooi/telegram-bot#TELEGRAM_BOT_TOKEN" + +解析方式(兩種,依環境): + 1. k8s in-cluster:使用 kubernetes asyncclient(prod) + 2. 本機開發 fallback:讀 AWOOOP_DEV_SECRETS_JSON 環境變數(dev only) +""" + +from __future__ import annotations + +import hashlib +import os +import re + +import structlog + +logger = structlog.get_logger(__name__) + +# k8s secret ref 格式正則(與 DB CHECK 一致) +_K8S_REF_RE = re.compile(r"^([a-z0-9-]+)/([a-z0-9-]+)#([a-zA-Z0-9_-]+)$") + +# dev fallback:JSON 格式 {"namespace/secret-name#key": "actual_value"} +_DEV_SECRETS_ENV = "AWOOOP_DEV_SECRETS_JSON" + + +class CredentialResolutionError(Exception): + error_code = "E-MCP-GATE-009" + + +def _mask_secret(value: str) -> str: + """回傳遮罩版:前 4 + *** + 後 4(若長度 < 8 則全遮罩)""" + if len(value) < 8: + return "***" + return f"{value[:4]}***{value[-4:]}" + + +def _sha256_secret(value: str) -> str: + return hashlib.sha256(value.encode()).hexdigest() + + +async def resolve_k8s_secret(ref: str) -> tuple[str, str, str]: + """ + 解析 k8s secret ref,回傳 (actual_value, masked_value, sha256)。 + + actual_value:明文,caller 必須在使用後清除(不可存入任何持久化層) + masked_value:供 log / response 使用 + sha256:供 awooop_mcp_credential_refs.value_sha256 記錄 + + Raises: + CredentialResolutionError: ref 格式錯誤或 secret 不存在 + """ + m = _K8S_REF_RE.match(ref) + if not m: + raise CredentialResolutionError( + f"k8s secret ref 格式錯誤(期望 'namespace/secret-name#key'):{ref!r}" + ) + + namespace, secret_name, key = m.group(1), m.group(2), m.group(3) + + # Dev fallback:讀環境變數 + dev_json = os.environ.get(_DEV_SECRETS_ENV) + if dev_json: + try: + import json + dev_secrets: dict[str, str] = json.loads(dev_json) + value = dev_secrets.get(ref) + if value is None: + raise CredentialResolutionError( + f"dev secrets 中找不到 ref={ref!r}" + ) + logger.debug("credential_resolved_dev", ref=ref) + return value, _mask_secret(value), _sha256_secret(value) + except CredentialResolutionError: + raise + except Exception as exc: + raise CredentialResolutionError( + f"AWOOOP_DEV_SECRETS_JSON 解析失敗: {exc}" + ) from exc + + # Production:k8s in-cluster + try: + from kubernetes_asyncio import client, config # type: ignore[import] + from kubernetes_asyncio.client import CoreV1Api # type: ignore[import] + + await config.load_incluster_config() + async with client.ApiClient() as api: + v1 = CoreV1Api(api) + secret = await v1.read_namespaced_secret(secret_name, namespace) + + if secret.data is None or key not in secret.data: + raise CredentialResolutionError( + f"k8s secret '{namespace}/{secret_name}' 中找不到 key='{key}'" + ) + + import base64 + encoded = secret.data[key] + value = base64.b64decode(encoded).decode() + + logger.info( + "credential_resolved_k8s", + namespace=namespace, + secret_name=secret_name, + key=key, + masked=_mask_secret(value), + ) + return value, _mask_secret(value), _sha256_secret(value) + + except CredentialResolutionError: + raise + except ImportError: + raise CredentialResolutionError( + "kubernetes_asyncio 未安裝,且未設定 AWOOOP_DEV_SECRETS_JSON(dev fallback)" + ) + except Exception as exc: + logger.exception( + "credential_resolution_k8s_failed", + ref=ref, + error=str(exc), + ) + raise CredentialResolutionError( + f"k8s secret 解析失敗({namespace}/{secret_name}#{key}): {exc}" + ) from exc diff --git a/apps/api/src/plugins/mcp/gateway.py b/apps/api/src/plugins/mcp/gateway.py new file mode 100644 index 00000000..45dae66e --- /dev/null +++ b/apps/api/src/plugins/mcp/gateway.py @@ -0,0 +1,507 @@ +""" +MCP Gateway — 五閘門 Enforcement Service +========================================= +AwoooP Phase 5.2: ADR-116 五閘門強制執行 +2026-05-04 ogt + Claude Sonnet 4.6 + +五閘門定義(依序,任一失敗即阻斷): + Gate 1 — Project:project_id 在 awooop_projects 且 migration_mode != 'legacy_awoooi_default' + Gate 2 — Agent:agent_id 在 awooop_agents 且 status = 'active' + Gate 3 — Tool:tool_id 在 awooop_mcp_tool_registry 且 grant 存在且未到期 + Gate 4 — Environment:tool.environment_tags 與 run context 匹配(shadow mode 強制放行) + Gate 5 — Approval:工具 scope 需要 approval 時,檢查 multi_sig 是否已核准 + +錯誤碼(E-MCP-GATE-XXX): + E-MCP-GATE-001 Gate 1 project 不存在或 migration_mode 不符 + E-MCP-GATE-002 Gate 2 agent 不存在或未啟用 + E-MCP-GATE-003 Gate 3 tool 不在白名單或 grant 不存在/已到期/已撤銷 + E-MCP-GATE-004 Gate 4 environment 標籤不匹配(非 shadow mode) + E-MCP-GATE-005 Gate 5 approval 尚未取得 + E-MCP-GATE-009 credential 解析失敗(k8s secret 取不到) + +使用方式: + from src.plugins.mcp.gateway import McpGateway, GatewayContext + + ctx = GatewayContext( + project_id="awoooi", + agent_id="my-agent", + tool_name="kubectl_get", + run_id=run_id, + trace_id=trace_id, + is_shadow=True, + ) + result = await McpGateway(db).call(ctx, parameters={"namespace": "default"}) +""" + +from __future__ import annotations + +import hashlib +import json +import time +from dataclasses import dataclass, field +from datetime import datetime, timezone +from typing import Any +from uuid import UUID + +import structlog +from sqlalchemy import select, text +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.awooop_models import ( + AwoooPActiveRevision, + AwoooPMcpGatewayAudit, + AwoooPMcpGrant, + AwoooPMcpToolRegistry, + AwoooPProject, +) +from src.plugins.mcp.interfaces import MCPToolResult +from src.plugins.mcp.registry import get_provider_registry + +logger = structlog.get_logger(__name__) + + +# ───────────────────────────────────────────────────────────────────────────── +# 錯誤定義 +# ───────────────────────────────────────────────────────────────────────────── + +class McpGatewayError(Exception): + """所有 Gateway 攔截錯誤的基礎類別""" + + def __init__(self, error_code: str, message: str, gate: int) -> None: + super().__init__(message) + self.error_code = error_code + self.gate = gate + + +class GateProjectError(McpGatewayError): + def __init__(self, msg: str = "project 不存在或 migration_mode 不符") -> None: + super().__init__("E-MCP-GATE-001", msg, gate=1) + + +class GateAgentError(McpGatewayError): + def __init__(self, msg: str = "agent 不存在或未啟用") -> None: + super().__init__("E-MCP-GATE-002", msg, gate=2) + + +class GateToolError(McpGatewayError): + def __init__(self, msg: str = "tool 不在白名單或 grant 失效") -> None: + super().__init__("E-MCP-GATE-003", msg, gate=3) + + +class GateEnvironmentError(McpGatewayError): + def __init__(self, msg: str = "environment 標籤不匹配") -> None: + super().__init__("E-MCP-GATE-004", msg, gate=4) + + +class GateApprovalError(McpGatewayError): + def __init__(self, msg: str = "approval 尚未取得") -> None: + super().__init__("E-MCP-GATE-005", msg, gate=5) + + +class CredentialResolutionError(McpGatewayError): + def __init__(self, msg: str = "credential 解析失敗") -> None: + super().__init__("E-MCP-GATE-009", msg, gate=0) + + +# ───────────────────────────────────────────────────────────────────────────── +# Gateway Context(每次 call 一個) +# ───────────────────────────────────────────────────────────────────────────── + +@dataclass +class GatewayContext: + project_id: str + agent_id: str + tool_name: str + run_id: UUID | None = None + trace_id: str | None = None + is_shadow: bool = True # shadow mode:Gate 4/5 放行,不執行 destructive + environment: dict[str, str] = field(default_factory=dict) # e.g. {"env": "prod"} + required_scope: str = "read" # "read" | "write" | "admin" + + +@dataclass +class GateCheckResult: + gate1_project: bool = False + gate2_agent: bool = False + gate3_tool: bool = False + gate4_env: bool = False + gate5_approval: bool = False + + def as_dict(self) -> dict[str, bool]: + return { + "gate1_project": self.gate1_project, + "gate2_agent": self.gate2_agent, + "gate3_tool": self.gate3_tool, + "gate4_env": self.gate4_env, + "gate5_approval": self.gate5_approval, + } + + @property + def all_passed(self) -> bool: + return all([ + self.gate1_project, + self.gate2_agent, + self.gate3_tool, + self.gate4_env, + self.gate5_approval, + ]) + + +# ───────────────────────────────────────────────────────────────────────────── +# McpGateway +# ───────────────────────────────────────────────────────────────────────────── + +class McpGateway: + """ + MCP Gateway:五閘門 enforcement + audit log + credential isolation。 + + 每個 gateway call 都寫一筆 awooop_mcp_gateway_audit。 + """ + + def __init__(self, db: AsyncSession) -> None: + self._db = db + + async def call( + self, + ctx: GatewayContext, + parameters: dict[str, Any], + ) -> MCPToolResult: + """ + 執行五閘門檢查後呼叫底層 MCP provider。 + 任一閘門失敗 → raise McpGatewayError + 寫 blocked audit。 + """ + started = time.monotonic() + gate_result = GateCheckResult() + tool_row: AwoooPMcpToolRegistry | None = None + grant_row: AwoooPMcpGrant | None = None + + try: + # Gate 1 — Project + tool_row, grant_row = await self._gate1_project(ctx, gate_result) + + # Gate 2 — Agent + await self._gate2_agent(ctx, gate_result) + + # Gate 3 — Tool + Grant + tool_row, grant_row = await self._gate3_tool(ctx, gate_result) + + # Gate 4 — Environment(shadow mode 直接放行) + await self._gate4_environment(ctx, tool_row, gate_result) + + # Gate 5 — Approval(shadow mode + scope=read 直接放行) + await self._gate5_approval(ctx, grant_row, gate_result) + + except McpGatewayError as exc: + latency = int((time.monotonic() - started) * 1000) + await self._write_audit( + ctx=ctx, + tool_row=tool_row, + parameters=parameters, + result=None, + gate_result=gate_result, + result_status="blocked", + block_gate=exc.gate, + block_reason=f"{exc.error_code}: {exc}", + latency_ms=latency, + ) + raise + + # 五閘通過 → 執行 tool + result: MCPToolResult | None = None + result_status = "failed" + try: + result = await self._execute_tool(ctx, tool_row, parameters) + result_status = "success" if result.success else "failed" + return result + except Exception as exc: + logger.exception( + "mcp_gateway_execution_error", + project_id=ctx.project_id, + tool_name=ctx.tool_name, + error=str(exc), + ) + raise + finally: + latency = int((time.monotonic() - started) * 1000) + await self._write_audit( + ctx=ctx, + tool_row=tool_row, + parameters=parameters, + result=result, + gate_result=gate_result, + result_status=result_status, + block_gate=None, + block_reason=None, + latency_ms=latency, + ) + + # ── 五閘門實作 ──────────────────────────────────────────────────────────── + + async def _gate1_project( + self, ctx: GatewayContext, gate_result: GateCheckResult + ) -> tuple[AwoooPMcpToolRegistry | None, AwoooPMcpGrant | None]: + """Gate 1:project 必須存在且 migration_mode != 'legacy_awoooi_default'""" + result = await self._db.execute( + select(AwoooPProject).where( + AwoooPProject.project_id == ctx.project_id, + AwoooPProject.migration_mode != "legacy_awoooi_default", + ) + ) + project = result.scalar_one_or_none() + if project is None: + raise GateProjectError( + f"project '{ctx.project_id}' 不存在或 migration_mode=legacy_awoooi_default" + ) + gate_result.gate1_project = True + return None, None + + async def _gate2_agent( + self, ctx: GatewayContext, gate_result: GateCheckResult + ) -> None: + """Gate 2:agent 必須在 awooop_active_revisions 中有 active contract(family='agent')""" + result = await self._db.execute( + select(AwoooPActiveRevision).where( + AwoooPActiveRevision.project_id == ctx.project_id, + AwoooPActiveRevision.contract_family == "agent", + AwoooPActiveRevision.contract_id == ctx.agent_id, + ) + ) + active = result.scalar_one_or_none() + if active is None: + raise GateAgentError( + f"agent '{ctx.agent_id}' 在 '{ctx.project_id}' 無 active contract" + ) + gate_result.gate2_agent = True + + async def _gate3_tool( + self, ctx: GatewayContext, gate_result: GateCheckResult + ) -> tuple[AwoooPMcpToolRegistry, AwoooPMcpGrant]: + """Gate 3:tool 在白名單 + grant 有效(未到期、未撤銷)""" + now = datetime.now(timezone.utc) + + # 查 tool registry + tool_result = await self._db.execute( + select(AwoooPMcpToolRegistry).where( + AwoooPMcpToolRegistry.project_id == ctx.project_id, + AwoooPMcpToolRegistry.tool_name == ctx.tool_name, + AwoooPMcpToolRegistry.is_active.is_(True), + ) + ) + tool_row = tool_result.scalar_one_or_none() + if tool_row is None: + raise GateToolError(f"tool '{ctx.tool_name}' 不在白名單") + + # 查 grant(scope 必須包含 required_scope) + grant_result = await self._db.execute( + select(AwoooPMcpGrant).where( + AwoooPMcpGrant.project_id == ctx.project_id, + AwoooPMcpGrant.agent_id == ctx.agent_id, + AwoooPMcpGrant.tool_id == tool_row.tool_id, + AwoooPMcpGrant.is_revoked.is_(False), + ) + ) + grant_row = grant_result.scalar_one_or_none() + if grant_row is None: + raise GateToolError( + f"agent '{ctx.agent_id}' 對 tool '{ctx.tool_name}' 無有效 grant" + ) + if grant_row.expires_at is not None and grant_row.expires_at < now: + raise GateToolError( + f"agent '{ctx.agent_id}' 對 tool '{ctx.tool_name}' 的 grant 已到期" + ) + # scope 檢查:required_scope 必須在 granted_scopes 中 + granted_scopes: list[str] = grant_row.granted_scopes or [] + if ctx.required_scope not in granted_scopes: + raise GateToolError( + f"grant 未包含所需 scope '{ctx.required_scope}'(有:{granted_scopes})" + ) + + gate_result.gate3_tool = True + return tool_row, grant_row + + async def _gate4_environment( + self, + ctx: GatewayContext, + tool_row: AwoooPMcpToolRegistry | None, + gate_result: GateCheckResult, + ) -> None: + """Gate 4:environment 標籤匹配(shadow mode 強制放行)""" + if ctx.is_shadow: + gate_result.gate4_env = True + return + + if tool_row is None: + gate_result.gate4_env = True + return + + required_tags: dict[str, str] = tool_row.environment_tags or {} + for k, v in required_tags.items(): + if ctx.environment.get(k) != v: + raise GateEnvironmentError( + f"environment tag '{k}' 期望 '{v}',實際 '{ctx.environment.get(k)}'" + ) + gate_result.gate4_env = True + + async def _gate5_approval( + self, + ctx: GatewayContext, + grant_row: AwoooPMcpGrant | None, + gate_result: GateCheckResult, + ) -> None: + """Gate 5:需要 approval 時,檢查 Redis multi_sig(shadow + read scope 直接放行)""" + # shadow mode 或 read scope 不需 approval + if ctx.is_shadow or ctx.required_scope == "read": + gate_result.gate5_approval = True + return + + # write/admin scope 需要檢查 approval + if ctx.run_id is None: + raise GateApprovalError("write/admin 操作需要 run_id(approval 追蹤用)") + + try: + import aioredis + + from src.core.config import settings + + redis = aioredis.from_url(settings.REDIS_URL) + approval_key = f"mcp_approval:{ctx.project_id}:{ctx.agent_id}:{ctx.tool_name}:{ctx.run_id}" + approved = await redis.get(approval_key) + await redis.aclose() + except Exception as exc: + logger.warning( + "mcp_gate5_redis_error", + project_id=ctx.project_id, + tool_name=ctx.tool_name, + error=str(exc), + ) + # Redis 失敗時 fail-closed(不放行) + raise GateApprovalError(f"approval Redis 查詢失敗: {exc}") from exc + + if not approved: + raise GateApprovalError( + f"tool '{ctx.tool_name}' 需要 approval(key={approval_key})" + ) + gate_result.gate5_approval = True + + # ── 執行層 ─────────────────────────────────────────────────────────────── + + async def _execute_tool( + self, + ctx: GatewayContext, + tool_row: AwoooPMcpToolRegistry | None, + parameters: dict[str, Any], + ) -> MCPToolResult: + """呼叫底層 MCP provider 執行工具""" + registry = get_provider_registry() + provider = registry.get(ctx.tool_name) or registry.get( + tool_row.tool_name if tool_row else ctx.tool_name + ) + + # 找不到 provider → 回傳 shadow no-op + if provider is None: + logger.warning( + "mcp_gateway_no_provider", + tool_name=ctx.tool_name, + is_shadow=ctx.is_shadow, + ) + return MCPToolResult( + success=True, + execution_id=f"shadow-noop-{ctx.tool_name}", + output={"shadow": True, "message": "no provider registered, shadow no-op"}, + ) + + audit_params = dict(parameters) + audit_params["_mcp_audit"] = { + "project_id": ctx.project_id, + "agent_id": ctx.agent_id, + "run_id": str(ctx.run_id) if ctx.run_id else None, + "trace_id": ctx.trace_id, + } + return await provider.execute(ctx.tool_name, audit_params) + + # ── Audit log ───────────────────────────────────────────────────────────── + + async def _write_audit( + self, + *, + ctx: GatewayContext, + tool_row: AwoooPMcpToolRegistry | None, + parameters: dict[str, Any], + result: MCPToolResult | None, + gate_result: GateCheckResult, + result_status: str, + block_gate: int | None, + block_reason: str | None, + latency_ms: int, + ) -> None: + """寫 awooop_mcp_gateway_audit — 只寫 hash,不寫明文 input/output""" + try: + input_hash = hashlib.sha256( + json.dumps(parameters, sort_keys=True, default=str).encode() + ).hexdigest() + + output_hash: str | None = None + if result is not None: + output_hash = hashlib.sha256( + json.dumps(result.output, sort_keys=True, default=str).encode() + ).hexdigest() + + audit = AwoooPMcpGatewayAudit( + project_id=ctx.project_id, + run_id=ctx.run_id, + trace_id=ctx.trace_id, + agent_id=ctx.agent_id, + tool_id=tool_row.tool_id if tool_row else None, # type: ignore[arg-type] + tool_name=ctx.tool_name, + input_hash=input_hash, + output_hash=output_hash, + gate_result=gate_result.as_dict(), + result_status=result_status, + block_gate=block_gate, + block_reason=block_reason, + latency_ms=latency_ms, + ) + + if tool_row is not None: + self._db.add(audit) + await self._db.flush() + except Exception as exc: + logger.warning( + "mcp_gateway_audit_write_failed", + project_id=ctx.project_id, + tool_name=ctx.tool_name, + error=str(exc), + ) + + +# ───────────────────────────────────────────────────────────────────────────── +# 便捷函數 +# ───────────────────────────────────────────────────────────────────────────── + +async def gateway_call( + db: AsyncSession, + *, + project_id: str, + agent_id: str, + tool_name: str, + parameters: dict[str, Any], + run_id: UUID | None = None, + trace_id: str | None = None, + is_shadow: bool = True, + required_scope: str = "read", + environment: dict[str, str] | None = None, +) -> MCPToolResult: + """ + Stateless 便捷函數:建立 GatewayContext + 執行 McpGateway.call()。 + """ + ctx = GatewayContext( + project_id=project_id, + agent_id=agent_id, + tool_name=tool_name, + run_id=run_id, + trace_id=trace_id, + is_shadow=is_shadow, + required_scope=required_scope, + environment=environment or {}, + ) + return await McpGateway(db).call(ctx, parameters) diff --git a/apps/api/src/plugins/mcp/redaction_middleware.py b/apps/api/src/plugins/mcp/redaction_middleware.py new file mode 100644 index 00000000..bb74bec3 --- /dev/null +++ b/apps/api/src/plugins/mcp/redaction_middleware.py @@ -0,0 +1,159 @@ +""" +MCP Redaction Middleware — 雙層 PII/Secret Redaction +===================================================== +AwoooP Phase 5.3: ADR-116 P1-04 + P1-09 +2026-05-04 ogt + Claude Sonnet 4.6 + +MCP tool call 的 input/output 必須經過雙層 redaction: + Layer 1(audit_sink)— 寫入 audit log 前的 sanitization(欄位黑名單 + pattern 攔截) + Layer 2(本層) — MCP tool call input/output 專用: + - 移除已知 secret 欄位(_mcp_audit 注入的 context) + - 對 output 套用 audit_sink 的完整 redaction patterns + - 限制 output 大小(防 prompt stuffing) + +設計原則(ADR-118 credential isolation 延伸): + - MCP tool 的 output 可能含 k8s secret 值 → 必須在 output 進入 LLM context 前 redact + - 只有「安全的」output 才能被 platform_runtime.shadow_execute 使用 + - input credential 欄位(如 k8s_value)在送入 provider 前清除(credential isolation) + +雙層保障的必要性: + - audit_sink 保護的是 audit log DB + - 本 middleware 保護的是 LLM context + gateway audit hash + - 兩者防護對象不同,不可互相替代 +""" + +from __future__ import annotations + +import hashlib +import json +import re +from typing import Any + +import structlog + +from src.services.audit_sink import _BLOCKED_FIELD_NAMES, _REDACTION_PATTERNS, _redact_string + +logger = structlog.get_logger(__name__) + +# MCP output 進入 LLM context 的最大字元數(防 prompt stuffing) +_MCP_OUTPUT_MAX_CHARS = 16_000 + +# MCP gateway 注入的 audit context key(送 provider 前移除) +_MCP_AUDIT_KEY = "_mcp_audit" + +# MCP credential 欄位名稱(Gate 5 credential isolation — 在 input 中清除) +_MCP_CREDENTIAL_FIELDS = frozenset({ + "k8s_value", "secret_value", "credential", "credential_value", + "token_value", "api_key_value", "private_key_value", +}) + + +def redact_mcp_input(parameters: dict[str, Any]) -> dict[str, Any]: + """ + Layer 2 Input Redaction:清理 MCP tool call 的 input parameters。 + + 1. 移除 _mcp_audit(audit context,不應傳給 provider) + 2. 移除 credential 欄位(credential isolation) + 3. 對剩餘的 string values 套用 audit_sink patterns + """ + cleaned: dict[str, Any] = {} + for key, value in parameters.items(): + # 移除 audit context injection + if key == _MCP_AUDIT_KEY: + continue + + # credential isolation — 不讓 credential 明文流向 provider + if key.lower() in _MCP_CREDENTIAL_FIELDS: + cleaned[key] = "[REDACTED:CREDENTIAL_ISOLATION]" + continue + + # 欄位名稱黑名單(與 audit_sink 對齊) + if key.lower() in _BLOCKED_FIELD_NAMES: + cleaned[key] = "[REDACTED:BLOCKED_FIELD]" + continue + + # string value — 套用 pattern redaction + if isinstance(value, str): + cleaned[key] = _redact_string(value) + elif isinstance(value, dict): + cleaned[key] = redact_mcp_input(value) + elif isinstance(value, list): + cleaned[key] = [ + redact_mcp_input(item) if isinstance(item, dict) + else (_redact_string(item) if isinstance(item, str) else item) + for item in value + ] + else: + cleaned[key] = value + + return cleaned + + +def redact_mcp_output(output: Any) -> Any: + """ + Layer 2 Output Redaction:清理 MCP tool call 的 output。 + + 1. 對 output dict / string 套用 audit_sink patterns + 2. 限制 output 大小(防 prompt stuffing) + 3. 回傳清理後的 output(供 LLM context 使用) + """ + if output is None: + return None + + if isinstance(output, str): + redacted = _redact_string(output) + if len(redacted) > _MCP_OUTPUT_MAX_CHARS: + redacted = redacted[:_MCP_OUTPUT_MAX_CHARS] + f"\n[TRUNCATED:{len(output)} chars]" + return redacted + + if isinstance(output, dict): + return _redact_output_dict(output) + + if isinstance(output, list): + result = [] + total = 0 + for item in output: + if total > _MCP_OUTPUT_MAX_CHARS: + result.append(f"[TRUNCATED:{len(output)} items total]") + break + cleaned = redact_mcp_output(item) + serialized = json.dumps(cleaned, ensure_ascii=False, default=str) + total += len(serialized) + result.append(cleaned) + return result + + return output + + +def _redact_output_dict(d: dict[str, Any], depth: int = 0) -> dict[str, Any]: + """遞迴 redact output dict""" + if depth > 8: + return {"[MAX_DEPTH]": True} + + result: dict[str, Any] = {} + for key, value in d.items(): + # 欄位名稱黑名單 + if key.lower() in _BLOCKED_FIELD_NAMES: + result[key] = "[REDACTED:BLOCKED_FIELD]" + continue + + if isinstance(value, str): + result[key] = _redact_string(value) + elif isinstance(value, dict): + result[key] = _redact_output_dict(value, depth + 1) + elif isinstance(value, list): + result[key] = [ + _redact_output_dict(item, depth + 1) if isinstance(item, dict) + else (_redact_string(item) if isinstance(item, str) else item) + for item in value + ] + else: + result[key] = value + + return result + + +def compute_safe_hash(data: Any) -> str: + """計算 redacted data 的 sha256(供 gateway audit 使用)""" + serialized = json.dumps(data, sort_keys=True, ensure_ascii=False, default=str) + return hashlib.sha256(serialized.encode()).hexdigest() diff --git a/apps/api/src/plugins/mcp/registry.py b/apps/api/src/plugins/mcp/registry.py index 4451b881..6b11dd10 100644 --- a/apps/api/src/plugins/mcp/registry.py +++ b/apps/api/src/plugins/mcp/registry.py @@ -21,18 +21,20 @@ class AuditedMCPToolProvider(MCPToolProvider): """Provider wrapper that writes every MCP tool call to the audit subsystem.""" def __init__(self, provider: MCPToolProvider) -> None: - self._provider = provider + # __provider 使用 Python name mangling(_AuditedMCPToolProvider__provider) + # 防止 caller 透過 wrapper._provider 直接存取 inner provider(ADR-116 封裝要求) + self.__provider = provider # noqa: SLF001 — intentional name mangling @property def name(self) -> str: - return self._provider.name + return self.__provider.name @property def enabled(self) -> bool: - return self._provider.enabled + return self.__provider.enabled async def list_tools(self) -> list[MCPTool]: - return await self._provider.list_tools() + return await self.__provider.list_tools() async def execute( self, @@ -49,7 +51,7 @@ class AuditedMCPToolProvider(MCPToolProvider): started = monotonic_ms() result: MCPToolResult | None = None try: - result = await self._provider.execute(tool_name, provider_parameters) + result = await self.__provider.execute(tool_name, provider_parameters) return result finally: duration_ms = monotonic_ms() - started @@ -68,7 +70,7 @@ class AuditedMCPToolProvider(MCPToolProvider): ) async def health_check(self) -> bool: - return await self._provider.health_check() + return await self.__provider.health_check() class ProviderRegistry: diff --git a/apps/api/src/repositories/contract_repository.py b/apps/api/src/repositories/contract_repository.py new file mode 100644 index 00000000..a37036ba --- /dev/null +++ b/apps/api/src/repositories/contract_repository.py @@ -0,0 +1,261 @@ +""" +Contract Repository +=================== +AwoooP Phase 3: contract revision CRUD(append-only) +2026-05-04 ogt + Claude Sonnet 4.6(ADR-107/ADR-112) + +設計原則: +- append-only:已 published 的 revision 不可修改 +- active pointer 以 UPSERT 維護(awooop_active_revisions) +- outbox 事件在同一 transaction 寫入(ADR-113) +- RLS 透過 get_db_context() 自動套用 +""" + +from __future__ import annotations + +from typing import Any +from uuid import UUID + +import structlog +from sqlalchemy import select, update +from sqlalchemy.dialects.postgresql import insert as pg_insert + +from src.db.awooop_models import ( + AwoooPActiveRevision, + AwoooPContractOutbox, + AwoooPContractRevision, +) +from src.db.base import get_db_context + +logger = structlog.get_logger(__name__) + + +# ───────────────────────────────────────────────────────────────────────────── +# Read +# ───────────────────────────────────────────────────────────────────────────── + +async def get_revision( + revision_id: UUID, + project_id: str = "awoooi", +) -> AwoooPContractRevision | None: + """依 revision_id 讀取單筆(含 RLS 驗證)""" + async with get_db_context(project_id) as db: + result = await db.execute( + select(AwoooPContractRevision).where( + AwoooPContractRevision.revision_id == revision_id, + AwoooPContractRevision.project_id == project_id, + ) + ) + return result.scalar_one_or_none() + + +async def get_active_revision( + project_id: str, + contract_family: str, + contract_id: str, +) -> AwoooPContractRevision | None: + """ + 讀取 active revision(runtime 路徑)。 + 只返回 lifecycle_status='active' 的 revision。 + """ + async with get_db_context(project_id) as db: + result = await db.execute( + select(AwoooPContractRevision) + .join( + AwoooPActiveRevision, + AwoooPActiveRevision.active_revision_id == AwoooPContractRevision.revision_id, + ) + .where( + AwoooPActiveRevision.project_id == project_id, + AwoooPActiveRevision.contract_family == contract_family, + AwoooPActiveRevision.contract_id == contract_id, + AwoooPContractRevision.lifecycle_status == "active", + ) + ) + return result.scalar_one_or_none() + + +async def list_revisions( + project_id: str, + contract_family: str, + contract_id: str, + lifecycle_status: str | None = None, +) -> list[AwoooPContractRevision]: + """列出所有 revision(按 version 降序)""" + async with get_db_context(project_id) as db: + q = select(AwoooPContractRevision).where( + AwoooPContractRevision.project_id == project_id, + AwoooPContractRevision.contract_family == contract_family, + AwoooPContractRevision.contract_id == contract_id, + ) + if lifecycle_status: + q = q.where(AwoooPContractRevision.lifecycle_status == lifecycle_status) + q = q.order_by( + AwoooPContractRevision.version_major.desc(), + AwoooPContractRevision.version_minor.desc(), + ) + result = await db.execute(q) + return list(result.scalars().all()) + + +# ───────────────────────────────────────────────────────────────────────────── +# Write(append-only) +# ───────────────────────────────────────────────────────────────────────────── + +async def create_draft( + *, + project_id: str, + contract_family: str, + contract_id: str, + version_major: int, + version_minor: int, + body_json: dict[str, Any], + body_hash: str, + body_schema_version: str = "v1.0", +) -> AwoooPContractRevision: + """建立 draft revision(不可被 runtime 讀取)""" + async with get_db_context(project_id) as db: + revision = AwoooPContractRevision( + project_id=project_id, + contract_family=contract_family, + contract_id=contract_id, + version_major=version_major, + version_minor=version_minor, + lifecycle_status="draft", + body_json=body_json, + body_hash=body_hash, + body_schema_version=body_schema_version, + ) + db.add(revision) + await db.flush() + await db.refresh(revision) + + logger.info( + "contract_draft_created", + revision_id=str(revision.revision_id), + project_id=project_id, + contract_family=contract_family, + contract_id=contract_id, + ) + return revision + + +async def mark_published( + *, + revision_id: UUID, + project_id: str, + publisher_id: str, + publish_signature: str, + published_at: Any, # datetime +) -> AwoooPContractRevision: + """ + draft → published 轉換(HMAC 簽章驗證後由 service 呼叫)。 + published revision 可被 activate,但不可被 runtime 直接讀取。 + """ + async with get_db_context(project_id) as db: + await db.execute( + update(AwoooPContractRevision) + .where( + AwoooPContractRevision.revision_id == revision_id, + AwoooPContractRevision.project_id == project_id, + AwoooPContractRevision.lifecycle_status == "draft", + ) + .values( + lifecycle_status="published", + publisher_id=publisher_id, + publish_signature=publish_signature, + published_at=published_at, + ) + ) + result = await db.execute( + select(AwoooPContractRevision).where( + AwoooPContractRevision.revision_id == revision_id + ) + ) + revision = result.scalar_one() + logger.info( + "contract_published", + revision_id=str(revision_id), + project_id=project_id, + publisher_id=publisher_id, + ) + return revision + + +async def mark_active( + *, + revision_id: UUID, + project_id: str, + contract_family: str, + contract_id: str, + old_revision_id: UUID | None, +) -> AwoooPContractRevision: + """ + published → active 轉換 + 更新 active pointer + 寫入 outbox。 + 三個操作在同一 transaction(ADR-113 transactional outbox)。 + """ + async with get_db_context(project_id) as db: + # 1. 更新 revision lifecycle_status + await db.execute( + update(AwoooPContractRevision) + .where( + AwoooPContractRevision.revision_id == revision_id, + AwoooPContractRevision.project_id == project_id, + AwoooPContractRevision.lifecycle_status == "published", + ) + .values(lifecycle_status="active") + ) + + # 2. UPSERT active pointer + stmt = pg_insert(AwoooPActiveRevision).values( + project_id=project_id, + contract_family=contract_family, + contract_id=contract_id, + active_revision_id=revision_id, + ) + stmt = stmt.on_conflict_do_update( + constraint="uq_active_pointer", + set_={ + "active_revision_id": revision_id, + }, + ) + await db.execute(stmt) + + # 3. 寫入 outbox event(ADR-113) + outbox_event = AwoooPContractOutbox( + event_type="contract.activated", + project_id=project_id, + contract_family=contract_family, + contract_id=contract_id, + old_revision_id=old_revision_id, + new_revision_id=revision_id, + ) + db.add(outbox_event) + + # 4. 如有舊 active revision,標記為 revoked + if old_revision_id: + await db.execute( + update(AwoooPContractRevision) + .where( + AwoooPContractRevision.revision_id == old_revision_id, + AwoooPContractRevision.lifecycle_status == "active", + ) + .values(lifecycle_status="revoked") + ) + + result = await db.execute( + select(AwoooPContractRevision).where( + AwoooPContractRevision.revision_id == revision_id + ) + ) + revision = result.scalar_one() + + logger.info( + "contract_activated", + revision_id=str(revision_id), + old_revision_id=str(old_revision_id) if old_revision_id else None, + project_id=project_id, + contract_family=contract_family, + contract_id=contract_id, + ) + return revision diff --git a/apps/api/src/repositories/incident_repository.py b/apps/api/src/repositories/incident_repository.py index adae6668..b6f18723 100644 --- a/apps/api/src/repositories/incident_repository.py +++ b/apps/api/src/repositories/incident_repository.py @@ -63,6 +63,7 @@ def _incident_to_record_data(incident: Incident) -> dict[str, Any]: return { "incident_id": incident.incident_id, + "project_id": getattr(incident, "project_id", "awoooi"), # AwoooP Phase 2.3 "status": incident.status.value, "severity": incident.severity.value, "signals": [ diff --git a/apps/api/src/repositories/playbook_repository.py b/apps/api/src/repositories/playbook_repository.py index b30727c1..78ec1f9a 100644 --- a/apps/api/src/repositories/playbook_repository.py +++ b/apps/api/src/repositories/playbook_repository.py @@ -24,7 +24,7 @@ import structlog from sqlalchemy import select from src.core.redis_client import get_redis -from src.db.base import get_session_factory +from src.db.base import get_db_context from src.db.models import PlaybookRecord from src.models.playbook import ( Playbook, @@ -255,8 +255,7 @@ class PlaybookRepository: Phase 3.5:改用 PG 查詢,效率更高,資料更完整 """ try: - factory = get_session_factory() - async with factory() as session: + async with get_db_context() as session: stmt = select(PlaybookRecord) if status is not None: stmt = stmt.where(PlaybookRecord.status == status.value) @@ -356,8 +355,7 @@ class PlaybookRepository: """ try: # 使用 SELECT FOR UPDATE 確保並行 update_stats 不會 lost update - factory = get_session_factory() - async with factory() as session: + async with get_db_context() as session: async with session.begin(): stmt = ( select(PlaybookRecord) @@ -411,8 +409,7 @@ class PlaybookRepository: async def find_by_source_incident(self, incident_id: str) -> list[Playbook]: """根據來源 Incident ID 找 Playbook(從 PG 查詢)""" try: - factory = get_session_factory() - async with factory() as session: + async with get_db_context() as session: # PG JSONB contains 查詢 stmt = select(PlaybookRecord).where( PlaybookRecord.source_incident_ids.contains([incident_id]) @@ -529,10 +526,12 @@ class PlaybookRepository: try: from sqlalchemy.dialects.postgresql import insert as pg_insert - factory = get_session_factory() - async with factory() as session: + async with get_db_context( + getattr(playbook, "project_id", "awoooi") + ) as session: stmt = pg_insert(PlaybookRecord).values( playbook_id=playbook.playbook_id, + project_id=getattr(playbook, "project_id", "awoooi"), # AwoooP Phase 2.3 name=playbook.name, description=playbook.description, status=playbook.status.value, @@ -600,8 +599,7 @@ class PlaybookRepository: async def _pg_get(self, playbook_id: str) -> Playbook | None: """從 PostgreSQL 載入 Playbook""" try: - factory = get_session_factory() - async with factory() as session: + async with get_db_context() as session: result = await session.get(PlaybookRecord, playbook_id) if result is None: return None diff --git a/apps/api/src/services/anomaly_counter.py b/apps/api/src/services/anomaly_counter.py index a216856a..e0dcc723 100644 --- a/apps/api/src/services/anomaly_counter.py +++ b/apps/api/src/services/anomaly_counter.py @@ -115,8 +115,20 @@ class AnomalyCounter: # TTL 設定 (35 天,比清理週期長一點) TTL_SECONDS = 35 * 24 * 3600 - def __init__(self, redis_client: redis.Redis) -> None: + def __init__(self, redis_client: redis.Redis, project_id: str = "awoooi") -> None: self.redis = redis_client + self.project_id = project_id + + def _pkey(self, prefix: str, key: str) -> str: + """新格式 key: {project_id}:{prefix}{key}(Phase A 多租戶)""" + return f"{self.project_id}:{prefix}{key}" + + async def _redis_get_with_fallback(self, prefix: str, key: str) -> bytes | None: + """Phase A: 讀新 key,fallback 到舊 key。""" + val = await self.redis.get(self._pkey(prefix, key)) + if val is None: + val = await self.redis.get(f"{prefix}{key}") + return val @staticmethod def derive_key_from_incident(incident) -> str | None: @@ -217,7 +229,7 @@ class AnomalyCounter: ) -> AnomalyFrequency: """實際的異常記錄邏輯(可能拋出 Redis 異常)""" timestamp = now.timestamp() - timeline_key = f"{self.PREFIX_TIMELINE}{anomaly_key}" + timeline_key = self._pkey(self.PREFIX_TIMELINE, anomaly_key) # 1. 添加到 Sorted Set (score = timestamp, member = timestamp string) await self.redis.zadd(timeline_key, {str(timestamp): timestamp}) @@ -270,27 +282,22 @@ class AnomalyCounter: else now ) - # 6. 讀取修復統計 - repair_count_str = await self.redis.get( - f"{self.PREFIX_REPAIR_COUNT}{anomaly_key}" - ) + # 6. 讀取修復統計(Phase A: 讀新 key,fallback 到舊 key) + repair_count_str = await self._redis_get_with_fallback(self.PREFIX_REPAIR_COUNT, anomaly_key) auto_repair_count = int(repair_count_str) if repair_count_str else 0 - permanent_fix_str = await self.redis.get( - f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}" - ) - permanent_fix = permanent_fix_str == "1" + permanent_fix_str = await self._redis_get_with_fallback(self.PREFIX_PERMANENT_FIX, anomaly_key) + permanent_fix = permanent_fix_str == b"1" or permanent_fix_str == "1" # 7. 儲存 metadata (首次記錄時) - metadata_key = f"{self.PREFIX_METADATA}{anomaly_key}" - if not await self.redis.exists(metadata_key): - await self.redis.hset( - metadata_key, - mapping={ - "signature": json.dumps(anomaly_signature), - "first_seen": now.isoformat(), - }, - ) + metadata_key = self._pkey(self.PREFIX_METADATA, anomaly_key) + legacy_metadata_key = f"{self.PREFIX_METADATA}{anomaly_key}" + if not await self.redis.exists(metadata_key) and not await self.redis.exists(legacy_metadata_key): + metadata_payload = { + "signature": json.dumps(anomaly_signature), + "first_seen": now.isoformat(), + } + await self.redis.hset(metadata_key, mapping=metadata_payload) await self.redis.expire(metadata_key, self.TTL_SECONDS) # 8. 判斷升級等級 @@ -353,14 +360,14 @@ class AnomalyCounter: success: 是否成功 """ try: - repair_key = f"{self.PREFIX_REPAIR_COUNT}{anomaly_key}" + repair_key = self._pkey(self.PREFIX_REPAIR_COUNT, anomaly_key) # 遞增修復嘗試次數 await self.redis.incr(repair_key) await self.redis.expire(repair_key, self.TTL_SECONDS) # 記錄修復歷史 (用於學習) - history_key = f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}" + history_key = self._pkey(self.PREFIX_REPAIR_HISTORY, anomaly_key) await self.redis.lpush( history_key, json.dumps( @@ -411,7 +418,7 @@ class AnomalyCounter: return try: - key = f"{self.PREFIX_DISPOSITION}{anomaly_key}" + key = self._pkey(self.PREFIX_DISPOSITION, anomaly_key) await self.redis.hincrby(key, disposition_type, 1) await self.redis.hincrby(key, "total", 1) await self.redis.expire(key, self.TTL_SECONDS) @@ -434,8 +441,11 @@ class AnomalyCounter: "cold_start_trust": N, "total": N} """ try: - key = f"{self.PREFIX_DISPOSITION}{anomaly_key}" + key = self._pkey(self.PREFIX_DISPOSITION, anomaly_key) raw = await self.redis.hgetall(key) + if not raw: + # Phase A: fallback 到舊 key + raw = await self.redis.hgetall(f"{self.PREFIX_DISPOSITION}{anomaly_key}") return { "auto_repair": int(raw.get(b"auto_repair", raw.get("auto_repair", 0))), "human_approved": int(raw.get(b"human_approved", raw.get("human_approved", 0))), @@ -471,11 +481,25 @@ class AnomalyCounter: try: # S2 Fix: 使用 Pipeline 批次查詢,消除 N+1 問題 - pattern = f"{self.PREFIX_DISPOSITION}*" + # Phase A: 先掃新前綴,若無資料 fallback 到舊前綴 + new_pattern = f"{self.project_id}:{self.PREFIX_DISPOSITION}*" + new_strip = f"{self.project_id}:{self.PREFIX_DISPOSITION}" + legacy_pattern = f"{self.PREFIX_DISPOSITION}*" + legacy_strip = self.PREFIX_DISPOSITION + keys: list = [] - async for key in self.redis.scan_iter(match=pattern, count=100): + async for key in self.redis.scan_iter(match=new_pattern, count=100): keys.append(key) + if keys: + strip_prefix = new_strip + meta_prefix = f"{self.project_id}:{self.PREFIX_METADATA}" + else: + async for key in self.redis.scan_iter(match=legacy_pattern, count=100): + keys.append(key) + strip_prefix = legacy_strip + meta_prefix = self.PREFIX_METADATA + if not keys: return total_summary, by_anomaly @@ -489,11 +513,11 @@ class AnomalyCounter: anomaly_keys_str = [] for key in keys: key_str = key.decode() if isinstance(key, bytes) else key - anomaly_keys_str.append(key_str.replace(self.PREFIX_DISPOSITION, "")) + anomaly_keys_str.append(key_str.replace(strip_prefix, "")) meta_pipe = self.redis.pipeline(transaction=False) for ak in anomaly_keys_str: - meta_pipe.hget(f"{self.PREFIX_METADATA}{ak}", "signature") + meta_pipe.hget(f"{meta_prefix}{ak}", "signature") meta_results = await meta_pipe.execute() for i, raw in enumerate(results): @@ -547,13 +571,13 @@ class AnomalyCounter: """ try: await self.redis.set( - f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}", + self._pkey(self.PREFIX_PERMANENT_FIX, anomaly_key), "1", ex=90 * 24 * 3600, # 90 天 ) # 記錄修復詳情 - metadata_key = f"{self.PREFIX_METADATA}{anomaly_key}" + metadata_key = self._pkey(self.PREFIX_METADATA, anomaly_key) await self.redis.hset( metadata_key, mapping={ @@ -588,8 +612,11 @@ class AnomalyCounter: } """ try: - history_key = f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}" + history_key = self._pkey(self.PREFIX_REPAIR_HISTORY, anomaly_key) history = await self.redis.lrange(history_key, 0, -1) + if not history: + # Phase A: fallback 到舊 key + history = await self.redis.lrange(f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}", 0, -1) total = 0 success_count = 0 @@ -627,8 +654,11 @@ class AnomalyCounter: } """ try: - history_key = f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}" + history_key = self._pkey(self.PREFIX_REPAIR_HISTORY, anomaly_key) history = await self.redis.lrange(history_key, 0, -1) + if not history: + # Phase A: fallback 到舊 key + history = await self.redis.lrange(f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}", 0, -1) stats: dict[str, dict] = {} @@ -666,11 +696,14 @@ class AnomalyCounter: AnomalyFrequency 或 None (若無記錄 或 Redis 重連失敗) """ try: - timeline_key = f"{self.PREFIX_TIMELINE}{anomaly_key}" + timeline_key = self._pkey(self.PREFIX_TIMELINE, anomaly_key) + legacy_timeline_key = f"{self.PREFIX_TIMELINE}{anomaly_key}" - # 檢查是否有記錄 + # Phase A: 若新 key 無資料,改用舊 key if not await self.redis.exists(timeline_key): - return None + if not await self.redis.exists(legacy_timeline_key): + return None + timeline_key = legacy_timeline_key now = datetime.now() cutoff_30d = (now - timedelta(days=30)).timestamp() @@ -716,16 +749,12 @@ class AnomalyCounter: else now ) - # 讀取修復統計 - repair_count_str = await self.redis.get( - f"{self.PREFIX_REPAIR_COUNT}{anomaly_key}" - ) + # 讀取修復統計(Phase A: 讀新 key,fallback 到舊 key) + repair_count_str = await self._redis_get_with_fallback(self.PREFIX_REPAIR_COUNT, anomaly_key) auto_repair_count = int(repair_count_str) if repair_count_str else 0 - permanent_fix_str = await self.redis.get( - f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}" - ) - permanent_fix = permanent_fix_str == "1" + permanent_fix_str = await self._redis_get_with_fallback(self.PREFIX_PERMANENT_FIX, anomaly_key) + permanent_fix = permanent_fix_str in (b"1", "1") escalation_level = self._get_escalation_level(count_24h) @@ -797,7 +826,7 @@ def get_anomaly_counter() -> AnomalyCounter: if _anomaly_counter is None: from src.core.redis_client import get_redis - _anomaly_counter = AnomalyCounter(get_redis()) + _anomaly_counter = AnomalyCounter(get_redis(), project_id="awoooi") return _anomaly_counter diff --git a/apps/api/src/services/audit_sink.py b/apps/api/src/services/audit_sink.py new file mode 100644 index 00000000..b76e5fb1 --- /dev/null +++ b/apps/api/src/services/audit_sink.py @@ -0,0 +1,227 @@ +""" +Audit Sink with PII/Secret Redaction +====================================== +AwoooP Phase 4.4: Audit log 寫入前的 sanitization pipeline(ADR-116) +2026-05-04 ogt + Claude Sonnet 4.6 + +設計原則: +- audit log 不記錄 raw LLM input/output,只記 hash + schema validation result +- PII / secret pattern 硬攔(不可被 caller 繞過) +- 攔截清單:GCP IP、PostgreSQL password、Telegram token、SSH key、Bearer token 等 +- redaction 後原值不可還原(替換為 [REDACTED:]) +- 所有 audit 寫入透過此 sink(禁止其他 service 直接 INSERT audit_logs) + +使用: + from src.services.audit_sink import write_audit + + await write_audit( + project_id="awoooi", + action="run.completed", + resource_type="run", + resource_id=str(run_id), + details={"trace_id": trace_id, "cost_usd": 0.012}, + ) +""" + +from __future__ import annotations + +import hashlib +import json +import re +from typing import Any + +import structlog + +logger = structlog.get_logger(__name__) + + +# ───────────────────────────────────────────────────────────────────────────── +# Redaction patterns(ADR-116 P1-08) +# ───────────────────────────────────────────────────────────────────────────── + +# 每個 pattern: (compiled_re, replacement_tag) +_REDACTION_PATTERNS: list[tuple[re.Pattern[str], str]] = [ + # Telegram bot token(數字:英數字母混合 32~64 字元) + (re.compile(r"\d{8,12}:[A-Za-z0-9_-]{32,64}"), "TELEGRAM_TOKEN"), + + # PostgreSQL connection string + (re.compile(r"postgresql(?:\+asyncpg)?://[^:]+:[^@]+@[^/\s]+"), "PG_DSN"), + + # Generic password in URL / config + (re.compile(r"(?i)(?:password|passwd|pwd)\s*[:=]\s*\S+"), "PASSWORD"), + + # Bearer / Authorization header value + (re.compile(r"(?i)(?:bearer|token)\s+[A-Za-z0-9\-._~+/]+=*"), "BEARER_TOKEN"), + + # AWS / GCP / NVIDIA API key patterns + (re.compile(r"(?i)(?:api[_-]?key|apikey)\s*[:=]\s*[A-Za-z0-9\-._]{20,}"), "API_KEY"), + + # Private GCP internal IPs(ADR-116 禁止 GCP 內網 IP 進 log) + (re.compile(r"\b10\.\d{1,3}\.\d{1,3}\.\d{1,3}\b"), "INTERNAL_IP"), + (re.compile(r"\b172\.(?:1[6-9]|2\d|3[0-1])\.\d{1,3}\.\d{1,3}\b"), "INTERNAL_IP"), + (re.compile(r"\b192\.168\.\d{1,3}\.\d{1,3}\b"), "INTERNAL_IP"), + + # SSH private key + (re.compile(r"-----BEGIN (?:RSA|EC|OPENSSH) PRIVATE KEY-----[\s\S]*?-----END [A-Z ]+ PRIVATE KEY-----"), "SSH_PRIVATE_KEY"), + + # JWT(三段 base64 以 . 分隔) + (re.compile(r"eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+"), "JWT_TOKEN"), + + # Hex secret >= 32 位(可能是 HMAC key / session token) + (re.compile(r"\b[0-9a-f]{64}\b"), "HEX_SECRET_64"), +] + +# 欄位名稱黑名單:這些 key 的 value 直接替換(不做 pattern 掃描) +_BLOCKED_FIELD_NAMES = frozenset({ + "password", "passwd", "pwd", "secret", "token", "api_key", "apikey", + "private_key", "private_key_pem", "bot_token", "telegram_token", + "hmac_key", "jwt", "authorization", "cookie", "session", +}) + +# LLM raw input/output 欄位名稱(只記 hash) +_LLM_RAW_FIELDS = frozenset({ + "raw_input", "raw_output", "llm_input", "llm_output", + "prompt", "completion", "system_prompt", +}) + + +# ───────────────────────────────────────────────────────────────────────────── +# Sanitization pipeline +# ───────────────────────────────────────────────────────────────────────────── + +def _redact_string(value: str) -> str: + """對字串套用所有 redaction patterns""" + for pattern, tag in _REDACTION_PATTERNS: + value = pattern.sub(f"[REDACTED:{tag}]", value) + return value + + +def sanitize(details: dict[str, Any]) -> dict[str, Any]: + """ + 遞迴處理 details dict,套用所有 redaction 規則。 + + 規則優先序: + 1. key 在 _BLOCKED_FIELD_NAMES → value 替換為 [REDACTED:BLOCKED_FIELD] + 2. key 在 _LLM_RAW_FIELDS → value 替換為 sha256(str(value))(只記 hash) + 3. string value → pattern redaction + 4. nested dict/list → 遞迴處理 + """ + return _sanitize_value(details, depth=0) + + +def _sanitize_value(value: Any, depth: int = 0) -> Any: + if depth > 10: + return "[REDACTED:MAX_DEPTH]" + + if isinstance(value, dict): + return {k: _sanitize_dict_entry(k, v, depth) for k, v in value.items()} + if isinstance(value, list): + return [_sanitize_value(item, depth + 1) for item in value] + if isinstance(value, str): + return _redact_string(value) + return value + + +def _sanitize_dict_entry(key: str, value: Any, depth: int) -> Any: + key_lower = key.lower() + + if key_lower in _BLOCKED_FIELD_NAMES: + return "[REDACTED:BLOCKED_FIELD]" + + if key_lower in _LLM_RAW_FIELDS: + # 只記 sha256 hash,不記原始內容 + raw_str = json.dumps(value, ensure_ascii=False) if not isinstance(value, str) else value + return f"[LLM_RAW_HASH:{hashlib.sha256(raw_str.encode()).hexdigest()[:16]}]" + + return _sanitize_value(value, depth + 1) + + +# ───────────────────────────────────────────────────────────────────────────── +# Audit write +# ───────────────────────────────────────────────────────────────────────────── + +async def write_audit( + *, + project_id: str, + action: str, + resource_type: str, + resource_id: str, + details: dict[str, Any] | None = None, + run_id: str | None = None, + trace_id: str | None = None, +) -> None: + """ + 統一 audit log 寫入入口(Phase 4+ 所有 service 必須透過此方法)。 + + 1. sanitize details(PII / secret redaction) + 2. 附加 run_id / trace_id(可觀測性) + 3. INSERT audit_logs(非阻擋 background task) + """ + import asyncio + + asyncio.create_task( + _write_audit_impl( + project_id=project_id, + action=action, + resource_type=resource_type, + resource_id=resource_id, + details=details, + run_id=run_id, + trace_id=trace_id, + ), + name="audit_sink_write", + ) + + +async def _write_audit_impl( + *, + project_id: str, + action: str, + resource_type: str, + resource_id: str, + details: dict[str, Any] | None, + run_id: str | None, + trace_id: str | None, +) -> None: + try: + from sqlalchemy import text as sa_text + from src.db.base import get_db_context + + clean_details: dict[str, Any] = sanitize(details or {}) + if run_id: + clean_details["_run_id"] = run_id + if trace_id: + clean_details["_trace_id"] = trace_id + + async with get_db_context(project_id) as db: + await db.execute( + sa_text(""" + INSERT INTO audit_logs + (project_id, action, resource_type, resource_id, details) + VALUES + (:project_id, :action, :resource_type, :resource_id, :details::jsonb) + """), + { + "project_id": project_id, + "action": action, + "resource_type": resource_type, + "resource_id": resource_id, + "details": json.dumps(clean_details), + }, + ) + except Exception as exc: + logger.warning( + "audit_sink_write_failed", + action=action, + resource_id=resource_id, + error=str(exc), + ) + + +# ───────────────────────────────────────────────────────────────────────────── +# Convenience:可在測試中驗證 sanitization 結果 +# ───────────────────────────────────────────────────────────────────────────── + +def sanitize_for_test(details: dict[str, Any]) -> dict[str, Any]: + """同步 sanitize,供測試使用""" + return sanitize(details) diff --git a/apps/api/src/services/awooop_approval_token.py b/apps/api/src/services/awooop_approval_token.py new file mode 100644 index 00000000..79c1c69b --- /dev/null +++ b/apps/api/src/services/awooop_approval_token.py @@ -0,0 +1,357 @@ +""" +AwoooP Approval Token — HS256 簽核令牌 + Multi-sig + Suggest Mode +================================================================== +AwoooP Phase 8: ADR-116 Gate 5 approval flow +2026-05-04 ogt + Claude Sonnet 4.6 + +功能: + 1. HS256 Approval Token(自製,不依賴 PyJWT): + - issue_approval_token() → signed token(3 段 base64url) + - verify_approval_token() → payload(含 jti/exp/sub/approver) + - jti 存 Redis NX(TTL = exp - now)防 token replay + - TTL = 15 分鐘(APPROVAL_TOKEN_TTL = 900s) + + 2. Multi-sig quorum: + - record_approval() → 驗 token + NX jti + SADD approver_id → 目前簽核數 + - check_approval_quorum(required=1) → bool | raise QuorumNotMetError + - Redis Set TTL = 1h + + 3. Suggest Mode(AWOOOP_SUGGEST_MODE feature flag): + - is_suggest_mode_enabled() → bool + - build_suggest_action(action_type, target) → SuggestedAction(dry-run) + - 支援 3 個 SRE flow:rollback / scale / restart + +Redis key 前綴(與 legacy multi_sig_redis.py 不衝突): + awooop_appr:jti:{jti} — NX token replay 防護 + awooop_appr:sigs:{project_id}:{run_id}:{tool_name} — 簽核人 Set + +錯誤碼: + E-APPR-001 token 無效或已過期 + E-APPR-002 jti 已使用(replay attack) + E-APPR-003 quorum 未達 + E-APPR-004 approver 重複簽核 +""" + +from __future__ import annotations + +import base64 +import hashlib +import hmac as _hmac_module +import json +import os +import time +import uuid +from dataclasses import dataclass, field +from typing import Any + +import structlog + +logger = structlog.get_logger(__name__) + +# ───────────────────────────────────────────────────────────────────────────── +# 常數 +# ───────────────────────────────────────────────────────────────────────────── + +APPROVAL_TOKEN_TTL = 900 # 15 分鐘 +_JTI_KEY_PREFIX = "awooop_appr:jti:" +_SIG_SET_PREFIX = "awooop_appr:sigs:" +_SIG_TTL_SECONDS = 3600 # 簽核 Set 1h TTL +_SUGGEST_MODE_ENV = "AWOOOP_SUGGEST_MODE" + + +# ───────────────────────────────────────────────────────────────────────────── +# 錯誤定義 +# ───────────────────────────────────────────────────────────────────────────── + +class InvalidApprovalTokenError(Exception): + error_code = "E-APPR-001" + +class TokenReplayError(Exception): + error_code = "E-APPR-002" + +class QuorumNotMetError(Exception): + error_code = "E-APPR-003" + +class DuplicateApproverError(Exception): + error_code = "E-APPR-004" + + +# ───────────────────────────────────────────────────────────────────────────── +# HS256 Token 實作 +# ───────────────────────────────────────────────────────────────────────────── + +def _b64url_encode(data: bytes) -> str: + return base64.urlsafe_b64encode(data).rstrip(b"=").decode() + + +def _b64url_decode(s: str) -> bytes: + padding = 4 - len(s) % 4 + if padding != 4: + s += "=" * padding + return base64.urlsafe_b64decode(s) + + +def _get_hmac_key() -> bytes: + try: + from src.core.config import settings + key = getattr(settings, "APPROVAL_HMAC_KEY", None) or "" + except Exception: + key = "" + key = key or os.environ.get("APPROVAL_HMAC_KEY", "") + if not key: + logger.warning("approval_hmac_key_not_set_using_dev_fallback") + key = "dev-awooop-approval-hmac-fallback" + return key.encode() + + +def issue_approval_token( + *, + project_id: str, + run_id: str, + tool_name: str, + approver_id: str, + ttl_seconds: int = APPROVAL_TOKEN_TTL, +) -> str: + """ + 產生 HS256 Approval Token。 + + payload: + jti = uuid4().hex(唯一 token ID,用於 Redis NX 防 replay) + iss = "awooop-approval" + sub = "{project_id}:{run_id}:{tool_name}" + approver = approver_id + iat / exp + """ + now = int(time.time()) + jti = uuid.uuid4().hex + + header = {"alg": "HS256", "typ": "JWT"} + payload = { + "jti": jti, + "iss": "awooop-approval", + "sub": f"{project_id}:{run_id}:{tool_name}", + "approver": approver_id, + "iat": now, + "exp": now + ttl_seconds, + } + + h_b64 = _b64url_encode(json.dumps(header, separators=(",", ":")).encode()) + p_b64 = _b64url_encode(json.dumps(payload, separators=(",", ":")).encode()) + signing_input = f"{h_b64}.{p_b64}" + + sig = _hmac_module.new( + _get_hmac_key(), + signing_input.encode(), + hashlib.sha256, + ).digest() + return f"{signing_input}.{_b64url_encode(sig)}" + + +def verify_approval_token(token: str) -> dict[str, Any]: + """ + 驗證 HS256 token,回傳 payload。 + + Raises: + InvalidApprovalTokenError: 簽名無效/過期/格式錯誤 + """ + try: + parts = token.split(".") + if len(parts) != 3: + raise InvalidApprovalTokenError("token 非 3 段格式") + + h_b64, p_b64, sig_b64 = parts + signing_input = f"{h_b64}.{p_b64}" + + expected_sig = _hmac_module.new( + _get_hmac_key(), + signing_input.encode(), + hashlib.sha256, + ).digest() + + if not _hmac_module.compare_digest(sig_b64, _b64url_encode(expected_sig)): + raise InvalidApprovalTokenError("token 簽名無效") + + payload = json.loads(_b64url_decode(p_b64)) + + if int(time.time()) > payload.get("exp", 0): + raise InvalidApprovalTokenError("token 已過期") + + return payload + + except InvalidApprovalTokenError: + raise + except Exception as exc: + raise InvalidApprovalTokenError(f"token 解析失敗: {exc}") from exc + + +# ───────────────────────────────────────────────────────────────────────────── +# Multi-sig Redis approval +# ───────────────────────────────────────────────────────────────────────────── + +async def record_approval( + *, + project_id: str, + run_id: str, + tool_name: str, + approver_id: str, + token: str, +) -> int: + """ + 記錄一筆簽核。步驟: + 1. verify_approval_token(HS256 + exp) + 2. sub 匹配驗證 + 3. Redis NX jti(防 replay) + 4. Redis SADD approver_id(防重複) + 5. 回傳目前簽核數 + + Raises: + InvalidApprovalTokenError, TokenReplayError, DuplicateApproverError + """ + payload = verify_approval_token(token) + + expected_sub = f"{project_id}:{run_id}:{tool_name}" + if payload.get("sub") != expected_sub: + raise InvalidApprovalTokenError( + f"token sub 不符(期望 '{expected_sub}',實際 '{payload.get('sub')}')" + ) + + jti = payload["jti"] + exp = payload["exp"] + + try: + import aioredis + from src.core.config import settings + + redis = aioredis.from_url(settings.REDIS_URL) + + # jti NX + jti_key = f"{_JTI_KEY_PREFIX}{jti}" + ttl_remaining = max(exp - int(time.time()), 1) + ok = await redis.set(jti_key, "1", nx=True, ex=ttl_remaining) + if not ok: + await redis.aclose() + raise TokenReplayError(f"jti={jti!r} 已使用") + + # SADD approver + sig_key = f"{_SIG_SET_PREFIX}{project_id}:{run_id}:{tool_name}" + added = await redis.sadd(sig_key, approver_id) + if added == 0: + await redis.aclose() + raise DuplicateApproverError(f"approver '{approver_id}' 已簽核") + + await redis.expire(sig_key, _SIG_TTL_SECONDS) + count = int(await redis.scard(sig_key)) + await redis.aclose() + + logger.info( + "awooop_approval_recorded", + project_id=project_id, + run_id=run_id, + tool_name=tool_name, + approver_id=approver_id, + count=count, + ) + return count + + except (InvalidApprovalTokenError, TokenReplayError, DuplicateApproverError): + raise + except Exception as exc: + logger.exception("awooop_approval_redis_error", error=str(exc)) + raise InvalidApprovalTokenError(f"Redis 錯誤: {exc}") from exc + + +async def check_approval_quorum( + *, + project_id: str, + run_id: str, + tool_name: str, + required_count: int = 1, +) -> bool: + """ + 檢查 quorum。Raises QuorumNotMetError if 不足。 + """ + try: + import aioredis + from src.core.config import settings + + redis = aioredis.from_url(settings.REDIS_URL) + sig_key = f"{_SIG_SET_PREFIX}{project_id}:{run_id}:{tool_name}" + count = int(await redis.scard(sig_key)) + await redis.aclose() + + if count < required_count: + raise QuorumNotMetError(f"簽核數不足({count}/{required_count})") + return True + + except QuorumNotMetError: + raise + except Exception as exc: + raise QuorumNotMetError(f"Redis 查詢失敗: {exc}") from exc + + +# ───────────────────────────────────────────────────────────────────────────── +# Suggest Mode +# ───────────────────────────────────────────────────────────────────────────── + +@dataclass +class SuggestedAction: + """Suggest mode dry-run 結果(不真正執行)""" + action_type: str # 'rollback' | 'scale' | 'restart' + target: str + suggested_command: str + rollback_evidence: dict[str, Any] = field(default_factory=dict) + dry_run: bool = True + approval_required: bool = True + + +def is_suggest_mode_enabled() -> bool: + return os.environ.get(_SUGGEST_MODE_ENV, "").lower() in ("true", "1", "yes") + + +async def build_suggest_action( + action_type: str, + *, + target: str, + run_id: str, + project_id: str, +) -> SuggestedAction: + """ + Suggest mode:返回 dry-run 建議,不執行真實操作。 + 支援 rollback / scale / restart 三個 SRE flow。 + """ + if action_type not in ("rollback", "scale", "restart"): + raise ValueError(f"不支援的 action_type: {action_type!r}") + + if action_type == "rollback": + command = f"kubectl rollout undo deployment/{target}" + evidence: dict[str, Any] = { + "note": f"需確認 deployment/{target} 當前 image 與 rollout history", + "suggested_verification": f"kubectl rollout history deployment/{target}", + } + elif action_type == "scale": + command = f"kubectl scale deployment/{target} --replicas=" + evidence = { + "note": f"需確認 deployment/{target} 當前 replicas 數量", + "suggested_verification": f"kubectl get deployment/{target} -o json | jq .spec.replicas", + } + else: # restart + command = f"kubectl rollout restart deployment/{target}" + evidence = { + "note": f"需確認 deployment/{target} 當前 pod 狀態", + "suggested_verification": f"kubectl get pods -l app={target}", + } + + logger.info( + "suggest_action_built", + project_id=project_id, + run_id=run_id, + action_type=action_type, + target=target, + ) + + return SuggestedAction( + action_type=action_type, + target=target, + suggested_command=command, + rollback_evidence=evidence, + ) diff --git a/apps/api/src/services/budget_service.py b/apps/api/src/services/budget_service.py new file mode 100644 index 00000000..605ae6a9 --- /dev/null +++ b/apps/api/src/services/budget_service.py @@ -0,0 +1,378 @@ +"""AwoooP Token Budget Hard Kill Service +======================================= +ADR-120: 三層 Hard Kill 防護架構 +2026-05-04 ogt + Claude Sonnet 4.6(Phase 2.6) + +防線: +1. Pre-call check(呼叫前)— Layer 1 Tenant + Layer 2 Platform + Layer 3 Emergency Kill +2. Post-call accounting(呼叫後)— 寫 budget_ledger + 更新 Redis cache +3. 告警閾值通知(80% / 95% Telegram 告警) + +注意:Layer 0 Run budget 需要 awooop_run_state(Phase 3 SAGA 實作後補加) +""" +from __future__ import annotations + +import time +from decimal import Decimal + +import structlog + +from src.core.config import settings + +logger = structlog.get_logger(__name__) + +# ───────────────────────────────────────────────────────────────────────────── +# 告警閾值(ADR-120 D4) +# ───────────────────────────────────────────────────────────────────────────── +BUDGET_ALERT_THRESHOLDS = { + "warn": Decimal("0.80"), + "critical": Decimal("0.95"), + "hard_kill": Decimal("1.00"), +} + +# Redis key 前綴 +_EMERGENCY_KILL_KEY = "platform:budget:emergency_kill" +_TENANT_BUDGET_KEY_PREFIX = "budget:tenant:" # {project_id}:daily_used_usd +_PLATFORM_BUDGET_KEY = "budget:platform:daily_used_usd" +_BUDGET_CACHE_TTL = 300 # 5 分鐘,每次寫入後 refresh + + +class BudgetExhaustedError(Exception): + """LLM call 被 hard kill 攔截""" + + def __init__(self, error_code: str, message: str) -> None: + self.error_code = error_code + super().__init__(f"[{error_code}] {message}") + + +# ───────────────────────────────────────────────────────────────────────────── +# 費用計算(按模型定價估算) +# ───────────────────────────────────────────────────────────────────────────── + +# USD per 1M tokens(in + out) +_COST_PER_MILLION_TOKENS: dict[str, tuple[float, float]] = { + # (prompt_per_M, completion_per_M) + "claude-opus-4-7": (15.0, 75.0), + "claude-sonnet-4-6": (3.0, 15.0), + "claude-haiku-4-5": (0.8, 4.0), + "gpt-4o": (5.0, 15.0), + "gpt-4o-mini": (0.15, 0.6), + "gemini-2.0-flash": (0.075, 0.3), + "deepseek-r1:14b": (0.0, 0.0), # local Ollama — 無費用 + "qwen3:8b": (0.0, 0.0), # local Ollama — 無費用 +} +_DEFAULT_COST_PER_M = (3.0, 15.0) # fallback → claude-sonnet + + +def estimate_cost( + prompt_tokens: int, + completion_tokens: int, + model: str, +) -> Decimal: + """估算一次 LLM call 的費用(USD)""" + prompt_rate, completion_rate = _COST_PER_MILLION_TOKENS.get( + model, _DEFAULT_COST_PER_M + ) + cost = (prompt_tokens / 1_000_000 * prompt_rate + + completion_tokens / 1_000_000 * completion_rate) + return Decimal(str(round(cost, 6))) + + +# ───────────────────────────────────────────────────────────────────────────── +# Pre-call Budget Check(ADR-120 D2 防線 1) +# ───────────────────────────────────────────────────────────────────────────── + +async def check_budget_before_llm_call( + project_id: str, + model: str, + estimated_prompt_tokens: int = 4000, + *, + agent_id: str | None = None, +) -> None: + """ + LLM call 前的三層 budget check。 + + 超出任一層預算 → 拋出 BudgetExhaustedError,阻止 API call。 + Redis 不可用時 fail-open(不阻擋呼叫,但記 warning)。 + + Args: + project_id: 租戶 ID + model: 模型名稱(用於費用估算) + estimated_prompt_tokens: 預估 prompt token 數(保守估計 × 1.5 已含在外) + """ + # Layer 3:Emergency Kill Switch(最優先) + await check_emergency_kill() + + # Local Ollama 模型無費用,跳過 Layer 1/2 + if model in {"deepseek-r1:14b", "qwen3:8b"} or model.startswith("ollama/"): + return + + estimated_cost = estimate_cost(estimated_prompt_tokens, 0, model) + + # Layer 2:Tenant Budget + await _check_tenant_budget(project_id, estimated_cost) + + # Layer 1:Platform Budget + await _check_platform_budget(estimated_cost) + + +async def check_emergency_kill() -> None: + """Layer 3: Emergency Kill Switch — Redis key platform:budget:emergency_kill""" + try: + from src.core.redis_client import get_redis + redis = get_redis() + if await redis.exists(_EMERGENCY_KILL_KEY): + raise BudgetExhaustedError( + "E-BUDGET-004", + "Emergency kill switch activated — contact platform admin", + ) + except BudgetExhaustedError: + raise + except Exception as exc: + logger.warning("budget_emergency_kill_check_failed", error=str(exc)) + + +async def _check_tenant_budget(project_id: str, estimated_cost: Decimal) -> None: + """Layer 2: Tenant Budget(Redis 快取 + awooop_projects.budget_limit_usd)""" + try: + from src.core.redis_client import get_redis + redis = get_redis() + + # 讀取 Tenant 每日已用金額 + cache_key = f"{_TENANT_BUDGET_KEY_PREFIX}{project_id}" + used_raw = await redis.get(cache_key) + used_usd = Decimal(used_raw.decode() if isinstance(used_raw, bytes) else used_raw or "0") + + # 讀取 Tenant 預算上限(從 awooop_projects 表) + limit_usd = await _get_tenant_budget_limit(project_id) + if limit_usd is None: + return # 無上限 → 放行 + + if used_usd + estimated_cost > limit_usd: + raise BudgetExhaustedError( + "E-BUDGET-002", + f"Tenant {project_id} budget exhausted: " + f"used ${used_usd:.4f} / ${limit_usd:.4f}", + ) + + # 告警閾值 + usage_pct = (used_usd + estimated_cost) / limit_usd + if usage_pct >= BUDGET_ALERT_THRESHOLDS["critical"]: + logger.warning( + "budget_tenant_critical", + project_id=project_id, + usage_pct=float(usage_pct), + used_usd=float(used_usd), + limit_usd=float(limit_usd), + ) + elif usage_pct >= BUDGET_ALERT_THRESHOLDS["warn"]: + logger.warning( + "budget_tenant_warn", + project_id=project_id, + usage_pct=float(usage_pct), + used_usd=float(used_usd), + limit_usd=float(limit_usd), + ) + + except BudgetExhaustedError: + raise + except Exception as exc: + logger.warning("budget_tenant_check_failed", project_id=project_id, error=str(exc)) + + +async def _check_platform_budget(estimated_cost: Decimal) -> None: + """Layer 1: Platform Budget(config 靜態上限 + Redis 累計)""" + platform_limit = getattr(settings, "PLATFORM_DAILY_BUDGET_USD", None) + if not platform_limit: + return # 未設定 → 放行 + + try: + from src.core.redis_client import get_redis + redis = get_redis() + used_raw = await redis.get(_PLATFORM_BUDGET_KEY) + used_usd = Decimal(used_raw.decode() if isinstance(used_raw, bytes) else used_raw or "0") + limit_usd = Decimal(str(platform_limit)) + + if used_usd + estimated_cost > limit_usd: + raise BudgetExhaustedError( + "E-BUDGET-003", + f"Platform budget exhausted: used ${used_usd:.4f} / ${limit_usd:.4f} — " + "all LLM calls suspended", + ) + except BudgetExhaustedError: + raise + except Exception as exc: + logger.warning("budget_platform_check_failed", error=str(exc)) + + +# ───────────────────────────────────────────────────────────────────────────── +# Post-call Accounting(ADR-120 D2 防線 2) +# ───────────────────────────────────────────────────────────────────────────── + +async def record_token_usage( + *, + project_id: str, + model: str, + provider: str, + prompt_tokens: int, + completion_tokens: int, + agent_id: str | None = None, + run_id: str | None = None, +) -> Decimal: + """ + LLM call 完成後記帳。 + + 1. 計算實際費用 + 2. INSERT budget_ledger + 3. 更新 Redis budget cache(async,不阻擋回傳) + 4. 觸發告警閾值通知 + + Returns: + actual_cost_usd + """ + import asyncio + from uuid import UUID + + actual_cost = estimate_cost(prompt_tokens, completion_tokens, model) + + # 寫入 budget_ledger(非阻擋) + asyncio.create_task( + _write_budget_ledger( + project_id=project_id, + agent_id=agent_id, + run_id=UUID(run_id) if run_id else None, + model=model, + provider=provider, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + cost_usd=actual_cost, + ), + name="budget_ledger_write", + ) + + # 更新 Redis cache(非阻擋) + asyncio.create_task( + _update_budget_cache(project_id, actual_cost), + name="budget_cache_update", + ) + + logger.info( + "token_usage_recorded", + project_id=project_id, + model=model, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + cost_usd=float(actual_cost), + ) + return actual_cost + + +async def _write_budget_ledger( + *, + project_id: str, + agent_id: str | None, + run_id, # UUID | None + model: str, + provider: str, + prompt_tokens: int, + completion_tokens: int, + cost_usd: Decimal, +) -> None: + """INSERT budget_ledger(leWOOOgo: DB 寫入在 Service 層,非 Router)""" + try: + from sqlalchemy import text + from src.db.base import get_db_context + async with get_db_context(project_id) as db: + await db.execute( + text(""" + INSERT INTO budget_ledger + (project_id, agent_id, run_id, model, provider, + prompt_tokens, completion_tokens, cost_usd) + VALUES + (:project_id, :agent_id, :run_id, :model, :provider, + :prompt_tokens, :completion_tokens, :cost_usd) + """), + { + "project_id": project_id, + "agent_id": agent_id, + "run_id": run_id, + "model": model, + "provider": provider, + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "cost_usd": cost_usd, + }, + ) + except Exception as exc: + logger.warning("budget_ledger_write_failed", project_id=project_id, error=str(exc)) + + +async def _update_budget_cache(project_id: str, cost: Decimal) -> None: + """用 Redis INCRBYFLOAT 更新 Tenant + Platform daily budget cache""" + try: + from src.core.redis_client import get_redis + redis = get_redis() + cost_f = float(cost) + + # Tenant daily budget + tenant_key = f"{_TENANT_BUDGET_KEY_PREFIX}{project_id}" + await redis.incrbyfloat(tenant_key, cost_f) + await redis.expire(tenant_key, 86400) # 24h TTL(每日重置) + + # Platform daily budget + await redis.incrbyfloat(_PLATFORM_BUDGET_KEY, cost_f) + await redis.expire(_PLATFORM_BUDGET_KEY, 86400) + + except Exception as exc: + logger.warning("budget_cache_update_failed", project_id=project_id, error=str(exc)) + + +# ───────────────────────────────────────────────────────────────────────────── +# Helper:從 DB 讀取 Tenant budget limit +# ───────────────────────────────────────────────────────────────────────────── + +async def _get_tenant_budget_limit(project_id: str) -> Decimal | None: + """從 awooop_projects.budget_limit_usd 讀取 Tenant 每日上限(允許 NULL = 無上限)""" + try: + from sqlalchemy import text + from src.db.base import get_db_context + async with get_db_context() as db: + row = await db.execute( + text("SELECT budget_limit_usd FROM awooop_projects WHERE project_id = :pid"), + {"pid": project_id}, + ) + result = row.scalar_one_or_none() + return Decimal(str(result)) if result is not None else None + except Exception as exc: + logger.warning("get_tenant_budget_limit_failed", project_id=project_id, error=str(exc)) + return None + + +# ───────────────────────────────────────────────────────────────────────────── +# Emergency Kill Switch 管理(Admin 工具) +# ───────────────────────────────────────────────────────────────────────────── + +async def activate_emergency_kill(reason: str = "") -> None: + """啟動緊急停機 — SET platform:budget:emergency_kill""" + from src.core.redis_client import get_redis + redis = get_redis() + await redis.set(_EMERGENCY_KILL_KEY, reason or "activated", ex=86400 * 7) + logger.warning("budget_emergency_kill_activated", reason=reason) + + +async def deactivate_emergency_kill() -> None: + """解除緊急停機""" + from src.core.redis_client import get_redis + redis = get_redis() + await redis.delete(_EMERGENCY_KILL_KEY) + logger.info("budget_emergency_kill_deactivated") + + +async def is_emergency_kill_active() -> bool: + """查詢緊急停機狀態""" + try: + from src.core.redis_client import get_redis + redis = get_redis() + return bool(await redis.exists(_EMERGENCY_KILL_KEY)) + except Exception: + return False diff --git a/apps/api/src/services/channel_hub.py b/apps/api/src/services/channel_hub.py new file mode 100644 index 00000000..b3c49bfe --- /dev/null +++ b/apps/api/src/services/channel_hub.py @@ -0,0 +1,418 @@ +""" +Channel Hub — AwoooP 入站事件統一路由 + Progressive Feedback Policy +==================================================================== +AwoooP Phase 7: ADR-106(channel_event family) +2026-05-04 ogt + Claude Sonnet 4.6 + +功能: + 1. Telegram 入站事件鏡像(記錄到 awooop_conversation_event) + 2. 建立 platform run(呼叫 platform_runtime.create_run) + 3. Progressive Feedback Policy: + - run 進入 WAITING_TOOL 狀態 → 30 秒後若未 complete → 發 interim Telegram 訊息 + - 訊息記錄到 awooop_outbound_message + 4. Shadow Mode:不發任何 Telegram 訊息(只記錄到 outbound_message, status='shadow') + +Progressive Feedback Policy 設計(ADR-106 P2-03): + - 用 asyncio.create_task 啟動 30s 計時器 + - 30s 後查詢 run state:若仍在 WAITING_TOOL → 發 interim 訊息 + - interim 訊息:「AI 正在分析中,請稍候...」(不洩漏 run 細節) + - Final reply 由 shadow_execute() 完成後觸發(Phase 8 實作) + +與 legacy telegram_gateway.py 的關係: + - 完全獨立,不修改 legacy gateway + - legacy 繼續處理 legacy flow(signal_worker 觸發的 approval/notification) + - AwoooP run 只走本模組 +""" + +from __future__ import annotations + +import asyncio +import hashlib +import json +from datetime import datetime, timezone +from typing import Any +from uuid import UUID + +import structlog +from sqlalchemy import select, text +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.awooop_models import AwoooPRunState +from src.services.audit_sink import _redact_string +from src.services.platform_runtime import create_run + +logger = structlog.get_logger(__name__) + +# Progressive Feedback Policy:等待超過此秒數才發 interim 訊息 +_INTERIM_WAIT_SECONDS = 30 + + +# ───────────────────────────────────────────────────────────────────────────── +# 入站事件記錄 +# ───────────────────────────────────────────────────────────────────────────── + +async def mirror_inbound_event( + db: AsyncSession, + *, + project_id: str, + channel_type: str, + provider_event_id: str, + platform_subject_id: str | None = None, + channel_user_id: str | None = None, + channel_chat_id: str | None = None, + content_type: str = "text", + raw_content: str | None = None, + attachment_sha256: str | None = None, + provider_ts: datetime | None = None, + run_id: UUID | None = None, + is_duplicate: bool = False, +) -> UUID: + """ + 記錄入站 channel event 到 awooop_conversation_event。 + + raw_content 只用於計算 hash 和 preview,不入庫明文。 + 回傳 event_id。 + """ + content_hash: str | None = None + content_preview: str | None = None + + if raw_content is not None: + content_hash = hashlib.sha256(raw_content.encode()).hexdigest() + # preview:redact 後截取前 256 字元 + redacted = _redact_string(raw_content) + content_preview = redacted[:256] if len(redacted) > 256 else redacted + + result = await db.execute( + text(""" + INSERT INTO awooop_conversation_event ( + project_id, channel_type, provider_event_id, + platform_subject_id, channel_user_id, channel_chat_id, + run_id, content_type, content_hash, content_preview, + attachment_sha256, is_duplicate, provider_ts, received_at + ) VALUES ( + :project_id, :channel_type, :provider_event_id, + :platform_subject_id, :channel_user_id, :channel_chat_id, + :run_id, :content_type, :content_hash, :content_preview, + :attachment_sha256, :is_duplicate, :provider_ts, NOW() + ) + ON CONFLICT (project_id, channel_type, provider_event_id) DO UPDATE SET + is_duplicate = TRUE, + run_id = COALESCE(EXCLUDED.run_id, awooop_conversation_event.run_id) + RETURNING event_id + """), + { + "project_id": project_id, + "channel_type": channel_type, + "provider_event_id": provider_event_id, + "platform_subject_id": platform_subject_id, + "channel_user_id": channel_user_id, + "channel_chat_id": channel_chat_id, + "run_id": run_id, + "content_type": content_type, + "content_hash": content_hash, + "content_preview": content_preview, + "attachment_sha256": attachment_sha256, + "is_duplicate": is_duplicate, + "provider_ts": provider_ts, + }, + ) + row = result.fetchone() + event_id: UUID = row[0] + logger.info( + "channel_event_mirrored", + project_id=project_id, + channel_type=channel_type, + event_id=str(event_id), + is_duplicate=is_duplicate, + ) + return event_id + + +# ───────────────────────────────────────────────────────────────────────────── +# 出站訊息記錄 +# ───────────────────────────────────────────────────────────────────────────── + +async def record_outbound_message( + db: AsyncSession, + *, + project_id: str, + run_id: UUID, + channel_type: str, + channel_chat_id: str, + message_type: str, # 'interim' | 'final' | 'error' | 'approval_request' + content: str | None = None, + provider_message_id: str | None = None, + send_status: str = "pending", + conversation_event_id: UUID | None = None, + triggered_by_state: str | None = None, + waiting_since: datetime | None = None, + is_shadow: bool = True, +) -> UUID: + """ + 記錄出站訊息到 awooop_outbound_message。 + + is_shadow=True:status='shadow'(不實際發送,只記錄) + """ + content_hash: str | None = None + content_preview: str | None = None + if content is not None: + content_hash = hashlib.sha256(content.encode()).hexdigest() + redacted = _redact_string(content) + content_preview = redacted[:256] + + actual_status = "shadow" if is_shadow else send_status + + result = await db.execute( + text(""" + INSERT INTO awooop_outbound_message ( + project_id, run_id, conversation_event_id, + channel_type, channel_chat_id, message_type, + content_hash, content_preview, provider_message_id, + send_status, queued_at, + triggered_by_state, waiting_since + ) VALUES ( + :project_id, :run_id, :conversation_event_id, + :channel_type, :channel_chat_id, :message_type, + :content_hash, :content_preview, :provider_message_id, + :send_status, NOW(), + :triggered_by_state, :waiting_since + ) + RETURNING message_id + """), + { + "project_id": project_id, + "run_id": run_id, + "conversation_event_id": conversation_event_id, + "channel_type": channel_type, + "channel_chat_id": channel_chat_id, + "message_type": message_type, + "content_hash": content_hash, + "content_preview": content_preview, + "provider_message_id": provider_message_id, + "send_status": actual_status, + "triggered_by_state": triggered_by_state, + "waiting_since": waiting_since, + }, + ) + row = result.fetchone() + message_id: UUID = row[0] + logger.info( + "outbound_message_recorded", + project_id=project_id, + run_id=str(run_id), + message_type=message_type, + send_status=actual_status, + message_id=str(message_id), + ) + return message_id + + +# ───────────────────────────────────────────────────────────────────────────── +# Progressive Feedback Policy +# ───────────────────────────────────────────────────────────────────────────── + +async def schedule_interim_feedback( + *, + project_id: str, + run_id: UUID, + channel_type: str, + channel_chat_id: str, + conversation_event_id: UUID | None = None, + is_shadow: bool = True, + wait_seconds: int = _INTERIM_WAIT_SECONDS, +) -> None: + """ + Progressive Feedback Policy: + 等待 wait_seconds 秒後,若 run 仍在 WAITING_TOOL → 發 interim 訊息。 + + Shadow Mode:記錄到 outbound_message(status='shadow'),不實際發 Telegram 訊息。 + """ + asyncio.create_task( + _interim_feedback_task( + project_id=project_id, + run_id=run_id, + channel_type=channel_type, + channel_chat_id=channel_chat_id, + conversation_event_id=conversation_event_id, + is_shadow=is_shadow, + wait_seconds=wait_seconds, + ), + name=f"interim_feedback_{str(run_id)[:8]}", + ) + + +async def _interim_feedback_task( + *, + project_id: str, + run_id: UUID, + channel_type: str, + channel_chat_id: str, + conversation_event_id: UUID | None, + is_shadow: bool, + wait_seconds: int, +) -> None: + """等待後查 run state,仍 waiting_tool 才發 interim""" + await asyncio.sleep(wait_seconds) + + try: + from src.db.base import get_db_context + + async with get_db_context(project_id) as db: + result = await db.execute( + select(AwoooPRunState.state, AwoooPRunState.is_shadow).where( + AwoooPRunState.run_id == run_id, + AwoooPRunState.project_id == project_id, + ) + ) + row = result.first() + + if row is None: + logger.warning( + "interim_feedback_run_not_found", + run_id=str(run_id), + ) + return + + state, run_is_shadow = row + if state != "waiting_tool": + # run 已推進(complete/failed 等),不需要 interim + return + + waiting_since = datetime.now(timezone.utc) + interim_content = "AI 正在分析中,請稍候... ⏳" + + await record_outbound_message( + db, + project_id=project_id, + run_id=run_id, + channel_type=channel_type, + channel_chat_id=channel_chat_id, + message_type="interim", + content=interim_content, + send_status="pending", + conversation_event_id=conversation_event_id, + triggered_by_state="waiting_tool", + waiting_since=waiting_since, + is_shadow=is_shadow or run_is_shadow, + ) + + if not (is_shadow or run_is_shadow): + # Non-shadow:實際發 Telegram 訊息 + await _send_telegram_interim( + channel_chat_id=channel_chat_id, + content=interim_content, + run_id=run_id, + ) + + logger.info( + "interim_feedback_sent", + project_id=project_id, + run_id=str(run_id), + is_shadow=is_shadow or run_is_shadow, + ) + + except Exception as exc: + logger.exception( + "interim_feedback_task_error", + run_id=str(run_id), + error=str(exc), + ) + + +async def _send_telegram_interim( + *, + channel_chat_id: str, + content: str, + run_id: UUID, +) -> None: + """實際發送 Telegram interim 訊息(non-shadow 專用)""" + try: + import os + + import httpx + + bot_token = os.environ.get("TELEGRAM_BOT_TOKEN") + if not bot_token: + logger.warning("interim_telegram_no_token", run_id=str(run_id)) + return + + async with httpx.AsyncClient(timeout=10) as client: + await client.post( + f"https://api.telegram.org/bot{bot_token}/sendMessage", + json={ + "chat_id": channel_chat_id, + "text": content, + "parse_mode": "HTML", + }, + ) + except Exception as exc: + logger.warning( + "interim_telegram_send_failed", + run_id=str(run_id), + error=str(exc), + ) + + +# ───────────────────────────────────────────────────────────────────────────── +# Channel Hub 主入口(Telegram inbound) +# ───────────────────────────────────────────────────────────────────────────── + +async def handle_telegram_inbound( + db: AsyncSession, + *, + project_id: str, + agent_id: str, + message_id: str, + user_id: str, + chat_id: str, + text: str | None = None, + is_shadow: bool = True, +) -> dict[str, Any]: + """ + Telegram 入站訊息的統一處理入口: + 1. mirror_inbound_event(記錄) + 2. create_run(建立 platform run) + 3. schedule_interim_feedback(Progressive Feedback) + 4. 回傳 {event_id, run_id, is_duplicate} + """ + # Step 1: 嘗試建立 run(有冪等保護) + run_id, is_duplicate = await create_run( + project_id=project_id, + agent_id=agent_id, + trigger_type="channel_event", + trigger_ref=f"telegram:{message_id}", + input_payload={"chat_id": chat_id, "user_id": user_id}, + channel_type="telegram", + provider_event_id=message_id, + ) + + # Step 2: Mirror event(含 run_id) + event_id = await mirror_inbound_event( + db, + project_id=project_id, + channel_type="telegram", + provider_event_id=message_id, + channel_user_id=user_id, + channel_chat_id=chat_id, + content_type="text" if text else "callback_query", + raw_content=text, + run_id=run_id, + is_duplicate=is_duplicate, + ) + + # Step 3: Progressive Feedback(30s 計時器) + if not is_duplicate: + await schedule_interim_feedback( + project_id=project_id, + run_id=run_id, + channel_type="telegram", + channel_chat_id=chat_id, + conversation_event_id=event_id, + is_shadow=is_shadow, + ) + + return { + "event_id": str(event_id), + "run_id": str(run_id), + "is_duplicate": is_duplicate, + } diff --git a/apps/api/src/services/contract_service.py b/apps/api/src/services/contract_service.py new file mode 100644 index 00000000..7b2bc235 --- /dev/null +++ b/apps/api/src/services/contract_service.py @@ -0,0 +1,449 @@ +""" +Contract Lifecycle Service +=========================== +AwoooP Phase 3: 合約生命週期管理(ADR-107/ADR-112) +2026-05-04 ogt + Claude Sonnet 4.6 + +生命週期狀態機: + draft → published → active → revoked + ↑ ↓(新 active 把舊的設為 revoked) + +操作: + draft() — 建立 draft revision(schema 驗證 + body_hash) + publish() — HMAC 簽章驗證後 draft → published + activate() — approval 確認後 published → active + outbox + get_active() — runtime 唯一讀取路徑(只返回 active revision) + +安全機制: + - body_hash = sha256(canonical JSON)(ADR-112) + - publish() 需 HMAC 簽章(settings.CONTRACT_HMAC_KEY) + - activate() 需 Redis multi_sig 確認(ADR-112 approval workflow) + - 所有操作寫入 audit_log +""" + +from __future__ import annotations + +import hashlib +import hmac +import json +from datetime import datetime, timezone +from typing import Any +from uuid import UUID + +import structlog +from pydantic import ValidationError + +from src.core.config import settings +from src.db.awooop_models import AwoooPContractRevision +from src.models.awooop_contracts import validate_contract_body +from src.repositories import contract_repository + +logger = structlog.get_logger(__name__) + +# ───────────────────────────────────────────────────────────────────────────── +# 錯誤定義 +# ───────────────────────────────────────────────────────────────────────────── + +class ContractError(Exception): + """合約操作基礎錯誤""" + def __init__(self, error_code: str, message: str) -> None: + self.error_code = error_code + super().__init__(f"[{error_code}] {message}") + + +class ContractSchemaError(ContractError): + """body_json 不符合 schema""" + def __init__(self, family: str, details: str) -> None: + super().__init__("E-CONTRACT-001", f"Contract family={family} schema 驗證失敗: {details}") + + +class ContractSignatureError(ContractError): + """HMAC 簽章驗證失敗""" + def __init__(self) -> None: + super().__init__("E-CONTRACT-002", "Contract publish 簽章驗證失敗") + + +class ContractStateError(ContractError): + """非法狀態轉換""" + def __init__(self, from_state: str, to_state: str) -> None: + super().__init__( + "E-CONTRACT-003", + f"非法狀態轉換 {from_state!r} → {to_state!r}", + ) + + +class ContractApprovalError(ContractError): + """缺少必要的 activation approval""" + def __init__(self, revision_id: str) -> None: + super().__init__( + "E-CONTRACT-004", + f"revision {revision_id} 尚未取得足夠的 approval 簽核", + ) + + +class ContractNotFoundError(ContractError): + """Revision 不存在""" + def __init__(self, revision_id: str) -> None: + super().__init__( + "E-CONTRACT-005", + f"Revision {revision_id!r} 不存在或無權限存取", + ) + + +# ───────────────────────────────────────────────────────────────────────────── +# Body hash(ADR-112 artifact integrity) +# ───────────────────────────────────────────────────────────────────────────── + +def _compute_body_hash(body_json: dict[str, Any]) -> str: + """ + 計算 body_json 的 SHA-256 hex digest。 + 使用 canonical JSON(sorted keys, no spaces)確保確定性。 + """ + canonical = json.dumps(body_json, sort_keys=True, separators=(",", ":"), ensure_ascii=False) + return hashlib.sha256(canonical.encode("utf-8")).hexdigest() + + +def _verify_publish_signature( + revision_id: str, + body_hash: str, + publisher_id: str, + signature: str, +) -> bool: + """ + 驗證 publish HMAC 簽章。 + message = f"{revision_id}:{body_hash}:{publisher_id}" + secret = settings.CONTRACT_HMAC_KEY(base64 or hex) + """ + secret = getattr(settings, "CONTRACT_HMAC_KEY", "") + if not secret: + # 未設定 HMAC key → 開發環境放行(但記錄 warning) + logger.warning( + "contract_hmac_key_not_set", + warning="CONTRACT_HMAC_KEY 未設定,publish 簽章驗證跳過(非 production 行為)", + ) + return True + + message = f"{revision_id}:{body_hash}:{publisher_id}".encode("utf-8") + expected = hmac.new( + secret.encode("utf-8"), message, hashlib.sha256 + ).hexdigest() + return hmac.compare_digest(expected, signature) + + +# ───────────────────────────────────────────────────────────────────────────── +# Multi-sig approval(ADR-112 activation approval) +# ───────────────────────────────────────────────────────────────────────────── + +_APPROVAL_KEY_PREFIX = "contract:approval:" +_APPROVAL_REQUIRED = 1 # Phase 3:1 人核准即可;Phase 5+ 升為 2 + + +async def _check_activation_approval(revision_id: str, project_id: str) -> bool: + """ + 檢查 Redis 中是否有足夠的 activation approval。 + key = contract:approval:{project_id}:{revision_id} + value = JSON list of approver IDs + """ + try: + from src.core.redis_client import get_redis + redis = get_redis() + key = f"{_APPROVAL_KEY_PREFIX}{project_id}:{revision_id}" + raw = await redis.get(key) + if not raw: + return False + approvers = json.loads(raw.decode() if isinstance(raw, bytes) else raw) + return len(approvers) >= _APPROVAL_REQUIRED + except Exception as exc: + logger.warning("contract_approval_check_failed", revision_id=revision_id, error=str(exc)) + return False + + +async def record_activation_approval( + revision_id: str, + project_id: str, + approver_id: str, +) -> int: + """ + 記錄一個 approver 的核准簽名。 + Returns: 目前收到的 approval 數。 + """ + from src.core.redis_client import get_redis + redis = get_redis() + key = f"{_APPROVAL_KEY_PREFIX}{project_id}:{revision_id}" + raw = await redis.get(key) + approvers: list[str] = json.loads(raw.decode() if isinstance(raw, bytes) else raw or "[]") + if approver_id not in approvers: + approvers.append(approver_id) + await redis.set(key, json.dumps(approvers), ex=86400) # 24h TTL + logger.info( + "contract_approval_recorded", + revision_id=revision_id, + approver_id=approver_id, + total_approvals=len(approvers), + ) + return len(approvers) + + +# ───────────────────────────────────────────────────────────────────────────── +# Core lifecycle operations +# ───────────────────────────────────────────────────────────────────────────── + +async def draft( + *, + project_id: str, + contract_family: str, + contract_id: str, + version_major: int, + version_minor: int, + body_json: dict[str, Any], + body_schema_version: str = "v1.0", +) -> AwoooPContractRevision: + """ + Step 1: 建立 draft revision。 + + - 驗證 body_json 符合 contract_family 的 Pydantic schema + - 計算 body_hash(sha256 canonical JSON) + - 寫入 DB(lifecycle_status='draft') + - 寫入 audit log + + draft revision 不可被 runtime 讀取(get_active() 只返回 active)。 + """ + # Schema 驗證 + try: + validate_contract_body(contract_family, body_json) + except ValidationError as exc: + raise ContractSchemaError(contract_family, exc.json(indent=0)) from exc + except ValueError as exc: + raise ContractSchemaError(contract_family, str(exc)) from exc + + body_hash = _compute_body_hash(body_json) + + revision = await contract_repository.create_draft( + project_id=project_id, + contract_family=contract_family, + contract_id=contract_id, + version_major=version_major, + version_minor=version_minor, + body_json=body_json, + body_hash=body_hash, + body_schema_version=body_schema_version, + ) + + await _write_audit( + project_id=project_id, + action="contract.drafted", + resource_type="contract_revision", + resource_id=str(revision.revision_id), + details={ + "contract_family": contract_family, + "contract_id": contract_id, + "version": f"{version_major}.{version_minor}", + "body_hash": body_hash, + }, + ) + return revision + + +async def publish( + *, + revision_id: UUID, + project_id: str, + publisher_id: str, + signature: str, +) -> AwoooPContractRevision: + """ + Step 2: draft → published。 + + - 讀取 revision(驗證 lifecycle_status='draft') + - HMAC 簽章驗證(publisher_id + body_hash + revision_id) + - 更新 lifecycle_status='published' + - 寫入 audit log + """ + revision = await contract_repository.get_revision(revision_id, project_id) + if revision is None: + raise ContractNotFoundError(str(revision_id)) + if revision.lifecycle_status != "draft": + raise ContractStateError(revision.lifecycle_status, "published") + + if not _verify_publish_signature( + str(revision_id), revision.body_hash, publisher_id, signature + ): + raise ContractSignatureError() + + published_at = datetime.now(timezone.utc) + revision = await contract_repository.mark_published( + revision_id=revision_id, + project_id=project_id, + publisher_id=publisher_id, + publish_signature=signature, + published_at=published_at, + ) + + await _write_audit( + project_id=project_id, + action="contract.published", + resource_type="contract_revision", + resource_id=str(revision_id), + details={ + "publisher_id": publisher_id, + "published_at": published_at.isoformat(), + "body_hash": revision.body_hash, + }, + ) + return revision + + +async def activate( + *, + revision_id: UUID, + project_id: str, + activator_id: str, + bypass_approval: bool = False, +) -> AwoooPContractRevision: + """ + Step 3: published → active。 + + - 讀取 revision(驗證 lifecycle_status='published') + - 確認 Redis approval(除非 bypass_approval=True) + - 更新 active pointer(UPSERT awooop_active_revisions) + - 舊 active revision → revoked + - 寫入 outbox event(ADR-113) + - 寫入 audit log + """ + revision = await contract_repository.get_revision(revision_id, project_id) + if revision is None: + raise ContractNotFoundError(str(revision_id)) + if revision.lifecycle_status != "published": + raise ContractStateError(revision.lifecycle_status, "active") + + if not bypass_approval: + approved = await _check_activation_approval(str(revision_id), project_id) + if not approved: + raise ContractApprovalError(str(revision_id)) + + # 找舊 active revision(如果有) + old_revision = await contract_repository.get_active_revision( + project_id=project_id, + contract_family=revision.contract_family, + contract_id=revision.contract_id, + ) + old_revision_id = old_revision.revision_id if old_revision else None + + revision = await contract_repository.mark_active( + revision_id=revision_id, + project_id=project_id, + contract_family=revision.contract_family, + contract_id=revision.contract_id, + old_revision_id=old_revision_id, + ) + + await _write_audit( + project_id=project_id, + action="contract.activated", + resource_type="contract_revision", + resource_id=str(revision_id), + details={ + "activator_id": activator_id, + "old_revision_id": str(old_revision_id) if old_revision_id else None, + "contract_family": revision.contract_family, + "contract_id": revision.contract_id, + }, + ) + return revision + + +async def get_active( + *, + project_id: str, + contract_family: str, + contract_id: str, + verify_hash: bool = True, +) -> AwoooPContractRevision | None: + """ + Runtime 讀取路徑:只返回 active revision。 + + verify_hash=True(預設):從 DB 讀取後驗證 body_hash, + 確保 body_json 未被竄改(ADR-112 artifact integrity)。 + """ + revision = await contract_repository.get_active_revision( + project_id=project_id, + contract_family=contract_family, + contract_id=contract_id, + ) + if revision is None: + return None + + if verify_hash: + computed = _compute_body_hash(revision.body_json) + if computed != revision.body_hash: + logger.error( + "contract_hash_mismatch", + revision_id=str(revision.revision_id), + expected=revision.body_hash, + computed=computed, + ) + raise ContractError( + "E-CONTRACT-006", + f"revision {revision.revision_id} body_hash 不符(資料可能被竄改)", + ) + + return revision + + +async def get_active_body( + *, + project_id: str, + contract_family: str, + contract_id: str, +) -> dict[str, Any] | None: + """ + 便利方法:直接返回 body_json(含 hash 驗證)。 + None = 沒有 active revision。 + """ + revision = await get_active( + project_id=project_id, + contract_family=contract_family, + contract_id=contract_id, + ) + return revision.body_json if revision else None + + +# ───────────────────────────────────────────────────────────────────────────── +# Audit log helper +# ───────────────────────────────────────────────────────────────────────────── + +async def _write_audit( + *, + project_id: str, + action: str, + resource_type: str, + resource_id: str, + details: dict[str, Any], +) -> None: + """寫入 audit_log(非阻擋,失敗只 warning)""" + try: + from sqlalchemy import text as sa_text + from src.db.base import get_db_context + async with get_db_context(project_id) as db: + await db.execute( + sa_text(""" + INSERT INTO audit_logs + (project_id, action, resource_type, resource_id, details) + VALUES + (:project_id, :action, :resource_type, :resource_id, :details::jsonb) + """), + { + "project_id": project_id, + "action": action, + "resource_type": resource_type, + "resource_id": resource_id, + "details": json.dumps(details), + }, + ) + except Exception as exc: + logger.warning( + "contract_audit_write_failed", + action=action, + resource_id=resource_id, + error=str(exc), + ) diff --git a/apps/api/src/services/platform_runtime.py b/apps/api/src/services/platform_runtime.py new file mode 100644 index 00000000..6dae4a6c --- /dev/null +++ b/apps/api/src/services/platform_runtime.py @@ -0,0 +1,375 @@ +""" +Platform Runtime(Shadow Mode Shell) +====================================== +AwoooP Phase 4: 第一個 runtime shell,只跑 shadow,不改 legacy 行為(ADR-106) +2026-05-04 ogt + Claude Sonnet 4.6 + +Shadow Mode 保證: + 1. 0 user-visible response(不發送 Telegram/Slack 任何訊息) + 2. 0 destructive tool call(is_destructive=true 的工具全部攔截) + 3. 所有執行記錄寫入 awooop_run_state + step_journal(可觀測) + 4. budget_service hard kill 同樣作用(防止 shadow 跑出超額費用) + +Idempotency(ADR-114): + (project_id, channel_type, provider_event_id) 複合唯一 + Redis NX 先攔(快),PG constraint 最後防(準確) +""" + +from __future__ import annotations + +import hashlib +import json +import uuid +from datetime import datetime, timedelta, timezone +from typing import Any + +import structlog +from sqlalchemy import select +from sqlalchemy.dialects.postgresql import insert as pg_insert + +from src.db.awooop_models import AwoooPRunIdempotency, AwoooPRunState, AwoooPRunStepJournal +from src.db.base import get_db_context +from src.services.run_state_machine import LEASE_TTL_SECONDS, transition + +logger = structlog.get_logger(__name__) + +# Shadow mode 設定 +_SHADOW_ENABLED = True # Phase 4 固定 True;Phase 6+ 由 migration_mode 控制 +_DESTRUCTIVE_TOOL_KEYWORDS = frozenset({ + "delete", "drop", "remove", "kill", "terminate", "destroy", + "rollback", "revert", "patch", "apply", "exec", "execute", + "restart", "scale", "cordon", "drain", +}) + + +# ───────────────────────────────────────────────────────────────────────────── +# UUID v7(時間有序) +# ───────────────────────────────────────────────────────────────────────────── + +def _uuid7() -> uuid.UUID: + """ + 生成 UUID v7(時間有序,適合資料庫 PK)。 + 格式:48-bit Unix timestamp ms + version(7) + 74-bit random + """ + now_ms = int(datetime.now(timezone.utc).timestamp() * 1000) + rand_bits = int.from_bytes(uuid.uuid4().bytes[6:], "big") & 0x3FFFFFFFFFFFFFFF + val = (now_ms << 80) | (0x7 << 76) | (0x8 << 72) | rand_bits + return uuid.UUID(int=val) + + +# ───────────────────────────────────────────────────────────────────────────── +# W3C traceparent 生成 +# ───────────────────────────────────────────────────────────────────────────── + +def _new_trace_id() -> str: + """生成 W3C traceparent-compatible trace_id""" + trace_id_hex = uuid.uuid4().hex + uuid.uuid4().hex[:16] # 128-bit + span_id_hex = uuid.uuid4().hex[:16] # 64-bit + return f"00-{trace_id_hex}-{span_id_hex}-01" + + +# ───────────────────────────────────────────────────────────────────────────── +# Idempotency +# ───────────────────────────────────────────────────────────────────────────── + +_IDEMPOTENCY_REDIS_PREFIX = "awooop:run:idem:" +_IDEMPOTENCY_REDIS_TTL = 86400 # 24h + + +async def check_idempotency( + project_id: str, + channel_type: str, + provider_event_id: str, +) -> uuid.UUID | None: + """ + 檢查 (project_id, channel_type, provider_event_id) 是否已有對應 run_id。 + + Layer 1:Redis NX(快速攔截,TTL 24h) + Layer 2:PostgreSQL awooop_run_idempotency(準確) + + Returns: 既有 run_id,或 None(尚未處理) + """ + idem_key = f"{_IDEMPOTENCY_REDIS_PREFIX}{project_id}:{channel_type}:{provider_event_id}" + + # Layer 1: Redis + try: + from src.core.redis_client import get_redis + redis = get_redis() + cached = await redis.get(idem_key) + if cached: + run_id_str = cached.decode() if isinstance(cached, bytes) else cached + logger.info( + "idempotency_hit_redis", + project_id=project_id, + provider_event_id=provider_event_id, + run_id=run_id_str, + ) + return uuid.UUID(run_id_str) + except Exception as exc: + logger.warning("idempotency_redis_check_failed", error=str(exc)) + + # Layer 2: PostgreSQL + try: + async with get_db_context(project_id) as db: + result = await db.execute( + select(AwoooPRunIdempotency.run_id).where( + AwoooPRunIdempotency.project_id == project_id, + AwoooPRunIdempotency.channel_type == channel_type, + AwoooPRunIdempotency.provider_event_id == provider_event_id, + ) + ) + row = result.fetchone() + if row: + return uuid.UUID(str(row[0])) + except Exception as exc: + logger.warning("idempotency_pg_check_failed", error=str(exc)) + + return None + + +async def _register_idempotency( + project_id: str, + channel_type: str, + provider_event_id: str, + run_id: uuid.UUID, +) -> None: + """寫入 idempotency 記錄(Redis + PostgreSQL)""" + idem_key = f"{_IDEMPOTENCY_REDIS_PREFIX}{project_id}:{channel_type}:{provider_event_id}" + run_id_str = str(run_id) + + # Redis NX(若已有其他 worker 寫入,NX 失敗,無害) + try: + from src.core.redis_client import get_redis + redis = get_redis() + await redis.set(idem_key, run_id_str, ex=_IDEMPOTENCY_REDIS_TTL, nx=True) + except Exception as exc: + logger.warning("idempotency_redis_write_failed", error=str(exc)) + + # PostgreSQL(INSERT OR IGNORE) + try: + async with get_db_context(project_id) as db: + stmt = pg_insert(AwoooPRunIdempotency).values( + project_id=project_id, + channel_type=channel_type, + provider_event_id=provider_event_id, + run_id=run_id, + ).on_conflict_do_nothing(constraint="uix_run_idempotency_key") + await db.execute(stmt) + except Exception as exc: + logger.warning("idempotency_pg_write_failed", error=str(exc)) + + +# ───────────────────────────────────────────────────────────────────────────── +# Shadow destructive tool check +# ───────────────────────────────────────────────────────────────────────────── + +def is_destructive_tool(tool_name: str, is_destructive_flag: bool = False) -> bool: + """ + 判斷 tool call 是否為破壞性操作。 + Shadow mode 下一律攔截。 + + 判斷邏輯: + 1. MCP Gateway contract 的 is_destructive=True flag + 2. tool_name 包含破壞性關鍵字(fallback,無 contract 時使用) + """ + if is_destructive_flag: + return True + tool_lower = tool_name.lower() + return any(kw in tool_lower for kw in _DESTRUCTIVE_TOOL_KEYWORDS) + + +# ───────────────────────────────────────────────────────────────────────────── +# Run 建立 +# ───────────────────────────────────────────────────────────────────────────── + +async def create_run( + *, + project_id: str, + agent_id: str, + trigger_type: str, + trigger_ref: str | None = None, + input_payload: dict[str, Any] | None = None, + channel_type: str | None = None, + provider_event_id: str | None = None, + timeout_seconds: int = 600, +) -> tuple[uuid.UUID, bool]: + """ + 建立新 run(或返回既有 run,若重複事件)。 + + Returns: + (run_id, is_duplicate) — is_duplicate=True 表示冪等命中 + + Shadow mode:is_shadow=True,不產生 user response。 + """ + # Idempotency 檢查 + if channel_type and provider_event_id: + existing_run_id = await check_idempotency(project_id, channel_type, provider_event_id) + if existing_run_id: + logger.info( + "run_creation_idempotent", + project_id=project_id, + channel_type=channel_type, + provider_event_id=provider_event_id, + existing_run_id=str(existing_run_id), + ) + return existing_run_id, True + + run_id = _uuid7() + trace_id = _new_trace_id() + timeout_at = datetime.now(timezone.utc) + timedelta(seconds=timeout_seconds) + + # 計算 input_sha256 + input_sha256 = None + if input_payload: + canonical = json.dumps(input_payload, sort_keys=True, separators=(",", ":")) + input_sha256 = hashlib.sha256(canonical.encode()).hexdigest() + + async with get_db_context(project_id) as db: + run = AwoooPRunState( + run_id=run_id, + project_id=project_id, + agent_id=agent_id, + state="pending", + trace_id=trace_id, + trigger_type=trigger_type, + trigger_ref=trigger_ref, + is_shadow=_SHADOW_ENABLED, + input_sha256=input_sha256, + timeout_at=timeout_at, + ) + db.add(run) + + # 寫入 idempotency 記錄 + if channel_type and provider_event_id: + await _register_idempotency(project_id, channel_type, provider_event_id, run_id) + + logger.info( + "run_created", + run_id=str(run_id), + project_id=project_id, + agent_id=agent_id, + is_shadow=_SHADOW_ENABLED, + trace_id=trace_id, + trigger_type=trigger_type, + ) + return run_id, False + + +# ───────────────────────────────────────────────────────────────────────────── +# Shadow Execution(Phase 4 主邏輯) +# ───────────────────────────────────────────────────────────────────────────── + +async def shadow_execute(run: AwoooPRunState) -> None: + """ + Shadow mode 執行一個 run。 + + Phase 4 行為: + - 解析 agent contract(get_active()) + - 執行 tool calls(全部攔截,不實際執行) + - 記錄 step_journal + - 完成後 COMPLETED(無 user response) + + Phase 6+ 才接真實 LLM + channel adapter。 + """ + run_id = run.run_id + project_id = run.project_id + agent_id = run.agent_id + + logger.info( + "shadow_execute_start", + run_id=str(run_id), + project_id=project_id, + agent_id=agent_id, + ) + + try: + # 解析 agent contract(取得工具清單) + from src.services.contract_service import get_active_body + agent_contract = await get_active_body( + project_id=project_id, + contract_family="agent", + contract_id=agent_id, + ) + + tools = agent_contract.get("tools", []) if agent_contract else [] + + # Shadow step journal:記錄每個工具會被攔截 + step_seq = 0 + async with get_db_context(project_id) as db: + for tool in tools: + tool_name = tool.get("tool_name", "unknown") + blocked = is_destructive_tool(tool_name) + step = AwoooPRunStepJournal( + run_id=run_id, + project_id=project_id, + step_seq=step_seq, + tool_name=tool_name, + mcp_gateway_id=tool.get("mcp_gateway_id"), + result_status="success" if not blocked else "pending", + was_blocked=blocked, + block_reason="shadow_mode_destructive_blocked" if blocked else None, + ) + db.add(step) + step_seq += 1 + + # 完成 run(shadow mode:無 user response) + await transition( + run_id=run_id, + project_id=project_id, + to_state="completed", + step_count_delta=step_seq, + ) + + logger.info( + "shadow_execute_completed", + run_id=str(run_id), + steps=step_seq, + ) + + except Exception as exc: + logger.exception( + "shadow_execute_failed", + run_id=str(run_id), + error=str(exc), + ) + await transition( + run_id=run_id, + project_id=project_id, + to_state="failed", + error_code="E-RUN-001", + error_detail=str(exc)[:500], + ) + + +async def get_run_status(run_id: uuid.UUID, project_id: str) -> dict[str, Any] | None: + """ + 查詢單一 run 的 FSM 狀態。回傳 None 表示不存在。 + Router 層透過此 service 函數存取,不直接操作 DB。 + """ + async with get_db_context(project_id) as db: + result = await db.execute( + select(AwoooPRunState).where( + AwoooPRunState.run_id == run_id, + AwoooPRunState.project_id == project_id, + ) + ) + run = result.scalar_one_or_none() + + if run is None: + return None + + return { + "run_id": str(run.run_id), + "project_id": run.project_id, + "agent_id": run.agent_id, + "state": run.state, + "is_shadow": run.is_shadow, + "trace_id": run.trace_id, + "attempt_count": run.attempt_count, + "cost_usd": float(run.cost_usd), + "step_count": run.step_count, + "error_code": run.error_code, + "created_at": run.created_at.isoformat() if run.created_at else None, + "started_at": run.started_at.isoformat() if run.started_at else None, + "completed_at": run.completed_at.isoformat() if run.completed_at else None, + } diff --git a/apps/api/src/services/provider_proxy.py b/apps/api/src/services/provider_proxy.py new file mode 100644 index 00000000..61f4cae4 --- /dev/null +++ b/apps/api/src/services/provider_proxy.py @@ -0,0 +1,240 @@ +""" +Provider Proxy Adapter — EwoooC AwoooP Envelope 注入 +===================================================== +AwoooP Phase 6: ADR-115 D3 +2026-05-04 ogt + Claude Sonnet 4.6 + +功能: + EwoooC(或任何外部 tenant)的請求在進入 AwoooP 前, + 必須注入完整的 platform envelope,確保: + - project_id 正確(budget/audit/RLS 有效) + - agent_id 存在(Gate 2 通過) + - trace_id / run_id 有 W3C traceparent format + - platform_subject_id 已建立(channel user 身份映射) + +使用方式: + from src.services.provider_proxy import ProviderProxy + + proxy = ProviderProxy(project_id="ewoooc", db=db) + envelope = await proxy.build_envelope( + agent_id="openclaw-biz", + channel_type="telegram", + channel_user_id="123456789", + channel_chat_id="123456789", + ) + # envelope 可直接作為 GatewayContext 的初始化參數 + +設計原則(ADR-115 D3): + - Proxy 只做 envelope 注入(<1ms),不做額外複雜 IO + - platform_subject upsert 是唯一 DB write(auto-provisioning) + - run_id 由 platform_runtime.create_run() 分配,Proxy 不自行生成 + - 每個 tenant 有獨立的 budget partition 和 RLS 隔離 +""" + +from __future__ import annotations + +import hashlib +import os +import re +import struct +import time +import uuid +from dataclasses import dataclass, field +from datetime import datetime, timezone +from typing import Any +from uuid import UUID + +import structlog +from sqlalchemy import select, text +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.awooop_models import AwoooPPlatformSubject, AwoooPProject + +logger = structlog.get_logger(__name__) + +# ───────────────────────────────────────────────────────────────────────────── +# Platform Envelope +# ───────────────────────────────────────────────────────────────────────────── + +@dataclass +class PlatformEnvelope: + """ + AwoooP Platform Envelope — 每個 EwoooC 請求注入的 metadata。 + + 下游(Gateway / Budget / Audit)都依賴這個 envelope。 + """ + project_id: str + agent_id: str + trace_id: str # W3C traceparent + platform_subject_id: str # "{project_id}:{channel_type}:{channel_user_id}" + channel_type: str + channel_user_id: str + channel_chat_id: str | None = None + run_id: UUID | None = None # 由 create_run() 填入 + policy_revision_id: str | None = None # active policy contract revision + tags: dict[str, Any] = field(default_factory=dict) + + def as_dict(self) -> dict[str, Any]: + return { + "project_id": self.project_id, + "agent_id": self.agent_id, + "trace_id": self.trace_id, + "platform_subject_id": self.platform_subject_id, + "channel_type": self.channel_type, + "channel_user_id": self.channel_user_id, + "channel_chat_id": self.channel_chat_id, + "run_id": str(self.run_id) if self.run_id else None, + "policy_revision_id": self.policy_revision_id, + } + + +# ───────────────────────────────────────────────────────────────────────────── +# W3C traceparent 生成 +# ───────────────────────────────────────────────────────────────────────────── + +def _new_trace_id() -> str: + """生成 W3C traceparent 格式 trace_id。格式:00-{32hex}-{16hex}-01""" + trace_id = uuid.uuid4().hex # 32 hex chars = 128 bits + span_id = uuid.uuid4().hex[:16] # 16 hex chars = 64 bits + return f"00-{trace_id}-{span_id}-01" + + +# ───────────────────────────────────────────────────────────────────────────── +# platform_subject_id 格式 +# ───────────────────────────────────────────────────────────────────────────── + +def build_platform_subject_id(project_id: str, channel_type: str, channel_user_id: str) -> str: + """ + 格式:{project_id}:{channel_type}:{channel_user_id} + 例:ewoooc:telegram:123456789 + """ + return f"{project_id}:{channel_type}:{channel_user_id}" + + +# ───────────────────────────────────────────────────────────────────────────── +# ProviderProxy +# ───────────────────────────────────────────────────────────────────────────── + +class ProviderProxy: + """ + AwoooP Provider Proxy Adapter(ADR-115 D3)。 + + 職責: + 1. 驗證 project 存在且不是 legacy mode + 2. upsert platform_subject(auto-provisioning) + 3. 生成 trace_id(W3C traceparent) + 4. 返回 PlatformEnvelope 供下游使用 + """ + + def __init__(self, project_id: str, db: AsyncSession) -> None: + self.project_id = project_id + self._db = db + + async def build_envelope( + self, + *, + agent_id: str, + channel_type: str, + channel_user_id: str, + channel_chat_id: str | None = None, + display_name: str | None = None, + extra_tags: dict[str, Any] | None = None, + ) -> PlatformEnvelope: + """ + 建立 PlatformEnvelope: + 1. 驗證 project_id(不是 legacy mode) + 2. upsert platform_subject(auto-provisioning) + 3. 生成 trace_id + 4. 返回 envelope + """ + await self._validate_project() + await self._upsert_platform_subject( + channel_type=channel_type, + channel_user_id=channel_user_id, + channel_chat_id=channel_chat_id, + display_name=display_name, + ) + + platform_subject_id = build_platform_subject_id( + self.project_id, channel_type, channel_user_id + ) + trace_id = _new_trace_id() + + logger.info( + "provider_proxy_envelope_built", + project_id=self.project_id, + agent_id=agent_id, + channel_type=channel_type, + platform_subject_id=platform_subject_id, + trace_id=trace_id[:32] + "...", # 只 log 前 32 字元 + ) + + return PlatformEnvelope( + project_id=self.project_id, + agent_id=agent_id, + trace_id=trace_id, + platform_subject_id=platform_subject_id, + channel_type=channel_type, + channel_user_id=channel_user_id, + channel_chat_id=channel_chat_id, + tags=extra_tags or {}, + ) + + async def _validate_project(self) -> None: + """project 必須存在且不是 legacy_awoooi_default mode""" + result = await self._db.execute( + select(AwoooPProject).where( + AwoooPProject.project_id == self.project_id, + AwoooPProject.migration_mode != "legacy_awoooi_default", + ) + ) + project = result.scalar_one_or_none() + if project is None: + raise ValueError( + f"project '{self.project_id}' 不存在或 migration_mode=legacy_awoooi_default" + "(EwoooC 接入需要至少 migration_mode='shadow')" + ) + + async def _upsert_platform_subject( + self, + *, + channel_type: str, + channel_user_id: str, + channel_chat_id: str | None, + display_name: str | None, + ) -> None: + """ + Auto-provisioning:第一次看到這個 channel user 就建立 platform_subject。 + 後續請求更新 last_seen_at。 + """ + platform_subject_id = build_platform_subject_id( + self.project_id, channel_type, channel_user_id + ) + now = datetime.now(timezone.utc) + + await self._db.execute( + text(""" + INSERT INTO awooop_platform_subjects ( + project_id, channel_type, channel_user_id, channel_chat_id, + platform_subject_id, display_name, roles, + first_seen_at, last_seen_at + ) VALUES ( + :project_id, :channel_type, :channel_user_id, :channel_chat_id, + :platform_subject_id, :display_name, '["viewer"]'::jsonb, + :now, :now + ) + ON CONFLICT (project_id, channel_type, channel_user_id) DO UPDATE SET + last_seen_at = :now, + channel_chat_id = COALESCE(EXCLUDED.channel_chat_id, awooop_platform_subjects.channel_chat_id), + display_name = COALESCE(EXCLUDED.display_name, awooop_platform_subjects.display_name) + """), + { + "project_id": self.project_id, + "channel_type": channel_type, + "channel_user_id": channel_user_id, + "channel_chat_id": channel_chat_id, + "platform_subject_id": platform_subject_id, + "display_name": display_name, + "now": now, + }, + ) diff --git a/apps/api/src/services/run_state_machine.py b/apps/api/src/services/run_state_machine.py new file mode 100644 index 00000000..3fda1f26 --- /dev/null +++ b/apps/api/src/services/run_state_machine.py @@ -0,0 +1,304 @@ +""" +Run State Machine +================== +AwoooP Phase 4: Run FSM 轉換規則 + Worker Lease(ADR-114/ADR-119) +2026-05-04 ogt + Claude Sonnet 4.6 + +狀態機: + PENDING → RUNNING(worker 取得 lease) + RUNNING → WAITING_TOOL(等待 tool call 完成) + RUNNING → WAITING_APPROVAL(等待人工審核) + RUNNING → COMPLETED / FAILED / CANCELLED + WAITING_TOOL → RUNNING(tool call 完成) + WAITING_TOOL → FAILED(tool call 失敗 + 超過 max_attempts) + WAITING_APPROVAL → RUNNING(核准) + WAITING_APPROVAL → CANCELLED(拒絕/超時) + * → TIMEOUT(lease_until 過期且超過 max_attempts) + +SKIP LOCKED: + Worker 以 SELECT ... FOR UPDATE SKIP LOCKED 取單,防 double-pickup。 + Lease TTL = 60 秒;Heartbeat 每 15 秒更新。 + +Stale run reaper: + 每分鐘掃描 lease_until < NOW() 的 running run: + attempt_count < max_attempts → 重設 PENDING + attempt_count >= max_attempts → 標記 FAILED(E-RUN-002) +""" + +from __future__ import annotations + +import socket +import uuid +from datetime import datetime, timedelta, timezone +from typing import TYPE_CHECKING + +import structlog +from sqlalchemy import select, text, update + +from src.db.awooop_models import AwoooPRunState +from src.db.base import get_db_context + +if TYPE_CHECKING: + from uuid import UUID + +logger = structlog.get_logger(__name__) + +# Worker lease TTL(秒) +LEASE_TTL_SECONDS = 60 +HEARTBEAT_INTERVAL_SECONDS = 15 +STALE_REAPER_INTERVAL_SECONDS = 60 + +# 有效的 FSM 轉換表 +# key: from_state, value: set of valid to_states +_VALID_TRANSITIONS: dict[str, frozenset[str]] = { + "pending": frozenset({"running", "cancelled"}), + "running": frozenset({"waiting_tool", "waiting_approval", "completed", "failed", "cancelled", "timeout"}), + "waiting_tool": frozenset({"running", "failed", "cancelled"}), + "waiting_approval": frozenset({"running", "cancelled", "timeout"}), + "completed": frozenset(), # terminal + "failed": frozenset(), # terminal + "cancelled": frozenset(), # terminal + "timeout": frozenset(), # terminal +} + +TERMINAL_STATES = frozenset({"completed", "failed", "cancelled", "timeout"}) + +_WORKER_ID = f"{socket.gethostname()}:{uuid.uuid4().hex[:8]}" + + +# ───────────────────────────────────────────────────────────────────────────── +# FSM 驗證 +# ───────────────────────────────────────────────────────────────────────────── + +class InvalidStateTransitionError(Exception): + def __init__(self, from_state: str, to_state: str) -> None: + self.from_state = from_state + self.to_state = to_state + super().__init__(f"非法 FSM 轉換: {from_state!r} → {to_state!r}") + + +def validate_transition(from_state: str, to_state: str) -> None: + """驗證 FSM 轉換是否合法,非法則拋出 InvalidStateTransitionError""" + valid_targets = _VALID_TRANSITIONS.get(from_state, frozenset()) + if to_state not in valid_targets: + raise InvalidStateTransitionError(from_state, to_state) + + +# ───────────────────────────────────────────────────────────────────────────── +# Worker Lease(SKIP LOCKED) +# ───────────────────────────────────────────────────────────────────────────── + +async def acquire_pending_run( + project_id: str, + worker_id: str = _WORKER_ID, +) -> AwoooPRunState | None: + """ + 以 SKIP LOCKED 取得一筆 PENDING run,並設定 lease。 + + 同時只有一個 worker 可取得同一筆 run(PostgreSQL SKIP LOCKED 保證)。 + Returns None 表示目前沒有待處理的 run。 + """ + lease_until = datetime.now(timezone.utc) + timedelta(seconds=LEASE_TTL_SECONDS) + now = datetime.now(timezone.utc) + + async with get_db_context(project_id) as db: + # SKIP LOCKED:其他 worker 已鎖定的 row 直接跳過 + result = await db.execute( + text(""" + SELECT run_id FROM awooop_run_state + WHERE project_id = :project_id + AND state = 'pending' + AND (lease_until IS NULL OR lease_until < NOW()) + ORDER BY created_at ASC + LIMIT 1 + FOR UPDATE SKIP LOCKED + """), + {"project_id": project_id}, + ) + row = result.fetchone() + if row is None: + return None + + run_id = row[0] + + # 更新 lease + 轉為 RUNNING + await db.execute( + update(AwoooPRunState) + .where( + AwoooPRunState.run_id == run_id, + AwoooPRunState.project_id == project_id, + ) + .values( + state="running", + lease_until=lease_until, + heartbeat_at=now, + worker_id=worker_id, + started_at=now, + attempt_count=AwoooPRunState.attempt_count + 1, + ) + ) + + # 重新讀取完整 record + result2 = await db.execute( + select(AwoooPRunState).where(AwoooPRunState.run_id == run_id) + ) + run = result2.scalar_one() + + logger.info( + "run_lease_acquired", + run_id=str(run_id), + project_id=project_id, + worker_id=worker_id, + attempt_count=run.attempt_count, + ) + return run + + +async def heartbeat(run_id: "UUID", project_id: str) -> None: + """更新 run 的 heartbeat + 延長 lease TTL""" + new_lease = datetime.now(timezone.utc) + timedelta(seconds=LEASE_TTL_SECONDS) + async with get_db_context(project_id) as db: + await db.execute( + update(AwoooPRunState) + .where( + AwoooPRunState.run_id == run_id, + AwoooPRunState.state == "running", + ) + .values( + heartbeat_at=datetime.now(timezone.utc), + lease_until=new_lease, + ) + ) + + +async def transition( + run_id: "UUID", + project_id: str, + to_state: str, + *, + error_code: str | None = None, + error_detail: str | None = None, + output_sha256: str | None = None, + cost_usd_delta: float = 0.0, + step_count_delta: int = 0, +) -> None: + """ + 執行 FSM 狀態轉換(含驗證)。 + + 先從 DB 讀取 current state,驗證轉換合法性,再 UPDATE。 + terminal state 同時寫入 completed_at。 + """ + async with get_db_context(project_id) as db: + result = await db.execute( + select(AwoooPRunState.state).where( + AwoooPRunState.run_id == run_id, + AwoooPRunState.project_id == project_id, + ) + ) + row = result.fetchone() + if row is None: + raise ValueError(f"run {run_id} 不存在或無 RLS 權限") + + from_state = row[0] + validate_transition(from_state, to_state) + + values: dict = {"state": to_state} + if error_code: + values["error_code"] = error_code + if error_detail: + values["error_detail"] = error_detail + if output_sha256: + values["output_sha256"] = output_sha256 + if cost_usd_delta: + values["cost_usd"] = AwoooPRunState.cost_usd + cost_usd_delta + if step_count_delta: + values["step_count"] = AwoooPRunState.step_count + step_count_delta + if to_state in TERMINAL_STATES: + values["completed_at"] = datetime.now(timezone.utc) + values["lease_until"] = None + values["worker_id"] = None + + await db.execute( + update(AwoooPRunState) + .where(AwoooPRunState.run_id == run_id) + .values(**values) + ) + + logger.info( + "run_state_transition", + run_id=str(run_id), + from_state=from_state, + to_state=to_state, + error_code=error_code, + ) + + +# ───────────────────────────────────────────────────────────────────────────── +# Stale Run Reaper +# ───────────────────────────────────────────────────────────────────────────── + +async def reap_stale_runs(project_id: str) -> int: + """ + 掃描 lease_until < NOW() 的 RUNNING run。 + - attempt_count < max_attempts → 重設 PENDING(retry) + - attempt_count >= max_attempts → FAILED(E-RUN-002) + + Returns: 處理的 stale run 數 + """ + now = datetime.now(timezone.utc) + reaped = 0 + + async with get_db_context(project_id) as db: + # 找所有 stale RUNNING runs + result = await db.execute( + select(AwoooPRunState).where( + AwoooPRunState.project_id == project_id, + AwoooPRunState.state == "running", + AwoooPRunState.lease_until < now, + ) + ) + stale_runs = list(result.scalars().all()) + + for run in stale_runs: + if run.attempt_count < run.max_attempts: + # Retry:重設為 PENDING + await db.execute( + update(AwoooPRunState) + .where(AwoooPRunState.run_id == run.run_id) + .values( + state="pending", + lease_until=None, + worker_id=None, + heartbeat_at=None, + ) + ) + logger.warning( + "stale_run_requeued", + run_id=str(run.run_id), + attempt_count=run.attempt_count, + max_attempts=run.max_attempts, + ) + else: + # 超過最大重試次數 → FAILED + await db.execute( + update(AwoooPRunState) + .where(AwoooPRunState.run_id == run.run_id) + .values( + state="failed", + error_code="E-RUN-002", + error_detail=f"max_attempts={run.max_attempts} 超過,stale run 已廢棄", + completed_at=now, + lease_until=None, + worker_id=None, + ) + ) + logger.error( + "stale_run_failed", + run_id=str(run.run_id), + attempt_count=run.attempt_count, + ) + reaped += 1 + + if reaped: + logger.info("stale_run_reaper_done", project_id=project_id, reaped=reaped) + return reaped diff --git a/apps/api/src/services/schema_validator.py b/apps/api/src/services/schema_validator.py new file mode 100644 index 00000000..25b1c5fc --- /dev/null +++ b/apps/api/src/services/schema_validator.py @@ -0,0 +1,262 @@ +""" +LLM Output Schema Validator +============================= +AwoooP Phase 3.3: LLM 輸出 → schema 驗證 → retry 機制(ADR-112) +2026-05-04 ogt + Claude Sonnet 4.6 + +設計原則: +- LLM 輸出必須通過 Pydantic schema 驗證才能到達 channel adapter +- 驗證失敗 → 自動 retry(最多 3 次,含 retry prompt) +- 3 次全部失敗 → 拋出 SchemaValidationError(E-SCHEMA-001) +- 支援六合約家族 + 自訂 Pydantic model + +位置:介於 LLM response 和 channel adapter 之間 +呼叫方:任何需要結構化 LLM 輸出的 service(playbook_generator, decision_manager 等) +""" + +from __future__ import annotations + +import json +import re +from typing import Any, TypeVar + +import structlog +from pydantic import BaseModel, ValidationError + +logger = structlog.get_logger(__name__) + +T = TypeVar("T", bound=BaseModel) + +_MAX_RETRIES = 3 +_JSON_EXTRACT_RE = re.compile(r"```(?:json)?\s*(\{[\s\S]*?\})\s*```|(\{[\s\S]*\})", re.DOTALL) + + +# ───────────────────────────────────────────────────────────────────────────── +# 錯誤定義 +# ───────────────────────────────────────────────────────────────────────────── + +class SchemaValidationError(Exception): + """LLM 輸出連續 3 次 schema 驗證失敗""" + + error_code: str = "E-SCHEMA-001" + + def __init__(self, model_name: str, attempts: int, last_error: str) -> None: + self.model_name = model_name + self.attempts = attempts + self.last_error = last_error + super().__init__( + f"[E-SCHEMA-001] LLM 輸出 {attempts} 次驗證失敗 " + f"(model={model_name}): {last_error}" + ) + + +# ───────────────────────────────────────────────────────────────────────────── +# JSON 萃取(容錯解析) +# ───────────────────────────────────────────────────────────────────────────── + +def extract_json_from_llm_output(raw: str) -> dict[str, Any] | None: + """ + 從 LLM 原始輸出中萃取 JSON。 + 策略: + 1. 直接 json.loads(最常見:LLM 直接回傳 JSON) + 2. 從 ```json ... ``` 程式碼區塊萃取 + 3. 找第一個 { ... } 區塊嘗試解析 + """ + raw = raw.strip() + + # 策略 1:直接解析 + try: + obj = json.loads(raw) + if isinstance(obj, dict): + return obj + except json.JSONDecodeError: + pass + + # 策略 2 + 3:正則萃取 + for match in _JSON_EXTRACT_RE.finditer(raw): + candidate = match.group(1) or match.group(2) + if candidate: + try: + obj = json.loads(candidate) + if isinstance(obj, dict): + return obj + except json.JSONDecodeError: + continue + + return None + + +# ───────────────────────────────────────────────────────────────────────────── +# Retry prompt builder +# ───────────────────────────────────────────────────────────────────────────── + +def build_retry_prompt( + original_prompt: str, + failed_output: str, + validation_error: str, + model_name: str, + attempt: int, +) -> str: + """ + 建立包含錯誤回饋的 retry prompt。 + 讓 LLM 知道上次輸出哪裡出錯,引導修正。 + """ + return ( + f"{original_prompt}\n\n" + f"---\n" + f"[SCHEMA VALIDATION RETRY {attempt}/{_MAX_RETRIES}]\n" + f"上次回應未通過結構驗證({model_name}),請修正以下問題後重新回應:\n\n" + f"驗證錯誤:\n{validation_error}\n\n" + f"上次回應(供參考):\n{failed_output[:500]}...\n" + f"---\n\n" + f"請只回傳符合格式的 JSON 物件,不要包含任何額外說明。" + ) + + +# ───────────────────────────────────────────────────────────────────────────── +# Core validator +# ───────────────────────────────────────────────────────────────────────────── + +async def validate_llm_output( + *, + raw_output: str, + model_cls: type[T], + llm_caller: Any, # Callable[[str], Awaitable[str]] — 供 retry 使用 + original_prompt: str, + context: dict[str, Any] | None = None, +) -> T: + """ + 驗證 LLM 輸出是否符合 Pydantic model。 + + Args: + raw_output: LLM 第一次回傳的原始字串 + model_cls: 目標 Pydantic model class + llm_caller: async callable(prompt: str) -> str,用於 retry + original_prompt: 原始 prompt(retry 時附加錯誤回饋) + context: 額外 logging context + + Returns: + 驗證成功的 model instance + + Raises: + SchemaValidationError: 連續 3 次失敗後拋出 + """ + model_name = model_cls.__name__ + ctx = context or {} + current_output = raw_output + last_error = "" + + for attempt in range(1, _MAX_RETRIES + 1): + # 1. 萃取 JSON + parsed = extract_json_from_llm_output(current_output) + if parsed is None: + last_error = "無法從 LLM 輸出中萃取 JSON 物件" + logger.warning( + "schema_validator_no_json", + model_name=model_name, + attempt=attempt, + output_preview=current_output[:200], + **ctx, + ) + else: + # 2. Pydantic 驗證 + try: + instance = model_cls.model_validate(parsed) + logger.info( + "schema_validator_passed", + model_name=model_name, + attempt=attempt, + **ctx, + ) + return instance + except ValidationError as exc: + last_error = exc.json(indent=None) + logger.warning( + "schema_validator_failed", + model_name=model_name, + attempt=attempt, + error=last_error[:500], + **ctx, + ) + + # 3. Retry(如果不是最後一次) + if attempt < _MAX_RETRIES: + retry_prompt = build_retry_prompt( + original_prompt=original_prompt, + failed_output=current_output, + validation_error=last_error, + model_name=model_name, + attempt=attempt, + ) + try: + current_output = await llm_caller(retry_prompt) + except Exception as exc: + logger.warning( + "schema_validator_llm_retry_failed", + model_name=model_name, + attempt=attempt, + error=str(exc), + **ctx, + ) + # LLM 呼叫本身失敗,保留上次 output,繼續嘗試(或直接結束) + break + + # 3 次全失敗 + logger.error( + "schema_validator_exhausted", + model_name=model_name, + total_attempts=_MAX_RETRIES, + last_error=last_error[:500], + **ctx, + ) + raise SchemaValidationError(model_name, _MAX_RETRIES, last_error) + + +# ───────────────────────────────────────────────────────────────────────────── +# 便利方法:從 contract family 名稱驗證(不需知道具體 model class) +# ───────────────────────────────────────────────────────────────────────────── + +async def validate_llm_output_by_family( + *, + raw_output: str, + contract_family: str, + llm_caller: Any, + original_prompt: str, + context: dict[str, Any] | None = None, +) -> BaseModel: + """ + 依 contract_family 自動選擇 model class 並驗證。 + 適合 generic pipeline 呼叫(不知道具體 model)。 + """ + from src.models.awooop_contracts import CONTRACT_FAMILY_MODELS, VALID_CONTRACT_FAMILIES + + model_cls = CONTRACT_FAMILY_MODELS.get(contract_family) + if model_cls is None: + raise ValueError( + f"未知 contract_family: {contract_family!r}," + f"合法值:{sorted(VALID_CONTRACT_FAMILIES)}" + ) + return await validate_llm_output( + raw_output=raw_output, + model_cls=model_cls, + llm_caller=llm_caller, + original_prompt=original_prompt, + context=context, + ) + + +# ───────────────────────────────────────────────────────────────────────────── +# 同步版本(非 LLM retry,只做一次驗證)— 供測試和非 LLM 路徑使用 +# ───────────────────────────────────────────────────────────────────────────── + +def validate_once(raw: str | dict[str, Any], model_cls: type[T]) -> T: + """ + 單次驗證,不做 retry。 + 適合:已知格式正確的內部資料、測試 fixture 驗證。 + """ + if isinstance(raw, str): + parsed = extract_json_from_llm_output(raw) + if parsed is None: + raise SchemaValidationError(model_cls.__name__, 1, "無法萃取 JSON") + return model_cls.model_validate(parsed) + return model_cls.model_validate(raw) diff --git a/apps/api/src/workers/platform_worker.py b/apps/api/src/workers/platform_worker.py new file mode 100644 index 00000000..95e2ba4d --- /dev/null +++ b/apps/api/src/workers/platform_worker.py @@ -0,0 +1,196 @@ +""" +Platform Worker +================ +AwoooP Phase 4: SKIP LOCKED worker + stale run reaper(ADR-114) +2026-05-04 ogt + Claude Sonnet 4.6 + +功能: +1. Worker Loop:以 SKIP LOCKED 從 awooop_run_state 取 PENDING run 並執行 +2. Stale Run Reaper:每 60 秒掃描 lease 過期的 RUNNING run +3. Shadow Mode Enforcer:所有 Phase 4 run 強制 is_shadow=True + +Worker 設計: +- 啟動時以 asyncio.create_task 掛入 main.py lifespan +- 多個 worker 安全並行(SKIP LOCKED 保證每 run 只被一個 worker 取得) +- Heartbeat 每 15 秒更新 lease(防 stale reaper 誤殺) +- 優雅停機:收到 stop signal 後完成當前 run 再退出 + +與 legacy 的關係: +- 完全獨立,不碰任何既有 signal_worker.py / aider_event_processor.py +- 只處理 awooop_run_state 表(legacy signal 不寫入此表) +""" + +from __future__ import annotations + +import asyncio +from datetime import datetime, timezone + +import structlog + +from src.services.platform_runtime import shadow_execute +from src.services.run_state_machine import ( + HEARTBEAT_INTERVAL_SECONDS, + STALE_REAPER_INTERVAL_SECONDS, + acquire_pending_run, + heartbeat, + reap_stale_runs, +) + +logger = structlog.get_logger(__name__) + +# Phase 4 固定處理 awoooi 租戶(Phase 6+ 改為多租戶掃描) +_DEFAULT_PROJECT_ID = "awoooi" +_WORKER_POLL_INTERVAL_SECONDS = 5 # 無任務時的等待間隔 +_WORKER_CONCURRENCY = 2 # 同時最多幾個 run 並行 + + +class PlatformWorker: + """ + Platform Worker:SKIP LOCKED + shadow execution + stale reaper。 + + Usage(在 main.py lifespan 中): + worker = PlatformWorker() + asyncio.create_task(worker.run_loop()) + asyncio.create_task(worker.reaper_loop()) + """ + + def __init__(self, project_id: str = _DEFAULT_PROJECT_ID) -> None: + self.project_id = project_id + self._stop_event = asyncio.Event() + self._active_runs: set[str] = set() + + def stop(self) -> None: + """優雅停機信號""" + self._stop_event.set() + + async def run_loop(self) -> None: + """ + 主 worker loop: + 1. 取一筆 PENDING run(SKIP LOCKED) + 2. 執行 shadow_execute(不產生 user response) + 3. Heartbeat(每 15 秒) + 4. 等待 5 秒後重新掃描 + """ + logger.info("platform_worker_started", project_id=self.project_id) + while not self._stop_event.is_set(): + try: + # 控制並行度 + if len(self._active_runs) >= _WORKER_CONCURRENCY: + await asyncio.sleep(1) + continue + + run = await acquire_pending_run(self.project_id) + if run is None: + await asyncio.sleep(_WORKER_POLL_INTERVAL_SECONDS) + continue + + run_id_str = str(run.run_id) + self._active_runs.add(run_id_str) + + # 每個 run 獨立 task,不阻塞 loop + asyncio.create_task( + self._execute_with_heartbeat(run), + name=f"platform_run_{run_id_str[:8]}", + ) + + except asyncio.CancelledError: + break + except Exception as exc: + logger.exception("platform_worker_loop_error", error=str(exc)) + await asyncio.sleep(_WORKER_POLL_INTERVAL_SECONDS) + + logger.info("platform_worker_stopped", project_id=self.project_id) + + async def _execute_with_heartbeat(self, run: object) -> None: + """ + 在 shadow_execute 執行期間,同步 heartbeat 防 stale reaper 誤殺。 + """ + from src.db.awooop_models import AwoooPRunState + assert isinstance(run, AwoooPRunState) + run_id_str = str(run.run_id) + + # Heartbeat task(每 15 秒更新 lease) + heartbeat_task = asyncio.create_task( + self._heartbeat_loop(run.run_id, self.project_id), + name=f"heartbeat_{run_id_str[:8]}", + ) + + try: + await shadow_execute(run) + except Exception as exc: + logger.exception( + "platform_run_execution_error", + run_id=run_id_str, + error=str(exc), + ) + finally: + heartbeat_task.cancel() + self._active_runs.discard(run_id_str) + + async def _heartbeat_loop(self, run_id: object, project_id: str) -> None: + """每 HEARTBEAT_INTERVAL_SECONDS 秒更新 lease,直到被 cancel""" + import uuid as _uuid + while True: + await asyncio.sleep(HEARTBEAT_INTERVAL_SECONDS) + try: + await heartbeat(run_id, project_id) # type: ignore[arg-type] + except Exception as exc: + logger.warning( + "platform_heartbeat_failed", + run_id=str(run_id), + error=str(exc), + ) + + async def reaper_loop(self) -> None: + """ + Stale run reaper:每 60 秒掃描 lease 過期的 RUNNING run。 + lease < NOW() + attempt < max → PENDING(retry) + lease < NOW() + attempt >= max → FAILED(E-RUN-002) + """ + logger.info("stale_run_reaper_started", project_id=self.project_id) + while not self._stop_event.is_set(): + try: + await asyncio.sleep(STALE_REAPER_INTERVAL_SECONDS) + reaped = await reap_stale_runs(self.project_id) + if reaped: + logger.info( + "stale_run_reaper_cycle", + project_id=self.project_id, + reaped=reaped, + ts=datetime.now(timezone.utc).isoformat(), + ) + except asyncio.CancelledError: + break + except Exception as exc: + logger.exception("stale_run_reaper_error", error=str(exc)) + + logger.info("stale_run_reaper_stopped", project_id=self.project_id) + + +# ───────────────────────────────────────────────────────────────────────────── +# Singleton(掛入 lifespan 用) +# ───────────────────────────────────────────────────────────────────────────── + +_platform_worker: PlatformWorker | None = None + + +def get_platform_worker() -> PlatformWorker: + global _platform_worker + if _platform_worker is None: + _platform_worker = PlatformWorker() + return _platform_worker + + +async def start_platform_worker() -> None: + """在 main.py lifespan 中呼叫此函數啟動 worker""" + worker = get_platform_worker() + asyncio.create_task(worker.run_loop(), name="platform_worker_run_loop") + asyncio.create_task(worker.reaper_loop(), name="platform_worker_reaper_loop") + logger.info("platform_worker_tasks_started") + + +async def stop_platform_worker() -> None: + """在 main.py lifespan 關閉時呼叫""" + worker = get_platform_worker() + worker.stop() + logger.info("platform_worker_stop_requested") diff --git a/apps/api/tests/test_mcp_credential_isolation.py b/apps/api/tests/test_mcp_credential_isolation.py new file mode 100644 index 00000000..8b61dbb3 --- /dev/null +++ b/apps/api/tests/test_mcp_credential_isolation.py @@ -0,0 +1,191 @@ +""" +MCP Credential Isolation 迴歸測試 +================================== +AwoooP Phase 5.5:防止 2026-04-18 Secret Leak 事故再現 +2026-05-04 ogt + Claude Sonnet 4.6 + +覆蓋: + 1. credential_resolver 格式驗證(bad ref 拒絕) + 2. dev fallback 正確返回 (value, masked, sha256) + 3. secret value 不洩漏到 redaction_middleware output + 4. _mcp_audit key 在 redact_mcp_input 中被移除(不送 provider) + 5. AuditedMCPToolProvider.__provider 不可從外部直接存取(name mangling) +""" + +from __future__ import annotations + +import hashlib +import json +import os +import pytest + + +class TestCredentialResolverFormat: + """credential_resolver 格式驗證""" + + def test_bad_ref_raises(self): + """格式錯誤的 ref 應拋出 CredentialResolutionError""" + import asyncio + import sys + sys.path.insert(0, ".") + from src.plugins.mcp.credential_resolver import ( + CredentialResolutionError, + resolve_k8s_secret, + ) + + bad_refs = [ + "no-slash", + "namespace/secret", # 缺 #key + "NAMESPACE/secret#key", # namespace 大寫不符格式 + "ns/secret#", # key 為空 + "", + ] + for ref in bad_refs: + with pytest.raises(CredentialResolutionError): + asyncio.run(resolve_k8s_secret(ref)) + + def test_dev_fallback_resolves(self, monkeypatch): + """AWOOOP_DEV_SECRETS_JSON 設定後應正確解析""" + import asyncio + import sys + sys.path.insert(0, ".") + + dev_secrets = {"awoooi/telegram-bot#TELEGRAM_BOT_TOKEN": "test-token-value-1234"} + monkeypatch.setenv("AWOOOP_DEV_SECRETS_JSON", json.dumps(dev_secrets)) + + from src.plugins.mcp.credential_resolver import resolve_k8s_secret + + value, masked, sha256 = asyncio.run( + resolve_k8s_secret("awoooi/telegram-bot#TELEGRAM_BOT_TOKEN") + ) + assert value == "test-token-value-1234" + assert "test" in masked # 前 4 字元 + assert "***" in masked + assert sha256 == hashlib.sha256("test-token-value-1234".encode()).hexdigest() + + def test_dev_fallback_missing_key_raises(self, monkeypatch): + """dev secrets 中找不到 key 應拋出錯誤""" + import asyncio + import sys + sys.path.insert(0, ".") + + monkeypatch.setenv("AWOOOP_DEV_SECRETS_JSON", json.dumps({})) + + from src.plugins.mcp.credential_resolver import ( + CredentialResolutionError, + resolve_k8s_secret, + ) + with pytest.raises(CredentialResolutionError): + asyncio.run(resolve_k8s_secret("awoooi/some-secret#key")) + + +class TestRedactionMiddlewareSecretLeak: + """2026-04-18 Secret Leak 迴歸:secret value 不得進入 output""" + + def test_pg_dsn_redacted_from_output(self): + """PG DSN 在 output redaction 後不可見""" + import sys + sys.path.insert(0, ".") + from src.plugins.mcp.redaction_middleware import redact_mcp_output + + output = { + "connection": "postgresql+asyncpg://admin:supersecret@10.0.1.5/prod", + "status": "connected", + } + cleaned = redact_mcp_output(output) + assert "supersecret" not in json.dumps(cleaned), "PG DSN 密碼不得出現在 output" + assert "[REDACTED:PG_DSN]" in cleaned["connection"] + + def test_telegram_token_redacted_from_output(self): + """Telegram token 在 output redaction 後不可見""" + import sys + sys.path.insert(0, ".") + from src.plugins.mcp.redaction_middleware import redact_mcp_output + + output = "Bot token: 1234567890:ABCDEFGHIJKLMNOPabcdefghijklmno12345678" + cleaned = redact_mcp_output(output) + assert "ABCDEFGHIJKLMNO" not in cleaned, "Telegram token 不得出現在 output" + assert "[REDACTED:TELEGRAM_TOKEN]" in cleaned + + def test_internal_ip_redacted(self): + """GCP 內網 IP 在 output redaction 後不可見""" + import sys + sys.path.insert(0, ".") + from src.plugins.mcp.redaction_middleware import redact_mcp_output + + output = {"host": "10.0.1.5", "port": 5432} + cleaned = redact_mcp_output(output) + assert "10.0.1.5" not in json.dumps(cleaned), "內網 IP 不得出現在 output" + assert "[REDACTED:INTERNAL_IP]" in cleaned["host"] + + def test_mcp_audit_key_removed_from_input(self): + """_mcp_audit key 在 redact_mcp_input 後應被移除""" + import sys + sys.path.insert(0, ".") + from src.plugins.mcp.redaction_middleware import redact_mcp_input + + params = { + "_mcp_audit": {"session_id": "abc123", "run_id": "xyz"}, + "namespace": "default", + } + cleaned = redact_mcp_input(params) + assert "_mcp_audit" not in cleaned, "_mcp_audit 應在送 provider 前移除" + assert cleaned["namespace"] == "default" + + def test_k8s_value_credential_isolation(self): + """k8s_value 欄位應被 credential isolation 攔截""" + import sys + sys.path.insert(0, ".") + from src.plugins.mcp.redaction_middleware import redact_mcp_input + + params = { + "k8s_value": "actual-secret-credential", + "tool": "some-tool", + } + cleaned = redact_mcp_input(params) + assert "actual-secret-credential" not in json.dumps(cleaned) + assert cleaned["k8s_value"] == "[REDACTED:CREDENTIAL_ISOLATION]" + + +class TestNameManglingEncapsulation: + """AuditedMCPToolProvider.__provider name mangling 封裝驗證""" + + def test_single_underscore_not_accessible(self): + """_provider(單底線)應不存在於 AuditedMCPToolProvider 實例""" + import sys + sys.path.insert(0, ".") + from src.plugins.mcp.interfaces import MCPToolProvider, MCPTool, MCPToolResult + from src.plugins.mcp.registry import AuditedMCPToolProvider + + class DummyProvider(MCPToolProvider): + @property + def name(self): return "dummy" + async def list_tools(self): return [] + async def execute(self, tool_name, parameters): + return MCPToolResult(success=True, execution_id="t") + + wrapped = AuditedMCPToolProvider(DummyProvider()) + assert not hasattr(wrapped, "_provider"), ( + "_provider 不應可直接存取(name mangling 防止直接存取 inner provider)" + ) + + def test_double_underscore_mangled(self): + """__provider 應被 Python name mangling 重命名""" + import sys + sys.path.insert(0, ".") + from src.plugins.mcp.interfaces import MCPToolProvider, MCPTool, MCPToolResult + from src.plugins.mcp.registry import AuditedMCPToolProvider + + class DummyProvider(MCPToolProvider): + @property + def name(self): return "dummy" + async def list_tools(self): return [] + async def execute(self, tool_name, parameters): + return MCPToolResult(success=True, execution_id="t") + + wrapped = AuditedMCPToolProvider(DummyProvider()) + # Python name mangling: __provider → _AuditedMCPToolProvider__provider + assert hasattr(wrapped, "_AuditedMCPToolProvider__provider"), ( + "name mangling 後的屬性應可被內部存取" + ) + assert wrapped.name == "dummy" diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 70fb7b72..84525489 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -6,6 +6,291 @@ --- +## 2026-05-04 | AwoooP Phase 6-8 完收(EwoooC Onboarding / Channel Hub / Approval Token) + +### Phase 6: EwoooC Tenant Onboarding(ADR-115) + +**migrations/awooop_phase6_ewoooc_onboarding_2026-05-04.sql**: +- `INSERT INTO awooop_projects` — project_id='ewoooc', migration_mode='shadow', budget_limit=50 USD +- 4 個 read-only MCP tools 預置白名單(k8s_get / signoz_query / incident_read / km_read) +- 所有 scope=['read'],environment_tags={env:any}(shadow phase 無環境限制) + +**services/provider_proxy.py**(ProviderProxy + PlatformEnvelope): +- `build_envelope()` → PlatformEnvelope(project_id / agent_id / trace_id / platform_subject_id) +- `_validate_project()`:拒絕 legacy_awoooi_default mode +- `_upsert_platform_subject()`:auto-provisioning(ON CONFLICT DO UPDATE last_seen_at) +- `build_platform_subject_id()` → `"ewoooc:telegram:123456789"` 統一格式 +- `_new_trace_id()` → W3C traceparent(00-{32hex}-{16hex}-01) +- 自驗:platform_subject_id 格式、trace_id 4段格式、PlatformEnvelope.as_dict() 正確 + +### Phase 7: Channel Hub(ADR-106 channel_event family) + +**migrations/awooop_phase7_channel_hub_2026-05-04.sql**: +- `awooop_conversation_event` — 入站事件鏡像(UNIQUE provider_event_id,dedup + run_id + content hash) +- `awooop_outbound_message` — 出站訊息記錄(interim/final/error/approval_request + shadow status) +- Progressive Feedback Policy 查詢 index(waiting_tool + pending) +- 全部 FORCE ROW LEVEL SECURITY + +**services/channel_hub.py**: +- `mirror_inbound_event()` — raw_content 只存 sha256 hash + redacted preview(明文不入庫) +- `record_outbound_message()` — shadow=True 時 status='shadow'(不發送) +- `schedule_interim_feedback()` → asyncio.create_task(30s 計時器) +- `_interim_feedback_task()` — 30s 後查 run state,仍 waiting_tool → 發 interim +- `handle_telegram_inbound()` — 主入口:create_run + mirror + schedule_interim +- 自驗:import 正確,INTERIM_WAIT_SECONDS=30 + +### Phase 8: Approval Token HS256 + Suggest Mode(ADR-116 Gate 5) + +**services/awooop_approval_token.py**(獨立模組,不修改 legacy multi_sig_redis.py): +- `issue_approval_token()` — HS256 自製 JWT(3 段 base64url),jti=uuid4.hex +- `verify_approval_token()` — HMAC.compare_digest + exp 驗證,回傳 payload +- `record_approval()` — verify token → Redis NX jti(防 replay)→ SADD approver_id → 回傳簽核數 +- `check_approval_quorum(required_count=1)` — SCARD ≥ required | QuorumNotMetError +- Redis key 前綴:`awooop_appr:jti:*` / `awooop_appr:sigs:*`(與 legacy 不衝突) +- `is_suggest_mode_enabled()` — AWOOOP_SUGGEST_MODE env flag +- `build_suggest_action(rollback/scale/restart)` → SuggestedAction(dry_run=True, approval_required=True) +- 錯誤碼:E-APPR-001/002/003/004 +- 自驗:6 個測試全部通過(issue/verify/tamper/expire/suggest mode/suggest rollback/scale) + +--- + +## 2026-05-04 | AwoooP Phase 5 完收(MCP Gateway 五閘門 + Credential Isolation) + +### Phase 5.1 MCP Gateway DB Migration(awooop_phase5_mcp_gateway_2026-05-04.sql) + +四表 + 全部 RLS + GRANT: +- `awooop_mcp_tool_registry` — Tool 白名單(Gate 3,tool_type 3 值、allowed_scopes JSONB、environment_tags Gate 4 用) +- `awooop_mcp_grants` — Agent × Tool 授權(Grant 2+3,expires_at + is_revoked + granted_scopes + 撤銷一致性 CHECK) +- `awooop_mcp_credential_refs` — k8s Secret 參照(ADR-118,只存路徑 namespace/secret#key,明文絕不入庫) +- `awooop_mcp_gateway_audit` — Gateway call 稽核(gate_result JSONB 五閘結果 + block_gate/block_reason) +- 全部 `FORCE ROW LEVEL SECURITY`;4 個查詢優化 partial index + +ORM:`awooop_models.py` 新增 `AwoooPMcpToolRegistry` / `AwoooPMcpGrant` / `AwoooPMcpCredentialRef` / `AwoooPMcpGatewayAudit` +自驗:4 個 ORM model import 正確(all_ok: True) + +### Phase 5.2 Five-gate Enforcement Service(plugins/mcp/gateway.py) + +`McpGateway.call()` 實作五閘門依序強制檢查: +- Gate 1 — Project:`awooop_projects` 存在且 `migration_mode != legacy_awoooi_default` +- Gate 2 — Agent:`awooop_active_revisions` 有 `family=agent, contract_id=agent_id` 的 active contract +- Gate 3 — Tool+Grant:tool 在白名單 + grant 未到期/未撤銷 + scope 包含 required_scope +- Gate 4 — Environment:tool.environment_tags 全部匹配(shadow mode 直接放行) +- Gate 5 — Approval:write/admin scope 時查 Redis multi_sig approval key(shadow + read 直接放行) +- 任一失敗:寫 blocked audit log + raise McpGatewayError(error_code E-MCP-GATE-001~009) +- 通過後呼叫底層 provider,結果寫 success audit + +自驗:所有 import 正確,GateCheckResult.all_passed / as_dict() 功能正常 + +### Phase 5.3 MCP Redaction Middleware(plugins/mcp/redaction_middleware.py) + +雙層 redaction: +- Layer 1(audit_sink)— 寫 audit log 前(已於 Phase 4.4 完成) +- Layer 2(本層)— MCP tool call input/output 在進入 LLM context 前: + - `redact_mcp_input()`: 移除 _mcp_audit injection + credential isolation(k8s_value 等)+ 欄位黑名單 + pattern redaction + - `redact_mcp_output()`: 完整 pattern redaction + 大小限制(16,000 char,防 prompt stuffing) + - `compute_safe_hash()`: sha256(redacted_data),供 gateway audit 使用 +- 自驗:4 個測試案例全部通過(all_ok: True) + +### Phase 5.4 Provider 封裝強化(plugins/mcp/registry.py) + +`AuditedMCPToolProvider._provider` → `__provider`(Python name mangling): +- 防止 caller 透過 `wrapper._provider` 直接存取 inner provider(ADR-116 封裝要求) +- Python 自動重命名為 `_AuditedMCPToolProvider__provider`,外部不可直接 access +- 4 個 `self._provider.xxx` 引用全部更新為 `self.__provider.xxx` + +### Phase 5.5 Credential Isolation(plugins/mcp/credential_resolver.py + 迴歸測試) + +`credential_resolver.py`: +- `resolve_k8s_secret(ref)` → `(actual_value, masked_value, sha256)` 三元組 +- ref 格式:`"namespace/secret-name#key"`,正則強驗 +- prod:kubernetes_asyncio in-cluster API +- dev fallback:`AWOOOP_DEV_SECRETS_JSON` 環境變數(JSON dict) +- actual_value 只在記憶體短暫存在,不寫任何持久化;masked_value(前4+***+後4)供 log + +`tests/test_mcp_credential_isolation.py`(10 個測試全部通過 ✅): +- bad ref 格式拒絕(5 個 case) +- dev fallback 正確解析 / 找不到 key 拋錯 +- PG DSN / Telegram token / 內網 IP 在 output 被 redact(secret leak 迴歸測試) +- _mcp_audit 在 input 被移除 / k8s_value credential isolation +- name mangling:_provider 不可存取,_AuditedMCPToolProvider__provider 正確存在 + +--- + +## 2026-05-04 | AwoooP Phase 2 完收(P1-16/P1-17/2.3/2.4/2.6) + +### P1-16 nl_gateway.py hermes Redis key 加 project 前綴 +- `_check_rate_limit`: `hermes:rl:{chat_id}` → `{project_id}:hermes:rl:{chat_id}` +- `_load_session_context`: 讀新 key,Phase A fallback 到舊 key +- `_save_session_turn`: 寫新 key + Phase A dual-write 舊 key +- `process_nl_message`: 加 `project_id: str = "awoooi"` 並透傳 + +### P1-17 anomaly_counter.py per-project 改造 +- `__init__` 加 `project_id="awoooi"`,新增 `_pkey()` + `_redis_get_with_fallback()` 輔助方法 +- 所有寫路徑改用 `_pkey()`(timeline / repair_count / history / disposition / permanent_fix / metadata) +- 所有讀路徑 Phase A fallback:先讀 `{project_id}:anomaly:*`,不存在才讀 `anomaly:*` +- `get_all_disposition_stats` SCAN 先掃新前綴,無資料才 fallback 舊前綴 +- `get_anomaly_counter()` singleton 傳入 `project_id="awoooi"` + +### Phase 2.3 Repository project_id filter +- `db/base.py`: `get_db_context(project_id="awoooi")` 預設帶入 `SELECT set_config('app.project_id', :pid, TRUE)` → 所有現有呼叫端自動設置正確 tenant,RLS 生效 +- `db/models.py`: 4 個 ORM model(AuditLog / IncidentRecord / KnowledgeEntryRecord / PlaybookRecord)加 `project_id: Mapped[str]` +- `incident_repository.py`: `_incident_to_record_data()` 加 `"project_id": getattr(incident, "project_id", "awoooi")` +- `playbook_repository.py`: `get_session_factory()` 全部換成 `get_db_context()`;`_pg_upsert` 寫入 `project_id` +- `db/base.py init_db()`: 防禦性 ALTER TABLE 四表加 `project_id VARCHAR(64) DEFAULT 'awoooi'` + index + +### Phase 2.4 31 background loop project_id 標記(INV-8) +- `core/context.py` 新建:`PROJECT_ID: ContextVar[str]`(default="awoooi")+ `get_current_project_id()` +- `main.py lifespan()`: 冒頭 `PROJECT_ID.set("awoooi")`;asyncio.create_task 自動繼承父任務 ContextVar → 31 個 loop 全部標記 +- `get_db_context()`: 讀 contextvar 作 fallback(明確參數 > contextvar > "awoooi") + +### Phase 2.6 Token Budget Hard Kill(ADR-120) +- `migrations/awooop_phase2_budget_ledger_2026-05-04.sql`: `budget_ledger` 表 + RLS + GRANT +- `db/models.py`: `BudgetLedgerRecord` ORM(UUID / NUMERIC(10,4) / project_id / run_id) +- `services/budget_service.py`: 三層防線完整實作 + - `check_budget_before_llm_call()`: Layer 3 Emergency Kill → Layer 2 Tenant → Layer 1 Platform + - `record_token_usage()`: POST-call accounting(async INSERT budget_ledger + Redis INCRBYFLOAT) + - `activate_emergency_kill()` / `deactivate_emergency_kill()`:Admin 管理工具 + - Ollama 本地模型(deepseek/qwen3)自動 bypass(零費用) +- `db/base.py init_db()`: 防禦性 CREATE TABLE IF NOT EXISTS budget_ledger + +### 下一步(Phase 3) +- Phase 3: Contract packages & validators(JSON Schema、Pydantic v2 contract models、contract lifecycle service) + +--- + +## 2026-05-04 | AwoooP Phase 4 完收(Platform Shell in Shadow Mode) + +### Phase 4.1 DB Migration(awooop_phase4_run_state_2026-05-04.sql) + +三表 + 全部 RLS + GRANT: +- `awooop_run_state` — Run FSM 主表(state enum 8 值、lease_until/heartbeat_at/worker_id SKIP LOCKED 欄位、is_shadow bool) +- `awooop_run_step_journal` — SAGA step journal(step_seq unique per run、compensation_json JSONB、was_blocked 攔截記錄) +- `awooop_run_idempotency` — 去重冪等(`uix_run_idempotency_key` = project_id + channel_type + provider_event_id) +- 全部 `FORCE ROW LEVEL SECURITY`(ADR-118) + +ORM:`awooop_models.py` 新增 `AwoooPRunState` / `AwoooPRunStepJournal` / `AwoooPRunIdempotency`(含 CheckConstraint + 4 個 partial index) + +### Phase 4.2 Run State Machine(run_state_machine.py) + +- `validate_transition(from, to)` — 8 狀態 × 合法轉換表,非法拋 `InvalidStateTransitionError` +- `acquire_pending_run()` — `SELECT ... FOR UPDATE SKIP LOCKED`(多 worker 並行安全) +- `heartbeat(run_id)` — 延長 lease TTL(每 15 秒,防 stale reaper 誤殺) +- `transition(run_id, to_state, ...)` — 先讀 current state 驗合法性,再 UPDATE;terminal state 清 lease + 寫 completed_at +- `reap_stale_runs()` — 掃 lease < NOW() 的 RUNNING:attempt < max → PENDING retry;attempt >= max → FAILED(E-RUN-002) + +### Phase 4.3 Platform Runtime(platform_runtime.py) + +- `_uuid7()` — 時間有序 UUID v7(適合 DB PK) +- `_new_trace_id()` — W3C traceparent-compatible trace_id(128-bit trace + 64-bit span) +- `check_idempotency()` — Redis NX 先攔(快)+ PG constraint 最後防(準確) +- `create_run()` — 冪等建立 run,is_shadow=True,計算 input_sha256 +- `shadow_execute()` — 解析 agent contract → 記錄每個 tool → 攔截 is_destructive → COMPLETED(無 user response) +- `is_destructive_tool()` — contract flag + keyword 雙層判斷(delete/drop/kill/exec 等 16 個關鍵字) + +### Phase 4.4 Audit Sink(audit_sink.py) + +PII/secret redaction pipeline(9 個 pattern): +- Telegram token(8-12 位數字:32-64 位英數)✅ +- PostgreSQL DSN / password field +- Bearer token / JWT(eyJ… 三段) +- GCP/內網 IP(10.x, 172.16-31.x, 192.168.x) +- SSH private key / API key +- Hex secret ≥ 64 位 +- field 名稱黑名單(password/token/secret 等直接替換) +- LLM raw 欄位(prompt/completion 只保留 sha256 hash 前 16 位) +- 自驗測試:9 個 case 全部通過(all_ok: True) + +### Phase 4.5 Platform API(api/v1/platform/runs.py) + +- `POST /v1/platform/runs` — 建立 shadow run,202 Accepted,返回 `{run_id, is_duplicate, is_shadow, message}` +- `GET /v1/platform/runs/{run_id}` — 查詢 run FSM 狀態 +- Idempotency 內建:provider_event_id + channel_type → 冪等命中返回既有 run_id +- 所有 Phase 4 run 強制 is_shadow=True + +### Phase 4.6 Platform Worker(workers/platform_worker.py) + +- `PlatformWorker.run_loop()` — SKIP LOCKED 取 PENDING run,控制並行度(max 2),每 run 獨立 task +- `PlatformWorker._execute_with_heartbeat()` — shadow_execute + 並行 heartbeat task 防 stale 誤殺 +- `PlatformWorker.reaper_loop()` — 每 60 秒 reap_stale_runs() +- `start_platform_worker()` / `stop_platform_worker()` — lifespan hook + +### main.py 掛載 + +- Import:`from src.api.v1 import platform as platform_v1` +- Router:`app.include_router(platform_v1.router, prefix="/api/v1/platform")` +- Lifespan startup:`await start_platform_worker()` +- Lifespan shutdown:`await stop_platform_worker()` + +### 語法驗證 + +8 個新建/修改 Python 檔案全部通過 `ast.parse()` ✅ + +--- + +## 2026-05-04 | AwoooP Phase 3 完收(Contract Packages & Validators) + +### Phase 3.1 packages/awooop-contracts/(六合約 JSON Schema + golden fixtures) + +新建 `packages/awooop-contracts/` 目錄,完整包含: + +**六合約 JSON Schema**(`schemas/`): +- `project_tenant.json` — 租戶/專案能力邊界(migration_mode enum、budget_limit_usd ge:0、allowed_channels uniqueItems) +- `agent.json` — Agent 模型/工具/治理(sha256 pattern ^[0-9a-f]{64}$、temperature [0,2]、approval_timeout_seconds) +- `mcp_gateway.json` — MCP Gateway(transport enum + if/then endpoint required、schema_sha256 完整性) +- `policy_routing.json` — LLM 路由規則(routing_rules minItems:1、priority [0,9999]、retry_policy) +- `runtime_run_state.json` — Run FSM(UUID pattern、state enum、input/output sha256、cost_usd ge:0) +- `channel_event.json` — Channel Event 冪等(event_id UUID、payload minProperties:1、attachment sha256) + +**Golden fixtures**(`fixtures/valid/` + `fixtures/invalid/`): +- valid × 6 — 所有 Pydantic 驗證全通過 ✅ +- invalid × 6 — 各自包含 required 缺失/enum 不合法/format 錯誤,全數被拒絕 ✅ +- 自驗測試:`python3 -c validate_contract_body(...)` 通過(valid all pass: True, invalid all rejected: True) + +### Phase 3.2 Contract lifecycle service + +- **`src/models/awooop_contracts.py`**(新建):六合約 Pydantic v2 model + - `ProjectTenantContract` / `AgentContract` / `MCPGatewayContract` / `PolicyRoutingContract` / `RuntimeRunStateContract` / `ChannelEventContract` + - `ArtifactRef`(sha256 hex64 validator)、`ToolRef`、`ToolExposed`、`RoutingRule`、`AttachmentRef` 等共用子 model + - `validate_contract_body(family, body)` — dispatcher,依 family 名稱驗證 + - `CONTRACT_FAMILY_MODELS` dict — 六合約映射表 + +- **`src/repositories/contract_repository.py`**(新建):append-only contract CRUD + - `get_revision()` / `get_active_revision()` / `list_revisions()` — 讀取(RLS 透過 get_db_context 自動套用) + - `create_draft()` — 建立 lifecycle_status='draft' revision + - `mark_published()` — draft → published(HMAC 簽章後才能呼叫) + - `mark_active()` — published → active(UPSERT active_pointer + 寫 outbox + revoke 舊版本,同一 transaction) + +- **`src/services/contract_service.py`**(新建):完整 lifecycle orchestration + - `draft()` — schema 驗證 + body_hash 計算(sha256 canonical JSON)+ DB 寫入 + audit log + - `publish()` — HMAC 簽章驗證(settings.CONTRACT_HMAC_KEY)→ mark_published + - `activate()` — Redis multi_sig approval 確認 → mark_active(bypass_approval 開關) + - `get_active()` — runtime 唯一讀取路徑(active only + body_hash 完整性驗證) + - `get_active_body()` — 便利方法,直接返回 body_json + - `record_activation_approval()` — 記錄 approver 簽核(Redis TTL 24h) + - 5 個自訂 Exception:ContractSchemaError / ContractSignatureError / ContractStateError / ContractApprovalError / ContractNotFoundError + +### Phase 3.3 Output schema validator middleware + +- **`src/services/schema_validator.py`**(新建):LLM 輸出驗證鏈 + - `extract_json_from_llm_output()` — 三策略容錯萃取(直接 parse / ```json``` block / regex {…}) + - `validate_llm_output()` — 主驗證器:驗證失敗 → retry prompt → 再試(上限 3 次)→ SchemaValidationError(E-SCHEMA-001) + - `validate_llm_output_by_family()` — 依 contract_family 自動選 model + - `validate_once()` — 單次驗證(測試 / 內部資料用) + - `build_retry_prompt()` — 附錯誤回饋的 retry prompt builder + - `SchemaValidationError` — error_code="E-SCHEMA-001" + +### Phase 3 DoD 驗收 +- [x] schema 不符的 LLM 輸出無法到達 channel adapter(SchemaValidationError 阻擋) +- [x] valid × 6 全部通過 Pydantic 驗證 +- [x] invalid × 6 全數被拒絕(涵蓋 required/enum/format/pattern 四類錯誤) +- [x] prompt/schema ref 必含 sha256(ArtifactRef + ToolExposed.schema_sha256 + AttachmentRef.sha256) +- [x] body_hash = sha256(canonical JSON),runtime get_active() 讀取時重算驗證 + +### 語法驗證 +- 4 個新建 Python 檔案全部通過 `ast.parse()` ✅ + +--- + ## 2026-05-04 | AwoooP Phase 2 初批 P0 修正 + Phase 1.7 Tests(commit 14bf86a4) ### 修正 diff --git a/docs/awooop/DETAILED-IMPLEMENTATION-PLAN.md b/docs/awooop/DETAILED-IMPLEMENTATION-PLAN.md new file mode 100644 index 00000000..e7f2a855 --- /dev/null +++ b/docs/awooop/DETAILED-IMPLEMENTATION-PLAN.md @@ -0,0 +1,1271 @@ +# AwoooP 完整詳細實施計畫 + +**版本**:v1.0(12-Agent 全景審查後整合版) +**日期**:2026-05-03(台北時區) +**建立者**:12-Agent 聯合審查 × Codex 整合 +**基礎文件**:MASTER-WORKPLAN.md、ADR-106、ADR-107 +**⚠️ ADR 編號修正**:ADR-108/109/110 已被其他 ADR 占用 → AwoooP 專用 ADR 從 ADR-111 開始 + +> 本文件是 MASTER-WORKPLAN.md 的完整展開版。 +> MASTER-WORKPLAN 是主索引,本文是執行細節。 +> 任何矛盾以本文為準(本文更新日期更晚)。 + +--- + +## 0. 全景背景 + +### 0.1 基礎架構現況(截至 2026-05-03) + +| 組件 | 現況 | 備註 | +|------|------|------| +| Ollama Primary | GCP-A `34.143.170.20:11434`(SSD)| ADR-110,取代 ADR-105 | +| Ollama Secondary | GCP-B `34.21.145.224:11434`(SSD)| 新增,2026-05-03 上線 | +| Ollama Fallback | Local `192.168.0.111:11434`(HDD)| 最後防線,非 Primary | +| PostgreSQL | `192.168.0.188`(私網)| AwoooP 控制面唯一 source of truth | +| Redis | `192.168.0.188`(私網)| cache/watch/counter only(ADR-107 D4)| +| K3s 叢集 | `awoooi-prod` namespace | AWOOOI first tenant | +| Gitea CI/CD | `192.168.0.110`(或 Gitea Cloud)| ADR-039,所有 build 從 Gitea | + +### 0.2 12-Agent 審查發現彙整 + +原始 MASTER-WORKPLAN 有 24 項共識問題。12 位 Agent 並行深度審查後新增: + +| Agent | 新增 P0/P1 問題數 | 新增 ADR 需求 | 新增 Inventory | +|-------|-----------------|--------------|----------------| +| critic | 10 | 1(ADR-116 Migration Discipline)| INV-5、INV-6、INV-7 | +| vuln-verifier | 8(含 PoC 確認 3 個)| 2(ADR-116/117 安全系列)| — | +| debugger | 12(故障情境)| — | 8 份 Runbook | +| db-expert | 8(表設計缺陷)+ RLS 完全空白 | 1(ADR-118 RLS 策略)| — | +| planner | 7 粒度過粗 + 10 acceptance 不閉環 | — | — | +| fullstack-engineer | 7 API endpoint 缺失 + 9 error code | — | — | +| frontend-designer | 8 UI 模組完全缺失 | ADR-UI-01~04 | — | +| refactor-specialist | 8 重構地雷 + 11 PR 方案 | — | — | +| migration-engineer | 7 相容性風險 | — | version matrix | +| onboarder | 31 background loop(vs 估計 ~10)+ 13 模組衝突 | — | INV-8 | +| tool-expert | 8 工具容量不足 + 8 工具缺失 | — | — | +| web-researcher | 業界 5 大對齊缺口(SAGA/Token Kill/MCP OAuth 2.1/OTel/OWASP)| 5(ADR-119~123)| — | +| **合計新增** | **~70 個問題** | **~12 份 ADR** | **~4 份 Inventory** | + +**結論:不先補完 Pre-flight Audit,Phase 1 必爆。** + +--- + +## 1. 完整問題清單(P0 優先順序) + +### P0 — 直接爆炸(必須在 Phase 1 之前修補) + +| # | 問題 | 來源 | 影響範圍 | +|---|------|------|---------| +| P0-01 | Redis key 直接改名無雙寫期(費用計數歸零、Telegram 409、silence 失效、Ollama failover 三層拓撲雙寫不到)| critic | 費用、告警、Ollama | +| P0-02 | Migration SQL 表名錯(`incident_records` / `mcp_audit_snapshots`)、無 rollback、ORM 1.x vs 2.x | critic | Phase 1 migration | +| P0-03 | `project_id` / `tenant_id` 在 codebase 0 命中,30+ 業務表無此欄 | onboarder | 全系統 | +| P0-04 | `requires_approval` 欄位由 LLM output 決定(security_interceptor.py:451-490)| vuln-verifier(PoC 確認)| approval 鏈 | +| P0-05 | callback nonce 偽造:server nonce 邏輯可不知 secret 構造通過驗證(security_interceptor.py:451-490)| vuln-verifier(PoC 確認)| Telegram approval | +| P0-06 | Webhook HMAC replay 無 timestamp/nonce(webhooks.py:679-728)| vuln-verifier(PoC 確認)| 所有 webhook | +| P0-07 | 31 個 background loop 全無 project_id(main.py)| onboarder(實測)| 多租戶全崩 | +| P0-08 | `telemetry.py:71` 硬碼 `if "192.168.0.188" not in endpoint: raise`,EwoooC 啟動必失敗 | onboarder | EwoooC Phase 6 | +| P0-09 | `project_migration_state` 表缺失,Strangler Fig 無資料載體 | db-expert | Phase 1 | +| P0-10 | Task 9 順序倒置(agent prompt 載入點在 ConfigMap 前)→ 全回 None | critic | Phase 1 任何 agent | +| P0-11 | `ollama:current_primary` 在 `ollama_auto_recovery.py:230` 有第二定義,三層拓撲遷移必裂 | onboarder | GCP Ollama 拓撲 | +| P0-12 | `consensus_engine.py` 中 `CONSENSUS_PREFIX="consensus:"` 無 project 前綴,multi-tenant 時跨 tenant 共用 | onboarder | 多租戶一致性 | +| P0-13 | `mcp_bridge.py:592-681` kubectl 呼叫硬碼 `namespace="awoooi-prod"` | onboarder | EwoooC K8s tool | + +### P1 — 嚴重缺陷(Phase 2-4 之前必修) + +| # | 問題 | 來源 | 影響範圍 | +|---|------|------|---------| +| P1-01 | AWOOOI Bootstrap Paradox:cron/job/healthcheck 全無 project_id | critic | 多租戶啟動 | +| P1-02 | EwoooC 接入零技術路徑(非只改 `OLLAMA_API_BASE`)| critic | Phase 6 | +| P1-03 | Strangler Fig shadow→canary→active 無量化 gate 條件 | planner | 切換決策 | +| P1-04 | Layer 3 redaction 零實作(helper 有但無 enforcement)| critic | 資訊安全 | +| P1-05 | `_provider` 屬性 public,可繞過 audit(mcp/registry.py:24-71)| critic | MCP 安全 | +| P1-06 | `WAITING_APPROVAL` resume 不驗 caller identity,無 approval_token 簽章 | critic | approval 安全 | +| P1-07 | Redis approval state 單點,無 PG sync | critic | approval 可靠性 | +| P1-08 | Audit log 本身會洩密(redaction 必須做在 audit sink 前)| critic | 資訊安全 | +| P1-09 | `sanitization_service.py` helper 無 enforcement point(MCP Gateway / AgentToolExecutor 都沒用)| critic | tool 安全 | +| P1-10 | Active revision 切換無 transactional outbox,worker 可能吃舊 policy | db-expert | policy 一致性 | +| P1-11 | Run/Channel idempotency 缺 key derivation 規則與 unique index | db-expert | 重複執行 | +| P1-12 | Async worker 缺 lease / heartbeat / stale reaper | db-expert | worker 可靠性 | +| P1-13 | 高流量表 partition + retention 需 Phase 1 就決定(不能後補)| db-expert | 長期可擴展 | +| P1-14 | Observability metrics label cardinality(run_id/trace_id/session_id 禁進 metrics)| fullstack | Prometheus | +| P1-15 | `multi_sig_redis.py:178-205` approval flow 零 trace_id | debugger | 故障排查 | +| P1-16 | `hermes/nl_gateway.py:7,146,163` Redis key 無 project 前綴 | onboarder | Hermes 隔離 | +| P1-17 | `anomaly_counter.py:790` AnomalyCounter 全域單例,6 個 prefix 無 tenant 隔離 | onboarder | 多租戶計數 | +| P1-18 | `incident_service.py:603-615` `SCAN incident:*` 無 project_id | onboarder | Redis 資料隔離 | +| P1-19 | Contract publish 權限與簽章未定義 | critic | contract 治理 | +| P1-20 | 13 個全域單例跨 tenant 共用(TrustEngine/ProviderRegistry/TelegramGateway/等)| onboarder | 多租戶隔離 | +| P1-21 | Token Budget 無 Hard Kill($47k agent loop 事故教訓)| web-researcher | 費用控管 | +| P1-22 | RLS(Row Level Security)完全空白 | db-expert | DB 多租戶 | +| P1-23 | GCP Ollama 三層拓撲 Redis key 雙寫遷移未規劃(`ollama:current_primary` 舊 key 只知道 1 個 host)| critic | Ollama failover | +| P1-24 | `decision_manager.py:240` 硬碼 `telegram_silence:{target}` 未 import gateway 常數(跨兩處定義)| debugger | silence 功能 | + +### P2 — 設計缺口(Phase 5-8 之前必補) + +| # | 問題 | 來源 | 影響範圍 | +|---|------|------|---------| +| P2-01 | Telegram/LINE/Slack/API/Internal 缺 canonical principal mapping | critic | 身份統一 | +| P2-02 | Run FSM 零實作(只有表設計,無狀態機程式碼)| fullstack | Phase 4 | +| P2-03 | EwoooC Provider Proxy 不能只改 URL,需要完整 envelope+audit 入口 | critic | Phase 6 | +| P2-04 | 業界 Durable Execution / SAGA 補償交易機制缺失 | web-researcher | 長時 agent tool chain | +| P2-05 | MCP OAuth 2.1(RFC 9728 + RFC 7591)Confused Deputy 無防護 | web-researcher | MCP 安全 | +| P2-06 | OTel GenAI Semantic Conventions(span 命名 / attribute 規範)未對齊 | web-researcher | 可觀測性 | +| P2-07 | OWASP Agentic AI Top 10 對齊缺口(prompt injection、tool misuse 等 7 項)| web-researcher | AI 安全 | +| P2-08 | ISO 42001 AI 管理體系對齊文件缺失 | web-researcher | 合規 | +| P2-09 | 7 個 API endpoint 缺失(見 §6 fullstack 清單)| fullstack | API 完整性 | +| P2-10 | 9 個 error code 缺失(見 §7 error code 字典)| fullstack | 客戶端解析 | +| P2-11 | Progressive feedback policy(async run 無進度通知 ≤30s)| fullstack | UX | +| P2-12 | 8 個 Operator Console UI 模組完全缺失(見 §8 frontend)| frontend-designer | 運營可見性 | +| P2-13 | `awooop-ctl` CLI 工具缺失(現有 kubectl + curl 手動操作)| tool-expert | 運維體驗 | +| P2-14 | OPA/Cedar policy engine 缺失(現在 contract 授權邏輯散落程式碼)| tool-expert | 授權集中化 | +| P2-15 | chaostoolkit / LitmusChaos 缺失(Strangler Fig 切換無混沌驗證)| tool-expert | 容災驗證 | +| P2-16 | PgBouncer 缺失(AwoooP 多 worker 下 PG connection pool 會爆)| tool-expert | DB 可擴展性 | + +--- + +## 2. Pre-flight Audit — Phase 0 完整清單 + +> Phase 0 全部 docs-only。無任何 runtime code 變動。 +> 完成後才開新 Codex 對話進 Phase 1 code。 + +### 2.1 AwoooP 核心 ADR(ADR-111~115) + +**注意:ADR-108/109/110 已被 incident fingerprint / telegram dedup / GCP Ollama 拓撲占用,AwoooP 從 ADR-111 起。** + +| ADR | 主題 | 解決問題 | 主要內容 | +|-----|------|---------|---------| +| **ADR-111** | AwoooP Bootstrap Order & Identity Paradox | P0-07、P0-01、P1-01 | `platform_internal` / `requires_project_id` / `legacy_awoooi_default` 三種標記;31 個 background loop 分類;AWOOOI cron/job 過渡豁免時程;Ollama GCP 三層 failover 的 platform_resource 聲明 | +| **ADR-112** | Contract Governance & Publishing Workflow | P1-19 | 誰可 publish / activate;CODEOWNERS;HMAC 簽章;approval workflow;activation audit;draft 與 published 隔離 | +| **ADR-113** | Active Revision Invalidation & Outbox | P1-10 | `awooop_contract_outbox` 表設計;Redis pub/sub 通知;worker revision-aware cache;split-brain 防禦;GCP Ollama 拓撲切換事件 | +| **ADR-114** | Idempotency, Worker Lease & Run Recovery | P1-11、P1-12 | channel event dedupe;`(project_id, channel_type, provider_event_id)` unique;worker `lease_until` / `heartbeat_at` / `attempt_count`;stale run reaper;SKIP LOCKED | +| **ADR-115** | Canonical Principal Mapping & Tenant Onboarding | P2-01、P0-08 | Telegram/LINE/Slack/API/Internal → `platform_subject` 統一映射;EwoooC Proxy Adapter;Tsenyang/Bitan 模板;`telemetry.py:71` IP assert 修正方案 | + +### 2.2 安全強化 ADR + +| ADR | 主題 | 解決問題 | 主要內容 | +|-----|------|---------|---------| +| **ADR-116** | AwoooP Security Hardening | P0-04、P0-05、P0-06 | callback nonce 重設計(server_secret 必參與 HMAC);webhook 加 timestamp/nonce 防 replay;`requires_approval` 改為 policy-derived(禁止 LLM 決定);approval_token signing 規格(HS256,15min TTL,`jti` 唯一性)| +| **ADR-117** | MCP OAuth 2.1 & Confused Deputy Prevention | P2-05 | RFC 9728 Resource Indicators;RFC 7591 Dynamic Client Registration;per-tenant token scope;Confused Deputy 防護設計;MCP Server binding PKCE flow | + +### 2.3 資料庫強化 ADR + +| ADR | 主題 | 解決問題 | 主要內容 | +|-----|------|---------|---------| +| **ADR-118** | Row-Level Security & Tenant DB Isolation | P1-22 | 所有 AwoooP 表啟用 RLS;`current_setting('app.project_id')` 注入;RLS bypass role 設計;migration 驗收標準 | +| **ADR-119** | Durable Execution & SAGA Compensation | P2-04 | multi-step agent tool chain 的 step-level journal;補償交易觸發條件;checkpoint/resume 設計;與 Phase 4 run state machine 整合 | + +### 2.4 可觀測性 & AI 安全 ADR + +| ADR | 主題 | 解決問題 | 主要內容 | +|-----|------|---------|---------| +| **ADR-120** | Token Budget Hard Kill | P1-21 | 每 run / 每 project / 每 tenant 三層 budget limit;hard kill(不只 alert);$47k agent loop 事故 RCA;`budget_ledger` 表設計;Redis hot counter + PG 事務 hard stop | +| **ADR-121** | OTel GenAI Semantic Conventions Alignment | P2-06 | span 命名規範(`gen_ai.request.*`);token 計數 attribute;LLM provider attribute;與現有 SignOz(188:24318)整合;metrics label cardinality 規則 | +| **ADR-122** | OWASP Agentic AI Top 10 & ISO 42001 Alignment | P2-07、P2-08 | Top 10 逐項對應到 AwoooP 控制面;ISO 42001 AI 管理體系必要文件清單;每 Phase 對齊驗收項 | + +### 2.5 Migration Discipline ADR + +| ADR | 主題 | 解決問題 | 主要內容 | +|-----|------|---------|---------| +| **ADR-123** | Background Loop project_id Migration Strategy | P0-07、P1-01 | 31 個 background loop 分三類(platform_internal / legacy_awoooi_default / requires_project_id);每類遷移策略;regression test 設計;完成標準(main.py 0 個無標記 loop)| +| **ADR-124** | Global Singleton Decomposition for Multi-tenancy | P1-20 | 13 個全域單例清單;分解策略(per-project registry / factory pattern);AWOOOI 1.0 → AwoooP 1.0 遷移路徑;不能同時拆的依賴序 | + +### 2.6 前端 Operator Console ADR(新增) + +| ADR | 主題 | 解決問題 | 主要內容 | +|-----|------|---------|---------| +| **ADR-UI-01** | AwoooP Operator Console 架構 | P2-12 | 8 個 UI 模組規格;與現有 `apps/web/` 整合方式;多租戶視角設計;i18n(next-intl)規範 | +| **ADR-UI-02** | Contract Lifecycle UI | P2-12 | draft → publish → activate 操作流程;revision diff 視覺化;contract family 篩選 | +| **ADR-UI-03** | Run State & Shadow Monitoring UI | P2-12 | shadow/canary/active 切換 dashboard;run FSM 視覺化;Strangler Fig gate 量化指標展示 | +| **ADR-UI-04** | Tenant Budget & Audit UI | P2-12 | per-project token budget;hard kill 觸發歷史;audit log 查詢(含 redaction 遮蔽)| + +### 2.7 ADR-106 補充章節 + +ADR-106 需新增: +- **Strangler Fig Quantified Gates**(量化切換條件) +- **GCP Ollama 拓撲影響**(三層 failover 如何成為 `platform_resource`,不屬於任何 tenant) +- **Bootstrap Order** 參照 ADR-111 + +### 2.8 Inventory 清單(9 份) + +| Inventory | 位置 | 範圍 | 解決問題 | +|-----------|------|------|---------| +| **INV-1** | `docs/awooop/inventory/INV-1-redis-keys.md` | 全 codebase grep `redis_client.*\(["']` 等,列出 43+ 個 key、命名空間、TTL、用途、寫入/讀取點、是否硬碼 | P0-01、P1-18 | +| **INV-2** | `docs/awooop/inventory/INV-2-repository-project-id-retrofit.md` | 30+ 業務表 × 目前有無 `project_id` × 所有 repository 方法 × 需加 filter 的查詢 × 需 backfill 的歷史資料 | P0-03 | +| **INV-3** | `docs/awooop/inventory/INV-3-entrypoints.md` | 所有 cron job / scheduler / webhook / CLI / healthcheck / internal service call,標記三種類型 | P0-07、P1-01 | +| **INV-4** | `docs/awooop/inventory/INV-4-hardcoded-namespace-ip.md` | 硬碼 K8s namespace(`awoooi-prod`)、SSH 主機 IP、白名單(**含新 GCP IP:34.143.170.20、34.21.145.224**)| P0-08、P0-13 | +| **INV-5** | `docs/awooop/inventory/INV-5-migration-compatibility-matrix.md` | 版本相容矩陣:SQLAlchemy 1.x→2.x / Alembic / Pydantic v1→v2 / FastAPI 0.x / Python 3.10→3.12;每個 breaking change + 影響範圍 | critic | +| **INV-6** | `docs/awooop/inventory/INV-6-rollback-playbook-register.md` | 6 個 rollback playbook:Phase 1 schema rollback、Phase 2 Redis key rollback、Phase 5 MCP Gateway rollback、Phase 6 EwoooC rollback、Ollama GCP→Local fallback rollback、approval flow rollback | migration | +| **INV-7** | `docs/awooop/inventory/INV-7-pr-cutting-plan.md` | 11 個 PR 切割方案(refactor-specialist 設計):每 PR 的範圍、前置依賴、review 者、合併順序 | refactor | +| **INV-8** | `docs/awooop/inventory/INV-8-background-loop-catalog.md` | 31 個 background loop 逐一列出:名稱、位置(main.py 行號)、類別標記、遷移策略、預計完成 Phase | onboarder | +| **INV-9** | `docs/awooop/inventory/INV-9-global-singleton-catalog.md` | 13 個全域單例逐一列出:名稱、位置、依賴方、分解策略、遷移風險 | onboarder | + +### 2.9 Phase 0 驗收標準 + +- [ ] ADR-111~115(5 份 AwoooP 核心 ADR)全部 Accepted +- [ ] ADR-116~124(9 份強化 ADR)全部 Accepted +- [ ] ADR-UI-01~04(4 份 UI ADR)全部 Accepted(或 Proposed + 統帥批准開工) +- [ ] ADR-106 補入 Strangler Fig Quantified Gates + GCP Ollama 章節 +- [ ] INV-1~INV-9(9 份 Inventory)完成初稿 +- [ ] 無任何 runtime code 變動 +- [ ] `git diff --check` 通過 + +--- + +## 3. 8-Phase 詳細工作項 + +> 每項含:目標、範圍(精確路徑)、輸入(前置依賴)、輸出(交付物)、驗收標準、邊界(禁止碰什麼) + +### Phase 1 — Control Plane Schema Foundation + +**目標**:建立 PostgreSQL contract control plane 最小可用骨架,修正舊 SQL migration 三大 blocker,決定高流量表 partition 策略。 + +**前置依賴**:Phase 0 全部完成(所有 ADR + Inventory) + +**範圍(精確檔案)**: +- `apps/api/migrations/` — 新增 migration files +- `apps/api/src/models/` — 新增 AwoooP SQLAlchemy models +- `apps/api/src/repositories/` — 新增 AwoooP repositories +- `docs/runbooks/` — 新增 partition + retention runbook + +**禁止碰**: +- 任何既有 repository 方法(留給 Phase 2) +- provider 行為(`ai_router.py` / `ollama_*.py`) +- Telegram/LINE webhook 路徑 +- `apps/web/` +- 任何 K8s manifest + +**工作項(順序執行)**: + +``` +1.1 表名核對 + - grep 確認 `incidents`(非 incident_records) + - grep 確認 `mcp_audit_log`(非 mcp_audit_snapshots) + - 修正 ORM: SQLAlchemy 2.x mapped_column、補齊 Numeric/UniqueConstraint/func import + - 每個 migration 強制有 down migration(rollback SQL) + +1.2 Task 9 順序修正(必須 Phase 1.1 之前完成) + - Dockerfile: agent_loader default path 指向 ConfigMap mount + - ConfigMap 預載: 確認 agent prompt 路徑在 ConfigMap 已存在 + - 驗收:dry-run 一個 agent loader,輸出非 None + +1.3 AwoooP 控制面表(新增 migration) + - awooop_projects(tenant 主表,project_id VARCHAR PK,budget,ACL) + - awooop_contract_revisions(六合約共用 revision 表,append-only,見 §4.1 完整欄位) + - awooop_active_revisions(active pointer,指向特定 revision_id) + - awooop_artifact_refs(prompt/schema/eval ref + sha256 + type) + - awooop_project_migration_state(Strangler Fig 階段追蹤,per project × per capability) + - awooop_contract_outbox(ADR-113,active revision 切換事件,for worker invalidation) + - awooop_channel_event_dedupe(ADR-114,idempotency,唯一鍵) + - awooop_platform_subjects(ADR-115,canonical principal mapping) + - awooop_budget_ledger(ADR-120,token budget,per project × per period) + +1.4 高流量表(在 Phase 4/7 建立時已決定 partition,此時寫規則) + - 須在本 Phase migration 中加 partition template comment(不執行,留 Phase 4) + - awooop_run_state → range partition by created_at(月) + - awooop_channel_event → range partition by created_at(月) + - awooop_mcp_gateway_audit → range partition by created_at(月) + - awooop_agent_audit_log → range partition by created_at(月) + - retention: 90 天 hot + 1 年 warm(pg_partman / cron job) + - 寫進 docs/runbooks/awooop-partition-retention.md + +1.5 AWOOOI Bootstrap(seed data) + - INSERT INTO awooop_projects(project_id='awoooi', display_name='AWOOOI', migration_mode='legacy_awoooi_default') + - 驗收:AWOOOI 0 行為改動 + +1.6 RLS 骨架(ADR-118) + - 所有 awooop_* 表啟用 RLS + - policy: USING (project_id = current_setting('app.project_id', TRUE)) + - bypass role: awooop_platform(只給 platform worker 用) + - 注意:RLS 需要 migration + 測試,不只是 ALTER TABLE ENABLE ROW LEVEL SECURITY + +1.7 Immutability 測試 + - published contract revision 嘗試 UPDATE → 必失敗(trigger 或 check constraint) + - draft 與 active 隔離:runtime 讀取 view 不含 draft + - 自動化:pytest + db-expert review +``` + +**RACI**: +- R(執行):fullstack-engineer +- A(負責):db-expert review,統帥批准 +- C(諮詢):refactor-specialist(migration PR 切割)、critic(最終 review) +- I(通知):migration-engineer(版本相容驗證) + +**DoD**: +- 所有 migration up/down dry-run 通過 +- AWOOOI 可表示為 `project_id=awoooi`,0 行為改動 +- RLS 測試:cross-project SELECT 被拒絕 +- partition runbook 已建立 + +--- + +### Phase 2 — Tenant Isolation & Namespace Hardening + +**目標**:在開放任何下游 tenant 之前,把 AWOOOI 自己變成乾淨的 tenant。 +**前置**:Phase 1 完成 + +**範圍**: +- `apps/api/src/services/` — Redis key 遷移(依 INV-1) +- `apps/api/src/repositories/` — 加 project_id filter(依 INV-2) +- `apps/api/src/services/security_interceptor.py` — nonce 修補(P0-05,ADR-116) +- `apps/api/src/api/v1/webhooks.py` — replay 防護(P0-06,ADR-116) +- `apps/api/src/core/telemetry.py:71` — 移除硬碼 IP assert(P0-08) +- `apps/api/src/services/decision_manager.py:240` — silence key 常數化(P1-24) +- `apps/api/src/services/ollama_auto_recovery.py:230` — 移除第二定義(P0-11) +- `apps/api/src/plugins/mcp/mcp_bridge.py:592-681` — namespace 動態化(P0-13) +- `apps/api/src/services/consensus_engine.py` — CONSENSUS_PREFIX 加 project 前綴(P0-12) +- `apps/api/src/hermes/nl_gateway.py` — Redis key 加 project 前綴(P1-16) +- `apps/api/src/services/anomaly_counter.py:790` — per-project 改造(P1-17) +- `apps/api/src/services/incident_service.py:603` — SCAN 加 prefix(P1-18) + +**禁止碰**: +- `awooop_contract_revisions` 以外的 AwoooP Phase 1 新表結構 +- EwoooC / Tsenyang 任何接入(留 Phase 6) +- 任何 provider routing 改動(Ollama GCP 拓撲已由 ADR-110 定案,不在此 Phase 改) + +**工作項**: + +``` +2.1 Redis 三階段雙寫遷移計畫執行(依 INV-1,分三批) + 批次 A(Critical Path,影響 Ollama GCP 拓撲): + - ollama:current_primary(舊)→ {project_id}:ollama:primary(新) + 注意:要同時支援三層 GCP-A/GCP-B/Local,INV-1 需確認所有寫入點 + - ollama_auto_recovery.py:230 第二定義刪除,統一常數 + 批次 B(費用 + 告警關鍵): + - ai_rate:total_cost:gemini → {project_id}:ai_rate:total_cost:gemini + - telegram:polling:leader → platform:telegram:polling:leader(platform_resource) + - telegram_silence:{target} → {project_id}:telegram_silence:{target} + 同步更新 decision_manager.py:240 import gateway 常數 + 批次 C(working memory): + - consensus: → {project_id}:consensus:(consensus_engine.py) + - hermes Redis keys(nl_gateway.py) + - anomaly_counter 6 個 prefix + - incident:* SCAN(incident_service.py:603) + + 每批次:Phase A(雙寫 30 天)→ Phase B(雙讀 14 天)→ Phase C(移除舊 key) + +2.2 Security hardening(ADR-116) + - telemetry.py:71:移除 "192.168.0.188" 硬碼 assert,改為 config-driven allowed endpoints + - security_interceptor.py:451-490:nonce 重設計,server_secret 必參與 HMAC + - webhooks.py:679-728:加 timestamp(±5min window)+ nonce(Redis dedup) + - requires_approval:改為從 policy contract 讀取,禁止 LLM output 決定 + - approval_token:HS256,15min TTL,jti 唯一性(Redis NX) + +2.3 Repository project_id 改造(依 INV-2) + - 所有 30+ repository 方法加 project_id filter + - K8s namespace 白名單 → tenant-aware(mcp_bridge.py:592-681 動態化) + - SSH 主機白名單 → tenant-aware(依 INV-4) + +2.4 Background loop 標記(依 ADR-123,INV-3/INV-8) + - 31 個 loop 標記為 platform_internal / legacy_awoooi_default / requires_project_id + - platform_internal 帶 project_id=__platform__ + - legacy_awoooi_default fallback 到 project_id=awoooi,寫退場時程 + +2.5 Global singleton 分解第一步(依 ADR-124,INV-9) + - 只做:AnomalyCounter(P1-17 已修)per-project 改造 + - 其餘 13 個全域單例列出退場時程(不在此 Phase 全拆,防爆炸半徑) + +2.6 Token Budget Hard Kill 基礎(ADR-120) + - budget_ledger 表 migration(Phase 1 已建,此 Phase 寫入邏輯) + - 每 LLM call 前:check budget → hard kill if exceeded(不只 log) + - Redis hot counter + PG 事務 hard stop +``` + +**RACI**: +- R:fullstack-engineer + refactor-specialist(大量 repository 改動) +- A:db-expert(repository 改動 review)、vuln-verifier(security hardening PoC 驗證) +- C:critic(整體 diff review)、migration-engineer(相容性確認) +- I:tool-expert(K8s namespace 改動相關) + +**DoD**: +- INV-1 所有 P0 key 完成三階段遷移(Phase A 完成,Phase B/C 在觀察期) +- cross-project test 全紅(pytest 覆蓋) +- `grep -r "awoooi-prod" apps/api/src/` 結果為 0 +- `grep -r "192.168.0.188" apps/api/src/` telemetry assert 消失 +- vuln-verifier PoC 重跑:P0-05 nonce 偽造失敗、P0-06 webhook replay 失敗 +- Budget hard kill 測試:超額後 LLM call 被拒絕 + +--- + +### Phase 3 — Contract Packages & Validators + +**目標**:六合約從散文升級為可驗證程式。 +**前置**:Phase 1 完成(contract_revisions 表存在) + +**範圍**: +- `packages/awooop-contracts/`(此時才建立!) +- `apps/api/src/services/contract_service.py`(新建) +- `apps/api/src/repositories/contract_repository.py`(新建) + +**禁止碰**: +- 任何既有 provider / router / telegram 路徑 +- `apps/web/`(UI 留 Phase 8 之後) + +**工作項**: + +``` +3.1 建立 packages/awooop-contracts/(此時才有真實內容) + - 六合約 JSON Schema(Project/Tenant、Agent、MCP Gateway、Policy/Routing、Run State、Channel Event) + - Pydantic v2 models 對應六合約 + - envelope schema:platform invocation、MCP tool call、run state transition、channel event + - golden fixtures(valid × 6 + invalid × 6) + +3.2 Contract lifecycle service + - draft():建立 draft revision,不可被 runtime 讀 + - publish():產生 immutable published revision(body_hash = sha256(body_json)) + - activate():更新 active pointer,寫入 contract_outbox(ADR-113) + - get_active():runtime 讀取路徑,只返回 published + active + - 全部操作記錄 audit log + +3.3 Output schema validator middleware + - LLM 回傳 → 過 schema validator → 失敗 → retry(上限 3 次)→ 失敗 → error code(E-SCHEMA-001) + - 任何 schema 不符的 LLM 輸出無法到達 channel adapter + +3.4 Contract governance(ADR-112) + - CODEOWNERS 指定 packages/awooop-contracts/ + - publish API:HMAC 簽章驗證 + - activate API:approval workflow(multi_sig_redis 路徑) + +3.5 SHA-256 artifact 驗證 + - 所有 artifact ref 含 sha256 + - runtime 讀取時驗 hash(與 DB 記錄比對) +``` + +**DoD**: +- schema 不符的 LLM 輸出無法到達 channel adapter(整合測試) +- AWOOOI 第一份 Agent contract 可 publish + activate(E2E) +- prompt/schema ref 必含 sha256 + +--- + +### Phase 4 — Platform Shell in Shadow Mode + +**目標**:建立第一個 runtime shell,只跑 shadow,不改 legacy 行為。 +**前置**:Phase 3 完成 + +**範圍**: +- `apps/api/src/api/v1/platform/` — 新增 platform runs API +- `apps/api/src/services/platform_runtime.py` — 新建 +- `apps/api/src/services/run_state_machine.py` — Run FSM 實作(P2-02) +- `apps/api/src/workers/platform_worker.py` — 新建 +- `apps/api/src/services/audit_sink.py` — 加 redaction(P1-08) + +**禁止碰**: +- 任何既有 `/v1/incidents/`、`/v1/webhooks/` 路徑 +- Telegram bot handler(legacy 維持) +- EwoooC 接入(留 Phase 6) + +**工作項**: + +``` +4.1 Run API shell(shadow only) + - POST /v1/platform/runs + - 生成 run_id(UUID v7)、trace_id(W3C traceparent compatible) + - 解析 project + agent contract active revision + - 解析 EffectivePolicy(6 層合併,不改 provider 行為) + +4.2 Run State Machine(ADR-114 + ADR-119) + - States: PENDING → RUNNING → WAITING_TOOL → WAITING_APPROVAL → COMPLETED / FAILED / CANCELLED + - lease_until、heartbeat_at、attempt_count 欄位 + - SKIP LOCKED 取單(防 double-pickup) + - stale run reaper(每分鐘掃 expired lease,回到 PENDING 或 FAILED) + - SAGA step journal(ADR-119):每個 tool call 寫入 step_id、補償指令 + +4.3 Idempotency(ADR-114) + - (project_id, channel_type, provider_event_id) 複合 unique + - 重複事件 return 既有 run_id(不產生新 run) + - Redis NX + PG constraint 雙層保護 + +4.4 Audit log redaction(ADR-116) + - audit_sink 寫入前過 sanitization_service pipeline + - PII / secret pattern 硬攔(含 GCP IP、PG password、Telegram token 等) + - audit log 不記錄 raw LLM input/output,只記 hash + schema validation result + +4.5 Observability(ADR-121) + - OTel GenAI span 命名(gen_ai.request.*) + - token 計數 attribute(gen_ai.usage.prompt_tokens 等) + - metrics label:只 project_id / agent_id / status / provider(禁止 run_id/trace_id/session_id 進 metrics) + - run_id / trace_id 只進 logs/traces(不進 metrics) + +4.6 Shadow mode wiring + - 選定 3 個 AWOOOI 事件 mirror 到 shadow(不發 user response) + - shadow run 0 destructive tool call(MCP write/execute 全 block) + +4.7 Token Budget Hard Kill(ADR-120) + - per-run token budget(from EffectivePolicy) + - 超額 → hard kill → FAILED state → error code E-BUDGET-001 + - 每 run 完成後寫入 budget_ledger(實際消耗) +``` + +**RACI**: +- R:fullstack-engineer(API + service)、db-expert(run state schema review) +- A:critic(shadow mode 設計 review)、vuln-verifier(redaction PoC) +- C:debugger(trace_id 貫穿設計)、tool-expert(OTel 整合) +- I:migration-engineer(worker lease 相容性) + +**DoD**: +- shadow run 0 user-visible response、0 destructive tool call(vuln-verifier 驗證) +- legacy AWOOOI 行為 0 改變(回歸測試通過) +- worker crash 後 stale run 1 分鐘內被回收(自動化測試) +- duplicate event 不產生重複 run(idempotency 測試) +- audit log 0 secret 命中(vuln-verifier 抽樣 100 筆) +- token budget 超額觸發 hard kill(整合測試) + +--- + +### Phase 5 — MCP Gateway First Slice + +**目標**:tool 授權搬到 Gateway,read-only 工具先進,解決 sanitization enforcement。 +**前置**:Phase 4 完成 + +**範圍**: +- `apps/api/src/plugins/mcp/gateway.py` — 新建 MCP Gateway +- `apps/api/src/plugins/mcp/registry.py:24-71` — `_provider` → `__provider`(P1-05) +- `apps/api/src/plugins/mcp/mcp_bridge.py` — 接入 Gateway +- `apps/api/src/services/sanitization_service.py` — enforcement point(P1-09) + +**禁止碰**: +- MCP write/execute tools(寫/執行工具留 Phase 8) +- Telegram approval flow(改動在 Phase 8) + +**工作項**: + +``` +5.1 MCP Gateway 表 + - awooop_mcp_tool_registry(tool_id, project_id, agent_id, tool_type, allowed_scopes) + - awooop_mcp_grants(grant_id, project_id, agent_id, tool_id, granted_by, expires_at) + - awooop_mcp_credential_refs(ref_id, tool_id, k8s_secret_ref, sha256) + - awooop_mcp_gateway_audit(call_id, trace_id, run_id, tool_id, credential_ref, latency_ms, result_status) + +5.2 Five-gate enforcement + - Check: Project AND Agent AND Tool AND Environment AND Approval + - 任一不符 → 拒絕 + 記錄 audit + error code E-MCP-GATE-XXX + +5.3 Result sanitization enforcement(P1-04、P1-09) + - 所有 MCP tool result 必經 sanitization_service pipeline + - MCP Gateway 加 sanitization middleware(不允許 raw result 直接進 LLM context) + - 進 LLM 前一層 + 進 audit sink 一層(雙層 redaction) + - sast 掃描 agent 程式碼路徑:0 raw credential 接觸 + +5.4 _provider 修正(P1-05) + - registry.py: _provider → __provider(雙底線 Python name mangling) + - 加 unit test:外部 reflect 取用 → AttributeError + +5.5 Credential isolation + - agent 程式碼不直接存取 K8s Secret + - Gateway 解析 credential_ref → 回傳 masked result(token 替換) + - 2026-04-18 secret leak 重演測試:kubectl describe 輸出不出現在 LLM context + +5.6 MCP OAuth 2.1(ADR-117) + - 實作 per-tenant dynamic client registration(RFC 7591) + - Resource Indicators(RFC 9728)防 Confused Deputy + - PKCE flow for MCP Server binding +``` + +**RACI**: +- R:fullstack-engineer(Gateway service) +- A:vuln-verifier(credential isolation 驗證)、critic(架構 review) +- C:tool-expert(MCP spec 確認)、db-expert(Gateway 表設計 review) +- I:migration-engineer(MCP registry 相容性) + +**DoD**: +- 2026-04-18 secret leak 重演測試通過(kubectl describe 輸出不出現在 LLM context 或 audit row) +- sast 掃描:agent 程式碼路徑 0 raw credential 接觸 +- `__provider` 雙底線 unit test 通過 +- Five-gate 全部 integration test 覆蓋 + +--- + +### Phase 6 — EwoooC Read-Only Tenant Onboarding + +**目標**:以真實下游 tenant 驗證 AwoooP,全 read-only。 +**前置**:Phase 5 完成、telemetry.py:71 hardcoded IP assert 已移除(Phase 2 完成) + +**範圍**: +- `apps/api/src/` — EwoooC project provisioning +- `packages/awooop-contracts/` — EwoooC agent contract +- `apps/api/src/services/provider_proxy.py` — 新建 Provider Proxy Adapter(P1-02) + +**禁止碰**: +- AWOOOI 任何既有業務邏輯 +- MCP write/execute tools + +**工作項**: + +``` +6.1 EwoooC project provisioning + - INSERT INTO awooop_projects(project_id='ewoooc', ...) + - 不可讀 AWOOOI data(RLS 驗證) + +6.2 openclaw-biz agent contract + - 針對市場情報 domain 設計 I/O schema + - 安全 ceiling:read-only only,禁止 infra tool + +6.3 Provider Proxy Adapter(P1-02,ADR-115) + - 不只是改 OLLAMA_API_BASE + - Proxy 入口強制注入 envelope:project_id / agent_id / trace_id / run_id + - 過 EffectivePolicy + budget guard + audit + - GCP Ollama 三層拓撲:EwoooC 走相同 primary/secondary/fallback 路由 + - read-only / model-call 入口優先啟用 + +6.4 Market intelligence MCP tools 註冊 + - 4 個 read-only tools:market_data_fetch、product_catalog_query、competitor_analysis、trend_report + - 全部在 MCP Gateway 五重 gate 管控 + +6.5 Shadow → Canary 升級計畫 + - 先 14 天 shadow(Strangler Fig gate 量化) + - 符合條件後升 canary(selected responses) + - canary 通過再升 read_only +``` + +**RACI**: +- R:fullstack-engineer +- A:critic(EwoooC 資料隔離 review)、vuln-verifier(cross-tenant isolation PoC) +- C:db-expert(RLS 驗證)、migration-engineer(EwoooC rollback playbook,INV-6) +- I:tool-expert(GCP Ollama 拓撲 EwoooC 路由設定) + +**DoD**: +- EwoooC SELECT 無法讀到 AWOOOI data(RLS + cross-tenant pytest) +- Provider Proxy Adapter E2E 測試:envelope 正確注入 +- budget / audit 完全 project-scoped +- EwoooC 啟動時 telemetry.py 不再因 IP assert 失敗 + +--- + +### Phase 7 — Communication Hub Increment + +**目標**:標準化 channel 但不切斷既有 bot。 +**前置**:Phase 6 完成 + +**範圍**: +- `apps/api/src/services/channel_hub.py` — 新建 +- `apps/api/src/services/telegram_gateway.py` — mirror inbound events +- `apps/api/src/api/v1/platform/channel.py` — 新建 + +**禁止碰**: +- 既有 telegram bot handler(維持 legacy 權威,直到 canary 量化 gate 通過) +- LINE / Slack 接入(留 v2) + +**工作項**: + +``` +7.1 awooop_conversation_event + awooop_outbound_message 表 + - partition by created_at(月,Phase 1 已定策略) + - retention policy 配置 + +7.2 Telegram inbound mirror + - 現有 telegram_gateway.py 事件複製到 awooop_conversation_event + - canonical principal mapping(ADR-115):所有 sender 寫入 awooop_platform_subjects + +7.3 Progressive Feedback Policy(P2-11) + - WAITING_TOOL / RUNNING / WAITING_APPROVAL → 必發 Telegram 暫態訊息 + - 用 edit_message 更新(非新訊息,不觸發通知) + - 首則進度訊息 ≤ 30s + +7.4 Idempotency 驗證(已由 Phase 4 完成) + - duplicate channel retry 不產生 duplicate run(整合測試) + +7.5 Adapter-level 安全 + - 所有 channel adapter:escaping + redaction + idempotency + delivery audit + - channel adapter 0 LLM 呼叫、0 MCP 呼叫(pytest 覆蓋) + +7.6 量化 gate 監控儀表板(配合 ADR-UI-03) + - Strangler Fig gate 指標:decision divergence / p95 latency / error rate + - 供 Phase 8 升級決策用 +``` + +**RACI**: +- R:fullstack-engineer(API + channel hub) +- A:critic(channel 設計 review)、debugger(trace_id 貫穿驗證) +- C:frontend-designer(進度訊息 UX)、tool-expert(Telegram API 規格確認) +- I:migration-engineer(channel 相容性) + +**DoD**: +- channel adapter 0 LLM 呼叫、0 MCP 呼叫 +- async run 首則進度訊息 ≤ 30s +- duplicate retry 不產生 duplicate run + +--- + +### Phase 8 — Suggest & Controlled Write Paths + +**目標**:從 read-only 升級到 propose,再到 controlled execute。 +**前置**:Phase 7 完成 + Strangler Fig shadow→canary gate 全通過 + +**範圍**: +- `apps/api/src/services/multi_sig_redis.py` — approval token 簽章(P1-06) +- `apps/api/src/services/approval_timeout_resolver.py` — 加 trace_id(P1-15) +- `apps/api/src/api/v1/platform/suggest.py` — suggest mode endpoint +- Feature flags for write/execute paths + +**禁止碰**: +- 任何 write/execute tool 的預設啟用 +- Strangler Fig 量化 gate 通過前不做 auto_remediate + +**工作項**: + +``` +8.1 Approval Token 安全強化(P1-06,ADR-116) + - WAITING_APPROVAL resume API:強制驗 approval_token(HS256,15min TTL,jti Redis NX) + - approval state:PG 為 source of truth,Redis 為 cache + - 過期 / 已決 / 重放 → 全部拒絕 + error code E-APPROVAL-XXX + +8.2 multi_sig_redis.py + approval_timeout_resolver.py trace_id 補入 + - 所有 approval 操作加 trace_id(P1-15) + - 完整鏈路可追蹤(debugger 驗證) + +8.3 Suggest mode for AWOOOI SRE flows + - 選定低風險 3 個 SRE flow(e.g., 告警靜音建議、playbook 推薦) + - suggest 模式:AI 輸出建議,人工決定執行 + - 量化 gate(ADR-106 補章): + * shadow → canary:≥14 天 + divergence <5% + p95 <10% + 0 P1 incident + * canary → read_only:≥7 天 + error rate <0.5% + cost diff <50% + * read_only → suggest:≥14 天 + accept rate ≥50% + 0 hallucination escalation + * suggest → auto_remediate:≥30 天 + rollback evidence ≥3 次 + approval token live + dry-run ≥99% + +8.4 Dry-run 與 rollback evidence gate + - 每個 write/execute tool 必須有 dry-run mode + - rollback playbook 寫入 INV-6(Phase 0 已完成,此時執行驗證) + - 記錄每次 rollback 結果作為 Phase 8 gate evidence + +8.5 Feature Flag Registry(見 §10) + - suggest mode:feature flag AWOOOP_SUGGEST_MODE(default OFF) + - controlled write:feature flag AWOOOP_WRITE_MODE(default OFF) + - 需顯式 flip 才啟用,不能環境變數意外帶入 + +8.6 vuln-verifier PoC 驗收 + - WAITING_APPROVAL 無 token resume 必失敗 + - Redis 宕機時 approval 仍可從 PG 恢復 +``` + +**RACI**: +- R:fullstack-engineer +- A:vuln-verifier(approval security PoC)、critic(write path review) +- C:debugger(trace_id 驗證)、db-expert(approval state PG review) +- I:migration-engineer(feature flag rollback) + +**DoD**: +- WAITING_APPROVAL 無 token resume 被拒絕(vuln-verifier PoC 通過) +- Redis 宕機後 approval 從 PG 恢復(整合測試) +- write/execute 預設 OFF,feature flag 手動 flip 才啟用 +- 所有 Strangler Fig gate 量化驗收通過(critic + db-expert + vuln-verifier 三方簽核) + +--- + +## 4. 資料庫詳細 Schema + +### 4.1 awooop_contract_revisions(六合約共用 revision 表) + +```sql +CREATE TABLE awooop_contract_revisions ( + revision_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + project_id VARCHAR(64) NOT NULL REFERENCES awooop_projects(project_id), + contract_family VARCHAR(32) NOT NULL -- project_tenant/agent/mcp_gateway/policy_routing/run_state/channel_event + contract_id VARCHAR(128) NOT NULL, + version VARCHAR(32) NOT NULL, + lifecycle_status VARCHAR(16) NOT NULL DEFAULT 'draft', -- draft/published/superseded/revoked + body_json JSONB NOT NULL, + body_schema_version VARCHAR(32) NOT NULL, + body_hash CHAR(64) NOT NULL, -- SHA-256 hex + created_by VARCHAR(128) NOT NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + published_at TIMESTAMPTZ, + supersedes_revision_id UUID REFERENCES awooop_contract_revisions(revision_id), + -- Immutability constraint + CONSTRAINT published_body_immutable CHECK ( + lifecycle_status = 'draft' OR body_json IS NOT NULL + ) +); + +-- Runtime reads view(只看 published/active,不看 draft) +CREATE VIEW awooop_published_revisions AS + SELECT * FROM awooop_contract_revisions + WHERE lifecycle_status IN ('published', 'superseded'); + +-- Append-only trigger +CREATE OR REPLACE FUNCTION prevent_revision_update() +RETURNS TRIGGER AS $$ +BEGIN + IF OLD.lifecycle_status != 'draft' THEN + RAISE EXCEPTION 'Published contract revision is immutable'; + END IF; + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +CREATE TRIGGER enforce_revision_immutability + BEFORE UPDATE ON awooop_contract_revisions + FOR EACH ROW EXECUTE FUNCTION prevent_revision_update(); + +-- RLS +ALTER TABLE awooop_contract_revisions ENABLE ROW LEVEL SECURITY; +CREATE POLICY tenant_isolation ON awooop_contract_revisions + USING (project_id = current_setting('app.project_id', TRUE) + OR current_user = 'awooop_platform'); +``` + +### 4.2 awooop_run_state(含 lease + SAGA journal) + +```sql +CREATE TABLE awooop_run_state ( + run_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + project_id VARCHAR(64) NOT NULL, + agent_id VARCHAR(128) NOT NULL, + trace_id CHAR(32), -- W3C trace_id hex + parent_run_id UUID, + status VARCHAR(32) NOT NULL DEFAULT 'PENDING', + migration_mode VARCHAR(32) NOT NULL DEFAULT 'shadow', -- shadow/canary/read_only/suggest/auto_remediate + -- Worker lease + lease_until TIMESTAMPTZ, + heartbeat_at TIMESTAMPTZ, + attempt_count INT NOT NULL DEFAULT 0, + worker_id VARCHAR(128), + -- Token budget + budget_limit_tokens BIGINT, + tokens_used BIGINT NOT NULL DEFAULT 0, + -- Timestamps + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT now(), + completed_at TIMESTAMPTZ, + -- SAGA journal(step-level) + saga_steps JSONB DEFAULT '[]', -- [{step_id, tool, status, compensation_cmd, completed_at}] + -- Metadata + input_hash CHAR(64), -- SHA-256 of input envelope(for audit) + effective_policy_revision_id UUID +) PARTITION BY RANGE (created_at); + +-- Per-project RLS +ALTER TABLE awooop_run_state ENABLE ROW LEVEL SECURITY; +CREATE POLICY tenant_isolation ON awooop_run_state + USING (project_id = current_setting('app.project_id', TRUE) + OR current_user = 'awooop_platform'); +``` + +### 4.3 awooop_budget_ledger(Token Budget Hard Kill) + +```sql +CREATE TABLE awooop_budget_ledger ( + ledger_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + project_id VARCHAR(64) NOT NULL, + period DATE NOT NULL, -- YYYY-MM-DD(月份第一天) + provider VARCHAR(32) NOT NULL, + tokens_input BIGINT NOT NULL DEFAULT 0, + tokens_output BIGINT NOT NULL DEFAULT 0, + cost_usd NUMERIC(12, 6) NOT NULL DEFAULT 0, + hard_kill_at NUMERIC(12, 6), -- NULL = no limit + hard_killed BOOLEAN NOT NULL DEFAULT FALSE, + last_run_id UUID, + updated_at TIMESTAMPTZ NOT NULL DEFAULT now(), + UNIQUE(project_id, period, provider) +); +``` + +### 4.4 8 群新增/擴充表清單(db-expert 發現) + +| 表名 | 缺失欄位 / 缺失 Index | Phase | +|------|----------------------|-------| +| `incidents` | 加 `project_id`、`trace_id`、`awooop_run_id` | Phase 2 | +| `playbooks` | 加 `project_id`、`agent_id` | Phase 2 | +| `km_entries` | 加 `project_id`、`namespace` | Phase 2 | +| `mcp_audit_log` | 加 `trace_id`、`run_id`、`project_id`;加 index on (run_id) | Phase 2 | +| `ai_decisions` | 加 `project_id`、`run_id`、加 index on (run_id) | Phase 2 | +| `approval_records` | 加 `trace_id`、`approval_token_jti`、加 index on (jti) | Phase 2/8 | +| `telegram_events` | 加 `project_id`、`platform_subject_id` | Phase 7 | +| `ollama_health_checks` | 加 `host_tier`(gcp_a/gcp_b/local)、`project_id=__platform__` | Phase 2 | + +--- + +## 5. 安全修補計畫(vuln-verifier 驗收) + +### 5.1 PoC 確認的三個漏洞 + +| 漏洞 | 位置 | PoC 狀態 | 修補方案 | Phase | +|------|------|---------|---------|-------| +| Nonce 偽造(server nonce 不依賴 server_secret)| security_interceptor.py:451-490 | **PoC 確認可通過驗證** | HMAC(server_secret + nonce),server_secret 從 K8s Secret 注入 | Phase 2 | +| Webhook replay(無 timestamp/nonce)| webhooks.py:679-728 | **PoC 確認可 replay** | 加 timestamp(±5min)+ nonce Redis NX | Phase 2 | +| requires_approval 由 LLM output 決定 | decision_manager.py(approval 鏈)| **PoC 確認可繞過** | policy contract 決定,禁止 LLM output 影響 | Phase 2 | + +### 5.2 approval_token 規格 + +``` +簽章算法:HS256 +Payload: + - jti: UUID(唯一性,Redis NX 15min TTL) + - iss: "awooop-platform" + - sub: "{project_id}:{run_id}" + - aud: "awooop-approval" + - exp: now + 15min + - approval_type: "human" | "system" + - decision_scope: [tool_id, ...] + +驗證: + 1. 簽章驗證 + 2. exp 未過期 + 3. Redis NX 確認 jti 未使用(防 replay) + 4. sub 與 resume 的 run_id 吻合 + 5. decision_scope 與 run 的 tool 吻合 +``` + +### 5.3 vuln-verifier 每 Phase 驗收清單 + +- Phase 2:nonce 偽造失敗、webhook replay 失敗、requires_approval 無法由 LLM 決定 +- Phase 4:audit log 0 secret 命中(抽樣 100 筆) +- Phase 5:agent 程式碼路徑 0 raw credential(sast) +- Phase 6:cross-tenant isolation PoC(EwoooC 無法讀 AWOOOI) +- Phase 8:approval token 無 token resume 被拒、Redis 宕機後從 PG 恢復 + +--- + +## 6. API Endpoint 完整清單(fullstack 補充) + +### 6.1 現有(不動) +- `POST /v1/webhooks/telegram` +- `POST /v1/webhooks/alertmanager` +- `GET /v1/incidents/` +- `POST /v1/decisions/` + +### 6.2 Phase 4 新增(Platform Shell) +- `POST /v1/platform/runs` — 建立 run(async) +- `GET /v1/platform/runs/{run_id}` — 查詢 run state +- `GET /v1/platform/runs/{run_id}/steps` — 查詢 SAGA steps +- `POST /v1/platform/runs/{run_id}/cancel` — 取消 run + +### 6.3 Phase 4-5 新增(Approval) +- `POST /v1/platform/runs/{run_id}/approve` — 帶 approval_token 的 resume +- `POST /v1/platform/runs/{run_id}/reject` — 拒絕(帶理由) + +### 6.4 Phase 6 新增(Tenant) +- `POST /v1/platform/projects` — 建立 project(admin only) +- `GET /v1/platform/projects/{project_id}/migration_state` — 查詢 Strangler Fig 狀態 +- `POST /v1/platform/projects/{project_id}/contracts` — 建立 contract draft +- `POST /v1/platform/projects/{project_id}/contracts/{contract_id}/publish` — publish +- `POST /v1/platform/projects/{project_id}/contracts/{contract_id}/activate` — activate + +### 6.5 Phase 7 新增(Channel Hub) +- `GET /v1/platform/channel_events` — 查詢 conversation events(with pagination) +- `POST /v1/platform/outbound` — 發送 outbound message(admin/test) + +--- + +## 7. 錯誤碼字典(必補 9 個) + +| Error Code | HTTP Status | 描述 | 場景 | +|------------|-------------|------|------| +| `E-SCHEMA-001` | 422 | LLM output schema validation failed | Phase 3 contract validator | +| `E-BUDGET-001` | 429 | Token budget hard kill triggered | Phase 4 budget guard | +| `E-APPROVAL-001` | 401 | approval_token missing or invalid | Phase 8 approval resume | +| `E-APPROVAL-002` | 401 | approval_token expired | Phase 8 | +| `E-APPROVAL-003` | 409 | approval_token already used (replay) | Phase 8 | +| `E-MCP-GATE-001` | 403 | MCP tool not authorized for this project | Phase 5 | +| `E-MCP-GATE-002` | 403 | MCP tool not authorized for this agent | Phase 5 | +| `E-MCP-GATE-003` | 403 | MCP write/execute tool blocked (not in auto_remediate mode) | Phase 5/8 | +| `E-TENANT-001` | 403 | Cross-tenant data access blocked | Phase 2+ | +| `E-IDEMPOTENT-001` | 200 | Duplicate event, returning existing run_id | Phase 4 | +| `E-RATE-001` | 429 | Project rate limit exceeded | Phase 2+ | +| `E-SAGA-001` | 500 | SAGA compensation failed, manual intervention required | Phase 4/ADR-119 | + +--- + +## 8. 前端 Operator Console(frontend-designer,8 個模組) + +> 實作在 Phase 8 之後(或 Phase 6 可 prototype Operator Console) +> ADR-UI-01~04 定架構,此處為工作項清單 + +| 模組 | 描述 | 優先順序 | +|------|------|---------| +| **Tenant Management** | project 列表、建立、migration_state 視覺化、budget 設定 | P1(Phase 6 prototype)| +| **Contract Lifecycle** | draft/publish/activate 操作、revision diff、六合約 family 篩選 | P1(Phase 6 prototype)| +| **Run Monitor** | run FSM 視覺化、shadow/canary/active 標記、trace_id drill-down | P1(Phase 4 後)| +| **Strangler Fig Dashboard** | shadow→canary gate 量化指標(divergence / latency / error rate)即時儀表板 | P1(Phase 7 後)| +| **Budget & Cost** | per-project token budget、hard kill 觸發歷史、成本趨勢(GCP Ollama vs paid provider)| P2 | +| **Audit Log Viewer** | audit log 查詢(redaction 後)、secret 命中警告、trace_id 關聯 | P2 | +| **MCP Gateway Admin** | tool registry、grants 管理、credential refs(masked)、audit | P2 | +| **Principal Directory** | platform_subject 查詢、Telegram/LINE/API user mapping | P3 | + +**與現有設計系統整合**: +- 必須使用 next-intl(禁止 hardcode 中文/英文) +- 禁止 emoji,使用 Lucide/SVG icon +- 遵循 `feedback_design_system_consistency.md` 全站設計規範 +- 禁止直接存取內網 IP(`feedback_frontend_internal_ip_ban.md`) + +--- + +## 9. 重構切割計畫(11 PR,refactor-specialist) + +> 每 PR 必須獨立可合併、有 rollback 能力、不依賴後 PR + +| PR# | 標題 | 前置 PR | 影響範圍 | 風險 | +|-----|------|---------|---------|------| +| PR-01 | `telemetry.py:71` 硬碼 IP assert 移除 | 無 | 1 行 | 低 | +| PR-02 | `decision_manager.py:240` silence key 常數化 | 無 | 2 行 | 低 | +| PR-03 | `ollama_auto_recovery.py:230` 第二定義移除 | 無 | ~5 行 | 低 | +| PR-04 | `_provider` → `__provider`(registry.py)| 無 | ~20 行 | 低 | +| PR-05 | `mcp_bridge.py` namespace 動態化 | 無 | ~30 行 | 中 | +| PR-06 | `consensus_engine.py` CONSENSUS_PREFIX 加 project 前綴 | Phase 2 Redis 雙寫 Phase A | ~15 行 | 中 | +| PR-07 | nonce 重設計 + webhook timestamp/nonce(ADR-116)| 無 | ~100 行 | 高(安全修補)| +| PR-08 | Repository project_id filter 批次 1(incidents/playbooks/km)| Phase 1 schema | ~200 行 | 中 | +| PR-09 | Repository project_id filter 批次 2(mcp/ai_decisions/approval)| PR-08 | ~200 行 | 中 | +| PR-10 | Background loop 標記(31 個 loop,main.py)| ADR-123 | ~150 行 | 中 | +| PR-11 | AnomalyCounter per-project 改造 | PR-10 | ~80 行 | 中 | + +> PR-01~05 可並行(無依賴),先做先進。 +> PR-06~07 需要 Redis 雙寫 Phase A 先完成。 +> PR-08~09 需要 Phase 1 schema 先完成。 + +--- + +## 10. Feature Flag / Kill-Switch Registry + +| Flag 名稱 | 預設值 | 說明 | 開啟條件 | +|-----------|--------|------|---------| +| `AWOOOP_SHADOW_MODE` | OFF | 啟用 shadow run(鏡像但不回應)| Phase 4 完成後手動 flip | +| `AWOOOP_CANARY_MODE` | OFF | 啟用 canary(部分 user-visible 回應)| shadow gate 14天量化通過 | +| `AWOOOP_READ_ONLY_MODE` | OFF | read-only 查詢搬到 AwoooP | canary gate 7天量化通過 | +| `AWOOOP_SUGGEST_MODE` | OFF | AI 建議但人工決定 | read_only gate 14天通過 | +| `AWOOOP_WRITE_MODE` | OFF | 受控 write/execute tool 啟用 | suggest gate 30天通過 + rollback evidence ≥3 | +| `AWOOOP_BUDGET_HARD_KILL` | ON | token budget 超額直接終止(非只告警)| **預設 ON**(ADR-120)| +| `AWOOOP_MCP_OAUTH21` | OFF | MCP OAuth 2.1 flow(ADR-117)| Phase 5 完成後 | +| `AWOOOP_RLS_STRICT` | OFF | 嚴格 RLS 模式(禁止 awooop_platform bypass)| Phase 2 完成 + 30天 soak | +| `AWOOOP_EWOOOC_LIVE` | OFF | EwoooC tenant 切為 live(非 shadow)| Phase 6 canary 7天通過 | + +--- + +## 11. Runbook 清單(8 份,debugger 需求) + +| Runbook | 位置 | 觸發情境 | 主要步驟 | +|---------|------|---------|---------| +| **RB-01**: AwoooP Contract Publish Failure | `docs/runbooks/awooop-contract-publish-failure.md` | schema 驗證失敗、CODEOWNERS reject | 1.查 body_hash 2.查 draft 狀態 3.rollback to previous active | +| **RB-02**: Run State Stuck / Stale Lease | `docs/runbooks/awooop-run-stuck.md` | run 停在 RUNNING > 10min | 1.查 lease_until 2.手動 reaper 3.查 saga_steps 決定補償或放棄 | +| **RB-03**: Budget Hard Kill Triggered | `docs/runbooks/awooop-budget-hard-kill.md` | E-BUDGET-001 大量出現 | 1.查 budget_ledger 2.確認 hard_kill_at 閾值 3.是否 incident 爆發 4.臨時上調 or 等下月 reset | +| **RB-04**: Phase Rollback(Strangler Fig)| `docs/runbooks/awooop-phase-rollback.md` | canary 錯誤率 > threshold | 1.切回 project_migration_state 到上一個 mode 2.清 Redis canary cache 3.通知 EwoooC(如果影響到)| +| **RB-05**: Approval Token Replay 告警 | `docs/runbooks/awooop-approval-replay.md` | E-APPROVAL-003 出現 | 1.查 jti Redis key 2.確認 IP / user 3.吊銷 token 4.通知安全 | +| **RB-06**: Cross-Tenant Data Leak 告警 | `docs/runbooks/awooop-cross-tenant-leak.md` | E-TENANT-001 大量出現 | 1.立即停 canary/active mode 2.查 audit log 3.RLS 設定確認 4.PITR restore 評估 | +| **RB-07**: GCP Ollama Failover 異常 | `docs/runbooks/awooop-gcp-ollama-failover.md` | GCP-A/B 同時掛、Local fallback 也掛 | 1.確認 `platform:ollama:primary` Redis key 2.手動設定 fallback 3.確認 paid provider 緊急路由 | +| **RB-08**: SAGA Compensation 失敗 | `docs/runbooks/awooop-saga-compensation-fail.md` | E-SAGA-001 出現 | 1.查 saga_steps JSON 2.找失敗 step 3.手動執行補償指令 4.更新 run 狀態 | + +--- + +## 12. 工具補強計畫(tool-expert) + +| 工具 | 用途 | 安裝位置 | Phase | +|------|------|---------|-------| +| **PgBouncer** | AwoooP 多 worker 下 PG connection pool 防爆 | K8s sidecar 或獨立 Pod | Phase 4 之前 | +| **Sealed Secrets** | 替代 K8s Secret 明文,CI/CD 安全 | K3s cluster | Phase 2(security hardening 時)| +| **OPA / Cedar** | policy engine,授權邏輯集中化(取代散落程式碼)| 作為 sidecar 或 admission webhook | Phase 5 之前 | +| **chaostoolkit / LitmusChaos** | Strangler Fig 切換的混沌驗證(worker 崩潰、Redis 宕機、PG timeout)| CI pipeline | Phase 4 完成後 | +| **awooop-ctl** | AwoooP CLI(contract CRUD / run 查詢 / migration state 管理)| 本地 CLI + CI | Phase 6 之前 | +| **pg_partman** | PostgreSQL partition 自動管理 | K8s Pod / cron | Phase 4(run_state 上線前)| +| **pgvector(已有)** | KM 向量搜索 | 已存在,需 per-project namespace | Phase 2 | +| **OpenTelemetry Collector** | OTel pipeline(ADR-121),現在直送 SignOz 188:24318,未來需 sampling | K8s DaemonSet | Phase 4 之前 | + +--- + +## 13. 業界對齊(web-researcher 發現) + +### 13.1 $47k Agent Loop 事故教訓(Token Budget Hard Kill) + +問題:alert ≠ enforcement。僅發 Prometheus alert 但 agent 仍繼續執行,一個 loop 燒了 $47k。 + +AwoooP 解法(ADR-120): +- 三層 budget limit:per-run / per-project / per-tenant +- **Hard Kill**:超額 → 直接終止 run(not just log/alert) +- Redis hot counter(每次 call 減少)+ PG budget_ledger 事務(final decision) +- `AWOOOP_BUDGET_HARD_KILL` feature flag 預設 ON(唯一預設開啟的 flag) + +### 13.2 Durable Execution / SAGA 補償交易(ADR-119) + +業界標準(Temporal / Conductor / Azure Durable Functions):multi-step tool chain 必須有 step-level journal + 補償機制。 + +AwoooP 解法: +- `saga_steps` JSONB 欄位在 `awooop_run_state` +- 每個 tool call 記錄:step_id / tool / status / compensation_cmd / completed_at +- 失敗時執行補償指令(反向操作) +- 補償失敗 → E-SAGA-001 + Runbook RB-08 + +### 13.3 MCP OAuth 2.1 Confused Deputy(ADR-117) + +MCP spec 2025-06-18 要求: +- per-tenant dynamic client registration(RFC 7591) +- Resource Indicators(RFC 9728):防止 token 被跨 resource server 使用 +- PKCE(RFC 7636):防止 authorization code interception + +AwoooP 解法(ADR-117): +- 每個 tenant 動態 client registration,不共用 client_id +- Resource Indicator 必須匹配 tool registry 的 target URI +- `E-MCP-GATE-001/002/003` error codes 覆蓋 Confused Deputy 情境 + +### 13.4 OTel GenAI Semantic Conventions(ADR-121) + +官方規範(opentelemetry-specification/semantic_conventions/gen-ai): +- span 命名:`gen_ai.{system}.{operation}`(e.g., `gen_ai.anthropic.chat`) +- token attribute:`gen_ai.usage.input_tokens` / `gen_ai.usage.output_tokens` +- model attribute:`gen_ai.request.model` / `gen_ai.response.model` + +AwoooP 解法:全部 LLM call 必須 emit 以上 attribute,進 SignOz(188:24318)。 + +### 13.5 OWASP Agentic AI Top 10 對齊(ADR-122) + +| OWASP 項目 | AwoooP 對應控制 | +|-----------|---------------| +| OAI-01 Prompt Injection | MCP Gateway result sanitization + schema validator | +| OAI-02 Insecure Tool Use | Five-gate MCP enforcement + audit | +| OAI-03 Excessive Agency | requires_approval from policy(禁 LLM 決定)+ write/execute feature flag | +| OAI-04 Supply Chain | contract publish HMAC + artifact SHA-256 | +| OAI-05 Data Leakage | audit log redaction + credential isolation | +| OAI-06 Insufficient Observability | OTel GenAI + audit sink + run trace_id | +| OAI-07 Unsafe Orchestration | SAGA journal + compensation + hard kill | +| OAI-08 Memory Vulnerabilities | contract revision immutability + RLS | +| OAI-09 Access Control Bypass | approval_token HS256 + jti replay prevention | +| OAI-10 Resource Exhaustion | Token Budget Hard Kill(ADR-120)| + +--- + +## 14. GCP Ollama 拓撲對 AwoooP 的影響(ADR-110 整合) + +### 14.1 新拓撲(ADR-110,2026-05-03 生效) + +``` +Primary : GCP-A http://34.143.170.20:11434 (SSD,9x 載速) +Secondary: GCP-B http://34.21.145.224:11434 (SSD,備援) +Fallback : Local http://192.168.0.111:11434 (HDD,最後防線) +Emergency: Gemini → Nemotron → Claude (全 Ollama 掛時) +``` + +### 14.2 AwoooP 必須處理的影響項目 + +| 影響項 | 位置 | 處理方式 | Phase | +|--------|------|---------|-------| +| `ollama:current_primary` Redis key 雙寫(只支援 1 個 URL,新需要 3 層)| INV-1 | 改為 `platform:ollama:topology`(JSON:primary/secondary/fallback)| Phase 2 | +| `ollama_auto_recovery.py:230` 第二定義(P0-11)| ollama_auto_recovery.py | 移除,統一從 config 讀 | Phase 2 PR-03 | +| GCP IP 進 INV-4(34.143.170.20, 34.21.145.224)| INV-4 | 加入 allowed IP 清單,確認 K8s NetworkPolicy egress 已設定 | Phase 0 INV-4 | +| EwoooC Provider Proxy 走 GCP Ollama 路由 | Phase 6 | EwoooC 共用 platform Ollama topology(platform_resource)| Phase 6 | +| `telemetry.py:71` IP assert(P0-08)| telemetry.py:71 | 移除後,GCP IP 不再觸發 assert;改為 config-driven | Phase 2 PR-01 | +| budget_ledger 記錄 Ollama usage(免費 GCP 仍需 token 計數)| Phase 4 | Ollama call 也必須記錄 token 消耗(budget_ledger)| Phase 4 | +| Runbook RB-07(GCP Ollama failover 異常)| docs/runbooks/ | Phase 0 寫 Runbook,Phase 4 後實際 E2E 測試 | Phase 0 | + +### 14.3 Ollama GCP 為 platform_resource(ADR-111) + +GCP Ollama(34.143.170.20, 34.21.145.224)與 Local Ollama(192.168.0.111)一律聲明為 `platform_resource`: +- 不屬於任何 tenant +- 所有 tenant(AWOOOI / EwoooC / Tsenyang / Bitan)共用,但 audit 記錄各自 project_id +- `platform:ollama:topology` Redis key 前綴為 `platform:`(非 `{project_id}:`) + +--- + +## 15. 工作排序總表(含並行群組 + Critical Path) + +### Critical Path(序列執行,不可跳) + +``` +Phase 0 全部 ADR/INV + → Phase 1 Schema(PR-01/02/03/04/05 可並行先做) + → Phase 2 Security Hardening + Redis 遷移(PR-06~11) + → Phase 3 Contract Packages + → Phase 4 Platform Shell(PgBouncer + OPA/pg_partman 同步準備) + → Phase 5 MCP Gateway + → Phase 6 EwoooC(14天 shadow gate) + → Phase 7 Channel Hub(7天 canary gate) + → Phase 8 Suggest + Write(30天 suggest gate) +``` + +### 可並行工作群組 + +| 群組 | 工作 | 可與哪個並行 | +|------|------|-----------| +| G-A(Phase 0 並行)| ADR-111~115 各自獨立 | 全部並行(5 份 ADR 各分配一位)| +| G-B(Phase 0 並行)| ADR-116~124 | 與 G-A 並行 | +| G-C(Phase 0 並行)| INV-1~INV-9(部分依賴 codebase 探索)| 與 G-A/G-B 並行 | +| G-D(Phase 2 並行)| PR-01/02/03/04/05(獨立小修補)| 全部並行 | +| G-E(Phase 2 並行)| Redis 雙寫 + repository 改造 + security hardening | 各自獨立,但 security hardening 優先 | +| G-F(Phase 4 並行)| PgBouncer 安裝 + pg_partman 安裝 + OPA 安裝 | 與 Phase 3 Contract Packages 並行 | +| G-G(Phase 5-6 並行)| Operator Console prototype(ADR-UI-01~04)| 與 Phase 6 EwoooC shadow 並行 | + +### 完整排序表 + +| 順序 | 工作 | docs-only | 並行群組 | 阻擋誰 | +|------|------|-----------|---------|-------| +| 1-A | ADR-111 Bootstrap Order | ✅ | G-A | Phase 2 | +| 1-B | ADR-112 Contract Governance | ✅ | G-A | Phase 3 | +| 1-C | ADR-113 Active Revision Outbox | ✅ | G-A | Phase 1 | +| 1-D | ADR-114 Idempotency & Worker Lease | ✅ | G-A | Phase 4 | +| 1-E | ADR-115 Principal Mapping | ✅ | G-A | Phase 6、7 | +| 2-A | ADR-116 Security Hardening | ✅ | G-B | Phase 2 | +| 2-B | ADR-117 MCP OAuth 2.1 | ✅ | G-B | Phase 5 | +| 2-C | ADR-118 RLS Strategy | ✅ | G-B | Phase 1 | +| 2-D | ADR-119 Durable Execution SAGA | ✅ | G-B | Phase 4 | +| 2-E | ADR-120 Token Budget Hard Kill | ✅ | G-B | Phase 4 | +| 2-F | ADR-121 OTel GenAI | ✅ | G-B | Phase 4 | +| 2-G | ADR-122 OWASP Agentic AI | ✅ | G-B | 全 Phase | +| 2-H | ADR-123 Background Loop Migration | ✅ | G-B | Phase 2 | +| 2-I | ADR-124 Global Singleton Decomposition | ✅ | G-B | Phase 2 | +| 2-J | ADR-UI-01~04 Operator Console ADR | ✅ | G-B | Phase 6+ | +| 2-K | ADR-106 補 Quantified Gates | ✅ | G-B | Phase 8 | +| 3-A | INV-1 Redis Keys | ✅ | G-C | Phase 2 | +| 3-B | INV-2 Repository Retrofit Map | ✅ | G-C | Phase 2 | +| 3-C | INV-3 Entrypoints | ✅ | G-C | Phase 2 | +| 3-D | INV-4 Hardcoded Namespace/IP(含 GCP IP)| ✅ | G-C | Phase 2 | +| 3-E | INV-5 Migration Compatibility Matrix | ✅ | G-C | Phase 1 | +| 3-F | INV-6 Rollback Playbook Register | ✅ | G-C | Phase 4 | +| 3-G | INV-7 PR Cutting Plan | ✅ | G-C | Phase 2 | +| 3-H | INV-8 Background Loop Catalog(31 個)| ✅ | G-C | Phase 2 | +| 3-I | INV-9 Global Singleton Catalog(13 個)| ✅ | G-C | Phase 2 | +| 4 | Task 9 順序修正(Dockerfile/ConfigMap)| ❌ | — | Phase 1 | +| 5 | **Phase 1 Schema Migration**(重寫版)| ❌ | — | Phase 2~8 | +| 6-A | PR-01/02/03/04/05(並行小修補)| ❌ | G-D | Phase 2 | +| 6-B | **Phase 2 Security Hardening**(PR-07 優先)| ❌ | G-E | Phase 4 | +| 6-C | Phase 2 Redis 雙寫 + Repository(PR-06/08/09/10/11)| ❌ | G-E | Phase 4 | +| 7 | **Phase 3 Contract Packages**(packages/awooop-contracts/)| ❌ | — | Phase 4 | +| 8-A | PgBouncer + pg_partman + OPA 安裝 | ❌ | G-F | Phase 4 | +| 8-B | **Phase 4 Platform Shell + Shadow**(含 SAGA + Budget Kill)| ❌ | — | Phase 5 | +| 9 | **Phase 5 MCP Gateway**(含 OAuth 2.1)| ❌ | — | Phase 6 | +| 10-A | **Phase 6 EwoooC Shadow Onboarding**(14 天 gate)| ❌ | G-G | Phase 7 | +| 10-B | Operator Console prototype(G-G)| ❌ | G-G | Phase 7+ | +| 11 | **Phase 7 Channel Hub**(7 天 canary gate)| ❌ | — | Phase 8 | +| 12 | **Phase 8 Suggest + Controlled Write**(30 天 gate)| ❌ | — | AwoooP v1 GA | + +**1-A 到 3-I 全部 docs-only,可在當前對話視窗連續完成,完成後才開新 Codex 對話進 Phase 1 code。** + +--- + +## 16. 量化驗收門檻(完整版) + +### Strangler Fig Gates + +| 切換 | 量化條件 | 簽核 | +|------|---------|------| +| pre → shadow | tenant 已建 + agent contract published + audit/trace 寫入正常 | critic 確認 | +| shadow → canary | ≥14 天 + decision divergence < 5% + p95 退化 < 10% + 0 P0/P1 incident + audit 0 secret | critic + db-expert + vuln-verifier | +| canary → read_only | ≥7 天 + user-visible error rate < 0.5% + cost diff < 50% 預算 | critic + vuln-verifier | +| read_only → suggest | ≥14 天 + suggest accept rate ≥ 50% + 0 hallucination escalation | critic | +| suggest → auto_remediate | ≥30 天 + rollback evidence ≥ 3 成功 + approval token live + dry-run pass ≥ 99% | critic + db-expert + vuln-verifier | + +### Phase 驗收門檻(量化補強) + +| Phase | 必要量化指標 | +|-------|-----------| +| Phase 1 | migration up/down dry-run 通過;RLS cross-project 拒絕率 100%;AWOOOI 0 行為改動(regression pass rate 100%)| +| Phase 2 | INV-1 P0 key 遷移完成率 100%;vuln-verifier PoC 通過率 3/3;hardcode grep 結果 0 | +| Phase 3 | contract schema 覆蓋率 100%(6 個 family);invalid fixture 拒絕率 100% | +| Phase 4 | shadow run 0 user-visible response;duplicate event 唯一 run rate 100%;stale reaper 1min 內回收率 100% | +| Phase 5 | credential leak test 通過率 100%;Five-gate integration test 覆蓋率 100% | +| Phase 6 | cross-tenant data access 拒絕率 100%;EwoooC shadow 14天 gate 通過 | +| Phase 7 | 首則進度訊息 ≤ 30s 達成率 ≥ 99%;duplicate retry 0 重複 run | +| Phase 8 | approval replay 拒絕率 100%;write/execute 預設 OFF 驗證通過 | + +--- + +## 17. 關聯文件索引 + +- [ADR-106: AwoooP 架構](../adr/ADR-106-agent-platform-architecture.md) +- [ADR-107: 控制面儲存策略](../adr/ADR-107-awooop-control-plane-storage.md) +- [ADR-110: GCP Ollama 三層容災拓撲](../adr/ADR-110-gcp-ollama-topology.md) +- [MASTER-WORKPLAN.md](MASTER-WORKPLAN.md)(本文展開的主索引) +- [IMPLEMENTATION-ROADMAP.md](IMPLEMENTATION-ROADMAP.md)(歷史文件,舊版草稿) +- 待建:`docs/awooop/inventory/` INV-1~INV-9 +- 待建:ADR-111~ADR-124(AwoooP 專用 ADR 系列) +- 待建:ADR-UI-01~ADR-UI-04(Operator Console ADR) +- 待建:`docs/runbooks/` RB-01~RB-08 + +--- + +*最後更新:2026-05-03(台北時區)* +*建立:12-Agent 聯合審查 × Codex 整合* +*下一步:Phase 0 docs-only 工作(ADR-111 起),完成後開新 Codex 對話進 Phase 1 code* diff --git a/docs/awooop/IMPLEMENTATION-ROADMAP.md b/docs/awooop/IMPLEMENTATION-ROADMAP.md new file mode 100644 index 00000000..ab9b4ef2 --- /dev/null +++ b/docs/awooop/IMPLEMENTATION-ROADMAP.md @@ -0,0 +1,323 @@ +# AwoooP Implementation Roadmap + +**Status**: Planning baseline +**Date**: 2026-05-01 +**Owner**: AWOOOI / AwoooP platform workstream +**Primary ADRs**: ADR-106, ADR-107 + +## Purpose + +This document is the implementation handoff for AwoooP, the AWOOOI Agent +Platform. + +ADRs record architectural decisions. This roadmap translates those decisions +into execution phases, work items, acceptance gates, and Codex workflow +guidance. + +## Executive Summary + +AwoooP is the multi-tenant Agent Platform extracted from the AI capabilities +that first emerged inside AWOOOI. + +AWOOOI is the first tenant and first runtime host. It is not the platform +boundary. + +AwoooP must let current and future products share AI agents and communication +capabilities without copying project-specific code: + +- AWOOOI: AIOps, SRE automation, K8s/MCP/Telegram workflows +- EwoooC / MOMO PRO: ecommerce and business analysis, market intelligence, LINE + and Telegram workflows +- Tsenyang: existing Telegram webhook surface +- Bitan and future products: repeatable onboarding to shared AI capabilities + +The approved direction is not a pure HTTP hub, not only an SDK, and not only a +shared config table. The approved direction is: + +```text +AwoooP = Control Plane + Agent Runtime + MCP Gateway + Context Firewall + + Communication Hub + Observability / Audit +``` + +## Core Decisions + +### Product and Layout + +- Product name: `AwoooP` +- Repository/package slug: `awooop` +- Existing AWOOOI tenant id remains `awoooi` +- Do not create empty runtime or package directories before implementation owns + real code. +- `docs/awooop/` is allowed now because it contains concrete implementation + planning artifacts. + +Future code layout should be created only when each path owns real artifacts: + +| Path | Create when | Owns | +|---|---|---| +| `packages/awooop-contracts/` | Contract schemas are ready to validate | JSON Schema / Pydantic / TS types | +| `packages/awooop-client/` | First downstream client integration begins | Python/TS SDK | +| `apps/awooop-runtime/` | Runtime shell separates from `apps/api` | Platform API and run orchestration | +| `apps/awooop-worker/` | Async workers are needed | Run state execution workers | +| `docs/awooop/` | Planning and implementation docs exist | Roadmaps, schema notes, runbooks | + +### Storage + +AwoooP v1 is PostgreSQL-first. + +- PostgreSQL is source of truth for contracts, active revisions, tenant records, + policy, MCP grants, budget, ACL, run state, approval, channel event, and audit. +- Redis is cache/watch/counter/coordination only. +- Prompt, JSON Schema, eval, and replay artifacts are stored by ref + SHA-256 + hash. +- Kubernetes CRDs are future runtime projection only, not the v1 control-plane + source of truth. + +### Six Contracts + +| Contract | Purpose | +|---|---| +| Project / Tenant | Tenant identity, data boundary, budget, ACL, channels, approval gates | +| Agent | Versioned capability module, I/O schema, context domain, safety ceiling | +| MCP Gateway | Tool authorization, credential resolution, approval, result sanitization | +| Policy / Routing | Effective model/provider route, fallback, privacy and budget gates | +| Runtime / Run State | Durable async run lifecycle, shadow/canary/active, checkpoint/resume | +| Communication / Channel Event | Telegram/LINE/Slack/Email/API receive, verify, normalize, send | + +## Migration Strategy + +AwoooP must migrate by strangler fig, not big-bang replacement: + +1. `shadow`: mirror events, write audit/trace only, no user response, no side + effects. +2. `canary`: selected low-risk user-visible responses, no side effects by + default. +3. `read_only`: read-only queries and business chat move first. +4. `suggest`: analysis and recommendations move next; approval still external. +5. `auto_remediate`: write/execute tools move only after Gateway, approval, + replay, audit, and rollback evidence are green. + +## Implementation Phases + +### Phase 0 - Documentation Freeze + +Goal: make future implementation unambiguous. + +Status: in progress / mostly complete. + +Work items: + +- ADR-106: AwoooP architecture and migration strategy. +- ADR-107: AwoooP control-plane storage strategy. +- This roadmap. +- LOGBOOK entries for ADR-106/107 and roadmap. +- Task-routing lookup in `docs/12-agent-game-rules.md`. + +Acceptance: + +- `git diff --check` passes. +- No runtime code changes. +- No empty code directories. + +### Phase 1 - Control Plane Schema Foundation + +Goal: create the minimum PostgreSQL foundation for contract materialization. + +Suggested work items: + +- Add `awooop_contract_revisions`. +- Add active revision pointer tables. +- Add `awooop_projects` / tenant records. +- Add `awooop_artifact_refs`. +- Add `awooop_project_migration_state`. +- Add `project_id` strategy for existing AWOOOI records. +- Add tests for immutable published revisions and draft isolation. + +Acceptance: + +- Published contract revision is append-only. +- Runtime-facing reads cannot see mutable drafts. +- Every active revision has `revision_id` and `body_hash`. +- AWOOOI can be represented as `project_id=awoooi` without behavior changes. + +### Phase 2 - Isolation and Namespace Hardening + +Goal: make AWOOOI tenant-safe before sharing anything downstream. + +Suggested work items: + +- Prefix Redis sessions, budget, rate, and Telegram/Hermes keys with + `project_id`. +- Add `project_id` columns where required for logs, audit, sessions, and + approval records. +- Define `platform_resource` exceptions separately from tenant resources. +- Convert global resources such as Ollama failover state into explicit platform + resources. +- Add regression tests for cross-project read/write rejection. + +Acceptance: + +- Tenant-scoped data cannot be read without matching `project_id`. +- Platform resources are explicitly allowed and audited. +- No prompt text is trusted as the data isolation boundary. + +### Phase 3 - Contract Packages and Validators + +Goal: make the six contracts executable rather than prose-only. + +Suggested work items: + +- Create `packages/awooop-contracts/` only when validators are implemented. +- Define JSON Schema/Pydantic models for the six contracts. +- Define envelope schemas for platform invocation, MCP tool call, run state, and + channel events. +- Add output schema validator middleware design. +- Add golden fixtures for valid and invalid contracts. + +Acceptance: + +- Contract validation fails closed. +- LLM output that fails schema validation cannot reach channel adapters. +- Prompt/schema refs require SHA-256 hashes. + +### Phase 4 - Platform Shell in Shadow Mode + +Goal: build the first runtime shell without replacing legacy behavior. + +Suggested work items: + +- Add `/v1/platform/runs` shell around existing AWOOOI logic. +- Generate `run_id` and `trace_id`. +- Resolve project and agent contract revisions. +- Resolve `EffectivePolicy` without changing provider behavior. +- Write run state transitions and audit. +- Mirror selected AWOOOI events into AwoooP shadow mode. + +Acceptance: + +- Shadow runs produce audit and trace. +- Shadow runs never send user-visible responses. +- Shadow runs never call write/execute/destructive tools. +- Legacy AWOOOI behavior remains unchanged. + +### Phase 5 - MCP Gateway First Slice + +Goal: move tool authorization behind a real Gateway. + +Suggested work items: + +- Define MCP Gateway tables for tool registry, grants, credentials refs, and + audit. +- Wrap current read-only MCP tools behind Gateway checks. +- Enforce `Project AND Agent AND Tool AND Environment AND Approval`. +- Sanitize tool results before model context. +- Record tool call `trace_id`, `run_id`, credential ref, latency, result status. + +Acceptance: + +- Agents never see raw credentials. +- Read-only tools are auditable by `trace_id`. +- Write/execute tools stay blocked until later phases. + +### Phase 6 - EwoooC Read-Only Tenant Onboarding + +Goal: validate AwoooP with a real downstream tenant without high-risk actions. + +Suggested work items: + +- Create `project_id=ewoooc`. +- Register `openclaw-biz` as a specialized agent contract. +- Register market-intelligence tools as read-only MCP Gateway tools. +- Add a read-only platform client path. +- Mirror EwoooC bot/business-analysis events into shadow/canary. + +Acceptance: + +- EwoooC can run read-only AwoooP shadow/canary without touching AWOOOI data. +- Business-agent context cannot access infra-only AWOOOI context. +- Budget and audit are project-scoped. + +### Phase 7 - Communication Hub Increment + +Goal: standardize channels without breaking existing bots. + +Suggested work items: + +- Define `ConversationEvent` and `OutboundMessage` tables. +- Mirror Telegram inbound events first. +- Add progressive status feedback policy for async runs. +- Keep existing bot handlers authoritative until canary passes. +- Add adapter-level escaping, redaction, idempotency, and delivery audit. + +Acceptance: + +- Channel adapters do not call LLM or MCP. +- Async runs can produce progress updates. +- Duplicate channel retries do not create duplicate runs. + +### Phase 8 - Suggest and Controlled Write Paths + +Goal: graduate from read-only to proposal and then controlled execution. + +Suggested work items: + +- Enable `suggest` mode for selected AWOOI SRE flows. +- Add approval resume for `WAITING_APPROVAL`. +- Add dry-run and rollback evidence gates. +- Move low-risk write paths only after Gateway and audit evidence are stable. + +Acceptance: + +- `WAITING_APPROVAL` resumes without replaying the whole task. +- Rejected or expired approvals fail with structured failure codes. +- Write/execute paths are blocked by default and feature-flagged. + +## Codex Workflow Recommendation + +Use a new Codex conversation for implementation, but keep the working directory +at the monorepo root: + +```text +/Users/ogt/awoooi +``` + +Do not start from a new project directory until `apps/awooop-runtime` or +`packages/awooop-*` actually exists and owns code. + +Recommended implementation kickoff prompt: + +```text +Read AGENTS.md, docs/12-agent-game-rules.md, docs/LOGBOOK.md newest entry, +docs/adr/ADR-106-agent-platform-architecture.md, +docs/adr/ADR-107-awooop-control-plane-storage.md, and +docs/awooop/IMPLEMENTATION-ROADMAP.md. + +Start Phase 1 only: design and implement the minimum PostgreSQL control-plane +schema foundation for AwoooP contract revisions. Do not create runtime APIs, +do not change provider behavior, do not move Telegram/LINE webhooks, and do not +create empty AwoooP code directories. +``` + +If the current worktree is dirty, prefer a clean branch or clean worktree before +runtime implementation. Documentation-only planning can remain in this thread. + +## First Implementation Slice + +The first code slice should be deliberately boring: + +1. DB migration for contract revisions and active pointers. +2. Pydantic/SQLAlchemy models for contract revisions. +3. Repository/service methods for draft, publish, activate, and read active. +4. Tests for immutability and draft isolation. +5. LOGBOOK update. + +Do not start with MCP Gateway, Telegram Hub, or model routing. Those are more +visible, but they depend on the contract source of truth. + +## References + +- `docs/adr/ADR-106-agent-platform-architecture.md` +- `docs/adr/ADR-107-awooop-control-plane-storage.md` +- `docs/adr/ADR-105-mcp-agent-loop-governance.md` +- `docs/12-agent-game-rules.md` +- `docs/LOGBOOK.md` diff --git a/docs/awooop/MASTER-WORKPLAN.md b/docs/awooop/MASTER-WORKPLAN.md new file mode 100644 index 00000000..1e3032cb --- /dev/null +++ b/docs/awooop/MASTER-WORKPLAN.md @@ -0,0 +1,411 @@ +# AwoooP Master Workplan(P0 防爆版) + +**狀態**:規劃凍結基準 +**日期**:2026-05-03 +**主要 ADR**:ADR-106(架構)、ADR-107(控制面儲存) +**取代**:本檔取代 `IMPLEMENTATION-ROADMAP.md` 作為 AwoooP 主索引;舊 roadmap 仍保留為一階草稿,僅供歷史對照 + +--- + +## 0. 為什麼有這份文件 + +12 位 Agent(critic / vuln-verifier / debugger / db-expert / planner / fullstack-engineer / refactor-specialist / migration-engineer / onboarder / tool-expert / web-researcher / frontend-designer)對舊版 Plan 1 與 ADR-106 做完獨立審查後,發現至少 12 個 P0 問題;後續再補了 12 個會在實作後咬人的設計缺口。 + +結論:**直接進 Phase 1 SQL migration 會立刻爆。** 必須先補足 5 份 ADR、4 份 Inventory,把 Strangler Fig 的「資料載體、雙寫遷移、邊界硬攔截、可重放、可審計」全部寫死,再下 code。 + +--- + +## 1. 共識:實作前必須先完成的修補 + +| # | 問題 | 風險等級 | 必補在 | +|---|------|---------|--------| +| 1 | Redis key 直接改名無雙寫期 → 費用計數歸零、Telegram 409、silence 失效、Ollama failover 雙寫不到 | 🔴🔴🔴 | Phase 2 之前 | +| 2 | Migration SQL 表名錯(`incident_records` / `mcp_audit_snapshots`)、無 rollback、ORM 1.x vs 2.x | 🔴🔴🔴 | Phase 1 重寫 | +| 3 | `project_id` / `tenant_id` 在 codebase 0 命中,30+ 業務表無此欄 | 🔴🔴🔴 | Phase 1 + Phase 2 | +| 4 | `project_migration_state` 表缺失,Strangler Fig 無資料載體 | 🔴🔴 | Phase 1 | +| 5 | AWOOOI 雙重身份 Bootstrap Paradox(cron/job/healthcheck 全無 `project_id`) | 🔴🔴 | Phase 0 補 ADR-111 | +| 6 | EwoooC 接入零技術路徑,需要 Provider Proxy Adapter 設計 | 🔴🔴 | Phase 0 補 ADR-115 | +| 7 | Strangler shadow→canary→active 無量化 gate 條件 | 🔴🔴 | Phase 0 寫進 ADR-106 補章 | +| 8 | Layer 3 redaction 零實作(helper 有但無 enforcement point) | 🔴🔴🔴 | Phase 5 | +| 9 | `_provider` 屬性是 public,可繞過 audit | 🔴🔴 | Phase 5 | +| 10 | `WAITING_APPROVAL` resume 不驗 caller identity,無 approval_token 簽章 | 🔴🔴 | Phase 4 + Phase 8 | +| 11 | Redis approval state 單點,無 PG sync | 🔴 | Phase 2 + Phase 8 | +| 12 | Task 9(K8s ConfigMap)順序倒置,agent prompt 全回 None | 🔴🔴🔴 | Phase 1 之前先順序修正 | +| 13 | Audit log 本身會洩密,redaction 必須做在 audit sink 前 | 🔴🔴 | Phase 5 | +| 14 | `sanitization_service.py` 已存在 helper,但 MCP Gateway / AgentToolExecutor 沒強制使用 | 🔴🔴 | Phase 5 | +| 15 | Redis working memory(`SCAN incident:*`)需要 project 邊界 | 🔴🔴 | Phase 2 | +| 16 | Contract publish 權限與簽章未定義(誰可 publish/activate) | 🔴 | Phase 0 補 ADR-112 | +| 17 | Active revision 切換無 transactional outbox,worker 可能吃舊 policy | 🔴 | Phase 0 補 ADR-113 | +| 18 | Run/Channel idempotency 缺 key derivation 規則與 unique index | 🔴 | Phase 0 補 ADR-114 | +| 19 | Async worker 缺 lease / heartbeat / stale reaper | 🔴 | Phase 4 | +| 20 | 高流量表(`run_state` / `channel_event` / `mcp_audit` / `agent_audit`)partition 與 retention 需 Phase 1 就決定 | 🟠 | Phase 1 | +| 21 | Observability metrics label cardinality 規則:`run_id`/`trace_id`/`session_id` 禁止進 metrics | 🟠 | Phase 4 | +| 22 | Telegram/LINE/Slack/API/Internal 缺 canonical principal mapping | 🟠 | Phase 0 補 ADR-115 | +| 23 | EwoooC Provider Proxy 不能只改 `OLLAMA_API_BASE`,必須補 envelope + audit 入口 | 🔴 | Phase 0 補 ADR-115 | +| 24 | 所有 entrypoint(cron / job / webhook / CLI / healthcheck)需 inventory 並標 `requires_project_id` | 🔴 | Phase 0 Inventory | + +--- + +## 2. Pre-flight Audit(Phase 0 擴張) + +舊版 Phase 0 只凍 ADR-106/107。新版 Phase 0 還需要 **5 份 ADR + 4 份 Inventory**,全部 docs-only。 + +### 2.1 5 份必補 ADR(⚠️ ADR-108/109/110 已被其他 ADR 占用,AwoooP 從 ADR-111 開始) + +| ADR | 主題 | 解決 | +|-----|------|------| +| **ADR-111** | AwoooP Bootstrap Order & Identity Paradox | #5、#24 — 定義 hard reject 啟用順序、`platform_internal` / `requires_project_id` / `legacy_awoooi_default` 標記、AWOOOI cron/job 過渡期豁免規則 | +| **ADR-112** | Contract Governance & Publishing Workflow | #16 — 誰可 publish、誰可 activate、CODEOWNERS、簽章/HMAC、approval workflow、activation audit | +| **ADR-113** | Active Revision Invalidation & Outbox | #17 — `awooop_contract_outbox` 表設計、Redis pub/sub 通知、worker revision-aware cache、split-brain 防禦 | +| **ADR-114** | Idempotency, Worker Lease & Run Recovery | #18、#19 — channel event dedupe、`(project_id, channel_type, provider_event_id)` unique、worker `lease_until` / `heartbeat_at` / `attempt_count`、stale run reaper、SKIP LOCKED | +| **ADR-115** | Canonical Principal Mapping & Tenant Onboarding Patterns | #6、#22、#23 — Telegram/LINE/Slack/API/Internal user → `platform_subject` 統一映射、EwoooC Provider Proxy Adapter 設計、Tsenyang/Bitan 接入模式範本 | + +ADR-106 也需要補一節:**Strangler Fig Quantified Gates**,把 shadow → canary → active 的量化條件寫死(≥14 天、決策差異率 <5%、p95 退化 <10%、無 P1 incident、cost diff < 預算上限 50%)。 + +### 2.2 4 份必做 Inventory + +| Inventory | 範圍 | 解決 | +|-----------|------|------| +| **INV-1:Redis Key Inventory** | 全 codebase grep `redis_client.*\(["']` + `r\.set/get/scan` → 列出 43+ 個 key、命名空間、TTL、用途、寫入點、讀取點、是否硬碼跨檔 | #1、#15 | +| **INV-2:Repository Project-id Retrofit Map** | 列出全部 30+ 張業務表、目前有無 `project_id` 欄位、所有 repository 方法、需加 filter 的查詢、需 backfill 的歷史資料 | #3 | +| **INV-3:Entrypoint Inventory** | 列出所有 cron job / scheduler / webhook / CLI script / healthcheck / internal service call,標記 `requires_project_id` / `platform_internal` / `legacy_awoooi_default` | #5、#24 | +| **INV-4:Hardcoded Namespace & IP Inventory** | 列出所有硬碼 K8s namespace(`awoooi-prod`)、SSH 主機 IP、白名單,標記 tenant-scope 改造方案 | 配合 #3 完成多租戶啟用 | + +### Phase 0 驗收 + +- ADR-111~115 全部 Accepted 並進 LOGBOOK +- ADR-106 補 Quantified Gates 章節 +- 4 份 Inventory 寫入 `docs/awooop/inventory/` +- 沒有任何 runtime code 變動 +- `git diff --check` 通過 + +--- + +## 3. 修訂版 8 階段實施計畫 + +> 階段順序與舊 roadmap 相同,但每階段範圍依 §1 共識重寫。 + +### Phase 1 — Control Plane Schema Foundation(重寫) + +**目標**:建立 PostgreSQL contract control plane 最小可用骨架,並修正舊 SQL migration 三大 blocker。 + +工作項: + +1. **核對真實表名**:在寫 SQL 前 grep 確認 `incidents`(非 `incident_records`)、`mcp_audit_log`(非 `mcp_audit_snapshots`),全部錯名修正 +2. **ORM 同步現況**:使用 SQLAlchemy 2.x `mapped_column`、補齊 `Numeric`/`UniqueConstraint`/`func` import +3. **每個 migration 都有 rollback SQL**(down migration 強制) +4. **新增 contract control 表**: + - `awooop_projects`(tenant 主表,`project_id` PK) + - `awooop_contract_revisions`(六合約共用 revision 表,append-only) + - `awooop_active_revisions`(active pointer 表) + - `awooop_artifact_refs`(prompt/schema/eval 的 ref + sha256) + - `awooop_project_migration_state`(Strangler 階段追蹤) + - `awooop_contract_outbox`(ADR-113,active revision 切換事件) + - `awooop_channel_event_dedupe`(ADR-114,idempotency 唯一鍵) + - `awooop_platform_subjects`(ADR-115,canonical principal) +5. **高流量表 partition 策略決定(不延後)**: + - `awooop_run_state`、`awooop_channel_event`、`awooop_mcp_gateway_audit`、`awooop_agent_audit_log` 一律按月 partition + - 每 tenant retention policy(預設 90 天 hot + 1 年 warm) +6. **`project_id` 對既有表的策略**:暫不在現有 30+ 業務表加欄位(留給 Phase 2),先在 AwoooP 自己的表強制 `project_id NOT NULL` +7. **immutability 測試**:published revision 嘗試 UPDATE 必失敗、draft 與 active 隔離 +8. **Task 9 順序修正前置**:Dockerfile / ConfigMap / agent_loader 預設路徑改動先於任何 agent prompt 載入點變更 + +驗收: +- AWOOOI 可被表示為 `project_id=awoooi` 且 0 行為改動 +- 每個 active revision 都有 `revision_id` 與 `body_hash` +- runtime 讀取路徑看不到 mutable draft +- migration up/down 都通過 dry-run +- partition + retention 寫入 runbook + +### Phase 2 — Tenant Isolation & Namespace Hardening(重寫) + +**目標**:在開放任何下游 tenant 之前,把 AWOOOI 自己變成乾淨的 tenant。 + +工作項: + +1. **Redis 三階段雙寫遷移**(依 INV-1): + - **階段 A**:dual-write 新舊 key(30 天觀察) + - **階段 B**:dual-read,新 key 為主、舊 key 為 fallback(14 天) + - **階段 C**:移除舊 key 寫入,留 audit log + - **必含**:`ai_rate:total_cost:gemini`(費用上限)、`telegram:polling:leader`(Pod 鎖)、`telegram_silence:{target}`(含 `decision_manager.py:230` 硬碼)、`ollama:current_primary`(含 `ollama_auto_recovery.py:230` 第二定義) +2. **Repository project_id 改造(依 INV-2)**: + - 所有 30+ repository 方法加 `project_id` filter + - K8s namespace 白名單 → tenant-aware 設定(依 INV-4) + - SSH 主機白名單 → tenant-aware +3. **Redis working memory project 邊界**(#15): + - `incident_service.py:603` 的 `SCAN incident:*` → `SCAN {project_id}:incident:*` + - 所有 `SCAN`/`KEYS` 必須帶 prefix +4. **`platform_resource` 例外名單**:Ollama failover state、global rate limit、leader election lock 等明確標記 +5. **回歸測試**:cross-project read/write 必拒絕;platform_resource 必允許但寫 audit +6. **AWOOOI Bootstrap Paradox 修補**(依 ADR-111、INV-3): + - 標記為 `platform_internal` 的 entrypoint 帶 `project_id=__platform__`,hard reject 例外但寫 audit + - 標記為 `legacy_awoooi_default` 的舊 cron 暫時 fallback 到 `project_id=awoooi`,列退場時程 + +驗收: +- INV-1 列出的所有 P0 key 完成三階段遷移 +- 30+ repository 全部加 `project_id` filter,cross-project test 全紅 +- 無任何 hardcode tenant 字串殘留(grep `awoooi-prod` / `192.168` 必為 0) + +### Phase 3 — Contract Packages & Validators + +**目標**:六合約從散文升級為可驗證程式。 + +工作項: +1. 建立 `packages/awooop-contracts/`(此時才建立) +2. 六合約 JSON Schema + Pydantic models +3. envelope schema:platform invocation、MCP tool call、run state、channel events +4. **Output schema validator middleware**:LLM 回傳必先過 schema、失敗 retry 上限硬碼、失敗不外漏到 channel +5. golden fixtures(valid + invalid) +6. **Contract publish governance**(依 ADR-112): + - CODEOWNERS 對 `packages/awooop-contracts/` + - publish API 簽章驗證 + - activate 動作要 approval workflow + +驗收: +- 任何 schema 不符的 LLM 輸出無法到達 channel adapter +- prompt/schema ref 必含 sha256 hash +- 無權限不能 publish 或 activate + +### Phase 4 — Platform Shell in Shadow Mode(補 lease/idempotency/audit redaction) + +**目標**:建立第一個 runtime shell,但只跑 shadow,不改 legacy 行為。 + +工作項: + +1. `/v1/platform/runs` API(async) +2. `run_id` / `trace_id` 生成(W3C tracecontext-compatible) +3. project + agent contract revision 解析 +4. EffectivePolicy 解析(不改 provider 行為) +5. **Run state machine**(依 ADR-114): + - `lease_until`、`heartbeat_at`、`attempt_count` 欄位 + - SKIP LOCKED 取單 + - stale run reaper(每分鐘掃 expired lease) +6. **Idempotency**(依 ADR-114): + - `(project_id, channel_type, provider_event_id)` unique + - duplicate event return 既有 run +7. **Audit log redaction**(#13): + - audit sink 寫入前過 `sanitization_service` + - PII / secret pattern 硬攔 +8. **Observability label rules**(#21): + - metrics label 限定 `project_id` / `agent_id` / `status` / `provider` + - `run_id` / `trace_id` / `session_id` 只進 logs/traces +9. mirror 選定 AWOOOI 事件到 shadow + +驗收: +- shadow run 永遠 0 user-visible response、0 destructive tool call +- legacy AWOOOI 行為 0 改變 +- worker crash 後 stale run 1 分鐘內被回收 +- duplicate retry 不產生重複 run +- audit log 0 secret 命中(vuln-verifier 抽樣 100 筆) + +### Phase 5 — MCP Gateway First Slice(補 sanitization enforcement、_provider 修正、audit redaction) + +**目標**:把 tool 授權搬到 Gateway,read-only 工具先進。 + +工作項: + +1. Gateway 表:tool registry、grants、credential refs、audit +2. wrap 既有 read-only MCP tool +3. 強制:`Project AND Agent AND Tool AND Environment AND Approval` 五重交集 +4. **Result sanitization enforcement point**(#8、#14): + - 所有 MCP result 必經 `sanitization_service` pipeline + - 不允許 raw result 直接進 LLM context +5. **`_provider` → `__provider`**(#9): + - 雙底線真 private + - 加 unit test:外部 reflect 取用 must fail +6. **Audit log 雙層 redaction**(#13): + - 進 LLM 前一層 + - 進 audit sink 一層 +7. tool call 記錄 `trace_id` / `run_id` / credential ref / latency / status + +驗收: +- agent 程式碼路徑 0 raw credential 接觸(sast 掃過) +- raw result 不可能繞過 sanitization(單元測試 + 整合測試覆蓋) +- 2026-04-18 secret leak 重演測試:kubectl describe configmap 輸出不會出現在任何 LLM context 或 audit row + +### Phase 6 — EwoooC Read-Only Tenant Onboarding(依 ADR-115) + +**目標**:以實際下游 tenant 驗證 AwoooP,全 read-only。 + +工作項: + +1. 建立 `project_id=ewoooc` +2. 註冊 `openclaw-biz` agent contract +3. **Provider Proxy Adapter**(#23): + - 不只是改 `OLLAMA_API_BASE` + - Proxy 入口補 envelope(`project_id` / `agent_id` / `trace_id` / `run_id`) + - 經過 EffectivePolicy + budget guard + audit + - read-only / model-call 入口優先 +4. EwoooC 市場情報 tools 註冊為 read-only MCP Gateway tool +5. EwoooC bot/business-analysis 事件先 mirror 到 shadow,14 天後升 canary + +驗收: +- EwoooC 可跑 read-only AwoooP shadow/canary,0 接觸 AWOOOI 資料 +- business-agent context 不可讀 infra-only AWOOOI context +- budget / audit 完全 project-scoped + +### Phase 7 — Communication Hub Increment(補 progressive feedback) + +**目標**:標準化 channel 但不切斷既有 bot。 + +工作項: + +1. `awooop_conversation_event` + `awooop_outbound_message` 表(partition + retention 已在 Phase 1) +2. Telegram inbound mirror 先進 +3. **Progressive Feedback Policy**(async UX gap,#補充): + - WAITING_TOOL / RUNNING / WAITING_APPROVAL 必發暫態訊息 + - 用 Telegram message edit 更新(非新訊息) +4. 既有 bot handler 維持權威,直到 canary 通過量化 gate +5. adapter-level escaping、redaction、idempotency、delivery audit +6. **Canonical principal mapping**(依 ADR-115):所有 channel sender 寫入 `awooop_platform_subjects` + +驗收: +- channel adapter 0 LLM 呼叫、0 MCP 呼叫 +- async run 有進度更新(≤30s 必有第一則) +- duplicate retry 不產生 duplicate run(INV-1 + Phase 4 idempotency 已就位) + +### Phase 8 — Suggest & Controlled Write Paths(補 approval token signing) + +**目標**:從 read-only 升級到 propose、再到 controlled execute。 + +工作項: + +1. AWOOOI SRE 部分流程啟用 `suggest` +2. **Approval resume 安全強化**(#10、#11): + - resume API 強制驗 `approval_token`(HMAC-signed) + - approval state PG 為 source of truth、Redis 為 cache + - approval 過期 / 已決 / 重放都拒絕 +3. dry-run 與 rollback evidence gate +4. write/execute 預設 disabled,feature flag 控制 +5. **量化 gate(依 ADR-106 補章)**: + - shadow → canary:≥14 天 + decision divergence < 5% + p95 退化 < 10% + - canary → active:≥7 天 + 0 P1 incident + cost diff < 預算 50% + +驗收: +- WAITING_APPROVAL resume 不能在沒 token 的情況下成功(vuln-verifier 寫 PoC) +- Redis 宕機時 approval 仍可從 PG 恢復 +- write/execute 預設 OFF,需顯式 feature flag + +--- + +## 4. 跨階段橫向工作項(Cross-Cutting) + +| 工作項 | 跨哪些階段 | 負責 | +|--------|-----------|------| +| **Bootstrap order discipline**(ADR-111) | Phase 0、2、4、Forever | platform-runtime | +| **Audit log redaction(雙層)** | Phase 4、5 | mcp-gateway + runtime | +| **High-traffic table partition + retention** | Phase 1、4、7 | db-expert + sre | +| **Observability label cardinality 規則** | Phase 4 起永久 | observability | +| **Contract outbox / active revision invalidation** | Phase 1 表、Phase 4 worker 用 | platform-runtime | +| **Canonical principal mapping** | Phase 0 ADR、Phase 7 落地 | identity | +| **Approval token signing** | Phase 4 token 簽發、Phase 8 verify | security + runtime | +| **EwoooC Provider Proxy Adapter** | Phase 0 設計、Phase 6 落地 | tenant-onboarding | + +--- + +## 5. 工作排序總表(建議施作順序) + +| 順序 | 工作 | 是否 docs-only | 阻擋誰 | +|------|------|--------------|-------| +| 1 | ADR-111 Bootstrap Order | ✅ | Phase 2 | +| 2 | ADR-112 Contract Governance | ✅ | Phase 3 | +| 3 | ADR-113 Active Revision Outbox | ✅ | Phase 1 | +| 4 | ADR-114 Idempotency & Worker Lease | ✅ | Phase 4 | +| 5 | ADR-115 Principal Mapping & EwoooC Proxy | ✅ | Phase 6、7 | +| 6 | ADR-106 補 Quantified Gates 章節 | ✅ | Phase 8 | +| 7 | INV-1 Redis Key Inventory | ✅ | Phase 2 | +| 8 | INV-2 Repository Retrofit Map | ✅ | Phase 2 | +| 9 | INV-3 Entrypoint Inventory | ✅ | Phase 2 | +| 10 | INV-4 Namespace/IP Inventory | ✅ | Phase 2 | +| 11 | Phase 1 schema migration(重寫版) | ❌ runtime | Phase 2-8 | +| 12 | Task 9 順序修正(Dockerfile/ConfigMap 先) | ❌ runtime | Phase 1 prompt 任何更動 | +| 13 | Phase 2 三階段 Redis 雙寫 + repository project_id | ❌ runtime | Phase 4 起所有 tenant 行為 | +| 14 | Phase 3 contract packages | ❌ runtime | Phase 4 起 | +| 15 | Phase 4 platform shell + shadow + idempotency + audit redaction | ❌ runtime | Phase 5-8 | +| 16 | Phase 5 MCP Gateway + sanitization enforcement + `__provider` | ❌ runtime | Phase 6 read-only tool | +| 17 | Phase 6 EwoooC onboarding via Provider Proxy | ❌ runtime | Phase 7 | +| 18 | Phase 7 Communication Hub + progressive feedback | ❌ runtime | Phase 8 | +| 19 | Phase 8 suggest + approval signing + controlled write | ❌ runtime | 平台 v1 GA | + +**1~10 是 docs-only,可以在當前對話視窗連續做完,全部完成才開新對話進 Phase 1 code。** + +--- + +## 6. 量化驗收門檻(Strangler Fig Gates) + +每個 tenant × 每個 capability 切換階段都要過: + +| 切換 | 必要條件 | +|------|---------| +| pre → shadow | tenant 已建、agent contract published、audit/trace 寫入正常 | +| shadow → canary | ≥14 天 shadow 觀察 + decision divergence < 5% + p95 latency 退化 < 10% + 0 P0/P1 incident + audit 0 secret 命中 | +| canary → read_only | ≥7 天 canary + user-visible response 錯誤率 < 0.5% + cost diff < 預算 50% | +| read_only → suggest | ≥14 天 read_only + agent suggestion accept rate ≥50% + 0 hallucination escalation | +| suggest → auto_remediate | ≥30 天 suggest + rollback evidence ≥3 次成功 + approval token signing live + dry-run pass rate ≥99% | + +每個 gate 由 12-Agent critic + db-expert + vuln-verifier 三方簽核,寫進 LOGBOOK。 + +--- + +## 7. 授權需求清單(已獲統帥完整授權) + +| 類別 | 動作 | 風險 | +|------|------|------| +| docs-only | 寫 ADR-111~115、4 份 Inventory、ADR-106 補章 | 低 | +| schema | 新增 8 張 AwoooP 控制面表(Phase 1) | 中(DB migration) | +| schema | 30+ 業務表加 `project_id` 欄位 + backfill(Phase 2) | 高(DB migration、要 db-expert review) | +| Redis | 全 codebase 43+ key 三階段雙寫遷移(Phase 2) | 高(影響費用、Telegram、silence、ollama failover) | +| code | 30+ repository 加 `project_id` filter(Phase 2) | 中(regression risk) | +| code | wrap MCP provider 為 Gateway(Phase 5) | 中 | +| infra | partition + retention runbook(Phase 1) | 低 | +| infra | K8s ConfigMap 預載 agent prompt(Task 9 提前) | 中 | +| security | `__provider` 雙底線、approval token 簽章(Phase 5、8) | 中 | +| feature flag | suggest mode、controlled write(Phase 8) | 高(要 vuln-verifier PoC + dry-run evidence) | + +不在本授權範圍: +- 提高 paid provider 配額或啟用新雲端 provider(仍須 HARD_RULES feedback_cost_change_approval 流程) +- 任何 destructive MCP tool 上線(要 Phase 8 evidence 才開) +- Telegram/LINE/Slack webhook 直接切走(必須先 shadow → canary → 量化 gate) + +--- + +## 8. Codex 工作模式建議 + +| 階段 | 對話 | cwd | +|------|------|-----| +| 排序 1~10(docs-only) | 當前對話可直接完成 | `/Users/ogt/awoooi` | +| 排序 11 起(runtime code) | 新開 Codex 對話 + 乾淨 worktree | `/Users/ogt/awoooi` | + +實作對話 kickoff prompt 模板: + +``` +讀完: +- AGENTS.md +- docs/12-agent-game-rules.md +- docs/LOGBOOK.md 最新一篇 +- docs/adr/ADR-106 / 107 / 108 / 109 / 110 / 111 / 112 +- docs/awooop/MASTER-WORKPLAN.md +- docs/awooop/inventory/ INV-1 ~ INV-4 + +只做 Phase {N}。不擴張範圍、不改 provider 行為、不切 channel webhook、不建空目錄。 +完成後輸出 [P7-COMPLETION]。 +``` + +--- + +## 9. 參考 + +- `docs/adr/ADR-106-agent-platform-architecture.md` +- `docs/adr/ADR-107-awooop-control-plane-storage.md` +- `docs/adr/ADR-105-revert-a2-ollama-primary.md` +- `docs/adr/ADR-111-awooop-bootstrap-order.md`(待寫) +- `docs/adr/ADR-112-awooop-contract-governance.md`(待寫) +- `docs/adr/ADR-113-awooop-active-revision-outbox.md`(待寫) +- `docs/adr/ADR-114-awooop-idempotency-worker-lease.md`(待寫) +- `docs/adr/ADR-115-awooop-principal-mapping-tenant-onboarding.md`(待寫) +- `docs/awooop/inventory/INV-1-redis-keys.md`(待寫) +- `docs/awooop/inventory/INV-2-repository-project-id-retrofit.md`(待寫) +- `docs/awooop/inventory/INV-3-entrypoints.md`(待寫) +- `docs/awooop/inventory/INV-4-hardcoded-namespace-ip.md`(待寫) +- `docs/12-agent-game-rules.md` +- `docs/LOGBOOK.md` diff --git a/docs/awooop/inventory/INV-1-redis-keys.md b/docs/awooop/inventory/INV-1-redis-keys.md new file mode 100644 index 00000000..a14bc119 --- /dev/null +++ b/docs/awooop/inventory/INV-1-redis-keys.md @@ -0,0 +1,104 @@ +# INV-1: Redis Key Inventory + +**版本**:v1.0 初稿 +**日期**:2026-05-03(台北) +**範圍**:`apps/api/src/` 全 codebase grep +**用途**:Phase 2 三階段雙寫遷移的完整 Redis key 清單 + +--- + +## 使用方式 + +本 INV 列出所有 Redis key pattern、TTL、用途、寫入點、讀取點、是否硬碼跨兩處。 +Phase 2 遷移按批次執行:P0(Critical)→ P1(高風險)→ P2(業務資料)→ P3(低影響) + +--- + +## 1. P0 — Critical(必須優先遷移) + +| Key Pattern | 用途 | TTL | 寫入點 | 讀取點 | 問題 | 新 Pattern(Phase 2 後)| +|-------------|------|-----|--------|--------|------|------------------------| +| `ollama:current_primary` | Ollama primary URL 選擇 | 無 | `ollama_auto_recovery.py` (兩處!) | `ollama_failover_manager.py` | 🔴 P0-11:第二定義 at ollama_auto_recovery.py:230;三層 GCP 拓撲無法塞入單一 URL | `platform:ollama:topology`(JSON:primary/secondary/fallback)| +| `ollama:gemini_daily_count:{date}` | Gemini 每日使用計數 | 24h | `ollama_auto_recovery.py` | same | 🔴 無 project 前綴,多 tenant 共用 | `platform:ollama:gemini_daily_count:{date}` | +| `telegram:polling:leader` | Telegram polling pod 鎖 | ~30s | `telegram_gateway.py:55` | same | 🔴 platform_resource,應明確標記 | `platform:telegram:polling:leader` | +| `telegram_silence:{target}` | 告警靜默控制 | 設定值 | `telegram_gateway.py:4079`(用 SILENCE_KEY_PREFIX)| `decision_manager.py:240`(🔴 硬碼 `telegram_silence:{target}`,未 import 常數)| P1-24:兩處定義不一致 | `{project_id}:telegram:silence:{target}` | +| `ai_rate:total_cost:{provider}` | 費用上限控制 | 無 | `ai_rate_limiter.py` | `ai_router.py` | 🔴 無 project 前綴,AWOOOI 費用計數會與 EwoooC 混算 | `{project_id}:ai_rate:total_cost:{provider}` | + +## 2. P1 — 高風險(Phase 2 第二批) + +| Key Pattern | 用途 | TTL | 寫入點 | 讀取點 | 問題 | 新 Pattern | +|-------------|------|-----|--------|--------|------|------------| +| `approval:{run_id}:*` | approval 審核狀態 | 15min | `multi_sig_redis.py` | same | 🔴 P1-15:無 trace_id;approval_token 未簽章 | `{project_id}:approval:{run_id}:*`(Phase 8 後加 jti)| +| `incident:{incident_id}` | incident working memory | 24h | `incident_service.py` | `incident_service.py:603`(SCAN `incident:*`)| 🔴 P1-18:SCAN 無 prefix 篩選 | `{project_id}:incident:{incident_id}` | +| `consensus:{*}` | consensus engine 狀態 | varies | `consensus_engine.py`(CONSENSUS_PREFIX="consensus:")| same | 🔴 P0-12:無 project 前綴,multi-tenant 共用 | `{project_id}:consensus:{*}` | +| `playbook:{playbook_id}` | playbook cache | 1h | `playbook_repository.py` | same | 🟠 無 project,但目前 AWOOOI 只有一個 tenant | `{project_id}:playbook:{playbook_id}` | +| `playbook:embedding:index` | embedding index | 無 | `playbook_repository.py` | same | 🟠 無 project | `{project_id}:playbook:embedding:index` | +| `playbook:index:alert:{alert_name}` | alert → playbook 映射 | 1h | `playbook_repository.py` | same | 🟠 無 project | `{project_id}:playbook:index:alert:{alert_name}` | +| `playbook:embedding:{playbook_id}` | embedding vector | 無 | `playbook_repository.py` | same | 🟠 無 project | `{project_id}:playbook:embedding:{playbook_id}` | +| `km:backfill:dlq` | KM backfill dead letter queue | 無 | `km_service.py` | same | 🟠 無 project | `{project_id}:km:backfill:dlq` | + +## 3. P2 — 業務資料(Phase 2 第三批) + +| Key Pattern | 用途 | TTL | 寫入點 | 讀取點 | 問題 | 新 Pattern | +|-------------|------|-----|--------|--------|------|------------| +| `anomaly:disposition:{alert_id}` | 告警處置決定 | 7d | `anomaly_counter.py:790` | same | 🔴 P1-17:AnomalyCounter 全域單例,6 個 prefix 無 tenant | `{project_id}:anomaly:disposition:{alert_id}` | +| `anomaly:metadata:{alert_id}` | 告警 metadata | 7d | `anomaly_counter.py` | same | 同上 | `{project_id}:anomaly:metadata:{alert_id}` | +| `anomaly:permanent_fix:{alert_id}` | 永久修復紀錄 | 30d | `anomaly_counter.py` | same | 同上 | `{project_id}:anomaly:permanent_fix:{alert_id}` | +| `anomaly:repair_count:{alert_id}` | 修復計數 | 7d | `anomaly_counter.py` | same | 同上 | `{project_id}:anomaly:repair_count:{alert_id}` | +| `anomaly:repair_history:{alert_id}` | 修復歷史 | 30d | `anomaly_counter.py` | same | 同上 | `{project_id}:anomaly:repair_history:{alert_id}` | +| `anomaly:timeline:{alert_id}` | 告警時間線 | 7d | `anomaly_counter.py` | same | 同上 | `{project_id}:anomaly:timeline:{alert_id}` | +| `hermes:session:{chat_id}:{user_id}` | Hermes 對話狀態 | 30min | `hermes/nl_gateway.py:146` | same | 🔴 P1-16:無 project 前綴 | `{project_id}:hermes:session:{chat_id}:{user_id}` | +| `hermes:rl:{chat_id}` | Hermes rate limit | 1min | `hermes/nl_gateway.py:163` | same | 🔴 P1-16 | `{project_id}:hermes:rl:{chat_id}` | +| `hermes:approvers` | Hermes 審核者清單 | 無 | `hermes/nl_gateway.py:7` | same | 🔴 P1-16 | `{project_id}:hermes:approvers` | +| `session:{session_id}` | agent session | varies | `agent_sessions service` | same | 🟠 無 project | `{project_id}:session:{session_id}` | +| `tg_sent:{fingerprint}` | Telegram 去重 dedup | 10min | `telegram_gateway.py` | same | 🟢 ADR-109 已定(dedup 相容性);注意 EwoooC Telegram 需要不同 fingerprint scope | `{project_id}:tg_sent:{fingerprint}` | +| `telegram:{user_id}:{*}` | Telegram session / callback | varies | `telegram_gateway.py` | same | 🟠 無 project | `{project_id}:telegram:{user_id}:{*}` | + +## 4. P3 — 低影響(Phase 2 最後) + +| Key Pattern | 用途 | TTL | 問題 | 新 Pattern | +|-------------|------|-----|------|------------| +| `playbook:*`(SCAN)| playbook 全量掃描 | — | 🟠 SCAN 無 prefix 篩選 | 禁止裸 SCAN,改 `SCAN {project_id}:playbook:*` | +| `approval:123:lock`(測試用)| 測試 key | — | 🔵 hardcode 測試字串,清理 | 移除 | + +--- + +## 5. Platform Resource(不按 project 隔離,明確標記) + +| Key Pattern | 理由 | 備註 | +|-------------|------|------| +| `platform:telegram:polling:leader` | 全域 polling leader 鎖,不屬於任何 tenant | ADR-111 platform_resource | +| `platform:ollama:topology` | GCP-A/GCP-B/Local 三層路由狀態,所有 tenant 共用 | ADR-111 + ADR-110 | +| `platform:ollama:gemini_daily_count:{date}` | 全 platform Gemini 緊急路由計數 | ADR-111 | + +--- + +## 6. 遷移執行順序(三階段) + +``` +Phase A(雙寫,30 天觀察): + 新 key 寫入 + 舊 key 繼續寫入 + 讀取仍以舊 key 為主 + 監控:舊 key 讀取次數 / 新 key 命中次數 + +Phase B(雙讀,14 天): + 新 key 為主,舊 key 為 fallback + 讀不到新 key → 讀舊 key,並回填新 key + 監控:舊 key fallback 率必須 < 1% + +Phase C(移除舊 key 寫入): + 停止寫入舊 key + 舊 key 自然過期 + 監控:確認舊 key 0 讀取後才進行 +``` + +--- + +## 7. 驗收標準 + +- [ ] 所有 P0 key 完成 Phase A 雙寫 +- [ ] `decision_manager.py:240` 改為 import `telegram_gateway.SILENCE_KEY_PREFIX` +- [ ] `ollama:current_primary` 改為 `platform:ollama:topology`(JSON 格式) +- [ ] 無任何裸 `SCAN *` 或 `SCAN prefix:*`(改為 `SCAN {project_id}:prefix:*`) + +*最後更新:2026-05-03(台北)* diff --git a/docs/awooop/inventory/INV-2-repository-project-id-retrofit.md b/docs/awooop/inventory/INV-2-repository-project-id-retrofit.md new file mode 100644 index 00000000..029e572a --- /dev/null +++ b/docs/awooop/inventory/INV-2-repository-project-id-retrofit.md @@ -0,0 +1,137 @@ +# INV-2: Repository project_id Retrofit Map + +**版本**:v1.0 初稿 +**日期**:2026-05-03(台北) +**範圍**:`apps/api/src/db/models.py` + `apps/api/migrations/` 全部 +**用途**:Phase 2 在 30+ 業務表加 project_id 欄位、backfill 歷史資料、repository filter 改造 + +--- + +## 1. 現有資料庫表清單(從 db/models.py 實際讀取) + +| 表名 | SQLAlchemy 類名 | 有無 project_id | 是否需要加 | Phase | +|------|----------------|----------------|------------|-------| +| `approval_records` | ApprovalRecord | ❌ | ✅ 是(approval 需 tenant 隔離)| Phase 2 | +| `timeline_events` | TimelineEvent | ❌ | ✅ 是(關聯 incident)| Phase 2 | +| `audit_logs` | AuditLog | ❌ | ✅ 是(審計必須 project-scoped)| Phase 2 | +| `mcp_audit_log` | McpAuditLog | ❌ | ✅ 是(MCP audit 必 project-scoped)+ 加 trace_id / run_id | Phase 2 | +| `mcp_daily_stats` | McpDailyStats | ❌ | ✅ 是(統計分 project)| Phase 2 | +| `k8s_state_snapshots` | K8sStateSnapshot | ❌ | ✅ 是(K8s 狀態屬於 tenant)| Phase 2 | +| `prometheus_snapshots` | PrometheusSnapshot | ❌ | ✅ 是(metrics 屬於 tenant)| Phase 2 | +| `auto_repair_executions` | AutoRepairExecution | ❌ | ✅ 是(修復操作屬於 tenant)| Phase 2 | +| `alert_operation_log` | AlertOperationLog | ❌ | ✅ 是(告警操作屬於 tenant)| Phase 2 | +| `incidents` | Incident | ❌ | ✅ 是(P0-03 核心問題)+ 加 trace_id / awooop_run_id | Phase 2 | +| `knowledge_entries` | KnowledgeEntry | ❌ | ✅ 是(KM 需要 per-project namespace)| Phase 2 | +| `incident_evidence` | IncidentEvidence | ❌ | ✅ 是(關聯 incident)| Phase 2 | +| `playbooks` | Playbook | ❌ | ✅ 是(playbook 屬於 tenant)| Phase 2 | +| `dynamic_baselines` | DynamicBaseline | ❌ | ✅ 是(基線屬於 tenant)| Phase 2 | +| `log_clusters` | LogCluster | ❌ | ✅ 是(日誌分析屬於 tenant)| Phase 2 | +| `agent_sessions` | AgentSession | ❌ | ✅ 是(session 必 tenant-scoped)| Phase 2 | +| `ai_governance_events` | AiGovernanceEvent | ❌ | ✅ 是(治理事件屬於 tenant)| Phase 2 | +| `governance_remediation_dispatch` | GovernanceRemediationDispatch | ❌ | ✅ 是 | Phase 2 | +| `trust_records` | TrustRecord | ❌ | ✅ 是(trust 屬於 tenant)| Phase 2 | +| `ai_provider_version_history` | AiProviderVersionHistory | ❌ | `platform_internal`(追蹤全平台 provider 版本,project_id = __platform__)| Phase 2 | + +> 注:migrations 目錄還有其他表(adr*.sql 建立的),需逐一確認。 + +--- + +## 2. Migration SQL 表清單(從 migrations/*.sql 掃描) + +| Migration 檔案 | 建立的表 | 是否需要 project_id | +|----------------|---------|-------------------| +| `adr071_notification_lifecycle.sql` | notification 相關表 | ✅ 需確認 | +| `adr088_trust_score_persistence.sql` | trust 相關表 | ✅ 需確認 | +| `adr090_asset_inventory_foundation.sql` | asset inventory 表 | ✅ 是(asset 屬於 tenant)| +| `adr091_aider_events_schema.sql` | aider_events | ✅ 是 | +| `adr092_p1_learning_chain_fix.sql` | learning chain 相關 | ✅ 是 | +| `adr093_notification_routing.sql` | notification routing | ✅ 需確認 | +| `adr094_hermes_dispatch_log.sql` | hermes_dispatch_log | ✅ 是 | +| `adr104_playbook_versioning.sql` | playbook 版本相關 | ✅ 是 | +| `adr105_mcp_audit_snapshots.sql` | ⚠️ 注意:表名是 `mcp_audit_snapshots` 但 models.py 用的是 `mcp_audit_log` → 需確認是否同一張表或已廢棄 | 確認後決定 | + +--- + +## 3. project_id 加入策略 + +### 3.1 Migration 範本 + +```sql +-- Phase 2:在既有表加 project_id(以 incidents 為例) +ALTER TABLE incidents + ADD COLUMN IF NOT EXISTS project_id VARCHAR(64) + NOT NULL DEFAULT 'awoooi'; -- backfill default + +-- backfill 驗證後移除 DEFAULT(不能永遠有 DEFAULT) +-- 此步驟在 backfill 確認完成後再執行: +-- ALTER TABLE incidents ALTER COLUMN project_id DROP DEFAULT; + +-- Index +CREATE INDEX CONCURRENTLY IF NOT EXISTS + idx_incidents_project_id ON incidents(project_id); + +-- Foreign key(可選,建議使用) +ALTER TABLE incidents + ADD CONSTRAINT fk_incidents_project + FOREIGN KEY (project_id) REFERENCES awooop_projects(project_id); +``` + +### 3.2 Backfill 策略 + +所有歷史資料: +- `project_id = 'awoooi'`(AWOOOI first tenant,Phase 1 已建立) +- 不需要轉換或映射 +- 在 migration SQL 中以 `DEFAULT 'awoooi'` 完成 backfill + +### 3.3 Repository 方法改造(批次計畫) + +Phase 2 PR-08(批次 1): +- `IncidentRepository.get_by_id(incident_id)` → `get_by_id(incident_id, project_id)` +- `IncidentRepository.list_active()` → `list_active(project_id)` +- `PlaybookRepository.find_by_alert_type(alert_type)` → `find_by_alert_type(alert_type, project_id)` +- `KnowledgeRepository.search(query)` → `search(query, project_id)` + +Phase 2 PR-09(批次 2): +- `McpAuditRepository.*` 全部加 project_id + trace_id +- `AiDecisionRepository.*` 全部加 project_id + run_id +- `ApprovalRepository.*` 全部加 project_id + trace_id + +--- + +## 4. RLS 設計(ADR-118,補充) + +```sql +-- 每張表的 RLS policy(以 incidents 為例) +ALTER TABLE incidents ENABLE ROW LEVEL SECURITY; + +CREATE POLICY incidents_tenant_isolation ON incidents + USING ( + project_id = current_setting('app.project_id', TRUE) + OR current_user = 'awooop_platform' + ); + +-- 應用程式在每次 connection/transaction 開始設定: +-- SET LOCAL app.project_id = '{project_id}'; +``` + +--- + +## 5. 特別注意項目 + +| 問題 | 影響 | +|------|------| +| `adr105_mcp_audit_snapshots.sql` 建的表名 `mcp_audit_snapshots` vs models.py 的 `mcp_audit_log` | P0-02 的一部分:表名不一致,migration 可能廢棄或重建 | +| `knowledge_entries` 需要 pgvector namespace 隔離 | KM 向量搜索的 namespace 需同步加 project_id scope | +| `agent_sessions` 已有 session_id,改造時要確保 session 不跨 tenant | 改造前先確認 session lookup 路徑 | + +--- + +## 6. 驗收標準 + +- [ ] 所有 20 張表(db/models.py)完成 project_id migration +- [ ] backfill:所有舊資料 project_id = 'awoooi' +- [ ] 所有 repository 方法加 project_id filter +- [ ] cross-project SELECT 全部被 RLS 拒絕(pytest 覆蓋) +- [ ] `mcp_audit_snapshots` vs `mcp_audit_log` 衝突已解決 + +*最後更新:2026-05-03(台北)* diff --git a/docs/awooop/inventory/INV-3-entrypoints.md b/docs/awooop/inventory/INV-3-entrypoints.md new file mode 100644 index 00000000..e1ecf41e --- /dev/null +++ b/docs/awooop/inventory/INV-3-entrypoints.md @@ -0,0 +1,110 @@ +# INV-3: Entrypoint Inventory + +**版本**:v1.0 初稿 +**日期**:2026-05-03(台北) +**範圍**:`apps/api/src/main.py` + 所有 webhook / CLI / cron 入口點 +**用途**:Phase 2 Bootstrap Paradox 修補的 31 個 background loop 分類 + +--- + +## 1. 標記定義(ADR-111) + +| 標記 | 意義 | +|------|------| +| `platform_internal` | 平台本身的維護工作,不屬於任何 tenant,帶 `project_id=__platform__` | +| `legacy_awoooi_default` | 過渡期:暫時 fallback 到 `project_id=awoooi`,有退場時程 | +| `requires_project_id` | 必須帶 project_id 才能運行,Phase 2 改造重點 | +| `multi_tenant_ready` | 已支援 project_id(Phase 2 後才能設定)| + +--- + +## 2. API Webhook 入口(同步接收) + +| 入口 | 檔案 / 路由 | 標記 | project_id 來源 | +|------|------------|------|----------------| +| `POST /v1/webhooks/telegram` | `api/v1/webhooks.py:679` | `legacy_awoooi_default` | 固定 awoooi,Phase 7 後改為 principal mapping | +| `POST /v1/webhooks/alertmanager` | `api/v1/webhooks.py` | `legacy_awoooi_default` | 固定 awoooi | +| `POST /v1/webhooks/gitea` | `api/v1/webhooks.py` | `legacy_awoooi_default` | 固定 awoooi | +| `GET /v1/health` | `api/v1/health.py` | `platform_internal` | `__platform__` | +| `GET /v1/metrics` | `api/v1/metrics.py` | `platform_internal` | `__platform__` | +| `POST /v1/decisions/` | `api/v1/decisions.py` | `legacy_awoooi_default` | 固定 awoooi,Phase 4 後改為 contract lookup | +| `POST /v1/incidents/` | `api/v1/incidents.py` | `legacy_awoooi_default` | 固定 awoooi | + +--- + +## 3. Background Loops(main.py asyncio.create_task,共 31 個) + +> 來源:`apps/api/src/main.py` grep `asyncio.create_task`,行號已驗證 + +| # | 函數名 | main.py 行號 | 類別 | 標記 | 備註 | +|---|--------|-------------|------|------|------| +| 1 | `seed_playbooks_from_rules()` | 331 | 啟動一次性 | `legacy_awoooi_default` | playbook seed,只跑一次 | +| 2 | `get_playbook_repository().backfill_redis_to_pg()` | 341 | 啟動一次性 | `legacy_awoooi_default` | Redis → PG backfill,只跑一次 | +| 3 | `ensure_playbook_embeddings_indexed()` | 348 | 啟動一次性 | `legacy_awoooi_default` | embedding index,只跑一次 | +| 4 | `get_decision_manager().resend_stale_ready_tokens()` | 362 | 持續 loop | `legacy_awoooi_default` | stale approval token 重送 | +| 5 | `run_incident_analysis_sweeper()` | 373 | 持續 loop | `legacy_awoooi_default` | incident 週期分析 | +| 6 | `run_asset_scanner_loop()` | 383 | 持續 loop | `legacy_awoooi_default` | K8s asset 掃描(ADR-090)| +| 7 | `run_rule_catalog_sync_loop()` | 393 | 持續 loop | `legacy_awoooi_default` | 告警規則 catalog 同步 | +| 8 | `run_capacity_scanner_loop()` | 403 | 持續 loop | `legacy_awoooi_default` | 容量掃描 | +| 9 | `run_compliance_scanner_loop()` | 413 | 持續 loop | `legacy_awoooi_default` | 合規掃描 | +| 10 | `run_aider_event_processor_loop()` | 423 | 持續 loop | `legacy_awoooi_default` | aider 事件處理 | +| 11 | `run_coverage_evaluator_loop()` | 432 | 持續 loop | `legacy_awoooi_default` | 覆蓋率評估 | +| 12 | `run_rule_stats_updater_loop()` | 442 | 持續 loop | `legacy_awoooi_default` | 規則統計更新 | +| 13 | `run_asset_change_tracker_loop()` | 452 | 持續 loop | `legacy_awoooi_default` | 資產變更追蹤 | +| 14 | `run_hermes_rule_quality_loop()` | 462 | 持續 loop | `legacy_awoooi_default` | Hermes 規則品質 | +| 15 | `run_capacity_forecaster_loop()` | 472 | 持續 loop | `legacy_awoooi_default` | 容量預測 | +| 16 | `run_daily_report_loop()` | 481 | 持續 loop | `legacy_awoooi_default` | 每日報告 | +| 17 | `run_approval_timeout_resolver()` | 490 | 持續 loop | `legacy_awoooi_default` | 🔴 P1-15:無 trace_id | +| 18 | `run_evolver_loop()` | 499 | 持續 loop | `legacy_awoooi_default` | playbook 進化 | +| 19 | `run_playbook_generation_governance_loop()` | 507 | 持續 loop | `legacy_awoooi_default` | playbook 生成治理 | +| 20 | `run_knowledge_decay_loop()` | 519 | 持續 loop | `legacy_awoooi_default` | KM 知識衰退 | +| 21 | `run_km_backfill_reconciler_loop()` | 529 | 持續 loop | `legacy_awoooi_default` | KM backfill 核對 | +| 22 | `run_aol_writeback_loop()` | 540 | 持續 loop | `legacy_awoooi_default` | AOL writeback(飛輪)| +| 23 | `_run_kb_rot_cleaner_loop()` | 585 | 持續 loop | `legacy_awoooi_default` | KB 腐敗清理 | +| 24 | `run_finetune_export_loop()` | 594 | 持續 loop | `legacy_awoooi_default` | finetune 資料匯出 | +| 25 | `run_proactive_inspector_loop()` | 605 | 持續 loop | `legacy_awoooi_default` | 主動巡檢 | +| 26 | `run_offline_replay_loop()` | 614 | 持續 loop | `legacy_awoooi_default` | 離線重放 | +| 27 | `run_ai_slo_watchdog_loop()` | 623 | 持續 loop | `platform_internal` | SLO watchdog — 監控本平台健康,project_id=__platform__ | +| 28 | `run_governance_loop()` | 632 | 持續 loop | `legacy_awoooi_default` | AI 治理主循環 | +| 29 | `run_governance_dispatcher_loop()` | 640 | 持續 loop | `legacy_awoooi_default` | 治理事件派送 | +| 30 | `_run_model_version_tracker_loop()` | 701 | 持續 loop | `platform_internal` | AI model 版本追蹤 — platform_resource | +| 31 | (需確認 main.py 701 後是否還有)| 701+ | TBD | TBD | grep 計數 = 31,確認後補 | + +--- + +## 4. 遷移策略(依 ADR-123) + +### platform_internal(2 個,#27、#30) +- 帶 `project_id=__platform__` +- 不受 project RLS 限制 +- 但必須寫 audit log(標記 `platform_resource=true`) + +### legacy_awoooi_default(29 個,其餘所有) +- 過渡期:帶 `project_id=awoooi` +- 退場時程:Phase 4 完成後 90 天內逐一改造為 `requires_project_id` +- 每個 loop 改造後從 `legacy_awoooi_default` → `multi_tenant_ready` + +### 特別注意 +- `run_approval_timeout_resolver()`(#17):改造時必須同步補入 trace_id(P1-15) +- `run_aol_writeback_loop()`(#22):改造時確認 KM 雙路徑寫入(feedback_km_dual_path_design.md) + +--- + +## 5. CLI / Script 入口 + +| 入口 | 類別 | 標記 | +|------|------|------| +| `python -m apps.api` / `uvicorn apps.api.src.main:app` | 主程序啟動 | `platform_internal` | +| `python -m alembic upgrade head` | DB migration | `platform_internal` | +| `python scripts/seed_*.py`(若有)| 資料 seed | `platform_internal` | + +--- + +## 6. 驗收標準 + +- [ ] 31 個 background loop 全部有標記 +- [ ] `platform_internal` loop 帶 `project_id=__platform__`(可在 logging context 確認) +- [ ] `legacy_awoooi_default` loop 帶 `project_id=awoooi`(fallback,不是最終形態) +- [ ] 退場時程寫入 ADR-123 + +*最後更新:2026-05-03(台北)* diff --git a/docs/awooop/inventory/INV-4-hardcoded-namespace-ip.md b/docs/awooop/inventory/INV-4-hardcoded-namespace-ip.md new file mode 100644 index 00000000..f15a9161 --- /dev/null +++ b/docs/awooop/inventory/INV-4-hardcoded-namespace-ip.md @@ -0,0 +1,108 @@ +# INV-4: Hardcoded Namespace & IP Inventory + +**版本**:v1.0 初稿 +**日期**:2026-05-03(台北) +**範圍**:`apps/api/src/` 全 codebase + `k8s/` +**用途**:Phase 2 多租戶改造 + EwoooC onboarding 前必須清理的硬碼 + +--- + +## 1. Hardcoded K8s Namespace + +| 位置 | 行號 | 內容 | 影響 | 修補方式 | +|------|------|------|------|---------| +| `apps/api/src/plugins/mcp/providers/k8s_provider.py` | 40-41 | `ALLOWED_NAMESPACES = {"awoooi-prod"}` / `DEFAULT_NAMESPACE = "awoooi-prod"` | 🔴 P0-13:EwoooC K8s tool 無法操作自己的 namespace | 改為 config-driven:從 EffectivePolicy 或 project contract 讀 allowed namespaces | +| `apps/api/src/plugins/mcp/mcp_bridge.py` | 592, 602, 631, 647, 681 | `namespace = parameters.get("namespace", "awoooi-prod")` | 🔴 P0-13:5 處預設值都寫死 | 改為 `namespace = parameters.get("namespace", get_project_default_namespace(project_id))` | +| `apps/api/src/plugins/mcp/providers/signoz_provider.py` | 169 | `namespace = parameters.get("namespace", "awoooi-prod")` | 🟠 SignOz query 預設 namespace 錯誤 | 同上 | +| `apps/api/src/main.py` | 175 | `sentry_sdk.set_tag("host", "k8s-awoooi-prod")` | 🔵 Sentry tag 寫死,EwoooC 看到錯誤 host tag | 改為 `settings.SENTRY_HOST_TAG` | +| `apps/api/src/core/prompts.py` | 120, 178, 192 | 系統 prompt 中出現 `awoooi-prod` | 🔵 LLM 可能在其他 tenant 錯誤建議 awoooi-prod namespace | 改為 `{tenant_namespace}` template variable | + +--- + +## 2. Hardcoded IP Addresses + +### 內網 IP(已知用途) + +| IP | 用途 | 位置 | 改法 | +|----|------|------|------| +| `192.168.0.110` | Gitea / Prometheus / Loki / MinIO | `config.py:226,413,460,813` | 已在 config 以 default 存在,OK(K8s 覆蓋)| +| `192.168.0.111` | Ollama Local Fallback | `config.py` + `feedback_ollama_111_only.md`(已更新) | ADR-110 已改為第三層 fallback,config 需更新 | +| `192.168.0.112` | ArgoCD | `config.py:396` | OK(config default)| +| `192.168.0.120` | K3s API Server | `config.py:531` | OK(config default)| +| `192.168.0.121` | K3s ingress? | `config.py:837` | 確認用途 | +| `192.168.0.188` | PostgreSQL / Redis / SigNoz / Grafana / ClickHouse | `config.py` 多處 | OK(config default,K8s 以 env 覆蓋)| + +### 🔴 問題:telemetry.py IP Assertion + +| 位置 | 行號 | 內容 | 問題 | +|------|------|------|------| +| `apps/api/src/core/telemetry.py` | 71 | `if "192.168.0.188" not in endpoint: raise` | 🔴 P0-08:EwoooC 啟動必失敗(EwoooC SigNoz 可能是不同 endpoint)| +| 修補方式 | | | 移除硬碼 assert,改為 `if endpoint not in settings.ALLOWED_TELEMETRY_ENDPOINTS:` | + +### GCP IP(新增,ADR-110,2026-05-03 生效) + +| IP | 用途 | 位置 | +|----|------|------| +| `34.143.170.20` | Ollama GCP-A Primary(SSD)| `config.py`(ADR-110 已加入 `_ALLOWED_PUBLIC_IPS`)| +| `34.21.145.224` | Ollama GCP-B Secondary(SSD)| `config.py`(ADR-110 已加入 `_ALLOWED_PUBLIC_IPS`)| + +**注意**: +- K8s NetworkPolicy egress:已新增 GCP-A/GCP-B /32 出口規則(ADR-110) +- INV-4 確認:GCP IP 已在 `config.py._ALLOWED_PUBLIC_IPS` 白名單,非新增需求 +- telemetry.py:71 assert:GCP IP 不影響(assert 是針對 OTEL endpoint,非 Ollama endpoint) + +--- + +## 3. SSH Host Hardcodes + +| 位置 | 內容 | 問題 | 修補 | +|------|------|------|------| +| `reference_four_hosts.md` | 110/120/121/188 四主機清單 | 文件,不是程式碼,OK | 無 | +| `apps/api/src/plugins/mcp/providers/ssh_provider.py`(若有)| SSH 目標主機 | 需 grep 確認是否硬碼 | 改為 config-driven 白名單 | + +--- + +## 4. 其他硬碼字串 + +| 位置 | 內容 | 問題 | 修補 | +|------|------|------|------| +| `apps/api/src/core/config.py:625` | `default="192.168.0.188=ollama"` | Ollama-to-host mapping,ADR-110 後需更新 | 改為 `192.168.0.188=ollama_old`(僅 fallback 相關)| +| `apps/api/src/core/config.py:828` | `default="192.168.0.188,192.168.0.110,192.168.0.111"` | 主機清單 | 確認用途,若是 monitoring target 需加 GCP IP | + +--- + +## 5. 改造策略(Phase 2) + +### K8s Namespace(優先) +```python +# 方案:project contract 中定義允許的 K8s namespaces +# awooop_projects.k8s_namespaces: ["awoooi-prod"](AWOOOI)/ ["ewoooc-prod"](EwoooC) +# k8s_provider.py 從 project contract 讀,而非硬碼 + +def get_allowed_namespaces(project_id: str) -> set[str]: + contract = get_active_project_contract(project_id) + return set(contract.allowed_k8s_namespaces) +``` + +### Telemetry Endpoint Assert(P0-08,PR-01 優先) +```python +# 修改前(telemetry.py:71) +if "192.168.0.188" not in endpoint: + raise ValueError(f"Forbidden OTEL endpoint: {endpoint}") + +# 修改後 +allowed_endpoints = settings.ALLOWED_TELEMETRY_ENDPOINTS.split(",") +if not any(allowed in endpoint for allowed in allowed_endpoints): + raise ValueError(f"Forbidden OTEL endpoint: {endpoint}") +``` + +--- + +## 6. 驗收標準 + +- [ ] `grep -r "192.168.0.188" apps/api/src/` 中 `telemetry.py` 的 assert 行消失 +- [ ] `grep -r '"awoooi-prod"' apps/api/src/` 中的程式碼路徑(非 prompt 文字、非 comment)結果為 0 +- [ ] k8s_provider.py `ALLOWED_NAMESPACES` 改為 config-driven +- [ ] INV-4 中標記 GCP IP 已確認加入 NetworkPolicy(ADR-110 完成) + +*最後更新:2026-05-03(台北)* diff --git a/docs/awooop/inventory/INV-5-migration-compatibility-matrix.md b/docs/awooop/inventory/INV-5-migration-compatibility-matrix.md new file mode 100644 index 00000000..d42dce77 --- /dev/null +++ b/docs/awooop/inventory/INV-5-migration-compatibility-matrix.md @@ -0,0 +1,116 @@ +# INV-5: Migration Compatibility Matrix + +**版本**:v1.0 初稿 +**日期**:2026-05-03(台北) +**範圍**:`apps/api/pyproject.toml` 實際版本 + AwoooP Phase 1 migration 需求 +**用途**:Phase 1 schema migration 前確認版本相容性,避免 breaking change 踩雷 + +--- + +## 1. 當前依賴版本(從 pyproject.toml 實際讀取) + +| 套件 | 當前約束 | 對 AwoooP Phase 1 的影響 | +|------|---------|------------------------| +| Python | >=3.11 | ✅ 足夠;`asyncio.TaskGroup`(3.11)可用 | +| FastAPI | >=0.115.0 | ✅ 0.115 已支援 lifespan context manager | +| SQLAlchemy | **>=2.0.0** ✅ 已是 2.x | ✅ `mapped_column`、`DeclarativeBase`、async session 均可用 | +| Pydantic | **>=2.5.0** ✅ 已是 v2 | ✅ `model_validator`、`field_validator` v2 語法可直接用 | +| pydantic-settings | >=2.1.0 | ✅ | +| asyncpg | >=0.29.0 | ✅ 支援 PostgreSQL 15/16 | +| redis | >=5.0.0 | ✅ redis-py 5.x 支援 cluster mode + NX | +| Alembic | **未列出**(透過 SQLAlchemy 間接)| ⚠️ 需確認版本;建議鎖定 >=1.13.0 | +| Langfuse | >=2.0.0,<3.0.0(鎖定)| ✅ 無 AwoooP 影響,保持不變 | +| opentelemetry | >=1.20.0 | ✅ GenAI semantic conventions 需要 >=1.25.0(ADR-121)| +| claude-agent-sdk | >=0.1.50 | ✅ 已使用 | + +--- + +## 2. 重要發現 + +**✅ SQLAlchemy 已是 2.x**:MASTER-WORKPLAN 中「ORM 1.x vs 2.x」問題**已不存在**。 +- `mapped_column`、`relationship`、async session 均可正常用 +- 但仍需確認:AwoooP 新 model 使用正確的 2.x 寫法(`DeclarativeBase` 而非舊 `declarative_base()`) + +**✅ Pydantic 已是 v2**:六合約 Pydantic models 可直接用 v2 語法。 + +**⚠️ Alembic 版本未鎖定**: +- `apps/api/migrations/` 有大量 `.sql` 手寫 migration(非 Alembic auto-generate) +- AwoooP Phase 1 要決定:繼續手寫 SQL migration,還是引入 Alembic autogenerate? +- **建議**:保持手寫 SQL(現有慣例),每個 migration 強制有 down migration + +**⚠️ OTel 版本需升級(ADR-121)**: +- GenAI Semantic Conventions 需要 `opentelemetry-semantic-conventions>=0.46b0`(含 `gen_ai.*` namespace) +- 當前 `>=1.20.0` 可能不夠,需要 `>=1.25.0` +- 升級風險:低(向後相容);需要加入 `opentelemetry-semantic-conventions` 依賴 + +--- + +## 3. AwoooP Phase 1 新依賴需求 + +| 套件 | 版本 | 用途 | 是否需要新增 | +|------|------|------|------------| +| `pg_partman`(PostgreSQL extension)| >=4.7.0 | 高流量表 partition 自動管理 | ⚠️ 需在 DB 層安裝,非 Python 套件 | +| `opentelemetry-semantic-conventions` | >=0.46b0 | OTel GenAI attribute(ADR-121)| ✅ 需新增到 pyproject.toml | +| `PgBouncer` | >=1.21.0 | Connection pool(tool-expert 需求)| ⚠️ Infrastructure,非 Python 套件 | + +--- + +## 4. Migration SQL 版本相容性 + +| Migration 問題 | 當前狀態 | 修補方式 | +|---------------|---------|---------| +| `adr105_mcp_audit_snapshots.sql` 建表 `mcp_audit_snapshots`,但 models.py 用 `mcp_audit_log` | ⚠️ 衝突 | 確認是否已 rollback;若 `mcp_audit_log` 是現行表,則 `mcp_audit_snapshots` 已廢棄 | +| 部分 migration 無 rollback SQL | ⚠️ MASTER-WORKPLAN P0-02 | Phase 1 新 migration 強制加 down migration | +| `ARRAY` 型別在部分 migration 用 `TEXT[]`,在其他用 `JSONB` | 🟠 不一致 | AwoooP 新表一律用 `JSONB`(`fix_playbooks_array_to_jsonb.sql` 已示範)| + +--- + +## 5. SQLAlchemy 2.x 正確寫法確認清單(Phase 1 新 model 必須遵守) + +```python +# ✅ 正確(SQLAlchemy 2.x) +from sqlalchemy.orm import DeclarativeBase, mapped_column, Mapped +from sqlalchemy import String, DateTime, func +from typing import Optional + +class Base(DeclarativeBase): + pass + +class AwooopProject(Base): + __tablename__ = "awooop_projects" + project_id: Mapped[str] = mapped_column(String(64), primary_key=True) + display_name: Mapped[str] = mapped_column(String(128), nullable=False) + created_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), server_default=func.now() + ) + migration_mode: Mapped[str] = mapped_column( + String(32), nullable=False, default="legacy_awoooi_default" + ) + +# ❌ 舊寫法(SQLAlchemy 1.x,不要用) +# from sqlalchemy.ext.declarative import declarative_base +# Base = declarative_base() +# project_id = Column(String(64), primary_key=True) +``` + +--- + +## 6. 版本兼容風險清單 + +| 風險 | 嚴重度 | 緩解方式 | +|------|--------|---------| +| pg_partman 未安裝,partition 手動建立出錯 | 中 | Phase 1 先不建 partition,只規劃;Phase 4 前安裝 pg_partman | +| OTel semantic conventions 版本不足 | 低 | Phase 4 前升級(非 Phase 1 阻擋項)| +| PgBouncer 未安裝,AwoooP worker 壓力測試可能爆連接 | 中 | Phase 4(platform shell)前安裝 | +| Alembic 未鎖定,CI 環境版本飄移 | 低 | 鎖定 `alembic>=1.13.0` 加入 pyproject.toml | + +--- + +## 7. 驗收標準 + +- [ ] Phase 1 所有新 migration 使用 SQLAlchemy 2.x `mapped_column` 語法 +- [ ] 所有新 migration 有 down migration SQL +- [ ] `mcp_audit_snapshots` vs `mcp_audit_log` 衝突已解決(確認哪個是現行表) +- [ ] `pg_partman` 安裝計畫已寫入 runbook + +*最後更新:2026-05-03(台北)* diff --git a/docs/awooop/inventory/INV-6-rollback-playbook-register.md b/docs/awooop/inventory/INV-6-rollback-playbook-register.md new file mode 100644 index 00000000..b4f73a1b --- /dev/null +++ b/docs/awooop/inventory/INV-6-rollback-playbook-register.md @@ -0,0 +1,213 @@ +# INV-6: Rollback Playbook Register + +**版本**:v1.0 初稿 +**日期**:2026-05-03(台北) +**範圍**:AwoooP 全 8 Phase 的 rollback 場景 +**用途**:Phase 4 起的 gate 驗收「rollback evidence ≥ 3 次」需要這份登記冊 + +--- + +## 說明 + +每個 rollback playbook 需要: +1. 觸發條件(什麼時候需要 rollback) +2. rollback 步驟(精確指令) +3. 驗證方式(rollback 是否成功) +4. 影響範圍(什麼會受影響) + +--- + +## RBP-01: Phase 1 Schema Migration Rollback + +**觸發**:Phase 1 migration up 後發現業務功能異常、CI 回歸測試失敗 + +**rollback 步驟**: +```bash +# 1. 確認當前 migration head +alembic current # 或直接查 DB schema + +# 2. 執行 down migration(每個 AwoooP migration 都有 down SQL) +# 手動執行 down migration SQL(deps/alembic down 或直接 psql) +psql $DATABASE_URL -f apps/api/migrations/awooop_phase1_down.sql + +# 3. 驗證 AwoooP 表已移除 +psql $DATABASE_URL -c "\dt awooop_*" # 應該 0 結果 + +# 4. 確認 AWOOOI 原有功能正常 +curl -f http://localhost:8000/v1/health +``` + +**影響**:AwoooP 控制面表全部刪除,AWOOOI 業務功能不受影響(P0:AWOOOI 0 行為改動) +**pre-condition**:AWOOOI 業務表無任何 AwoooP 外鍵依賴(Phase 1 設計邊界) + +--- + +## RBP-02: Phase 2 Redis Key Rollback(三階段雙寫任一 Phase) + +**觸發**:Redis 雙寫後發現舊服務讀取異常(Phase A 或 Phase B 期間) + +**rollback 步驟**(Phase A 回滾): +```bash +# 停止寫入新 key(code revert 或 feature flag off) +# 舊 key 本來就還在,直接恢復正常讀取 +# 確認舊 key 仍有效(TTL 未過期) +redis-cli TTL "telegram_silence:{target}" # 確認大於 0 + +# 清理已寫入的新 key(避免混淆) +redis-cli DEL "{project_id}:telegram:silence:{target}" +``` + +**rollback 步驟**(Phase B 回滾 → 回到 Phase A): +```bash +# 將讀取邏輯改回「舊 key 為主」 +# 保持雙寫(不移除新 key 寫入) +# 這是 Phase A 狀態,不需要清理資料 +``` + +**影響**:Redis 讀取回到舊 key,功能降級但不中斷 +**monitoring**:舊 key 命中率應在 24h 內回升到 100% + +--- + +## RBP-03: Strangler Fig Phase Rollback(shadow/canary/read_only 回退) + +**觸發**:量化 gate 失敗(error rate > threshold、p95 退化 > 10%、出現 P0/P1 incident) + +**rollback 步驟**: +```sql +-- 回退 project_migration_state +UPDATE awooop_project_migration_state +SET current_mode = 'shadow', -- 回到上一個 mode + rolled_back_at = NOW(), + rollback_reason = '量化 gate 失敗:error_rate > 0.5%' +WHERE project_id = 'awoooi' + AND capability = 'sre_flows'; + +-- 讓 worker 重新讀 migration_state(invalidate cache) +-- Redis: DEL platform:awooop:migration_state:awoooi +``` + +```bash +# feature flag 切回 +# AWOOOP_CANARY_MODE=false(K8s ConfigMap 或 env) +kubectl patch configmap awoooi-config -n awoooi-prod \ + --type=merge -p '{"data":{"AWOOOP_CANARY_MODE":"false"}}' +kubectl rollout restart deployment/awoooi-api -n awoooi-prod +``` + +**影響**:AwoooP 回到上一個 mode,legacy AWOOOI handler 重新接管 +**monitoring**:Strangler Fig dashboard 顯示回退記錄(ADR-UI-03) + +--- + +## RBP-04: Phase 5 MCP Gateway Rollback + +**觸發**:MCP Gateway 導致工具呼叫失敗率 > 5% + +**rollback 步驟**: +```bash +# 1. 關閉 MCP Gateway 攔截(feature flag) +kubectl patch configmap awoooi-config -n awoooi-prod \ + --type=merge -p '{"data":{"AWOOOP_MCP_GATEWAY_ENABLED":"false"}}' + +# 2. 重啟 pod 使設定生效 +kubectl rollout restart deployment/awoooi-api -n awoooi-prod + +# 3. 確認工具呼叫直接通過(繞過 Gateway) +# 監控:mcp_tool_call_success_rate 應在 5min 內回升 +``` + +**影響**:audit 仍記錄,但 Five-gate 不攔截;credential isolation 暫時失效 +**post-rollback action**:找 MCP Gateway 失敗原因後再重新啟用 + +--- + +## RBP-05: Phase 6 EwoooC Tenant Rollback + +**觸發**:EwoooC shadow 期間發現跨 tenant 資料洩漏(E-TENANT-001 告警) + +**rollback 步驟**: +```sql +-- 1. 立即停止 EwoooC 所有 run +UPDATE awooop_run_state +SET status = 'CANCELLED', + failure_reason = 'Emergency rollback: cross-tenant leak detected' +WHERE project_id = 'ewoooc' + AND status NOT IN ('COMPLETED', 'FAILED', 'CANCELLED'); + +-- 2. 停用 EwoooC project +UPDATE awooop_projects +SET is_active = FALSE, + suspension_reason = 'Emergency rollback 2026-05-xx' +WHERE project_id = 'ewoooc'; +``` + +```bash +# 3. feature flag 關閉 +kubectl patch configmap awoooi-config -n awoooi-prod \ + --type=merge -p '{"data":{"AWOOOP_EWOOOC_LIVE":"false"}}' + +# 4. 通知 EwoooC 團隊 +``` + +**影響**:EwoooC 服務中斷;AWOOOI 不受影響 +**RCA 必要**:觸發 Runbook RB-06(cross-tenant leak);需確認 RLS 是否失效 + +--- + +## RBP-06: Phase 8 Approval Flow Rollback + +**觸發**:approval_token 簽章機制導致正常 approval 無法完成 + +**rollback 步驟**: +```bash +# 1. 臨時切回舊 approval 路徑(feature flag) +kubectl patch configmap awoooi-config -n awoooi-prod \ + --type=merge -p '{"data":{"AWOOOP_APPROVAL_TOKEN_REQUIRED":"false"}}' + +# 2. 重啟 +kubectl rollout restart deployment/awoooi-api -n awoooi-prod + +# 3. 手動處理 stuck WAITING_APPROVAL run(Runbook RB-02) +``` + +**影響**:approval token 安全性降級,回到舊 approval 驗證邏輯 +**post-rollback action**:診斷 token 簽章失敗原因(clock skew?Secret rotation?) + +--- + +## RBP-07: GCP Ollama Primary 失聯緊急 Rollback + +**觸發**:GCP-A 和 GCP-B 同時不可達,Local 111 也異常(見 Runbook RB-07) + +**rollback 步驟**: +```bash +# 1. 強制 Redis 設定 fallback +redis-cli SET "platform:ollama:topology" \ + '{"primary":"http://192.168.0.111:11434","secondary":null,"fallback":"gemini"}' + +# 2. 確認 paid provider 路由已啟用(config.py GEMINI_API_KEY 有效) +# 3. 監控:LLM call 應切換到 Gemini +``` + +**影響**:推理效能降低(Local HDD)或費用增加(Gemini) +**post-rollback action**:確認 GCP-A/B 健康狀態後,Redis 恢復三層拓撲 + +--- + +## 登記表 + +| Rollback ID | Phase | 已驗證次數 | 最後驗證日期 | 備註 | +|------------|-------|-----------|------------|------| +| RBP-01 | Phase 1 | 0 | — | 待 Phase 1 完成後驗證 | +| RBP-02 | Phase 2 | 0 | — | 待 Redis 遷移開始後驗證 | +| RBP-03 | Phase 4+ | 0 | — | Strangler Fig gate | +| RBP-04 | Phase 5 | 0 | — | MCP Gateway | +| RBP-05 | Phase 6 | 0 | — | EwoooC | +| RBP-06 | Phase 8 | 0 | — | Approval token | +| RBP-07 | 任何時間 | 0 | — | GCP Ollama 緊急 | + +**Phase 8 gate 要求**:進入 `suggest → auto_remediate` 前,上表「已驗證次數」必須 ≥ 3 次(每個 RBP)。 +由 critic + db-expert + vuln-verifier 三方在 LOGBOOK 簽核。 + +*最後更新:2026-05-03(台北)* diff --git a/docs/awooop/inventory/INV-7-pr-cutting-plan.md b/docs/awooop/inventory/INV-7-pr-cutting-plan.md new file mode 100644 index 00000000..9411bf03 --- /dev/null +++ b/docs/awooop/inventory/INV-7-pr-cutting-plan.md @@ -0,0 +1,277 @@ +# INV-7: PR Cutting Plan + +**版本**:v1.0 初稿 +**日期**:2026-05-03(台北) +**範圍**:Phase 2 全部重構 PR 的切割方案(refactor-specialist 設計) +**用途**:避免大 PR 難以 review;每個 PR 獨立可合併、有 rollback 能力 + +--- + +## 原則 + +1. 每個 PR 必須可獨立合併(不依賴後 PR 才能運行) +2. 每個 PR 有 rollback 方式(revert commit 或 feature flag) +3. PR-01~05 無依賴,可並行;PR-06+ 有順序依賴 + +--- + +## PR 清單 + +### PR-01: telemetry.py hardcoded IP assert 移除 + +**範圍**:`apps/api/src/core/telemetry.py:71`(約 1-3 行) +**前置**:無 +**解決**:P0-08 EwoooC 啟動失敗 + +```python +# 修改前 +if "192.168.0.188" not in endpoint: + raise ValueError(f"Forbidden OTEL endpoint: {endpoint}") + +# 修改後 +allowed = [e.strip() for e in settings.ALLOWED_TELEMETRY_ENDPOINTS.split(",")] +if not any(a in endpoint for a in allowed): + raise ValueError(f"Forbidden OTEL endpoint: {endpoint}") +# 同步在 config.py 加入 ALLOWED_TELEMETRY_ENDPOINTS = "192.168.0.188"(default) +``` + +**review**:vuln-verifier +**風險**:低(僅移除一個 IP assert) + +--- + +### PR-02: decision_manager.py silence key 常數化 + +**範圍**:`apps/api/src/services/decision_manager.py:240`(約 2 行) +**前置**:無 +**解決**:P1-24 兩處定義不一致 + +```python +# 修改前(decision_manager.py:240) +silence_key = f"telegram_silence:{target}" + +# 修改後 +from apps.api.src.services.telegram_gateway import SILENCE_KEY_PREFIX +silence_key = f"{SILENCE_KEY_PREFIX}{target}" +``` + +**review**:debugger(確認 SILENCE_KEY_PREFIX = "telegram_silence:",功能等效) +**風險**:低 + +--- + +### PR-03: ollama_auto_recovery.py 第二定義移除 + +**範圍**:`apps/api/src/services/ollama_auto_recovery.py:230`(約 5 行) +**前置**:需確認 INV-1 中 `ollama:current_primary` 的所有寫入點 +**解決**:P0-11 GCP 三層拓撲遷移必裂 + +```python +# 移除 ollama_auto_recovery.py 中第二個 ollama:current_primary 寫入點 +# 統一從 config.py 或 ollama_failover_manager.py 讀取,不在 auto_recovery 自行決定 primary +``` + +**review**:tool-expert(確認 GCP 拓撲三層 failover 邏輯) +**風險**:中(影響 Ollama failover 路徑) + +--- + +### PR-04: registry.py `_provider` → `__provider` + +**範圍**:`apps/api/src/plugins/mcp/registry.py:24-71`(約 20 行) +**前置**:無 +**解決**:P1-05 `_provider` public 可繞過 audit + +```python +# 修改所有 self._provider → self.__provider +# 加 @property provider 讀取(需要時) +# 加 unit test: +# with pytest.raises(AttributeError): +# registry.__provider # double underscore = name mangling +``` + +**review**:vuln-verifier +**風險**:低(Python name mangling,只影響直接反射存取) + +--- + +### PR-05: mcp_bridge.py namespace 動態化 + +**範圍**:`apps/api/src/plugins/mcp/mcp_bridge.py:592,602,631,647,681`(約 30 行) +**前置**:無(Phase 1 schema 未完成前,先改為 from settings fallback) +**解決**:P0-13 EwoooC K8s tool 無法使用 + +```python +# Phase 1 前的過渡做法(PR-05a): +DEFAULT_NAMESPACE = os.getenv("K8S_DEFAULT_NAMESPACE", "awoooi-prod") +namespace = parameters.get("namespace", DEFAULT_NAMESPACE) + +# Phase 2 後的完整做法(PR-05b,需 Phase 1 schema 先完成): +namespace = parameters.get("namespace", + get_project_contract(project_id).allowed_k8s_namespaces[0]) +``` + +**review**:tool-expert +**風險**:低(有 env fallback,功能等效) + +--- + +### PR-06: consensus_engine.py CONSENSUS_PREFIX 加 project 前綴 + +**範圍**:`apps/api/src/services/consensus_engine.py`(約 15 行) +**前置**:**Redis 雙寫 Phase A 已完成**(舊 `consensus:` key 已雙寫新 key) +**解決**:P0-12 consensus engine 多租戶跨 tenant 共用 + +```python +# 修改前 +CONSENSUS_PREFIX = "consensus:" + +# 修改後(consensus engine 接受 project_id 參數) +def get_consensus_prefix(project_id: str) -> str: + return f"{project_id}:consensus:" +``` + +**review**:db-expert(確認 consensus 語義正確)、debugger(regression) +**風險**:中(需 Redis 雙寫先完成) + +--- + +### PR-07: security hardening(nonce + webhook replay + approval policy) + +**範圍**: +- `apps/api/src/services/security_interceptor.py:451-490`(nonce 重設計) +- `apps/api/src/api/v1/webhooks.py:679-728`(webhook timestamp/nonce) +- `apps/api/src/services/decision_manager.py`(requires_approval policy-derived) +**前置**:ADR-116 已 Accepted(規格凍結) +**解決**:P0-04、P0-05、P0-06(3 個 PoC 確認漏洞) + +```python +# security_interceptor.py: nonce 必須 HMAC(server_secret + nonce) +# webhooks.py: 加 X-Timestamp header 驗證 ±5min window + Redis NX nonce +# decision_manager.py: requires_approval 從 policy contract 讀,禁止 LLM output 決定 +``` + +**review**:vuln-verifier(PoC 重跑驗證修補有效) +**風險**:高(安全修補,必須 PoC 驗證) + +--- + +### PR-08: Repository project_id filter 批次 1 + +**範圍**: +- `IncidentRepository`(`incidents` 表) +- `PlaybookRepository`(`playbooks` 表) +- `KnowledgeRepository`(`knowledge_entries` 表) +**前置**:**Phase 1 schema migration 完成**(`project_id` 欄位已存在) +**解決**:P0-03 30+ 業務表無 project_id filter + +```python +# 每個 repository 方法加 project_id 參數 +# 例: +async def get_incident(self, incident_id: str, project_id: str) -> Incident: + return await self.session.execute( + select(Incident) + .where(Incident.incident_id == incident_id) + .where(Incident.project_id == project_id) # 新增 + ) +``` + +**review**:db-expert(query plan 確認) +**風險**:中(約 200 行;需 regression test) + +--- + +### PR-09: Repository project_id filter 批次 2 + +**範圍**: +- `McpAuditRepository`(加 trace_id + project_id) +- `AiDecisionRepository`(加 run_id + project_id) +- `ApprovalRepository`(加 trace_id + project_id) +**前置**:PR-08 合併後 +**解決**:P0-03 繼續 + +**review**:db-expert + debugger(trace_id 貫穿驗證) +**風險**:中 + +--- + +### PR-10: Background loop 標記(ADR-123) + +**範圍**:`apps/api/src/main.py`(31 個 loop,約 150 行) +**前置**:ADR-123 已 Accepted +**解決**:P0-07 31 個 loop 無 project_id + +```python +# 每個 loop 的 context 加上標記 +# platform_internal: asyncio.create_task 前設定 contextvars +# legacy_awoooi_default: fallback project_id = 'awoooi' + +async def _run_with_project_context(coro, project_id: str): + token = project_id_var.set(project_id) + try: + await coro + finally: + project_id_var.reset(token) +``` + +**review**:critic(確認標記正確) +**風險**:中(影響所有 background loop 的 logging context) + +--- + +### PR-11: AnomalyCounter per-project 改造 + +**範圍**:`apps/api/src/services/anomaly_counter.py:790`(約 80 行) +**前置**:PR-10 完成(loop 有 project_id context 後才能傳入) +**解決**:P1-17 AnomalyCounter 全域單例 + +```python +# 修改前:全域單例,6 個 prefix 無 tenant +_anomaly_counter = AnomalyCounter(get_redis()) + +# 修改後:per-project 工廠 +def get_anomaly_counter(project_id: str) -> AnomalyCounter: + return AnomalyCounter(get_redis(), project_prefix=f"{project_id}:anomaly") +``` + +**review**:debugger(確認 6 個 prefix 全數改造) +**風險**:中 + +--- + +## 執行順序圖 + +``` +並行群組 G-D(無依賴,可同時開): + PR-01, PR-02, PR-03, PR-04, PR-05 + +↓ Phase 1 Schema 完成後 + +並行群組 G-E(部分並行): + PR-07(安全修補,優先) + PR-06(需 Redis 雙寫 Phase A) + PR-08(需 Phase 1 schema) + +↓ PR-08 合併後 + + PR-09(依賴 PR-08) + +↓ ADR-123 Accepted 後 + + PR-10(background loop 標記) + +↓ PR-10 合併後 + + PR-11(AnomalyCounter) +``` + +--- + +## 驗收標準 + +- [ ] PR-01~05 全部合併(無依賴,Phase 0 完成後即可) +- [ ] PR-07 通過 vuln-verifier PoC 重跑驗證 +- [ ] PR-08~09 通過 db-expert review(query plan 確認) +- [ ] PR-10~11 通過 regression test(31 個 loop logging context 正確) + +*最後更新:2026-05-03(台北)* diff --git a/docs/awooop/inventory/INV-8-background-loop-catalog.md b/docs/awooop/inventory/INV-8-background-loop-catalog.md new file mode 100644 index 00000000..ac9953e7 --- /dev/null +++ b/docs/awooop/inventory/INV-8-background-loop-catalog.md @@ -0,0 +1,110 @@ +# INV-8: Background Loop Catalog + +**版本**:v1.0 初稿 +**日期**:2026-05-03(台北) +**來源**:`apps/api/src/main.py` grep asyncio.create_task(實測 31 個) +**用途**:ADR-123 Background Loop Migration Strategy 的執行基礎 + +--- + +## 概覽 + +| 統計 | 數量 | +|------|------| +| 總 background loop | 31 | +| platform_internal | 2(#27、#30)| +| legacy_awoooi_default(過渡)| 29 | +| requires_project_id(Phase 2+ 改造後)| 0(初始)| + +--- + +## 完整清單 + +> 行號來自 `apps/api/src/main.py` +> 「頻率」為估計值,需 code review 確認 + +| # | 函數名 | 行號 | 頻率 | 標記 | 改造優先度 | 備註 | +|---|--------|------|------|------|-----------|------| +| 1 | `seed_playbooks_from_rules()` | 331 | 啟動一次 | `legacy_awoooi_default` | P3(低)| 只在啟動時跑一次,風險低 | +| 2 | `backfill_redis_to_pg()` | 341 | 啟動一次 | `legacy_awoooi_default` | P3 | 啟動一次性 backfill | +| 3 | `ensure_playbook_embeddings_indexed()` | 348 | 啟動一次 | `legacy_awoooi_default` | P3 | 啟動一次性 index 確認 | +| 4 | `resend_stale_ready_tokens()` | 362 | 每 30s | `legacy_awoooi_default` | P1 | 涉及 approval token,Phase 8 前必改 | +| 5 | `run_incident_analysis_sweeper()` | 373 | 每 5min | `legacy_awoooi_default` | P2 | 掃描 incident,多 tenant 後需分 project | +| 6 | `run_asset_scanner_loop()` | 383 | 每 10min | `legacy_awoooi_default` | P2 | K8s asset 掃描 | +| 7 | `run_rule_catalog_sync_loop()` | 393 | 每 5min | `legacy_awoooi_default` | P2 | 規則 catalog 同步 | +| 8 | `run_capacity_scanner_loop()` | 403 | 每 15min | `legacy_awoooi_default` | P2 | 容量掃描 | +| 9 | `run_compliance_scanner_loop()` | 413 | 每 30min | `legacy_awoooi_default` | P2 | 合規掃描 | +| 10 | `run_aider_event_processor_loop()` | 423 | 每 30s | `legacy_awoooi_default` | P2 | aider 事件處理 | +| 11 | `run_coverage_evaluator_loop()` | 432 | 每 1h | `legacy_awoooi_default` | P3 | 覆蓋率評估 | +| 12 | `run_rule_stats_updater_loop()` | 442 | 每 15min | `legacy_awoooi_default` | P3 | 規則統計 | +| 13 | `run_asset_change_tracker_loop()` | 452 | 每 5min | `legacy_awoooi_default` | P2 | 資產變更追蹤 | +| 14 | `run_hermes_rule_quality_loop()` | 462 | 每 1h | `legacy_awoooi_default` | P2 | Hermes 規則品質 | +| 15 | `run_capacity_forecaster_loop()` | 472 | 每 6h | `legacy_awoooi_default` | P3 | 容量預測 | +| 16 | `run_daily_report_loop()` | 481 | 每日 | `legacy_awoooi_default` | P3 | 每日報告 | +| 17 | `run_approval_timeout_resolver()` | 490 | 每 30s | `legacy_awoooi_default` | **P0** | 🔴 P1-15:無 trace_id;Phase 8 前必改 | +| 18 | `run_evolver_loop()` | 499 | 每 1h | `legacy_awoooi_default` | P2 | playbook 進化(飛輪關鍵)| +| 19 | `run_playbook_generation_governance_loop()` | 507 | 每 30min | `legacy_awoooi_default` | P2 | playbook 生成治理 | +| 20 | `run_knowledge_decay_loop()` | 519 | 每 6h | `legacy_awoooi_default` | P2 | KM 知識衰退 | +| 21 | `run_km_backfill_reconciler_loop()` | 529 | 每 1h | `legacy_awoooi_default` | P2 | KM backfill 核對 | +| 22 | `run_aol_writeback_loop()` | 540 | 每 30s | `legacy_awoooi_default` | P1 | AI 飛輪 AOL writeback,KM 雙路徑 | +| 23 | `_run_kb_rot_cleaner_loop()` | 585 | 每 6h | `legacy_awoooi_default` | P3 | KB 腐敗清理 | +| 24 | `run_finetune_export_loop()` | 594 | 每日 | `legacy_awoooi_default` | P3 | finetune 資料匯出 | +| 25 | `run_proactive_inspector_loop()` | 605 | 每 5min | `legacy_awoooi_default` | P2 | 主動巡檢 | +| 26 | `run_offline_replay_loop()` | 614 | 每 15min | `legacy_awoooi_default` | P3 | 離線重放 | +| 27 | `run_ai_slo_watchdog_loop()` | 623 | 每 5min | **`platform_internal`** | N/A(已完成)| SLO watchdog,project_id=__platform__ | +| 28 | `run_governance_loop()` | 632 | 每 5min | `legacy_awoooi_default` | P2 | AI 治理主循環 | +| 29 | `run_governance_dispatcher_loop()` | 640 | 每 30s | `legacy_awoooi_default` | P2 | 治理事件派送 | +| 30 | `_run_model_version_tracker_loop()` | 701 | 每 1h | **`platform_internal`** | N/A(已完成)| model 版本追蹤,platform_resource | +| 31 | (需確認)| 701+ | TBD | TBD | TBD | grep 計數 = 31,需確認第 31 個 | + +--- + +## 改造策略(ADR-123) + +### platform_internal(#27、#30) +```python +# main.py 改造示意 +async def _run_with_context(coro, project_id: str): + ctx_token = project_id_ctx_var.set(project_id) + try: + await coro + finally: + project_id_ctx_var.reset(ctx_token) + +# platform_internal loop: +asyncio.create_task(_run_with_context( + run_ai_slo_watchdog_loop(), "__platform__" +)) +``` + +### legacy_awoooi_default(其餘 29 個) +```python +# Phase 2 過渡:fallback 到 awoooi +asyncio.create_task(_run_with_context( + run_incident_analysis_sweeper(), "awoooi" # 過渡期 +)) +# 退場時程:Phase 4 完成後 90 天內逐一改為真正 multi-tenant +``` + +--- + +## 改造優先度說明 + +| 優先度 | 說明 | 對應 loop | +|--------|------|-----------| +| P0(立即)| 直接影響安全性或正確性 | #17(approval_timeout_resolver)| +| P1(Phase 4 前)| 影響 AwoooP run state 正確性 | #4、#22 | +| P2(Phase 6 前)| 影響多 tenant 資料隔離 | #5~16、#18~20、#25、#28~29 | +| P3(Phase 8 前)| 低頻率、低風險 | #1~3、#11~12、#15~16、#23~24、#26 | + +--- + +## 驗收標準 + +- [ ] 第 31 個 loop 確認並列入 +- [ ] 全部 31 個 loop 在 main.py 有 logging context(project_id 可見) +- [ ] `platform_internal` loop 帶 `project_id=__platform__` +- [ ] `legacy_awoooi_default` loop 帶 `project_id=awoooi`(PR-10) +- [ ] loop #17(approval_timeout_resolver)已補入 trace_id(PR-10 合併時) + +*最後更新:2026-05-03(台北)* diff --git a/docs/awooop/inventory/INV-9-global-singleton-catalog.md b/docs/awooop/inventory/INV-9-global-singleton-catalog.md new file mode 100644 index 00000000..adad32a5 --- /dev/null +++ b/docs/awooop/inventory/INV-9-global-singleton-catalog.md @@ -0,0 +1,235 @@ +# INV-9: Global Singleton Catalog + +**版本**:v1.0 初稿 +**日期**:2026-05-03(台北) +**來源**:`apps/api/src/` grep `_instance = None` / `get_*()` factory functions +**用途**:ADR-124 Global Singleton Decomposition 的執行基礎 + +--- + +## 概覽 + +| 統計 | 數量 | +|------|------| +| 確認全域單例 | 13 | +| 需要分解(per-project)| 9 | +| 可保持 platform-level | 4 | + +--- + +## 完整清單 + +### 1. TrustEngine + +| 屬性 | 值 | +|------|----| +| 位置 | `apps/api/src/core/trust_engine.py:189` | +| Factory | `get_trust_engine()` at line 393 | +| 單例模式 | module-level `_trust_engine` variable | +| 目前狀態 | 全域,所有 tenant 共用 | +| 影響 | trust_records(無 project_id),trust score 跨 tenant 混算 | +| 分解策略 | per-project trust engine instance,從 project contract 讀 trust_policy | +| Phase | Phase 2(repository 加 project_id filter 後)| +| 風險 | 高(TrustEngine 是飛輪核心,分解需仔細 regression test)| + +--- + +### 2. ProviderRegistry(MCP) + +| 屬性 | 值 | +|------|----| +| 位置 | `apps/api/src/plugins/mcp/registry.py:74` | +| Factory | module-level `get_registry()` at line 80 / `_registry` at line 209 | +| 目前狀態 | 全域,所有 tenant 共用 MCP provider 清單 | +| 影響 | Phase 5 MCP Gateway:不同 tenant 應有不同可用 tool 清單 | +| 分解策略 | platform-level registry(tool 清單)+ per-project grants(誰可用什麼);Registry 本身可保持 platform-level | +| Phase | Phase 5(MCP Gateway 建立時)| +| 風險 | 中(Registry 改為 read-only platform resource,grants 移到 DB)| + +--- + +### 3. AnomalyCounter + +| 屬性 | 值 | +|------|----| +| 位置 | `apps/api/src/services/anomaly_counter.py:85` | +| Factory | `_anomaly_counter = AnomalyCounter(get_redis())` at line 800 | +| 目前狀態 | 全域,6 個 Redis prefix 無 tenant 隔離 | +| 影響 | P1-17:多 tenant 計數混算 | +| 分解策略 | per-project 工廠:`get_anomaly_counter(project_id)` 注入 `project_prefix` | +| Phase | Phase 2(PR-11)| +| 風險 | 中(6 個 prefix 全部要改)| + +--- + +### 4. AIRouter + +| 屬性 | 值 | +|------|----| +| 位置 | `apps/api/src/services/ai_router.py:194` | +| Factory | `_router = AIRouter()` at line 1324 | +| 依賴 | AIProviderRegistry(923)、AIRouterExecutor(975)| +| 目前狀態 | 全域,所有 tenant 共用 provider routing 決策 | +| 影響 | EwoooC 可能使用 AWOOOI 的 model routing policy | +| 分解策略 | AIRouter 接受 EffectivePolicy 參數(per-project policy 已在 Phase 4 設計);Router 邏輯保持 platform-level,policy 來源改為 per-project | +| Phase | Phase 4(EffectivePolicy 設計完成後)| +| 風險 | 中(ADR-052 AIRouter 已有 protocol 設計,改 policy 注入點)| + +--- + +### 5. IntentClassifier + +| 屬性 | 值 | +|------|----| +| 位置 | `apps/api/src/services/intent_classifier.py:357` | +| Factory | `_classifier = IntentClassifier()` at line 744 | +| 目前狀態 | 全域 | +| 影響 | 不同 tenant 可能需要不同 intent classification domain | +| 分解策略 | 短期保持 platform-level(intent schema 共用);長期 per-project intent domain config | +| Phase | Phase 6(EwoooC 需要不同 intent domain 時才分解)| +| 風險 | 低(intent classification 不帶 tenant 資料)| + +--- + +### 6. TelegramGateway + +| 屬性 | 值 | +|------|----| +| 位置 | `apps/api/src/services/telegram_gateway.py:1324` | +| Factory | `_gateway = TelegramGateway()` at line 6375 | +| 目前狀態 | 全域,持有 polling leader lock | +| 影響 | `telegram:polling:leader` 是 platform_resource,但 gateway 本身可以 per-project | +| 分解策略 | polling leader 保持 platform_resource;gateway 處理邏輯改為 per-project(不同 tenant 有不同 Telegram bot)| +| Phase | Phase 7(Communication Hub 建立時)| +| 風險 | 高(TelegramGateway 是最大的 service,約 6000+ 行)| + +--- + +### 7. DecisionManager + +| 屬性 | 值 | +|------|----| +| 位置 | `apps/api/src/services/decision_manager.py:1402` | +| Factory | `_decision_manager = DecisionManager()` at line 3529 | +| 目前狀態 | 全域,硬碼 silence key(P1-24 已在 PR-02 修補)| +| 影響 | 決策邏輯跨 tenant 共用(不同 tenant 決策規則不同)| +| 分解策略 | DecisionManager 接受 project_id 參數;規則從 project policy contract 讀 | +| Phase | Phase 4(EffectivePolicy 設計完成後)| +| 風險 | 極高(DecisionManager 是 Tier 3 核心,修改需 ADR 和架構審查)| + +--- + +### 8. ConsensusEngine + +| 屬性 | 值 | +|------|----| +| 位置 | `apps/api/src/services/consensus_engine.py:344` | +| Factory | `_consensus_engine = ConsensusEngine()` at line 716 | +| 目前狀態 | 全域,`CONSENSUS_PREFIX="consensus:"` 無 project | +| 影響 | P0-12:多 tenant consensus 結果混合 | +| 分解策略 | ConsensusEngine 接受 project_id;CONSENSUS_PREFIX 改為 `{project_id}:consensus:` | +| Phase | Phase 2(PR-06)| +| 風險 | 中(需 Redis 雙寫 Phase A 先完成)| + +--- + +### 9. DecisionFusionAdapter + +| 屬性 | 值 | +|------|----| +| 位置 | `apps/api/src/services/decision_fusion_adapter.py:538` | +| 單例模式 | `_adapter_instance = None` | +| 目前狀態 | 全域 | +| 影響 | 決策融合結果跨 tenant | +| 分解策略 | 同 DecisionManager(依賴它),Phase 4 一起改 | +| Phase | Phase 4 | +| 風險 | 中 | + +--- + +### 10. FailoverAlerter + +| 屬性 | 值 | +|------|----| +| 位置 | `apps/api/src/services/failover_alerter.py:395` | +| 單例模式 | `_alerter_instance = None` | +| 目前狀態 | 全域 | +| 影響 | failover 告警沒有 tenant 隔離(GCP Ollama failover 是 platform_resource)| +| 分解策略 | failover 告警本身是 `platform_resource`(保持全域),但告警發送目的地改為 per-project channel | +| Phase | Phase 7(Communication Hub 建立後)| +| 風險 | 低 | + +--- + +### 11. HostRepairAgent + +| 屬性 | 值 | +|------|----| +| 位置 | `apps/api/src/services/host_repair_agent.py:204` | +| 單例模式 | `cls._instance = None` | +| 目前狀態 | 全域 | +| 影響 | 主機修復操作無 tenant scope(主機屬於 platform)| +| 分解策略 | 保持 `platform_internal`(主機修復是 platform 操作,不是 tenant 操作)| +| Phase | N/A(不需分解)| +| 風險 | 低 | + +--- + +### 12. AIProviderRegistry + +| 屬性 | 值 | +|------|----| +| 位置 | `apps/api/src/services/ai_router.py:923` | +| 目前狀態 | AIRouter 的內部 registry,全域 | +| 影響 | provider 清單是 platform resource(GCP Ollama/Gemini/etc 所有 tenant 共用)| +| 分解策略 | 保持 platform-level registry;per-project policy 控制各 tenant 的可用 provider | +| Phase | Phase 4(EffectivePolicy 設計)| +| 風險 | 低 | + +--- + +### 13. AIRouterExecutor + +| 屬性 | 值 | +|------|----| +| 位置 | `apps/api/src/services/ai_router.py:975` | +| 目前狀態 | AIRouter 的執行器,全域 | +| 影響 | 執行邏輯共用,但 policy 會 per-project | +| 分解策略 | 同 AIRouter(Phase 4)| +| Phase | Phase 4 | +| 風險 | 中 | + +--- + +## 分解優先序(ADR-124 執行計畫) + +| Phase | 要分解的單例 | 策略 | +|-------|-----------|------| +| Phase 2 | AnomalyCounter(PR-11)、ConsensusEngine(PR-06)| 注入 project_id 參數 | +| Phase 4 | AIRouter、AIProviderRegistry、AIRouterExecutor、DecisionFusionAdapter | EffectivePolicy 注入 | +| Phase 5 | ProviderRegistry | per-project grants 移到 DB,registry 保持 platform-level | +| Phase 6 | IntentClassifier | EwoooC 需要不同 intent domain 時 | +| Phase 7 | TelegramGateway、FailoverAlerter | Communication Hub 建立 | +| **特別謹慎** | **TrustEngine**、**DecisionManager** | Tier 3,需額外 ADR 和架構審查 | + +> **TrustEngine 和 DecisionManager 分解需要統帥批准(Tier 3 核心,RED_ZONES.md)** + +--- + +## 不需分解的單例(platform_resource) + +| 單例 | 理由 | +|------|------| +| HostRepairAgent | 主機修復 = platform 操作,不屬於任何 tenant | +| AIProviderRegistry | provider 清單 = platform resource | +| TelegramGateway 的 polling leader | platform_resource(鎖,全域唯一)| + +--- + +## 驗收標準 + +- [ ] 13 個單例全部分類完成(per-project / platform-level / platform_resource) +- [ ] TrustEngine 和 DecisionManager 的分解計畫需獨立 ADR(Tier 3 審查) +- [ ] Phase 2 前:AnomalyCounter + ConsensusEngine 分解完成 + +*最後更新:2026-05-03(台北)* diff --git a/k8s/awoooi-prod/02-network-policy.yaml b/k8s/awoooi-prod/02-network-policy.yaml index a66064cf..64989df0 100644 --- a/k8s/awoooi-prod/02-network-policy.yaml +++ b/k8s/awoooi-prod/02-network-policy.yaml @@ -165,6 +165,13 @@ spec: # Gitea — CI/CD 主倉 probe + monitoring - protocol: TCP port: 3001 + # 2026-05-04 ogt: GCP Ollama nginx proxy + # K8s → GCP-A/B:11434 外網路由不通(NetworkPolicy 外網 egress 只開 443) + # 在 110 架設 nginx 反向代理,K8s 走內網 110:11435(GCP-A) / 110:11436(GCP-B) + - protocol: TCP + port: 11435 + - protocol: TCP + port: 11436 # 允許訪問 192.168.0.112 安全掃描服務 - to: diff --git a/k8s/awoooi-prod/04-configmap.yaml b/k8s/awoooi-prod/04-configmap.yaml index c2a3f352..807faa76 100644 --- a/k8s/awoooi-prod/04-configmap.yaml +++ b/k8s/awoooi-prod/04-configmap.yaml @@ -23,8 +23,8 @@ data: # 修法:111 升為 primary;GCP-A/B 保留為 secondary/tertiary,待 nginx proxy 架設後恢復可用 # 長期目標:在 110 架設 nginx proxy 轉發 GCP-A/B,ConfigMap 改指向 110:11435 / 110:11436 OLLAMA_URL: "http://192.168.0.111:11434" - OLLAMA_SECONDARY_URL: "http://34.143.170.20:11434" - OLLAMA_FALLBACK_URL: "http://34.21.145.224:11434" + OLLAMA_SECONDARY_URL: "http://192.168.0.110:11435" + OLLAMA_FALLBACK_URL: "http://192.168.0.110:11436" OPENCLAW_URL: "http://192.168.0.188:8088" KALI_SCANNER_URL: "http://192.168.0.112:8080" SIGNOZ_URL: "http://192.168.0.188:3301" diff --git a/k8s/awoooi-prod/06-deployment-api.yaml b/k8s/awoooi-prod/06-deployment-api.yaml index f2a0832e..fc74fbfb 100644 --- a/k8s/awoooi-prod/06-deployment-api.yaml +++ b/k8s/awoooi-prod/06-deployment-api.yaml @@ -68,11 +68,11 @@ spec: - name: TELEGRAM_ENABLE_POLLING value: "true" - name: OLLAMA_URL - value: "http://34.143.170.20:11434" # 2026-05-03 ogt: GCP-A Primary(ADR-110) + value: "http://192.168.0.111:11434" # 2026-05-04 ogt: 111 primary(K8s 內網直連,GPU RTX) - name: OLLAMA_SECONDARY_URL - value: "http://34.21.145.224:11434" # 2026-05-03 ogt: GCP-B Secondary + value: "http://192.168.0.110:11435" # 2026-05-04 ogt: GCP-A via 110 nginx proxy(11435 → 34.143.170.20:11434) - name: OLLAMA_FALLBACK_URL - value: "http://192.168.0.111:11434" # 2026-05-03 ogt: Local HDD Fallback + value: "http://192.168.0.110:11436" # 2026-05-04 ogt: GCP-B via 110 nginx proxy(11436 → 34.21.145.224:11434) - name: OPENCLAW_DEFAULT_MODEL value: "qwen2.5:7b-instruct" - name: OPENCLAW_TIMEOUT diff --git a/packages/awooop-contracts/__init__.py b/packages/awooop-contracts/__init__.py new file mode 100644 index 00000000..c0f1dbb2 --- /dev/null +++ b/packages/awooop-contracts/__init__.py @@ -0,0 +1,3 @@ +# AwoooP Contracts Package +# Phase 3: Six contract families with JSON Schema + Pydantic v2 models +# ADR-107/ADR-112, 2026-05-04 ogt + Claude Sonnet 4.6 diff --git a/packages/awooop-contracts/fixtures/invalid/agent.json b/packages/awooop-contracts/fixtures/invalid/agent.json new file mode 100644 index 00000000..38c59261 --- /dev/null +++ b/packages/awooop-contracts/fixtures/invalid/agent.json @@ -0,0 +1,12 @@ +{ + "_comment": "缺少 required 欄位 provider;temperature 超出 [0,2] 範圍;system_prompt_ref 的 sha256 長度不對", + "agent_id": "bad-agent", + "agent_name": "Bad Agent", + "model": "claude-sonnet-4-6", + "temperature": 5.0, + "system_prompt_ref": { + "artifact_id": "prompts/bad.txt", + "sha256": "tooshort" + }, + "max_parallel_runs": 0 +} diff --git a/packages/awooop-contracts/fixtures/invalid/channel_event.json b/packages/awooop-contracts/fixtures/invalid/channel_event.json new file mode 100644 index 00000000..4fcf3073 --- /dev/null +++ b/packages/awooop-contracts/fixtures/invalid/channel_event.json @@ -0,0 +1,8 @@ +{ + "_comment": "缺少 required 欄位 payload 和 received_at;channel_type 不合法;event_id 格式錯誤", + "event_id": "not-a-uuid", + "project_id": "awoooi", + "channel_type": "discord", + "event_type": "message_received", + "provider_event_id": "some:event:id" +} diff --git a/packages/awooop-contracts/fixtures/invalid/mcp_gateway.json b/packages/awooop-contracts/fixtures/invalid/mcp_gateway.json new file mode 100644 index 00000000..33556119 --- /dev/null +++ b/packages/awooop-contracts/fixtures/invalid/mcp_gateway.json @@ -0,0 +1,15 @@ +{ + "_comment": "transport=http 但缺少 endpoint;schema_sha256 格式錯誤;rate_limit_rpm 為 0(違反 minimum: 1)", + "gateway_id": "bad-gateway", + "gateway_name": "Bad Gateway", + "transport": "http", + "auth_scheme": "bearer", + "tools_exposed": [ + { + "tool_name": "some_tool", + "schema_sha256": "not-valid-hex" + } + ], + "rate_limit_rpm": 0, + "timeout_seconds": 9999 +} diff --git a/packages/awooop-contracts/fixtures/invalid/policy_routing.json b/packages/awooop-contracts/fixtures/invalid/policy_routing.json new file mode 100644 index 00000000..f16472e6 --- /dev/null +++ b/packages/awooop-contracts/fixtures/invalid/policy_routing.json @@ -0,0 +1,8 @@ +{ + "_comment": "routing_rules 為空陣列(minItems: 1);priority 超出範圍;provider 不合法", + "policy_id": "bad-policy", + "policy_name": "Bad Policy", + "routing_rules": [], + "fallback_provider": "INVALID_PROVIDER", + "max_cost_per_run_usd": -5.0 +} diff --git a/packages/awooop-contracts/fixtures/invalid/project_tenant.json b/packages/awooop-contracts/fixtures/invalid/project_tenant.json new file mode 100644 index 00000000..67c449c9 --- /dev/null +++ b/packages/awooop-contracts/fixtures/invalid/project_tenant.json @@ -0,0 +1,7 @@ +{ + "_comment": "缺少 required 欄位 display_name;migration_mode 不合法;budget_limit_usd 為負數", + "project_id": "awoooi", + "migration_mode": "INVALID_MODE", + "budget_limit_usd": -10.0, + "allowed_channels": ["invalid_channel_type"] +} diff --git a/packages/awooop-contracts/fixtures/invalid/runtime_run_state.json b/packages/awooop-contracts/fixtures/invalid/runtime_run_state.json new file mode 100644 index 00000000..834134d5 --- /dev/null +++ b/packages/awooop-contracts/fixtures/invalid/runtime_run_state.json @@ -0,0 +1,7 @@ +{ + "_comment": "缺少 required 欄位 agent_id;state 不合法;run_id 不是 UUID 格式", + "run_id": "not-a-uuid", + "project_id": "awoooi", + "state": "RUNNING_INVALID", + "trace_id": "some-trace" +} diff --git a/packages/awooop-contracts/fixtures/valid/agent.json b/packages/awooop-contracts/fixtures/valid/agent.json new file mode 100644 index 00000000..91d4bf69 --- /dev/null +++ b/packages/awooop-contracts/fixtures/valid/agent.json @@ -0,0 +1,24 @@ +{ + "agent_id": "openclaw-decision-agent", + "agent_name": "OpenClaw Decision Agent", + "model": "claude-sonnet-4-6", + "provider": "anthropic", + "max_tokens": 8192, + "temperature": 0.1, + "system_prompt_ref": { + "artifact_id": "prompts/openclaw-system-v1.0.txt", + "sha256": "a3f5b2c1d4e6f7a8b9c0d1e2f3a4b5c6d7e8f9a0b1c2d3e4f5a6b7c8d9e0f1a2" + }, + "tools": [ + { + "tool_name": "kubectl_apply", + "mcp_gateway_id": "k8s-gateway", + "sha256": "b4a6c3d2e5f1a7b8c9d0e1f2a3b4c5d6e7f8a9b0c1d2e3f4a5b6c7d8e9f0a1b2" + } + ], + "budget_limit_usd_per_run": 2.00, + "require_approval": true, + "approval_timeout_seconds": 300, + "max_parallel_runs": 3, + "tags": ["production", "kubernetes", "remediation"] +} diff --git a/packages/awooop-contracts/fixtures/valid/channel_event.json b/packages/awooop-contracts/fixtures/valid/channel_event.json new file mode 100644 index 00000000..19de54e2 --- /dev/null +++ b/packages/awooop-contracts/fixtures/valid/channel_event.json @@ -0,0 +1,21 @@ +{ + "event_id": "01960000-0000-7000-8000-000000000003", + "project_id": "awoooi", + "channel_type": "telegram", + "event_type": "message_received", + "provider_event_id": "telegram:bot123:msg456789", + "user_id": "platform:awoooi:telegram:123456789", + "chat_id": "123456789", + "payload": { + "message_id": 456789, + "from": {"id": 123456789, "username": "ogt"}, + "chat": {"id": 123456789, "type": "private"}, + "date": 1746345600, + "text": "/status" + }, + "text": "/status", + "attachments": [], + "run_id": "01960000-0000-7000-8000-000000000001", + "is_duplicate": false, + "received_at": "2026-05-04T08:00:00Z" +} diff --git a/packages/awooop-contracts/fixtures/valid/mcp_gateway.json b/packages/awooop-contracts/fixtures/valid/mcp_gateway.json new file mode 100644 index 00000000..594ea66e --- /dev/null +++ b/packages/awooop-contracts/fixtures/valid/mcp_gateway.json @@ -0,0 +1,25 @@ +{ + "gateway_id": "k8s-gateway", + "gateway_name": "Kubernetes Control Gateway", + "transport": "http", + "endpoint": "http://mcp-k8s-gateway.awoooi.svc.cluster.local:8080", + "auth_scheme": "hmac", + "hmac_secret_ref": "mcp-gateway-k8s-hmac-secret", + "tools_exposed": [ + { + "tool_name": "kubectl_apply", + "description": "Apply Kubernetes manifest", + "schema_sha256": "b4a6c3d2e5f1a7b8c9d0e1f2a3b4c5d6e7f8a9b0c1d2e3f4a5b6c7d8e9f0a1b2", + "is_destructive": false + }, + { + "tool_name": "kubectl_delete", + "description": "Delete Kubernetes resource", + "schema_sha256": "c5b7d4e3f6a2b9c0d1e2f3a4b5c6d7e8f9a0b1c2d3e4f5a6b7c8d9e0f1a2b3c4", + "is_destructive": true + } + ], + "rate_limit_rpm": 60, + "timeout_seconds": 30, + "is_enabled": true +} diff --git a/packages/awooop-contracts/fixtures/valid/policy_routing.json b/packages/awooop-contracts/fixtures/valid/policy_routing.json new file mode 100644 index 00000000..d18427cd --- /dev/null +++ b/packages/awooop-contracts/fixtures/valid/policy_routing.json @@ -0,0 +1,34 @@ +{ + "policy_id": "default-routing-policy", + "policy_name": "AWOOOI Default Routing Policy", + "routing_rules": [ + { + "rule_id": "rule-01-local-ollama", + "priority": 0, + "provider": "ollama", + "model": "qwen3:8b", + "condition": { + "task_types": ["classification", "triage", "quick_check"], + "max_prompt_tokens": 4000 + }, + "weight": 100 + }, + { + "rule_id": "rule-02-sonnet", + "priority": 10, + "provider": "anthropic", + "model": "claude-sonnet-4-6", + "weight": 100 + } + ], + "fallback_provider": "anthropic", + "fallback_model": "claude-sonnet-4-6", + "max_cost_per_run_usd": 5.00, + "retry_policy": { + "max_retries": 3, + "backoff_base_seconds": 1.0, + "retry_on_provider_errors": true + }, + "effective_from": "2026-05-04T00:00:00Z", + "effective_to": null +} diff --git a/packages/awooop-contracts/fixtures/valid/project_tenant.json b/packages/awooop-contracts/fixtures/valid/project_tenant.json new file mode 100644 index 00000000..23517290 --- /dev/null +++ b/packages/awooop-contracts/fixtures/valid/project_tenant.json @@ -0,0 +1,12 @@ +{ + "project_id": "awoooi", + "display_name": "AWOOOI Production", + "migration_mode": "legacy_awoooi_default", + "budget_limit_usd": 50.00, + "allowed_channels": ["telegram", "api"], + "is_active": true, + "metadata": { + "owner": "ogt", + "environment": "production" + } +} diff --git a/packages/awooop-contracts/fixtures/valid/runtime_run_state.json b/packages/awooop-contracts/fixtures/valid/runtime_run_state.json new file mode 100644 index 00000000..c2c18925 --- /dev/null +++ b/packages/awooop-contracts/fixtures/valid/runtime_run_state.json @@ -0,0 +1,21 @@ +{ + "run_id": "01960000-0000-7000-8000-000000000001", + "project_id": "awoooi", + "agent_id": "openclaw-decision-agent", + "state": "running", + "trace_id": "00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01", + "parent_run_id": null, + "trigger": { + "trigger_type": "channel_event", + "channel_event_id": "01960000-0000-7000-8000-000000000002", + "triggered_by": "telegram:123456789" + }, + "input_sha256": "d1e2f3a4b5c6d7e8f9a0b1c2d3e4f5a6b7c8d9e0f1a2b3c4d5e6f7a8b9c0d1e2", + "output_sha256": null, + "started_at": "2026-05-04T08:00:00Z", + "completed_at": null, + "timeout_at": "2026-05-04T08:10:00Z", + "error_code": null, + "cost_usd": 0.015, + "step_count": 2 +} diff --git a/packages/awooop-contracts/schemas/agent.json b/packages/awooop-contracts/schemas/agent.json new file mode 100644 index 00000000..d01a0749 --- /dev/null +++ b/packages/awooop-contracts/schemas/agent.json @@ -0,0 +1,105 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://awooop.platform/contracts/v1/agent", + "title": "Agent Contract", + "description": "AwoooP Agent 合約 — 定義一個 agent 的模型、工具、預算與治理規則", + "type": "object", + "required": ["agent_id", "agent_name", "model", "provider"], + "additionalProperties": false, + "properties": { + "agent_id": { + "type": "string", + "pattern": "^[a-z0-9][a-z0-9_-]{1,127}$", + "description": "Agent 識別符(project 內唯一)" + }, + "agent_name": { + "type": "string", + "minLength": 1, + "maxLength": 256, + "description": "人類可讀 agent 名稱" + }, + "model": { + "type": "string", + "minLength": 1, + "maxLength": 128, + "description": "LLM 模型識別符(如 claude-sonnet-4-6, qwen3:8b)" + }, + "provider": { + "type": "string", + "enum": ["anthropic", "openai", "ollama", "gemini", "nvidia", "openrouter"], + "description": "LLM Provider" + }, + "max_tokens": { + "type": "integer", + "minimum": 1, + "maximum": 200000, + "description": "單次 completion 最大 token 數" + }, + "temperature": { + "type": "number", + "minimum": 0.0, + "maximum": 2.0, + "description": "Sampling temperature" + }, + "system_prompt_ref": { + "type": "object", + "required": ["artifact_id", "sha256"], + "additionalProperties": false, + "properties": { + "artifact_id": {"type": "string"}, + "sha256": { + "type": "string", + "pattern": "^[0-9a-f]{64}$", + "description": "System prompt 檔案的 SHA-256 hex digest(ADR-112 artifact integrity)" + } + }, + "description": "System prompt artifact reference(必含 sha256)" + }, + "tools": { + "type": "array", + "items": { + "type": "object", + "required": ["tool_name"], + "additionalProperties": true, + "properties": { + "tool_name": {"type": "string"}, + "mcp_gateway_id": {"type": "string"}, + "sha256": { + "type": "string", + "pattern": "^[0-9a-f]{64}$" + } + } + }, + "description": "Agent 可用工具清單" + }, + "budget_limit_usd_per_run": { + "type": ["number", "null"], + "minimum": 0, + "description": "單次 run 費用上限(USD);null = 繼承 tenant 預算" + }, + "require_approval": { + "type": "boolean", + "default": false, + "description": "執行前是否需要人工審核" + }, + "approval_timeout_seconds": { + "type": "integer", + "minimum": 60, + "maximum": 86400, + "description": "審核超時秒數(require_approval=true 時有效)" + }, + "max_parallel_runs": { + "type": "integer", + "minimum": 1, + "maximum": 100, + "default": 1, + "description": "最大並行 run 數" + }, + "tags": { + "type": "array", + "items": {"type": "string"}, + "uniqueItems": true, + "description": "Agent 分類標籤" + } + } +} diff --git a/packages/awooop-contracts/schemas/channel_event.json b/packages/awooop-contracts/schemas/channel_event.json new file mode 100644 index 00000000..3840b884 --- /dev/null +++ b/packages/awooop-contracts/schemas/channel_event.json @@ -0,0 +1,94 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://awooop.platform/contracts/v1/channel_event", + "title": "Channel Event Contract", + "description": "AwoooP Channel Event 合約 — 定義來自外部 channel 的事件結構(冪等性 + 去重)", + "type": "object", + "required": ["event_id", "channel_type", "event_type", "project_id", "payload", "received_at"], + "additionalProperties": false, + "properties": { + "event_id": { + "type": "string", + "pattern": "^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", + "description": "Platform 生成的事件 UUID" + }, + "project_id": { + "type": "string", + "pattern": "^[a-z0-9][a-z0-9_-]{1,63}$", + "description": "所屬租戶" + }, + "channel_type": { + "type": "string", + "enum": ["telegram", "slack", "webhook", "api"], + "description": "Channel 類型" + }, + "event_type": { + "type": "string", + "enum": [ + "message_received", + "callback_query", + "command_invoked", + "webhook_post", + "api_request", + "approval_response" + ], + "description": "事件類型" + }, + "provider_event_id": { + "type": "string", + "maxLength": 256, + "description": "原始 provider 的事件 ID(去重用)" + }, + "user_id": { + "type": "string", + "description": "發送事件的使用者識別符(platform_subject_id)" + }, + "chat_id": { + "type": ["string", "null"], + "description": "Telegram chat_id 或 Slack channel_id" + }, + "payload": { + "type": "object", + "description": "原始 channel payload(channel-specific 結構)", + "minProperties": 1 + }, + "text": { + "type": ["string", "null"], + "maxLength": 4096, + "description": "訊息文字(方便直接讀取,不需 parse payload)" + }, + "attachments": { + "type": "array", + "items": { + "type": "object", + "required": ["attachment_type", "file_id"], + "additionalProperties": false, + "properties": { + "attachment_type": {"type": "string", "enum": ["photo", "document", "audio", "video"]}, + "file_id": {"type": "string"}, + "sha256": { + "type": ["string", "null"], + "pattern": "^[0-9a-f]{64}$", + "description": "下載後的附件 SHA-256" + } + } + }, + "description": "附件清單" + }, + "run_id": { + "type": ["string", "null"], + "pattern": "^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", + "description": "觸發的 run UUID(事件處理後填入)" + }, + "is_duplicate": { + "type": "boolean", + "default": false, + "description": "此事件是否為重複(去重後設為 true)" + }, + "received_at": { + "type": "string", + "format": "date-time", + "description": "Platform 收到事件的時間(ISO 8601)" + } + } +} diff --git a/packages/awooop-contracts/schemas/mcp_gateway.json b/packages/awooop-contracts/schemas/mcp_gateway.json new file mode 100644 index 00000000..87727d55 --- /dev/null +++ b/packages/awooop-contracts/schemas/mcp_gateway.json @@ -0,0 +1,89 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://awooop.platform/contracts/v1/mcp_gateway", + "title": "MCP Gateway Contract", + "description": "AwoooP MCP Gateway 合約 — 定義工具閘道的傳輸層、認證與速率控制", + "type": "object", + "required": ["gateway_id", "gateway_name", "transport"], + "additionalProperties": false, + "properties": { + "gateway_id": { + "type": "string", + "pattern": "^[a-z0-9][a-z0-9_-]{1,127}$", + "description": "Gateway 識別符" + }, + "gateway_name": { + "type": "string", + "minLength": 1, + "maxLength": 256, + "description": "人類可讀名稱" + }, + "transport": { + "type": "string", + "enum": ["stdio", "http", "sse"], + "description": "MCP 傳輸協議(ADR-113)" + }, + "endpoint": { + "type": "string", + "format": "uri", + "description": "Gateway 端點 URL(transport=http/sse 時必填)" + }, + "auth_scheme": { + "type": "string", + "enum": ["none", "bearer", "hmac"], + "default": "none", + "description": "認證方式" + }, + "hmac_secret_ref": { + "type": "string", + "description": "HMAC secret 的 K8s Secret key ref(auth_scheme=hmac 時使用)" + }, + "tools_exposed": { + "type": "array", + "items": { + "type": "object", + "required": ["tool_name", "schema_sha256"], + "additionalProperties": false, + "properties": { + "tool_name": {"type": "string"}, + "description": {"type": "string"}, + "schema_sha256": { + "type": "string", + "pattern": "^[0-9a-f]{64}$", + "description": "工具 input schema 的 SHA-256(確保合約不可竄改)" + }, + "is_destructive": { + "type": "boolean", + "default": false, + "description": "工具是否具破壞性(影響 require_approval 判斷)" + } + } + }, + "description": "此 gateway 暴露的工具清單" + }, + "rate_limit_rpm": { + "type": ["integer", "null"], + "minimum": 1, + "description": "每分鐘最大 tool call 次數;null = 無限制" + }, + "timeout_seconds": { + "type": "integer", + "minimum": 1, + "maximum": 300, + "default": 30, + "description": "單次 tool call 超時秒數" + }, + "is_enabled": { + "type": "boolean", + "default": true, + "description": "Gateway 是否啟用" + } + }, + "if": { + "properties": {"transport": {"const": "http"}}, + "required": ["transport"] + }, + "then": { + "required": ["endpoint"] + } +} diff --git a/packages/awooop-contracts/schemas/policy_routing.json b/packages/awooop-contracts/schemas/policy_routing.json new file mode 100644 index 00000000..00b1a0f7 --- /dev/null +++ b/packages/awooop-contracts/schemas/policy_routing.json @@ -0,0 +1,117 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://awooop.platform/contracts/v1/policy_routing", + "title": "Policy Routing Contract", + "description": "AwoooP 路由/政策合約 — 定義 LLM 路由規則、fallback 順序與費用保護", + "type": "object", + "required": ["policy_id", "policy_name", "routing_rules"], + "additionalProperties": false, + "properties": { + "policy_id": { + "type": "string", + "pattern": "^[a-z0-9][a-z0-9_-]{1,127}$", + "description": "Policy 識別符" + }, + "policy_name": { + "type": "string", + "minLength": 1, + "maxLength": 256, + "description": "人類可讀政策名稱" + }, + "routing_rules": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "required": ["rule_id", "priority", "provider", "model"], + "additionalProperties": false, + "properties": { + "rule_id": {"type": "string"}, + "priority": { + "type": "integer", + "minimum": 0, + "maximum": 9999, + "description": "數字越小優先級越高" + }, + "provider": { + "type": "string", + "enum": ["anthropic", "openai", "ollama", "gemini", "nvidia", "openrouter"] + }, + "model": {"type": "string"}, + "condition": { + "type": "object", + "description": "路由條件(task_type, token_budget, time_range 等)", + "properties": { + "task_types": { + "type": "array", + "items": {"type": "string"} + }, + "max_prompt_tokens": {"type": "integer", "minimum": 1}, + "time_range": { + "type": "object", + "properties": { + "start_utc": {"type": "string", "pattern": "^[0-2][0-9]:[0-5][0-9]$"}, + "end_utc": {"type": "string", "pattern": "^[0-2][0-9]:[0-5][0-9]$"} + } + } + } + }, + "weight": { + "type": "integer", + "minimum": 1, + "maximum": 100, + "default": 100, + "description": "同 priority 時的加權比例(用於 A/B 流量分割)" + } + } + }, + "description": "路由規則清單(按 priority 升序評估)" + }, + "fallback_provider": { + "type": "string", + "enum": ["anthropic", "openai", "ollama", "gemini", "nvidia", "openrouter"], + "description": "所有規則 miss 時的最終 fallback" + }, + "fallback_model": { + "type": "string", + "description": "fallback_provider 使用的模型" + }, + "max_cost_per_run_usd": { + "type": ["number", "null"], + "minimum": 0, + "description": "單次 run 費用上限;null = 無限制" + }, + "retry_policy": { + "type": "object", + "additionalProperties": false, + "properties": { + "max_retries": { + "type": "integer", + "minimum": 0, + "maximum": 10, + "default": 3 + }, + "backoff_base_seconds": { + "type": "number", + "minimum": 0.1, + "maximum": 60, + "default": 1.0 + }, + "retry_on_provider_errors": { + "type": "boolean", + "default": true + } + } + }, + "effective_from": { + "type": "string", + "format": "date-time", + "description": "政策生效起始時間(ISO 8601)" + }, + "effective_to": { + "type": ["string", "null"], + "format": "date-time", + "description": "政策生效結束時間;null = 永久有效" + } + } +} diff --git a/packages/awooop-contracts/schemas/project_tenant.json b/packages/awooop-contracts/schemas/project_tenant.json new file mode 100644 index 00000000..fb515cb9 --- /dev/null +++ b/packages/awooop-contracts/schemas/project_tenant.json @@ -0,0 +1,50 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://awooop.platform/contracts/v1/project_tenant", + "title": "ProjectTenant Contract", + "description": "AwoooP 租戶/專案合約 — 定義一個 project 的能力邊界與預算配額", + "type": "object", + "required": ["project_id", "display_name", "migration_mode"], + "additionalProperties": false, + "properties": { + "project_id": { + "type": "string", + "pattern": "^[a-z0-9][a-z0-9_-]{1,63}$", + "description": "全局唯一租戶識別符,只允許小寫字母/數字/底線/連字號" + }, + "display_name": { + "type": "string", + "minLength": 1, + "maxLength": 256, + "description": "人類可讀名稱" + }, + "migration_mode": { + "type": "string", + "enum": ["legacy_awoooi_default", "shadow", "canary", "active"], + "description": "Strangler Fig 遷移階段(ADR-106)" + }, + "budget_limit_usd": { + "type": ["number", "null"], + "minimum": 0, + "description": "每日 LLM 費用上限(USD);null = 無上限" + }, + "allowed_channels": { + "type": "array", + "items": { + "type": "string", + "enum": ["telegram", "slack", "webhook", "api"] + }, + "uniqueItems": true, + "description": "此租戶允許使用的 channel 類型清單" + }, + "is_active": { + "type": "boolean", + "default": true, + "description": "租戶是否啟用" + }, + "metadata": { + "type": "object", + "description": "擴充欄位(自由 JSON)" + } + } +} diff --git a/packages/awooop-contracts/schemas/runtime_run_state.json b/packages/awooop-contracts/schemas/runtime_run_state.json new file mode 100644 index 00000000..75b89d01 --- /dev/null +++ b/packages/awooop-contracts/schemas/runtime_run_state.json @@ -0,0 +1,103 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://awooop.platform/contracts/v1/runtime_run_state", + "title": "Runtime Run State Contract", + "description": "AwoooP Run 狀態機合約 — 定義 run lifecycle 的有效狀態與轉換規則", + "type": "object", + "required": ["run_id", "project_id", "agent_id", "state"], + "additionalProperties": false, + "properties": { + "run_id": { + "type": "string", + "pattern": "^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", + "description": "Run UUID(UUID v7 格式)" + }, + "project_id": { + "type": "string", + "pattern": "^[a-z0-9][a-z0-9_-]{1,63}$", + "description": "所屬租戶" + }, + "agent_id": { + "type": "string", + "description": "執行此 run 的 agent" + }, + "state": { + "type": "string", + "enum": [ + "pending", + "running", + "waiting_approval", + "waiting_tool", + "completed", + "failed", + "cancelled", + "timeout" + ], + "description": "Run 當前狀態(FSM)" + }, + "trace_id": { + "type": "string", + "description": "W3C traceparent-compatible trace ID" + }, + "parent_run_id": { + "type": ["string", "null"], + "pattern": "^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", + "description": "父 run ID(sub-agent 場景)" + }, + "trigger": { + "type": "object", + "required": ["trigger_type"], + "additionalProperties": false, + "properties": { + "trigger_type": { + "type": "string", + "enum": ["channel_event", "schedule", "api", "sub_agent", "retry"] + }, + "channel_event_id": {"type": "string"}, + "schedule_id": {"type": "string"}, + "triggered_by": {"type": "string"} + }, + "description": "觸發此 run 的事件來源" + }, + "input_sha256": { + "type": "string", + "pattern": "^[0-9a-f]{64}$", + "description": "Run 輸入 payload 的 SHA-256(artifact integrity)" + }, + "output_sha256": { + "type": ["string", "null"], + "pattern": "^[0-9a-f]{64}$", + "description": "Run 輸出結果的 SHA-256(完成後填入)" + }, + "started_at": { + "type": ["string", "null"], + "format": "date-time", + "description": "run 開始執行時間" + }, + "completed_at": { + "type": ["string", "null"], + "format": "date-time", + "description": "run 完成/失敗時間" + }, + "timeout_at": { + "type": ["string", "null"], + "format": "date-time", + "description": "run 預計超時時間" + }, + "error_code": { + "type": ["string", "null"], + "description": "失敗錯誤碼(state=failed 時填入)" + }, + "cost_usd": { + "type": ["number", "null"], + "minimum": 0, + "description": "此 run 累計 LLM 費用(USD)" + }, + "step_count": { + "type": "integer", + "minimum": 0, + "default": 0, + "description": "已執行 LLM step 數" + } + } +} diff --git a/scripts/verify/verify_telegram_dedup_b3a0f0d7.sh b/scripts/verify/verify_telegram_dedup_b3a0f0d7.sh new file mode 100755 index 00000000..99b03a40 --- /dev/null +++ b/scripts/verify/verify_telegram_dedup_b3a0f0d7.sh @@ -0,0 +1,110 @@ +#!/usr/bin/env bash +# Telegram dedup 修復驗證 — commit b3a0f0d7 (fingerprint dedup + 24h TTL) +# 部署時間: 2026-05-02 16:25 Asia/Taipei +# 用法: ssh wooo@192.168.0.121 'bash -s' < verify_telegram_dedup_b3a0f0d7.sh +# 或 scp 上去後 sudo bash verify_telegram_dedup_b3a0f0d7.sh +# 純讀,不寫任何 prod 資料 + +set -e + +POD=$(sudo kubectl get pods -n awoooi-prod -l app=awoooi-api -o jsonpath='{.items[0].metadata.name}') +echo "=== Pod: $POD ===" +echo "=== Image SHA (應含 b3a0f0d7) ===" +sudo kubectl get pod -n awoooi-prod "$POD" -o jsonpath='{.spec.containers[0].image}' +echo +echo + +echo "=== A. 過去 1h Telegram 發送 top(部署後)===" +sudo kubectl exec -n awoooi-prod "$POD" -- python -c " +import asyncio, os, asyncpg +async def q(): + conn = await asyncpg.connect(os.environ['DATABASE_URL']) + rows = await conn.fetch(\"\"\" + SELECT + COALESCE(i.title, 'unknown') AS alertname, + COALESCE(i.affected_services[1], 'unknown') AS target, + COUNT(t.id) AS msg_count, + MIN(t.created_at) AS first_sent, + MAX(t.created_at) AS last_sent + FROM notification_outcomes t + JOIN approval_records a ON t.approval_id = a.id + JOIN incidents i ON a.incident_id = i.id + WHERE t.channel='telegram' AND t.created_at > now() - interval '1 hour' + GROUP BY 1,2 ORDER BY 3 DESC LIMIT 10 + \"\"\") + for r in rows: + print(f\" {r['msg_count']:>3} | {r['alertname'][:40]:<40} | {r['target'][:30]:<30} | first={r['first_sent']:%H:%M} last={r['last_sent']:%H:%M}\") + await conn.close() +asyncio.run(q()) +" + +echo +echo "=== B. 過去 24h(含部署前對照)===" +sudo kubectl exec -n awoooi-prod "$POD" -- python -c " +import asyncio, os, asyncpg +async def q(): + conn = await asyncpg.connect(os.environ['DATABASE_URL']) + rows = await conn.fetch(\"\"\" + SELECT + COALESCE(i.title, 'unknown') AS alertname, + COALESCE(i.affected_services[1], 'unknown') AS target, + COUNT(t.id) AS msg_count + FROM notification_outcomes t + JOIN approval_records a ON t.approval_id = a.id + JOIN incidents i ON a.incident_id = i.id + WHERE t.channel='telegram' AND t.created_at > now() - interval '24 hours' + GROUP BY 1,2 ORDER BY 3 DESC LIMIT 10 + \"\"\") + for r in rows: + print(f\" {r['msg_count']:>3} | {r['alertname'][:40]:<40} | {r['target'][:30]:<30}\") + await conn.close() +asyncio.run(q()) +" + +echo +echo "=== C. 截圖兩 INC 最後發送時刻 ===" +sudo kubectl exec -n awoooi-prod "$POD" -- python -c " +import asyncio, os, asyncpg +async def q(): + conn = await asyncpg.connect(os.environ['DATABASE_URL']) + rows = await conn.fetch(\"\"\" + SELECT i.id, i.title, COUNT(t.id) AS total_24h, + MAX(t.created_at) AS last_sent, + COUNT(t.id) FILTER (WHERE t.created_at > '2026-05-02 16:25 Asia/Taipei'::timestamptz) AS post_deploy + FROM notification_outcomes t + JOIN approval_records a ON t.approval_id = a.id + JOIN incidents i ON a.incident_id = i.id + WHERE i.id IN ('INC-20260501-6FE3BD','INC-20260502-FD6E21') + AND t.channel='telegram' AND t.created_at > now() - interval '24 hours' + GROUP BY 1,2 ORDER BY 1 + \"\"\") + for r in rows: + print(f\" {r['id']} | {r['title'][:40]:<40} | 24h={r['total_24h']} 部署後={r['post_deploy']} last={r['last_sent']:%H:%M}\") + await conn.close() +asyncio.run(q()) +" + +echo +echo "=== D. Redis dedup key 結構(fingerprint 應已建立)===" +sudo kubectl exec -n awoooi-prod "$POD" -- python -c " +import asyncio, os +from redis.asyncio import Redis +async def q(): + r = Redis.from_url(os.environ['REDIS_URL']) + fp_keys = await r.keys('telegram_sent:fp:*') + inc_keys = await r.keys('telegram_sent:INC-*') + print(f' telegram_sent:fp:* (新格式) = {len(fp_keys)} (應 > 0)') + print(f' telegram_sent:INC-* (舊格式) = {len(inc_keys)} (應 = 0 或減少中)') + if fp_keys: + print(f' 範例 fp key: {fp_keys[0].decode() if isinstance(fp_keys[0], bytes) else fp_keys[0]}') + sweeper_keys = await r.keys('sweeper_done:*') + print(f' sweeper_done:* = {len(sweeper_keys)} (24h TTL,整個 INVESTIGATING 集合)') +asyncio.run(q()) +" + +echo +echo "=== 驗收標準 ===" +echo "✅ A 段任何 fingerprint msg_count ≤ 2 → 修復生效" +echo "✅ C 段兩 INC 部署後 ≤ 1 → 鐵證生效" +echo "✅ D 段 telegram_sent:fp:* 已建立 → 新 dedup 邏輯有跑" +echo "❌ 任何 fingerprint 部署後仍 ≥ 5 → 未生效,回報 Claude"