## Phase 1-3: Control Plane + Contract System - awooop_phase1_control_plane_2026-05-04.sql: 12 張核心表 + RLS - awooop_phase1_batch1_rls_2026-05-04.sql: 全部 FORCE RLS + GRANT - packages/awooop-contracts/: 六合約 JSON Schema + golden fixtures - src/models/awooop_contracts.py: Pydantic v2 contract models(extra=forbid) - src/repositories/contract_repository.py: contract lifecycle(draft→published→active) - src/services/contract_service.py: HMAC publish sig + Redis multi-sig activate - src/services/schema_validator.py: LLM output validator(retry×3, E-SCHEMA-001) ## Phase 2: Tenant Isolation - awooop_phase2_budget_ledger_2026-05-04.sql: budget_ledger + RLS - src/services/budget_service.py: Token Budget Hard Kill 三層防線 - src/core/context.py: PROJECT_ID ContextVar(31 background loop 自動繼承) - src/db/base.py + models.py: project_id 欄位 + RLS set_config 注入 - src/hermes/nl_gateway.py: project_id Redis key 前綴(Phase A 雙寫) - src/services/anomaly_counter.py: per-project 改造(Phase A fallback) ## Phase 4: Platform Shell in Shadow Mode - awooop_phase4_run_state_2026-05-04.sql: run_state + step_journal + idempotency - src/services/run_state_machine.py: 8-state FSM + SKIP LOCKED + stale reaper - src/services/platform_runtime.py: UUID v7 + W3C trace_id + shadow_execute - src/services/audit_sink.py: PII/secret redaction 9 patterns - src/api/v1/platform/runs.py: POST/GET /v1/platform/runs(Router→Service 架構) - src/workers/platform_worker.py: SKIP LOCKED worker + heartbeat + reaper loop - src/main.py: platform router + lifespan worker start/stop ## Phase 5: MCP Gateway 五閘門 - awooop_phase5_mcp_gateway_2026-05-04.sql: 4 表 + RLS - src/plugins/mcp/gateway.py: McpGateway(Gate 1~5, E-MCP-GATE-001~009) - src/plugins/mcp/redaction_middleware.py: 雙層 redaction + 16K 截斷 - src/plugins/mcp/registry.py: __provider name mangling(ADR-116) - src/plugins/mcp/credential_resolver.py: k8s secret ref 解析 - tests/test_mcp_credential_isolation.py: 10 個迴歸測試(secret leak 防再現) ## Phase 6-8: EwoooC + Channel Hub + Approval Token - awooop_phase6_ewoooc_onboarding_2026-05-04.sql: ewoooc tenant + 4 read-only MCP tools - awooop_phase7_channel_hub_2026-05-04.sql: conversation_event + outbound_message - src/services/provider_proxy.py: ProviderProxy + PlatformEnvelope(ADR-115) - src/services/channel_hub.py: Telegram inbound mirror + Progressive Feedback(30s) - src/services/awooop_approval_token.py: HS256 + jti NX replay 防護 + suggest mode Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
160 lines
5.6 KiB
Python
160 lines
5.6 KiB
Python
"""
|
||
MCP Redaction Middleware — 雙層 PII/Secret Redaction
|
||
=====================================================
|
||
AwoooP Phase 5.3: ADR-116 P1-04 + P1-09
|
||
2026-05-04 ogt + Claude Sonnet 4.6
|
||
|
||
MCP tool call 的 input/output 必須經過雙層 redaction:
|
||
Layer 1(audit_sink)— 寫入 audit log 前的 sanitization(欄位黑名單 + pattern 攔截)
|
||
Layer 2(本層) — MCP tool call input/output 專用:
|
||
- 移除已知 secret 欄位(_mcp_audit 注入的 context)
|
||
- 對 output 套用 audit_sink 的完整 redaction patterns
|
||
- 限制 output 大小(防 prompt stuffing)
|
||
|
||
設計原則(ADR-118 credential isolation 延伸):
|
||
- MCP tool 的 output 可能含 k8s secret 值 → 必須在 output 進入 LLM context 前 redact
|
||
- 只有「安全的」output 才能被 platform_runtime.shadow_execute 使用
|
||
- input credential 欄位(如 k8s_value)在送入 provider 前清除(credential isolation)
|
||
|
||
雙層保障的必要性:
|
||
- audit_sink 保護的是 audit log DB
|
||
- 本 middleware 保護的是 LLM context + gateway audit hash
|
||
- 兩者防護對象不同,不可互相替代
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import hashlib
|
||
import json
|
||
import re
|
||
from typing import Any
|
||
|
||
import structlog
|
||
|
||
from src.services.audit_sink import _BLOCKED_FIELD_NAMES, _REDACTION_PATTERNS, _redact_string
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
# MCP output 進入 LLM context 的最大字元數(防 prompt stuffing)
|
||
_MCP_OUTPUT_MAX_CHARS = 16_000
|
||
|
||
# MCP gateway 注入的 audit context key(送 provider 前移除)
|
||
_MCP_AUDIT_KEY = "_mcp_audit"
|
||
|
||
# MCP credential 欄位名稱(Gate 5 credential isolation — 在 input 中清除)
|
||
_MCP_CREDENTIAL_FIELDS = frozenset({
|
||
"k8s_value", "secret_value", "credential", "credential_value",
|
||
"token_value", "api_key_value", "private_key_value",
|
||
})
|
||
|
||
|
||
def redact_mcp_input(parameters: dict[str, Any]) -> dict[str, Any]:
|
||
"""
|
||
Layer 2 Input Redaction:清理 MCP tool call 的 input parameters。
|
||
|
||
1. 移除 _mcp_audit(audit context,不應傳給 provider)
|
||
2. 移除 credential 欄位(credential isolation)
|
||
3. 對剩餘的 string values 套用 audit_sink patterns
|
||
"""
|
||
cleaned: dict[str, Any] = {}
|
||
for key, value in parameters.items():
|
||
# 移除 audit context injection
|
||
if key == _MCP_AUDIT_KEY:
|
||
continue
|
||
|
||
# credential isolation — 不讓 credential 明文流向 provider
|
||
if key.lower() in _MCP_CREDENTIAL_FIELDS:
|
||
cleaned[key] = "[REDACTED:CREDENTIAL_ISOLATION]"
|
||
continue
|
||
|
||
# 欄位名稱黑名單(與 audit_sink 對齊)
|
||
if key.lower() in _BLOCKED_FIELD_NAMES:
|
||
cleaned[key] = "[REDACTED:BLOCKED_FIELD]"
|
||
continue
|
||
|
||
# string value — 套用 pattern redaction
|
||
if isinstance(value, str):
|
||
cleaned[key] = _redact_string(value)
|
||
elif isinstance(value, dict):
|
||
cleaned[key] = redact_mcp_input(value)
|
||
elif isinstance(value, list):
|
||
cleaned[key] = [
|
||
redact_mcp_input(item) if isinstance(item, dict)
|
||
else (_redact_string(item) if isinstance(item, str) else item)
|
||
for item in value
|
||
]
|
||
else:
|
||
cleaned[key] = value
|
||
|
||
return cleaned
|
||
|
||
|
||
def redact_mcp_output(output: Any) -> Any:
|
||
"""
|
||
Layer 2 Output Redaction:清理 MCP tool call 的 output。
|
||
|
||
1. 對 output dict / string 套用 audit_sink patterns
|
||
2. 限制 output 大小(防 prompt stuffing)
|
||
3. 回傳清理後的 output(供 LLM context 使用)
|
||
"""
|
||
if output is None:
|
||
return None
|
||
|
||
if isinstance(output, str):
|
||
redacted = _redact_string(output)
|
||
if len(redacted) > _MCP_OUTPUT_MAX_CHARS:
|
||
redacted = redacted[:_MCP_OUTPUT_MAX_CHARS] + f"\n[TRUNCATED:{len(output)} chars]"
|
||
return redacted
|
||
|
||
if isinstance(output, dict):
|
||
return _redact_output_dict(output)
|
||
|
||
if isinstance(output, list):
|
||
result = []
|
||
total = 0
|
||
for item in output:
|
||
if total > _MCP_OUTPUT_MAX_CHARS:
|
||
result.append(f"[TRUNCATED:{len(output)} items total]")
|
||
break
|
||
cleaned = redact_mcp_output(item)
|
||
serialized = json.dumps(cleaned, ensure_ascii=False, default=str)
|
||
total += len(serialized)
|
||
result.append(cleaned)
|
||
return result
|
||
|
||
return output
|
||
|
||
|
||
def _redact_output_dict(d: dict[str, Any], depth: int = 0) -> dict[str, Any]:
|
||
"""遞迴 redact output dict"""
|
||
if depth > 8:
|
||
return {"[MAX_DEPTH]": True}
|
||
|
||
result: dict[str, Any] = {}
|
||
for key, value in d.items():
|
||
# 欄位名稱黑名單
|
||
if key.lower() in _BLOCKED_FIELD_NAMES:
|
||
result[key] = "[REDACTED:BLOCKED_FIELD]"
|
||
continue
|
||
|
||
if isinstance(value, str):
|
||
result[key] = _redact_string(value)
|
||
elif isinstance(value, dict):
|
||
result[key] = _redact_output_dict(value, depth + 1)
|
||
elif isinstance(value, list):
|
||
result[key] = [
|
||
_redact_output_dict(item, depth + 1) if isinstance(item, dict)
|
||
else (_redact_string(item) if isinstance(item, str) else item)
|
||
for item in value
|
||
]
|
||
else:
|
||
result[key] = value
|
||
|
||
return result
|
||
|
||
|
||
def compute_safe_hash(data: Any) -> str:
|
||
"""計算 redacted data 的 sha256(供 gateway audit 使用)"""
|
||
serialized = json.dumps(data, sort_keys=True, ensure_ascii=False, default=str)
|
||
return hashlib.sha256(serialized.encode()).hexdigest()
|