awoooi/apps/api/src/plugins/mcp/gateway.py

"""
MCP Gateway — 五閘門 Enforcement Service
=========================================
AwoooP Phase 5.2: ADR-116 五閘門強制執行
2026-05-04 ogt + Claude Sonnet 4.6

五閘門定義（依序，任一失敗即阻斷）：
  Gate 1 — Project：project_id 在 awooop_projects 且 migration_mode != 'legacy_awoooi_default'
  Gate 2 — Agent：agent_id 在 awooop_agents 且 status = 'active'
  Gate 3 — Tool：tool_id 在 awooop_mcp_tool_registry 且 grant 存在且未到期
  Gate 4 — Environment：tool.environment_tags 與 run context 匹配（shadow mode 強制放行）
  Gate 5 — Approval：工具 scope 需要 approval 時，檢查 multi_sig 是否已核准

錯誤碼（E-MCP-GATE-XXX）：
  E-MCP-GATE-001  Gate 1 project 不存在或 migration_mode 不符
  E-MCP-GATE-002  Gate 2 agent 不存在或未啟用
  E-MCP-GATE-003  Gate 3 tool 不在白名單或 grant 不存在/已到期/已撤銷
  E-MCP-GATE-004  Gate 4 environment 標籤不匹配（非 shadow mode）
  E-MCP-GATE-005  Gate 5 approval 尚未取得
  E-MCP-GATE-009  credential 解析失敗（k8s secret 取不到）

使用方式：
    from src.plugins.mcp.gateway import McpGateway, GatewayContext

    ctx = GatewayContext(
        project_id="awoooi",
        agent_id="my-agent",
        tool_name="kubectl_get",
        run_id=run_id,
        trace_id=trace_id,
        is_shadow=True,
    )
    result = await McpGateway(db).call(ctx, parameters={"namespace": "default"})
"""

from __future__ import annotations

import hashlib
import json
import time
from dataclasses import dataclass, field
from datetime import UTC, datetime
from typing import Any
from uuid import UUID

import structlog
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession

from src.core.redis_client import get_redis
from src.db.awooop_models import (
    AwoooPActiveRevision,
    AwoooPMcpGatewayAudit,
    AwoooPMcpGrant,
    AwoooPMcpToolRegistry,
    AwoooPProject,
)
from src.plugins.mcp.interfaces import MCPToolResult
from src.plugins.mcp.registry import get_provider_registry

logger = structlog.get_logger(__name__)


# ─────────────────────────────────────────────────────────────────────────────
# 錯誤定義
# ─────────────────────────────────────────────────────────────────────────────

class McpGatewayError(Exception):
    """所有 Gateway 攔截錯誤的基礎類別"""

    def __init__(self, error_code: str, message: str, gate: int) -> None:
        super().__init__(message)
        self.error_code = error_code
        self.gate = gate


class GateProjectError(McpGatewayError):
    def __init__(self, msg: str = "project 不存在或 migration_mode 不符") -> None:
        super().__init__("E-MCP-GATE-001", msg, gate=1)


class GateAgentError(McpGatewayError):
    def __init__(self, msg: str = "agent 不存在或未啟用") -> None:
        super().__init__("E-MCP-GATE-002", msg, gate=2)


class GateToolError(McpGatewayError):
    def __init__(self, msg: str = "tool 不在白名單或 grant 失效") -> None:
        super().__init__("E-MCP-GATE-003", msg, gate=3)


class GateEnvironmentError(McpGatewayError):
    def __init__(self, msg: str = "environment 標籤不匹配") -> None:
        super().__init__("E-MCP-GATE-004", msg, gate=4)


class GateApprovalError(McpGatewayError):
    def __init__(self, msg: str = "approval 尚未取得") -> None:
        super().__init__("E-MCP-GATE-005", msg, gate=5)


class CredentialResolutionError(McpGatewayError):
    def __init__(self, msg: str = "credential 解析失敗") -> None:
        super().__init__("E-MCP-GATE-009", msg, gate=0)


# ─────────────────────────────────────────────────────────────────────────────
# Gateway Context（每次 call 一個）
# ─────────────────────────────────────────────────────────────────────────────

@dataclass
class GatewayContext:
    project_id: str
    agent_id: str
    tool_name: str
    run_id: UUID | None = None
    trace_id: str | None = None
    is_shadow: bool = True           # shadow mode：Gate 4/5 放行，不執行 destructive
    environment: dict[str, str] = field(default_factory=dict)  # e.g. {"env": "prod"}
    required_scope: str = "read"     # "read" | "write" | "admin"


@dataclass
class GateCheckResult:
    gate1_project: bool = False
    gate2_agent: bool = False
    gate3_tool: bool = False
    gate4_env: bool = False
    gate5_approval: bool = False

    def as_dict(self) -> dict[str, bool]:
        return {
            "gate1_project": self.gate1_project,
            "gate2_agent": self.gate2_agent,
            "gate3_tool": self.gate3_tool,
            "gate4_env": self.gate4_env,
            "gate5_approval": self.gate5_approval,
        }

    @property
    def all_passed(self) -> bool:
        return all([
            self.gate1_project,
            self.gate2_agent,
            self.gate3_tool,
            self.gate4_env,
            self.gate5_approval,
        ])


# ─────────────────────────────────────────────────────────────────────────────
# McpGateway
# ─────────────────────────────────────────────────────────────────────────────

class McpGateway:
    """
    MCP Gateway：五閘門 enforcement + audit log + credential isolation。

    每個 gateway call 都寫一筆 awooop_mcp_gateway_audit。
    """

    def __init__(self, db: AsyncSession) -> None:
        self._db = db

    async def call(
        self,
        ctx: GatewayContext,
        parameters: dict[str, Any],
    ) -> MCPToolResult:
        """
        執行五閘門檢查後呼叫底層 MCP provider。
        任一閘門失敗 → raise McpGatewayError + 寫 blocked audit。
        """
        started = time.monotonic()
        gate_result = GateCheckResult()
        tool_row: AwoooPMcpToolRegistry | None = None
        grant_row: AwoooPMcpGrant | None = None

        try:
            # Gate 1 — Project
            tool_row, grant_row = await self._gate1_project(ctx, gate_result)

            # Gate 2 — Agent
            await self._gate2_agent(ctx, gate_result)

            # Gate 3 — Tool + Grant
            tool_row, grant_row = await self._gate3_tool(ctx, gate_result)

            # Gate 4 — Environment（shadow mode 直接放行）
            await self._gate4_environment(ctx, tool_row, gate_result)

            # Gate 5 — Approval（shadow mode + scope=read 直接放行）
            await self._gate5_approval(ctx, grant_row, gate_result)

        except McpGatewayError as exc:
            latency = int((time.monotonic() - started) * 1000)
            await self._write_audit(
                ctx=ctx,
                tool_row=tool_row,
                parameters=parameters,
                result=None,
                gate_result=gate_result,
                result_status="blocked",
                block_gate=exc.gate,
                block_reason=f"{exc.error_code}: {exc}",
                latency_ms=latency,
            )
            raise

        # 五閘通過 → 執行 tool
        result: MCPToolResult | None = None
        result_status = "failed"
        try:
            result = await self._execute_tool(ctx, tool_row, parameters)
            result_status = "success" if result.success else "failed"
            return result
        except Exception as exc:
            logger.exception(
                "mcp_gateway_execution_error",
                project_id=ctx.project_id,
                tool_name=ctx.tool_name,
                error=str(exc),
            )
            raise
        finally:
            latency = int((time.monotonic() - started) * 1000)
            await self._write_audit(
                ctx=ctx,
                tool_row=tool_row,
                parameters=parameters,
                result=result,
                gate_result=gate_result,
                result_status=result_status,
                block_gate=None,
                block_reason=None,
                latency_ms=latency,
            )

    # ── 五閘門實作 ────────────────────────────────────────────────────────────

    async def _gate1_project(
        self, ctx: GatewayContext, gate_result: GateCheckResult
    ) -> tuple[AwoooPMcpToolRegistry | None, AwoooPMcpGrant | None]:
        """Gate 1：project 必須存在且 migration_mode != 'legacy_awoooi_default'"""
        result = await self._db.execute(
            select(AwoooPProject).where(
                AwoooPProject.project_id == ctx.project_id,
                AwoooPProject.migration_mode != "legacy_awoooi_default",
            )
        )
        project = result.scalar_one_or_none()
        if project is None:
            raise GateProjectError(
                f"project '{ctx.project_id}' 不存在或 migration_mode=legacy_awoooi_default"
            )
        gate_result.gate1_project = True
        return None, None

    async def _gate2_agent(
        self, ctx: GatewayContext, gate_result: GateCheckResult
    ) -> None:
        """Gate 2：agent 必須在 awooop_active_revisions 中有 active contract（family='agent'）"""
        result = await self._db.execute(
            select(AwoooPActiveRevision).where(
                AwoooPActiveRevision.project_id == ctx.project_id,
                AwoooPActiveRevision.contract_family == "agent",
                AwoooPActiveRevision.contract_id == ctx.agent_id,
            )
        )
        active = result.scalar_one_or_none()
        if active is None:
            raise GateAgentError(
                f"agent '{ctx.agent_id}' 在 '{ctx.project_id}' 無 active contract"
            )
        gate_result.gate2_agent = True

    async def _gate3_tool(
        self, ctx: GatewayContext, gate_result: GateCheckResult
    ) -> tuple[AwoooPMcpToolRegistry, AwoooPMcpGrant]:
        """Gate 3：tool 在白名單 + grant 有效（未到期、未撤銷）"""
        now = datetime.now(UTC)

        # 查 tool registry
        tool_result = await self._db.execute(
            select(AwoooPMcpToolRegistry).where(
                AwoooPMcpToolRegistry.project_id == ctx.project_id,
                AwoooPMcpToolRegistry.tool_name == ctx.tool_name,
                AwoooPMcpToolRegistry.is_active.is_(True),
            )
        )
        tool_row = tool_result.scalar_one_or_none()
        if tool_row is None:
            raise GateToolError(f"tool '{ctx.tool_name}' 不在白名單")

        # 查 grant（scope 必須包含 required_scope）
        grant_result = await self._db.execute(
            select(AwoooPMcpGrant).where(
                AwoooPMcpGrant.project_id == ctx.project_id,
                AwoooPMcpGrant.agent_id == ctx.agent_id,
                AwoooPMcpGrant.tool_id == tool_row.tool_id,
                AwoooPMcpGrant.is_revoked.is_(False),
            )
        )
        grant_row = grant_result.scalar_one_or_none()
        if grant_row is None:
            raise GateToolError(
                f"agent '{ctx.agent_id}' 對 tool '{ctx.tool_name}' 無有效 grant"
            )
        if grant_row.expires_at is not None and grant_row.expires_at < now:
            raise GateToolError(
                f"agent '{ctx.agent_id}' 對 tool '{ctx.tool_name}' 的 grant 已到期"
            )
        # scope 檢查：required_scope 必須在 granted_scopes 中
        granted_scopes: list[str] = grant_row.granted_scopes or []
        if ctx.required_scope not in granted_scopes:
            raise GateToolError(
                f"grant 未包含所需 scope '{ctx.required_scope}'（有：{granted_scopes}）"
            )

        gate_result.gate3_tool = True
        return tool_row, grant_row

    async def _gate4_environment(
        self,
        ctx: GatewayContext,
        tool_row: AwoooPMcpToolRegistry | None,
        gate_result: GateCheckResult,
    ) -> None:
        """Gate 4：environment 標籤匹配（shadow mode 強制放行）"""
        if ctx.is_shadow:
            gate_result.gate4_env = True
            return

        if tool_row is None:
            gate_result.gate4_env = True
            return

        required_tags: dict[str, str] = tool_row.environment_tags or {}
        for k, v in required_tags.items():
            if ctx.environment.get(k) != v:
                raise GateEnvironmentError(
                    f"environment tag '{k}' 期望 '{v}'，實際 '{ctx.environment.get(k)}'"
                )
        gate_result.gate4_env = True

    async def _gate5_approval(
        self,
        ctx: GatewayContext,
        grant_row: AwoooPMcpGrant | None,
        gate_result: GateCheckResult,
    ) -> None:
        """Gate 5：需要 approval 時，檢查 Redis multi_sig（shadow + read scope 直接放行）"""
        # shadow mode 或 read scope 不需 approval
        if ctx.is_shadow or ctx.required_scope == "read":
            gate_result.gate5_approval = True
            return

        # write/admin scope 需要檢查 approval
        if ctx.run_id is None:
            raise GateApprovalError("write/admin 操作需要 run_id（approval 追蹤用）")

        try:
            redis = get_redis()
            approval_key = f"mcp_approval:{ctx.project_id}:{ctx.agent_id}:{ctx.tool_name}:{ctx.run_id}"
            approved = await redis.get(approval_key)
        except Exception as exc:
            logger.warning(
                "mcp_gate5_redis_error",
                project_id=ctx.project_id,
                tool_name=ctx.tool_name,
                error=str(exc),
            )
            # Redis 失敗時 fail-closed（不放行）
            raise GateApprovalError(f"approval Redis 查詢失敗: {exc}") from exc

        if not approved:
            raise GateApprovalError(
                f"tool '{ctx.tool_name}' 需要 approval（key={approval_key}）"
            )
        gate_result.gate5_approval = True

    # ── 執行層 ───────────────────────────────────────────────────────────────

    async def _execute_tool(
        self,
        ctx: GatewayContext,
        tool_row: AwoooPMcpToolRegistry | None,
        parameters: dict[str, Any],
    ) -> MCPToolResult:
        """呼叫底層 MCP provider 執行工具"""
        provider = await self._resolve_provider(ctx, tool_row)

        # 找不到 provider → 回傳 shadow no-op
        if provider is None:
            logger.warning(
                "mcp_gateway_no_provider",
                tool_name=ctx.tool_name,
                is_shadow=ctx.is_shadow,
            )
            return MCPToolResult(
                success=True,
                execution_id=f"shadow-noop-{ctx.tool_name}",
                output={"shadow": True, "message": "no provider registered, shadow no-op"},
            )

        audit_params = dict(parameters)
        existing_audit = (
            parameters.get("_mcp_audit")
            if isinstance(parameters, dict) and isinstance(parameters.get("_mcp_audit"), dict)
            else {}
        )
        audit_params["_mcp_audit"] = {
            "project_id": ctx.project_id,
            "agent_id": ctx.agent_id,
            "run_id": str(ctx.run_id) if ctx.run_id else None,
            "trace_id": ctx.trace_id,
            "incident_id": existing_audit.get("incident_id") or ctx.trace_id,
            "session_id": existing_audit.get("session_id"),
            "flywheel_node": existing_audit.get("flywheel_node"),
            "agent_role": existing_audit.get("agent_role") or ctx.agent_id,
            "gateway_path": "awooop_mcp_gateway",
        }
        return await provider.execute(ctx.tool_name, audit_params)

    async def _resolve_provider(
        self,
        ctx: GatewayContext,
        tool_row: AwoooPMcpToolRegistry | None,
    ):
        """Find the provider that owns ctx.tool_name.

        ProviderRegistry is keyed by provider name (`kubernetes`, `ssh_host`, ...),
        while GatewayContext intentionally uses the governed tool name
        (`kubectl_get`, `ssh_diagnose`, ...).  Scan provider tool manifests as the
        compatibility bridge until registry exposes a first-class tool index.
        """
        registry = get_provider_registry()
        direct = registry.get(ctx.tool_name)
        if direct is not None:
            return direct

        lookup_name = tool_row.tool_name if tool_row else ctx.tool_name
        for provider in registry.all():
            try:
                tools = await provider.list_tools()
            except Exception as exc:
                logger.debug(
                    "mcp_gateway_provider_manifest_skipped",
                    provider=getattr(provider, "name", None),
                    tool_name=lookup_name,
                    error=str(exc),
                )
                continue
            if any(tool.name == lookup_name for tool in tools):
                return provider
        return None

    # ── Audit log ─────────────────────────────────────────────────────────────

    async def _write_audit(
        self,
        *,
        ctx: GatewayContext,
        tool_row: AwoooPMcpToolRegistry | None,
        parameters: dict[str, Any],
        result: MCPToolResult | None,
        gate_result: GateCheckResult,
        result_status: str,
        block_gate: int | None,
        block_reason: str | None,
        latency_ms: int,
    ) -> None:
        """寫 awooop_mcp_gateway_audit — 只寫 hash，不寫明文 input/output"""
        try:
            input_hash = hashlib.sha256(
                json.dumps(parameters, sort_keys=True, default=str).encode()
            ).hexdigest()

            output_hash: str | None = None
            if result is not None:
                output_hash = hashlib.sha256(
                    json.dumps(result.output, sort_keys=True, default=str).encode()
                ).hexdigest()

            gate_payload = {
                **gate_result.as_dict(),
                "schema_version": "awooop_mcp_gateway_audit_v1",
                "gateway_path": "awooop_mcp_gateway",
                "policy_enforced": True,
                "is_shadow": ctx.is_shadow,
                "required_scope": ctx.required_scope,
            }

            audit = AwoooPMcpGatewayAudit(
                project_id=ctx.project_id,
                run_id=ctx.run_id,
                trace_id=ctx.trace_id,
                agent_id=ctx.agent_id,
                tool_id=tool_row.tool_id if tool_row else None,  # type: ignore[arg-type]
                tool_name=ctx.tool_name,
                input_hash=input_hash,
                output_hash=output_hash,
                gate_result=gate_payload,
                result_status=result_status,
                block_gate=block_gate,
                block_reason=block_reason,
                latency_ms=latency_ms,
            )

            self._db.add(audit)
            await self._db.flush()
        except Exception as exc:
            logger.warning(
                "mcp_gateway_audit_write_failed",
                project_id=ctx.project_id,
                tool_name=ctx.tool_name,
                error=str(exc),
            )


# ─────────────────────────────────────────────────────────────────────────────
# 便捷函數
# ─────────────────────────────────────────────────────────────────────────────

async def gateway_call(
    db: AsyncSession,
    *,
    project_id: str,
    agent_id: str,
    tool_name: str,
    parameters: dict[str, Any],
    run_id: UUID | None = None,
    trace_id: str | None = None,
    is_shadow: bool = True,
    required_scope: str = "read",
    environment: dict[str, str] | None = None,
) -> MCPToolResult:
    """
    Stateless 便捷函數：建立 GatewayContext + 執行 McpGateway.call()。
    """
    ctx = GatewayContext(
        project_id=project_id,
        agent_id=agent_id,
        tool_name=tool_name,
        run_id=run_id,
        trace_id=trace_id,
        is_shadow=is_shadow,
        required_scope=required_scope,
        environment=environment or {},
    )
    return await McpGateway(db).call(ctx, parameters)