From 2faa167ed230a1dd090a89aae58fd7a5f00de230 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 1 Jun 2026 14:35:06 +0800 Subject: [PATCH] fix(api): route auto repair docker restart through mcp --- ...air_executor_docker_restart_2026-06-01.sql | 159 +++++++++++++++++ ...xecutor_docker_restart_2026-06-01_down.sql | 37 ++++ apps/api/src/services/auto_repair_service.py | 165 +++++++++++++++++- apps/api/tests/test_auto_repair_service.py | 103 +++++++++++ docs/LOGBOOK.md | 45 +++++ 5 files changed, 503 insertions(+), 6 deletions(-) create mode 100644 apps/api/migrations/awooop_awoooi_mcp_auto_repair_executor_docker_restart_2026-06-01.sql create mode 100644 apps/api/migrations/awooop_awoooi_mcp_auto_repair_executor_docker_restart_2026-06-01_down.sql diff --git a/apps/api/migrations/awooop_awoooi_mcp_auto_repair_executor_docker_restart_2026-06-01.sql b/apps/api/migrations/awooop_awoooi_mcp_auto_repair_executor_docker_restart_2026-06-01.sql new file mode 100644 index 00000000..7fc887e9 --- /dev/null +++ b/apps/api/migrations/awooop_awoooi_mcp_auto_repair_executor_docker_restart_2026-06-01.sql @@ -0,0 +1,159 @@ +-- T24: auto-repair executor Docker restart MCP Gateway grant +-- 目的:讓已由 PlayBook 標記為 requires_approval=false 的安全容器重啟, +-- 透過 AwoooP MCP Gateway + Gate 5 policy projection 執行與稽核。 +-- 邊界:僅授權 ssh_docker_restart/write;複雜 shell、systemctl、prune 仍不得自動執行。 + +SELECT set_config('app.project_id', 'awoooi', FALSE); + +WITH agent_body AS ( + SELECT jsonb_build_object( + 'schema_version', 'awooop_agent_contract_v1', + 'agent_id', 'auto_repair_executor', + 'display_name', 'Auto Repair Executor', + 'project_id', 'awoooi', + 'purpose', 'Auto repair diagnostics and safe Docker container restart through AwoooP MCP Gateway', + 'allowed_scopes', jsonb_build_array('read', 'write'), + 'requires_gate5_for_scopes', jsonb_build_array('write'), + 'write_scope_constraints', jsonb_build_object( + 'allowed_tools', jsonb_build_array('ssh_docker_restart'), + 'required_playbook_requires_approval', false, + 'required_trust_score_min', 0.8, + 'forbidden_shell_patterns', jsonb_build_array('command_substitution', 'pipe', 'fallback_shell', 'systemd', 'prune') + ), + 'stage', 't24_auto_repair_docker_restart_gateway' + ) AS body_json +), +inserted_revision AS ( + INSERT INTO awooop_contract_revisions ( + project_id, + contract_family, + contract_id, + version_major, + version_minor, + lifecycle_status, + body_json, + body_hash, + body_schema_version, + publisher_id, + published_at + ) + SELECT + 'awoooi', + 'agent', + 'auto_repair_executor', + 1, + 1, + 'active', + body_json, + encode(digest(body_json::text, 'sha256'), 'hex'), + 'v1.1', + 'migration:t24_auto_repair_docker_restart_gateway', + NOW() + FROM agent_body + ON CONFLICT (project_id, contract_family, contract_id, version_major, version_minor) + DO NOTHING + RETURNING revision_id, project_id, contract_family, contract_id +), +chosen_revision AS ( + SELECT revision_id, project_id, contract_family, contract_id + FROM inserted_revision + UNION ALL + SELECT revision_id, project_id, contract_family, contract_id + FROM awooop_contract_revisions + WHERE project_id = 'awoooi' + AND contract_family = 'agent' + AND contract_id = 'auto_repair_executor' + AND version_major = 1 + AND version_minor = 1 + AND lifecycle_status = 'active' +), +upsert_pointer AS ( + INSERT INTO awooop_active_revisions ( + project_id, + contract_family, + contract_id, + active_revision_id, + updated_at + ) + SELECT DISTINCT ON (project_id, contract_family, contract_id) + project_id, + contract_family, + contract_id, + revision_id, + NOW() + FROM chosen_revision + ORDER BY project_id, contract_family, contract_id, revision_id + ON CONFLICT (project_id, contract_family, contract_id) + DO UPDATE SET + active_revision_id = EXCLUDED.active_revision_id, + updated_at = NOW() + RETURNING contract_id +), +upsert_tool AS ( + INSERT INTO awooop_mcp_tool_registry ( + project_id, + tool_name, + tool_type, + description, + allowed_scopes, + environment_tags, + is_active, + updated_at + ) + VALUES ( + 'awoooi', + 'ssh_docker_restart', + 'mcp_server', + 'Policy-approved Docker container restart over SSH for auto-repair', + '["write"]'::jsonb, + '{"env": "prod"}'::jsonb, + TRUE, + NOW() + ) + ON CONFLICT (project_id, tool_name) + DO UPDATE SET + description = EXCLUDED.description, + allowed_scopes = EXCLUDED.allowed_scopes, + environment_tags = EXCLUDED.environment_tags, + is_active = TRUE, + updated_at = NOW() + RETURNING tool_id, allowed_scopes +), +upsert_grant AS ( + INSERT INTO awooop_mcp_grants ( + project_id, + agent_id, + tool_id, + granted_by, + granted_scopes, + expires_at, + is_revoked, + revoked_at, + revoked_by + ) + SELECT + 'awoooi', + 'auto_repair_executor', + tool_id, + 'migration:t24_auto_repair_docker_restart_gateway', + allowed_scopes, + NULL, + FALSE, + NULL, + NULL + FROM upsert_tool + ON CONFLICT (project_id, agent_id, tool_id) + DO UPDATE SET + granted_by = EXCLUDED.granted_by, + granted_scopes = EXCLUDED.granted_scopes, + expires_at = NULL, + is_revoked = FALSE, + revoked_at = NULL, + revoked_by = NULL + RETURNING grant_id +) +SELECT + 'auto_repair_executor_docker_restart_gateway', + (SELECT count(*) FROM upsert_pointer) AS active_contract_rows, + (SELECT count(*) FROM upsert_tool) AS tool_rows, + (SELECT count(*) FROM upsert_grant) AS grant_rows; diff --git a/apps/api/migrations/awooop_awoooi_mcp_auto_repair_executor_docker_restart_2026-06-01_down.sql b/apps/api/migrations/awooop_awoooi_mcp_auto_repair_executor_docker_restart_2026-06-01_down.sql new file mode 100644 index 00000000..dc430b92 --- /dev/null +++ b/apps/api/migrations/awooop_awoooi_mcp_auto_repair_executor_docker_restart_2026-06-01_down.sql @@ -0,0 +1,37 @@ +-- Rollback T24: revoke auto_repair_executor Docker restart write grant. + +SELECT set_config('app.project_id', 'awoooi', FALSE); + +UPDATE awooop_mcp_grants +SET is_revoked = TRUE, + revoked_at = NOW(), + revoked_by = 'rollback:t24_auto_repair_docker_restart_gateway' +WHERE project_id = 'awoooi' + AND agent_id = 'auto_repair_executor' + AND granted_by = 'migration:t24_auto_repair_docker_restart_gateway'; + +WITH previous_revision AS ( + SELECT revision_id, project_id, contract_family, contract_id + FROM awooop_contract_revisions + WHERE project_id = 'awoooi' + AND contract_family = 'agent' + AND contract_id = 'auto_repair_executor' + AND version_major = 1 + AND version_minor = 0 + AND lifecycle_status = 'active' + ORDER BY revision_id DESC + LIMIT 1 +) +INSERT INTO awooop_active_revisions ( + project_id, + contract_family, + contract_id, + active_revision_id, + updated_at +) +SELECT project_id, contract_family, contract_id, revision_id, NOW() +FROM previous_revision +ON CONFLICT (project_id, contract_family, contract_id) +DO UPDATE SET + active_revision_id = EXCLUDED.active_revision_id, + updated_at = NOW(); diff --git a/apps/api/src/services/auto_repair_service.py b/apps/api/src/services/auto_repair_service.py index c5dfcb5f..974e397c 100644 --- a/apps/api/src/services/auto_repair_service.py +++ b/apps/api/src/services/auto_repair_service.py @@ -26,6 +26,7 @@ from collections.abc import Callable from dataclasses import dataclass import re from typing import Any, Protocol +from uuid import NAMESPACE_URL, UUID, uuid5 import structlog @@ -88,6 +89,7 @@ class _SshMcpRoute: tool_name: str params: dict[str, Any] + required_scope: str = "read" _SHORT_HOST_MAP: dict[str, str] = { @@ -130,6 +132,12 @@ _SSH_WRITE_KEYWORDS = ( "bash ", ) +_AUTO_REPAIR_GATEWAY_AGENT_ID = "auto_repair_executor" +_AUTO_REPAIR_GATEWAY_PROJECT_ID = "awoooi" +_AUTO_REPAIR_GATEWAY_APPROVAL_TTL_SECONDS = 600 +_SAFE_DOCKER_CONTAINER_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9_.-]{0,127}$") +_UNSAFE_LEGACY_WRITE_PATTERN = re.compile(r"[;|<>`\n]|(\$\{|\$\()") + # ============================================================================= # Auto Repair Service Interface @@ -1003,6 +1011,52 @@ class AutoRepairService: return _SshMcpRoute(tool_name="ssh_diagnose", params=params) + def _route_legacy_ssh_write_command_to_mcp( + self, + incident: Incident, + command: str, + ) -> _SshMcpRoute | None: + """Map safe legacy Docker restart steps to the write-scoped MCP tool. + + This intentionally supports only the historical pattern used by + DockerContainerUnhealthy playbooks: + ``ssh {host} 'docker inspect {container} ... && docker restart {container}'``. + Broader shell, command substitution, exporter discovery, and systemd + fallbacks stay blocked so they can move through approval/manual review. + """ + + raw_command = (command or "").strip() + lowered = raw_command.lower() + if not lowered.startswith("ssh ") or "docker restart" not in lowered: + return None + + if _UNSAFE_LEGACY_WRITE_PATTERN.search(raw_command): + return None + + host = self._resolve_ssh_host_for_incident(incident, raw_command) + if not host: + return None + + container_name = ( + self._resolve_container_name_for_incident(incident, raw_command) + or self._extract_docker_restart_container(raw_command) + ) + if not self._is_safe_docker_container_name(container_name): + return None + + if not self._has_simple_docker_restart_tail(raw_command, container_name): + return None + + return _SshMcpRoute( + tool_name="ssh_docker_restart", + params={ + "host": host, + "container_name": container_name, + "trust_score": 0.85, + }, + required_scope="write", + ) + def preview_read_only_ssh_mcp_route( self, incident: Incident, @@ -1077,11 +1131,35 @@ class AutoRepairService: return value match = re.search( - r"docker\s+(?:stats\s+--no-stream|inspect|logs|top|ps\s+-a\s+--filter\s+name=)\s+([a-zA-Z0-9._-]+)", + r"docker\s+(?:stats\s+--no-stream|inspect|logs|top|restart|ps\s+-a\s+--filter\s+name=)\s+([a-zA-Z0-9._-]+)", command, ) return match.group(1) if match else "" + @staticmethod + def _extract_docker_restart_container(command: str) -> str: + match = re.search( + r"docker\s+restart\s+([A-Za-z0-9][A-Za-z0-9_.-]{0,127})(?:\s*['\"])?\s*$", + command, + flags=re.IGNORECASE, + ) + return match.group(1) if match else "" + + @staticmethod + def _is_safe_docker_container_name(container_name: str) -> bool: + return bool(container_name and _SAFE_DOCKER_CONTAINER_RE.fullmatch(container_name)) + + @staticmethod + def _has_simple_docker_restart_tail(command: str, container_name: str) -> bool: + target = rf"(?:{re.escape(container_name)}|\{{container\}}|\{{target\}})" + return bool( + re.search( + rf"docker\s+restart\s+{target}(?:\s*['\"])?\s*$", + command, + flags=re.IGNORECASE, + ) + ) + @staticmethod def _incident_labels(incident: Incident) -> dict[str, Any]: for signal in incident.signals or []: @@ -1094,6 +1172,8 @@ class AutoRepairService: self, incident: Incident, route: _SshMcpRoute, + *, + approved: bool = True, ) -> str: """Execute a routed SSH diagnostic through AwoooP MCP Gateway.""" @@ -1103,22 +1183,38 @@ class AutoRepairService: from src.services.mcp_audit_context import with_mcp_audit_context incident_id = incident.incident_id + run_id: UUID | None = None + if route.required_scope != "read": + if not approved: + return ( + f"FAILED: mcp:{route.tool_name} approval required for " + f"{route.required_scope} scope" + ) + run_id = self._mcp_auto_repair_run_id(incident_id, route) + approval_projected = await self._project_auto_repair_mcp_approval( + route=route, + run_id=run_id, + ) + if not approval_projected: + return f"FAILED: mcp:{route.tool_name} approval projection failed" + params = with_mcp_audit_context( route.params, session_id=f"incident:{incident_id}:auto_repair_execute", incident_id=incident_id, flywheel_node="execute", - agent_role="auto_repair_executor", + agent_role=_AUTO_REPAIR_GATEWAY_AGENT_ID, ) async with get_db_context("awoooi") as db: ctx = GatewayContext( - project_id="awoooi", - agent_id="auto_repair_executor", + project_id=_AUTO_REPAIR_GATEWAY_PROJECT_ID, + agent_id=_AUTO_REPAIR_GATEWAY_AGENT_ID, tool_name=route.tool_name, + run_id=run_id, trace_id=incident_id, is_shadow=False, environment={"env": "prod"}, - required_scope="read", + required_scope=route.required_scope, ) result = await McpGateway(db).call(ctx, params) except McpGatewayError as exc: @@ -1137,6 +1233,55 @@ class AutoRepairService: return f"SUCCESS: mcp:{route.tool_name} {preview}".strip() return f"FAILED: mcp:{route.tool_name} {result.error or 'execution failed'}" + @staticmethod + def _mcp_auto_repair_run_id(incident_id: str, route: _SshMcpRoute) -> UUID: + stable_payload = ( + f"{_AUTO_REPAIR_GATEWAY_PROJECT_ID}:" + f"{_AUTO_REPAIR_GATEWAY_AGENT_ID}:" + f"{route.tool_name}:" + f"{incident_id}:" + f"{route.params.get('host', '')}:" + f"{route.params.get('container_name', '')}" + ) + return uuid5(NAMESPACE_URL, stable_payload) + + async def _project_auto_repair_mcp_approval( + self, + route: _SshMcpRoute, + run_id: UUID, + ) -> bool: + """Project policy-approved auto repair into Gateway's Gate 5 key.""" + + try: + from src.core.redis_client import get_redis + + redis = get_redis() + approval_key = ( + f"mcp_approval:{_AUTO_REPAIR_GATEWAY_PROJECT_ID}:" + f"{_AUTO_REPAIR_GATEWAY_AGENT_ID}:{route.tool_name}:{run_id}" + ) + await redis.set( + approval_key, + "approved:auto_repair_policy", + ex=_AUTO_REPAIR_GATEWAY_APPROVAL_TTL_SECONDS, + ) + logger.info( + "auto_repair_mcp_approval_projected", + tool=route.tool_name, + required_scope=route.required_scope, + run_id=str(run_id), + ) + return True + except Exception as exc: + logger.warning( + "auto_repair_mcp_approval_projection_failed", + tool=route.tool_name, + required_scope=route.required_scope, + run_id=str(run_id), + error=str(exc), + ) + return False + async def _execute_step(self, incident: Incident, step) -> str: """ 執行單一修復步驟 @@ -1172,9 +1317,17 @@ class AutoRepairService: if route is not None: return await self._execute_ssh_mcp_route(incident, route) + approved = not getattr(step, "requires_approval", False) + route = self._route_legacy_ssh_write_command_to_mcp(incident, step.command) + if route is not None: + return await self._execute_ssh_mcp_route( + incident, + route, + approved=approved, + ) + from src.services.host_repair_agent import HostRepairAgent agent = HostRepairAgent() - approved = not getattr(step, "requires_approval", False) result = await agent.repair_by_uri(step.command, approved=approved) if result.success: return f"SUCCESS: {result.output}" diff --git a/apps/api/tests/test_auto_repair_service.py b/apps/api/tests/test_auto_repair_service.py index 0edeaab2..08b651f0 100644 --- a/apps/api/tests/test_auto_repair_service.py +++ b/apps/api/tests/test_auto_repair_service.py @@ -459,6 +459,44 @@ class TestAutoRepairService: assert route is None + def test_legacy_ssh_docker_restart_routes_to_write_mcp_gateway(self, service): + """Safe legacy Docker restart steps use the governed write MCP tool.""" + incident = create_test_incident( + severity=Severity.P2, + alert_category="infrastructure", + alert_name="DockerContainerUnhealthy", + ) + incident.signals[0].labels.update({ + "host": "110", + "container_name": "minio", + }) + + route = service._route_legacy_ssh_write_command_to_mcp( + incident, + 'ssh {host} \'docker inspect {container} --format="{{.State.Health.Status}}" && docker restart {container}\'', + ) + + assert route is not None + assert route.tool_name == "ssh_docker_restart" + assert route.required_scope == "write" + assert route.params == { + "host": "192.168.0.110", + "container_name": "minio", + "trust_score": 0.85, + } + + def test_legacy_ssh_complex_restart_stays_blocked(self, service): + """Command substitution/fallback shell must not enter auto write MCP.""" + incident = create_test_incident(severity=Severity.P2) + incident.signals[0].labels.update({"host": "110", "container_name": "node-exporter"}) + + route = service._route_legacy_ssh_write_command_to_mcp( + incident, + 'ssh {host} \'docker restart $(docker ps -a --filter name=exporter --format "{{.Names}}" | head -1) || systemctl restart node_exporter\'', + ) + + assert route is None + @pytest.mark.asyncio async def test_execute_legacy_ssh_diagnostic_uses_mcp_gateway( self, @@ -508,6 +546,71 @@ class TestAutoRepairService: assert calls[0]["params"]["container_name"] == "momo-scheduler" assert calls[0]["params"]["_mcp_audit"]["flywheel_node"] == "execute" + @pytest.mark.asyncio + async def test_execute_legacy_ssh_docker_restart_uses_write_mcp_gateway( + self, + service, + monkeypatch, + ): + incident = create_test_incident( + severity=Severity.P2, + alert_category="infrastructure", + alert_name="DockerContainerUnhealthy", + ) + incident.signals[0].labels.update({ + "host": "110", + "container_name": "minio", + }) + step = RepairStep( + step_number=1, + action_type=ActionType.SSH_COMMAND, + command='ssh {host} \'docker inspect {container} --format="{{.State.Health.Status}}" && docker restart {container}\'', + risk_level=RiskLevel.MEDIUM, + requires_approval=False, + ) + calls = [] + redis_sets = [] + + class FakeRedis: + async def set(self, key, value, ex=None): + redis_sets.append({"key": key, "value": value, "ex": ex}) + return True + + class FakeGateway: + def __init__(self, db: object) -> None: + self.db = db + + async def call(self, ctx, params): + calls.append({"ctx": ctx, "params": params, "db": self.db}) + return MCPToolResult( + success=True, + execution_id="gw-write-ok", + output={"stdout": "minio"}, + ) + + monkeypatch.setattr("src.core.redis_client.get_redis", lambda: FakeRedis()) + monkeypatch.setattr("src.db.base.get_db_context", lambda _project_id: _DbContext()) + monkeypatch.setattr("src.plugins.mcp.gateway.McpGateway", FakeGateway) + + result = await service._execute_step(incident, step) + + assert result.startswith("SUCCESS: mcp:ssh_docker_restart") + assert calls + assert redis_sets + assert calls[0]["ctx"].agent_id == "auto_repair_executor" + assert calls[0]["ctx"].tool_name == "ssh_docker_restart" + assert calls[0]["ctx"].required_scope == "write" + assert calls[0]["ctx"].is_shadow is False + assert calls[0]["ctx"].run_id is not None + assert calls[0]["params"]["host"] == "192.168.0.110" + assert calls[0]["params"]["container_name"] == "minio" + assert calls[0]["params"]["trust_score"] == 0.85 + assert calls[0]["params"]["_mcp_audit"]["flywheel_node"] == "execute" + assert redis_sets[0]["key"].startswith( + "mcp_approval:awoooi:auto_repair_executor:ssh_docker_restart:" + ) + assert redis_sets[0]["value"] == "approved:auto_repair_policy" + @pytest.mark.asyncio async def test_evaluate_low_risk_allowed(self, service, mock_playbook_service): """Test that LOW risk actions are allowed""" diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index c2702ced..965908ec 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,48 @@ +## 2026-06-01|AI 自健診 W-1 SLO:auto_execute_success_rate 修復 + +**背景**: + +- Telegram 出現 `META-20260601130436`「飛輪核心異常」,W-1 SLO 違反 `auto_execute_success_rate`。 +- production `/api/v1/ai/slo` 顯示 7 日自動執行成功率約 `83.3%`,低於 `85%` 門檻;recent `auto_repair_executions` 指向 `PB-20260420-3F9C4C` 反覆失敗。 + +**根因**: + +- `Docker 容器 healthcheck 失敗` PlayBook 使用 legacy 指令: + `ssh {host} 'docker inspect {container} --format="{{.State.Health.Status}}" && docker restart {container}'` +- 只讀診斷已能走 `auto_repair_executor/ssh_diagnose/read` MCP Gateway,但此寫入指令沒有安全轉譯,落到 `HostRepairAgent.repair_by_uri()` 後被拒絕為 `unsupported scheme`。 +- production MCP grant 顯示 `auto_repair_executor` 只有 `ssh_diagnose/read`;`ssh_docker_restart/write` 僅授權 `approval_executor`,與 PlayBook `requires_approval=false` 的低/中風險自動修復意圖不一致。 + +**本次調整**: + +- `apps/api/src/services/auto_repair_service.py` + - 新增安全 legacy SSH write 路由:只允許簡單 `docker restart ` 或 `{container}` placeholder,且必須能解析出安全 container name。 + - 將該路由轉進 AwoooP MCP Gateway `ssh_docker_restart/write`,注入 `trust_score=0.85`、MCP audit context、`auto_repair_executor` agent 與 deterministic run id。 + - 對 write scope 先投影短效 Gate 5 key `approved:auto_repair_policy`;`requires_approval=true` 時仍 fail-closed,不會繞過人工審批。 + - 保留 read-only `ssh_diagnose` 原路徑;含 command substitution、pipe、fallback shell、systemd/prune 的複雜命令仍拒絕自動寫入。 +- `apps/api/migrations/awooop_awoooi_mcp_auto_repair_executor_docker_restart_2026-06-01.sql` + - 將 `auto_repair_executor` agent contract 升到 v1.1。 + - 僅新增 `ssh_docker_restart/write` grant,邊界寫入 contract:只允許安全 Docker container restart,其他寫入工具仍不授權。 +- `apps/api/tests/test_auto_repair_service.py` + - 補上 write MCP route、複雜 shell 封鎖與 Gate 5 approval projection 測試。 + +**驗證**: + +```text +PYTHONPATH=apps/api python -m py_compile apps/api/src/services/auto_repair_service.py +DATABASE_URL=sqlite+aiosqlite:///:memory: PYTHONPATH=apps/api python -m pytest apps/api/tests/test_auto_repair_service.py -q +→ 26 passed +DATABASE_URL=sqlite+aiosqlite:///:memory: PYTHONPATH=apps/api python -m pytest apps/api/tests/test_auto_repair_service.py apps/api/tests/test_approval_execution_mcp_audit.py -q +→ 29 passed +production owner-fallback rollback check: + apps/api/migrations/awooop_awoooi_mcp_auto_repair_executor_docker_restart_2026-06-01.sql + → migration rollback syntax ok with owner fallback +``` + +**進度邊界**: + +- 本輪修的是 W-1 的已知 executor gap,避免後續同一 PlayBook 再產生 `unsupported_action_scheme`。 +- 歷史 7 日 SLO 會等舊失敗列被新成功列稀釋或滾出視窗才完全回綠;部署後需監看下一輪 `auto_repair_executions` 是否出現 `SUCCESS: mcp:ssh_docker_restart` 與 AwoooP MCP Gateway audit。 + ## 2026-06-01|IwoooS 決策跑道 **背景**: