fix(api): route auto repair docker restart through mcp
Some checks failed
CD Pipeline / tests (push) Successful in 1m21s
Code Review / ai-code-review (push) Successful in 13s
run-migration / migrate (push) Successful in 9s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled

This commit is contained in:
Your Name
2026-06-01 14:35:06 +08:00
parent ea8e2b1106
commit 2faa167ed2
5 changed files with 503 additions and 6 deletions

View File

@@ -0,0 +1,159 @@
-- T24: auto-repair executor Docker restart MCP Gateway grant
-- 目的:讓已由 PlayBook 標記為 requires_approval=false 的安全容器重啟,
-- 透過 AwoooP MCP Gateway + Gate 5 policy projection 執行與稽核。
-- 邊界:僅授權 ssh_docker_restart/write複雜 shell、systemctl、prune 仍不得自動執行。
SELECT set_config('app.project_id', 'awoooi', FALSE);
WITH agent_body AS (
SELECT jsonb_build_object(
'schema_version', 'awooop_agent_contract_v1',
'agent_id', 'auto_repair_executor',
'display_name', 'Auto Repair Executor',
'project_id', 'awoooi',
'purpose', 'Auto repair diagnostics and safe Docker container restart through AwoooP MCP Gateway',
'allowed_scopes', jsonb_build_array('read', 'write'),
'requires_gate5_for_scopes', jsonb_build_array('write'),
'write_scope_constraints', jsonb_build_object(
'allowed_tools', jsonb_build_array('ssh_docker_restart'),
'required_playbook_requires_approval', false,
'required_trust_score_min', 0.8,
'forbidden_shell_patterns', jsonb_build_array('command_substitution', 'pipe', 'fallback_shell', 'systemd', 'prune')
),
'stage', 't24_auto_repair_docker_restart_gateway'
) AS body_json
),
inserted_revision AS (
INSERT INTO awooop_contract_revisions (
project_id,
contract_family,
contract_id,
version_major,
version_minor,
lifecycle_status,
body_json,
body_hash,
body_schema_version,
publisher_id,
published_at
)
SELECT
'awoooi',
'agent',
'auto_repair_executor',
1,
1,
'active',
body_json,
encode(digest(body_json::text, 'sha256'), 'hex'),
'v1.1',
'migration:t24_auto_repair_docker_restart_gateway',
NOW()
FROM agent_body
ON CONFLICT (project_id, contract_family, contract_id, version_major, version_minor)
DO NOTHING
RETURNING revision_id, project_id, contract_family, contract_id
),
chosen_revision AS (
SELECT revision_id, project_id, contract_family, contract_id
FROM inserted_revision
UNION ALL
SELECT revision_id, project_id, contract_family, contract_id
FROM awooop_contract_revisions
WHERE project_id = 'awoooi'
AND contract_family = 'agent'
AND contract_id = 'auto_repair_executor'
AND version_major = 1
AND version_minor = 1
AND lifecycle_status = 'active'
),
upsert_pointer AS (
INSERT INTO awooop_active_revisions (
project_id,
contract_family,
contract_id,
active_revision_id,
updated_at
)
SELECT DISTINCT ON (project_id, contract_family, contract_id)
project_id,
contract_family,
contract_id,
revision_id,
NOW()
FROM chosen_revision
ORDER BY project_id, contract_family, contract_id, revision_id
ON CONFLICT (project_id, contract_family, contract_id)
DO UPDATE SET
active_revision_id = EXCLUDED.active_revision_id,
updated_at = NOW()
RETURNING contract_id
),
upsert_tool AS (
INSERT INTO awooop_mcp_tool_registry (
project_id,
tool_name,
tool_type,
description,
allowed_scopes,
environment_tags,
is_active,
updated_at
)
VALUES (
'awoooi',
'ssh_docker_restart',
'mcp_server',
'Policy-approved Docker container restart over SSH for auto-repair',
'["write"]'::jsonb,
'{"env": "prod"}'::jsonb,
TRUE,
NOW()
)
ON CONFLICT (project_id, tool_name)
DO UPDATE SET
description = EXCLUDED.description,
allowed_scopes = EXCLUDED.allowed_scopes,
environment_tags = EXCLUDED.environment_tags,
is_active = TRUE,
updated_at = NOW()
RETURNING tool_id, allowed_scopes
),
upsert_grant AS (
INSERT INTO awooop_mcp_grants (
project_id,
agent_id,
tool_id,
granted_by,
granted_scopes,
expires_at,
is_revoked,
revoked_at,
revoked_by
)
SELECT
'awoooi',
'auto_repair_executor',
tool_id,
'migration:t24_auto_repair_docker_restart_gateway',
allowed_scopes,
NULL,
FALSE,
NULL,
NULL
FROM upsert_tool
ON CONFLICT (project_id, agent_id, tool_id)
DO UPDATE SET
granted_by = EXCLUDED.granted_by,
granted_scopes = EXCLUDED.granted_scopes,
expires_at = NULL,
is_revoked = FALSE,
revoked_at = NULL,
revoked_by = NULL
RETURNING grant_id
)
SELECT
'auto_repair_executor_docker_restart_gateway',
(SELECT count(*) FROM upsert_pointer) AS active_contract_rows,
(SELECT count(*) FROM upsert_tool) AS tool_rows,
(SELECT count(*) FROM upsert_grant) AS grant_rows;

View File

@@ -0,0 +1,37 @@
-- Rollback T24: revoke auto_repair_executor Docker restart write grant.
SELECT set_config('app.project_id', 'awoooi', FALSE);
UPDATE awooop_mcp_grants
SET is_revoked = TRUE,
revoked_at = NOW(),
revoked_by = 'rollback:t24_auto_repair_docker_restart_gateway'
WHERE project_id = 'awoooi'
AND agent_id = 'auto_repair_executor'
AND granted_by = 'migration:t24_auto_repair_docker_restart_gateway';
WITH previous_revision AS (
SELECT revision_id, project_id, contract_family, contract_id
FROM awooop_contract_revisions
WHERE project_id = 'awoooi'
AND contract_family = 'agent'
AND contract_id = 'auto_repair_executor'
AND version_major = 1
AND version_minor = 0
AND lifecycle_status = 'active'
ORDER BY revision_id DESC
LIMIT 1
)
INSERT INTO awooop_active_revisions (
project_id,
contract_family,
contract_id,
active_revision_id,
updated_at
)
SELECT project_id, contract_family, contract_id, revision_id, NOW()
FROM previous_revision
ON CONFLICT (project_id, contract_family, contract_id)
DO UPDATE SET
active_revision_id = EXCLUDED.active_revision_id,
updated_at = NOW();

View File

@@ -26,6 +26,7 @@ from collections.abc import Callable
from dataclasses import dataclass
import re
from typing import Any, Protocol
from uuid import NAMESPACE_URL, UUID, uuid5
import structlog
@@ -88,6 +89,7 @@ class _SshMcpRoute:
tool_name: str
params: dict[str, Any]
required_scope: str = "read"
_SHORT_HOST_MAP: dict[str, str] = {
@@ -130,6 +132,12 @@ _SSH_WRITE_KEYWORDS = (
"bash ",
)
_AUTO_REPAIR_GATEWAY_AGENT_ID = "auto_repair_executor"
_AUTO_REPAIR_GATEWAY_PROJECT_ID = "awoooi"
_AUTO_REPAIR_GATEWAY_APPROVAL_TTL_SECONDS = 600
_SAFE_DOCKER_CONTAINER_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9_.-]{0,127}$")
_UNSAFE_LEGACY_WRITE_PATTERN = re.compile(r"[;|<>`\n]|(\$\{|\$\()")
# =============================================================================
# Auto Repair Service Interface
@@ -1003,6 +1011,52 @@ class AutoRepairService:
return _SshMcpRoute(tool_name="ssh_diagnose", params=params)
def _route_legacy_ssh_write_command_to_mcp(
self,
incident: Incident,
command: str,
) -> _SshMcpRoute | None:
"""Map safe legacy Docker restart steps to the write-scoped MCP tool.
This intentionally supports only the historical pattern used by
DockerContainerUnhealthy playbooks:
``ssh {host} 'docker inspect {container} ... && docker restart {container}'``.
Broader shell, command substitution, exporter discovery, and systemd
fallbacks stay blocked so they can move through approval/manual review.
"""
raw_command = (command or "").strip()
lowered = raw_command.lower()
if not lowered.startswith("ssh ") or "docker restart" not in lowered:
return None
if _UNSAFE_LEGACY_WRITE_PATTERN.search(raw_command):
return None
host = self._resolve_ssh_host_for_incident(incident, raw_command)
if not host:
return None
container_name = (
self._resolve_container_name_for_incident(incident, raw_command)
or self._extract_docker_restart_container(raw_command)
)
if not self._is_safe_docker_container_name(container_name):
return None
if not self._has_simple_docker_restart_tail(raw_command, container_name):
return None
return _SshMcpRoute(
tool_name="ssh_docker_restart",
params={
"host": host,
"container_name": container_name,
"trust_score": 0.85,
},
required_scope="write",
)
def preview_read_only_ssh_mcp_route(
self,
incident: Incident,
@@ -1077,11 +1131,35 @@ class AutoRepairService:
return value
match = re.search(
r"docker\s+(?:stats\s+--no-stream|inspect|logs|top|ps\s+-a\s+--filter\s+name=)\s+([a-zA-Z0-9._-]+)",
r"docker\s+(?:stats\s+--no-stream|inspect|logs|top|restart|ps\s+-a\s+--filter\s+name=)\s+([a-zA-Z0-9._-]+)",
command,
)
return match.group(1) if match else ""
@staticmethod
def _extract_docker_restart_container(command: str) -> str:
match = re.search(
r"docker\s+restart\s+([A-Za-z0-9][A-Za-z0-9_.-]{0,127})(?:\s*['\"])?\s*$",
command,
flags=re.IGNORECASE,
)
return match.group(1) if match else ""
@staticmethod
def _is_safe_docker_container_name(container_name: str) -> bool:
return bool(container_name and _SAFE_DOCKER_CONTAINER_RE.fullmatch(container_name))
@staticmethod
def _has_simple_docker_restart_tail(command: str, container_name: str) -> bool:
target = rf"(?:{re.escape(container_name)}|\{{container\}}|\{{target\}})"
return bool(
re.search(
rf"docker\s+restart\s+{target}(?:\s*['\"])?\s*$",
command,
flags=re.IGNORECASE,
)
)
@staticmethod
def _incident_labels(incident: Incident) -> dict[str, Any]:
for signal in incident.signals or []:
@@ -1094,6 +1172,8 @@ class AutoRepairService:
self,
incident: Incident,
route: _SshMcpRoute,
*,
approved: bool = True,
) -> str:
"""Execute a routed SSH diagnostic through AwoooP MCP Gateway."""
@@ -1103,22 +1183,38 @@ class AutoRepairService:
from src.services.mcp_audit_context import with_mcp_audit_context
incident_id = incident.incident_id
run_id: UUID | None = None
if route.required_scope != "read":
if not approved:
return (
f"FAILED: mcp:{route.tool_name} approval required for "
f"{route.required_scope} scope"
)
run_id = self._mcp_auto_repair_run_id(incident_id, route)
approval_projected = await self._project_auto_repair_mcp_approval(
route=route,
run_id=run_id,
)
if not approval_projected:
return f"FAILED: mcp:{route.tool_name} approval projection failed"
params = with_mcp_audit_context(
route.params,
session_id=f"incident:{incident_id}:auto_repair_execute",
incident_id=incident_id,
flywheel_node="execute",
agent_role="auto_repair_executor",
agent_role=_AUTO_REPAIR_GATEWAY_AGENT_ID,
)
async with get_db_context("awoooi") as db:
ctx = GatewayContext(
project_id="awoooi",
agent_id="auto_repair_executor",
project_id=_AUTO_REPAIR_GATEWAY_PROJECT_ID,
agent_id=_AUTO_REPAIR_GATEWAY_AGENT_ID,
tool_name=route.tool_name,
run_id=run_id,
trace_id=incident_id,
is_shadow=False,
environment={"env": "prod"},
required_scope="read",
required_scope=route.required_scope,
)
result = await McpGateway(db).call(ctx, params)
except McpGatewayError as exc:
@@ -1137,6 +1233,55 @@ class AutoRepairService:
return f"SUCCESS: mcp:{route.tool_name} {preview}".strip()
return f"FAILED: mcp:{route.tool_name} {result.error or 'execution failed'}"
@staticmethod
def _mcp_auto_repair_run_id(incident_id: str, route: _SshMcpRoute) -> UUID:
stable_payload = (
f"{_AUTO_REPAIR_GATEWAY_PROJECT_ID}:"
f"{_AUTO_REPAIR_GATEWAY_AGENT_ID}:"
f"{route.tool_name}:"
f"{incident_id}:"
f"{route.params.get('host', '')}:"
f"{route.params.get('container_name', '')}"
)
return uuid5(NAMESPACE_URL, stable_payload)
async def _project_auto_repair_mcp_approval(
self,
route: _SshMcpRoute,
run_id: UUID,
) -> bool:
"""Project policy-approved auto repair into Gateway's Gate 5 key."""
try:
from src.core.redis_client import get_redis
redis = get_redis()
approval_key = (
f"mcp_approval:{_AUTO_REPAIR_GATEWAY_PROJECT_ID}:"
f"{_AUTO_REPAIR_GATEWAY_AGENT_ID}:{route.tool_name}:{run_id}"
)
await redis.set(
approval_key,
"approved:auto_repair_policy",
ex=_AUTO_REPAIR_GATEWAY_APPROVAL_TTL_SECONDS,
)
logger.info(
"auto_repair_mcp_approval_projected",
tool=route.tool_name,
required_scope=route.required_scope,
run_id=str(run_id),
)
return True
except Exception as exc:
logger.warning(
"auto_repair_mcp_approval_projection_failed",
tool=route.tool_name,
required_scope=route.required_scope,
run_id=str(run_id),
error=str(exc),
)
return False
async def _execute_step(self, incident: Incident, step) -> str:
"""
執行單一修復步驟
@@ -1172,9 +1317,17 @@ class AutoRepairService:
if route is not None:
return await self._execute_ssh_mcp_route(incident, route)
approved = not getattr(step, "requires_approval", False)
route = self._route_legacy_ssh_write_command_to_mcp(incident, step.command)
if route is not None:
return await self._execute_ssh_mcp_route(
incident,
route,
approved=approved,
)
from src.services.host_repair_agent import HostRepairAgent
agent = HostRepairAgent()
approved = not getattr(step, "requires_approval", False)
result = await agent.repair_by_uri(step.command, approved=approved)
if result.success:
return f"SUCCESS: {result.output}"

View File

@@ -459,6 +459,44 @@ class TestAutoRepairService:
assert route is None
def test_legacy_ssh_docker_restart_routes_to_write_mcp_gateway(self, service):
"""Safe legacy Docker restart steps use the governed write MCP tool."""
incident = create_test_incident(
severity=Severity.P2,
alert_category="infrastructure",
alert_name="DockerContainerUnhealthy",
)
incident.signals[0].labels.update({
"host": "110",
"container_name": "minio",
})
route = service._route_legacy_ssh_write_command_to_mcp(
incident,
'ssh {host} \'docker inspect {container} --format="{{.State.Health.Status}}" && docker restart {container}\'',
)
assert route is not None
assert route.tool_name == "ssh_docker_restart"
assert route.required_scope == "write"
assert route.params == {
"host": "192.168.0.110",
"container_name": "minio",
"trust_score": 0.85,
}
def test_legacy_ssh_complex_restart_stays_blocked(self, service):
"""Command substitution/fallback shell must not enter auto write MCP."""
incident = create_test_incident(severity=Severity.P2)
incident.signals[0].labels.update({"host": "110", "container_name": "node-exporter"})
route = service._route_legacy_ssh_write_command_to_mcp(
incident,
'ssh {host} \'docker restart $(docker ps -a --filter name=exporter --format "{{.Names}}" | head -1) || systemctl restart node_exporter\'',
)
assert route is None
@pytest.mark.asyncio
async def test_execute_legacy_ssh_diagnostic_uses_mcp_gateway(
self,
@@ -508,6 +546,71 @@ class TestAutoRepairService:
assert calls[0]["params"]["container_name"] == "momo-scheduler"
assert calls[0]["params"]["_mcp_audit"]["flywheel_node"] == "execute"
@pytest.mark.asyncio
async def test_execute_legacy_ssh_docker_restart_uses_write_mcp_gateway(
self,
service,
monkeypatch,
):
incident = create_test_incident(
severity=Severity.P2,
alert_category="infrastructure",
alert_name="DockerContainerUnhealthy",
)
incident.signals[0].labels.update({
"host": "110",
"container_name": "minio",
})
step = RepairStep(
step_number=1,
action_type=ActionType.SSH_COMMAND,
command='ssh {host} \'docker inspect {container} --format="{{.State.Health.Status}}" && docker restart {container}\'',
risk_level=RiskLevel.MEDIUM,
requires_approval=False,
)
calls = []
redis_sets = []
class FakeRedis:
async def set(self, key, value, ex=None):
redis_sets.append({"key": key, "value": value, "ex": ex})
return True
class FakeGateway:
def __init__(self, db: object) -> None:
self.db = db
async def call(self, ctx, params):
calls.append({"ctx": ctx, "params": params, "db": self.db})
return MCPToolResult(
success=True,
execution_id="gw-write-ok",
output={"stdout": "minio"},
)
monkeypatch.setattr("src.core.redis_client.get_redis", lambda: FakeRedis())
monkeypatch.setattr("src.db.base.get_db_context", lambda _project_id: _DbContext())
monkeypatch.setattr("src.plugins.mcp.gateway.McpGateway", FakeGateway)
result = await service._execute_step(incident, step)
assert result.startswith("SUCCESS: mcp:ssh_docker_restart")
assert calls
assert redis_sets
assert calls[0]["ctx"].agent_id == "auto_repair_executor"
assert calls[0]["ctx"].tool_name == "ssh_docker_restart"
assert calls[0]["ctx"].required_scope == "write"
assert calls[0]["ctx"].is_shadow is False
assert calls[0]["ctx"].run_id is not None
assert calls[0]["params"]["host"] == "192.168.0.110"
assert calls[0]["params"]["container_name"] == "minio"
assert calls[0]["params"]["trust_score"] == 0.85
assert calls[0]["params"]["_mcp_audit"]["flywheel_node"] == "execute"
assert redis_sets[0]["key"].startswith(
"mcp_approval:awoooi:auto_repair_executor:ssh_docker_restart:"
)
assert redis_sets[0]["value"] == "approved:auto_repair_policy"
@pytest.mark.asyncio
async def test_evaluate_low_risk_allowed(self, service, mock_playbook_service):
"""Test that LOW risk actions are allowed"""

View File

@@ -1,3 +1,48 @@
## 2026-06-01AI 自健診 W-1 SLOauto_execute_success_rate 修復
**背景**
- Telegram 出現 `META-20260601130436`「飛輪核心異常」W-1 SLO 違反 `auto_execute_success_rate`
- production `/api/v1/ai/slo` 顯示 7 日自動執行成功率約 `83.3%`,低於 `85%` 門檻recent `auto_repair_executions` 指向 `PB-20260420-3F9C4C` 反覆失敗。
**根因**
- `Docker 容器 healthcheck 失敗` PlayBook 使用 legacy 指令:
`ssh {host} 'docker inspect {container} --format="{{.State.Health.Status}}" && docker restart {container}'`
- 只讀診斷已能走 `auto_repair_executor/ssh_diagnose/read` MCP Gateway但此寫入指令沒有安全轉譯落到 `HostRepairAgent.repair_by_uri()` 後被拒絕為 `unsupported scheme`
- production MCP grant 顯示 `auto_repair_executor` 只有 `ssh_diagnose/read``ssh_docker_restart/write` 僅授權 `approval_executor`,與 PlayBook `requires_approval=false` 的低/中風險自動修復意圖不一致。
**本次調整**
- `apps/api/src/services/auto_repair_service.py`
- 新增安全 legacy SSH write 路由:只允許簡單 `docker restart <container>``{container}` placeholder且必須能解析出安全 container name。
- 將該路由轉進 AwoooP MCP Gateway `ssh_docker_restart/write`,注入 `trust_score=0.85`、MCP audit context、`auto_repair_executor` agent 與 deterministic run id。
- 對 write scope 先投影短效 Gate 5 key `approved:auto_repair_policy``requires_approval=true` 時仍 fail-closed不會繞過人工審批。
- 保留 read-only `ssh_diagnose` 原路徑;含 command substitution、pipe、fallback shell、systemd/prune 的複雜命令仍拒絕自動寫入。
- `apps/api/migrations/awooop_awoooi_mcp_auto_repair_executor_docker_restart_2026-06-01.sql`
- 將 `auto_repair_executor` agent contract 升到 v1.1。
- 僅新增 `ssh_docker_restart/write` grant邊界寫入 contract只允許安全 Docker container restart其他寫入工具仍不授權。
- `apps/api/tests/test_auto_repair_service.py`
- 補上 write MCP route、複雜 shell 封鎖與 Gate 5 approval projection 測試。
**驗證**
```text
PYTHONPATH=apps/api python -m py_compile apps/api/src/services/auto_repair_service.py
DATABASE_URL=sqlite+aiosqlite:///:memory: PYTHONPATH=apps/api python -m pytest apps/api/tests/test_auto_repair_service.py -q
→ 26 passed
DATABASE_URL=sqlite+aiosqlite:///:memory: PYTHONPATH=apps/api python -m pytest apps/api/tests/test_auto_repair_service.py apps/api/tests/test_approval_execution_mcp_audit.py -q
→ 29 passed
production owner-fallback rollback check:
apps/api/migrations/awooop_awoooi_mcp_auto_repair_executor_docker_restart_2026-06-01.sql
→ migration rollback syntax ok with owner fallback
```
**進度邊界**
- 本輪修的是 W-1 的已知 executor gap避免後續同一 PlayBook 再產生 `unsupported_action_scheme`
- 歷史 7 日 SLO 會等舊失敗列被新成功列稀釋或滾出視窗才完全回綠;部署後需監看下一輪 `auto_repair_executions` 是否出現 `SUCCESS: mcp:ssh_docker_restart` 與 AwoooP MCP Gateway audit。
## 2026-06-01IwoooS 決策跑道
**背景**