fix(api): route auto repair docker restart through mcp
Some checks failed
CD Pipeline / tests (push) Successful in 1m21s
Code Review / ai-code-review (push) Successful in 13s
run-migration / migrate (push) Successful in 9s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / tests (push) Successful in 1m21s
Code Review / ai-code-review (push) Successful in 13s
run-migration / migrate (push) Successful in 9s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
This commit is contained in:
@@ -0,0 +1,159 @@
|
||||
-- T24: auto-repair executor Docker restart MCP Gateway grant
|
||||
-- 目的:讓已由 PlayBook 標記為 requires_approval=false 的安全容器重啟,
|
||||
-- 透過 AwoooP MCP Gateway + Gate 5 policy projection 執行與稽核。
|
||||
-- 邊界:僅授權 ssh_docker_restart/write;複雜 shell、systemctl、prune 仍不得自動執行。
|
||||
|
||||
SELECT set_config('app.project_id', 'awoooi', FALSE);
|
||||
|
||||
WITH agent_body AS (
|
||||
SELECT jsonb_build_object(
|
||||
'schema_version', 'awooop_agent_contract_v1',
|
||||
'agent_id', 'auto_repair_executor',
|
||||
'display_name', 'Auto Repair Executor',
|
||||
'project_id', 'awoooi',
|
||||
'purpose', 'Auto repair diagnostics and safe Docker container restart through AwoooP MCP Gateway',
|
||||
'allowed_scopes', jsonb_build_array('read', 'write'),
|
||||
'requires_gate5_for_scopes', jsonb_build_array('write'),
|
||||
'write_scope_constraints', jsonb_build_object(
|
||||
'allowed_tools', jsonb_build_array('ssh_docker_restart'),
|
||||
'required_playbook_requires_approval', false,
|
||||
'required_trust_score_min', 0.8,
|
||||
'forbidden_shell_patterns', jsonb_build_array('command_substitution', 'pipe', 'fallback_shell', 'systemd', 'prune')
|
||||
),
|
||||
'stage', 't24_auto_repair_docker_restart_gateway'
|
||||
) AS body_json
|
||||
),
|
||||
inserted_revision AS (
|
||||
INSERT INTO awooop_contract_revisions (
|
||||
project_id,
|
||||
contract_family,
|
||||
contract_id,
|
||||
version_major,
|
||||
version_minor,
|
||||
lifecycle_status,
|
||||
body_json,
|
||||
body_hash,
|
||||
body_schema_version,
|
||||
publisher_id,
|
||||
published_at
|
||||
)
|
||||
SELECT
|
||||
'awoooi',
|
||||
'agent',
|
||||
'auto_repair_executor',
|
||||
1,
|
||||
1,
|
||||
'active',
|
||||
body_json,
|
||||
encode(digest(body_json::text, 'sha256'), 'hex'),
|
||||
'v1.1',
|
||||
'migration:t24_auto_repair_docker_restart_gateway',
|
||||
NOW()
|
||||
FROM agent_body
|
||||
ON CONFLICT (project_id, contract_family, contract_id, version_major, version_minor)
|
||||
DO NOTHING
|
||||
RETURNING revision_id, project_id, contract_family, contract_id
|
||||
),
|
||||
chosen_revision AS (
|
||||
SELECT revision_id, project_id, contract_family, contract_id
|
||||
FROM inserted_revision
|
||||
UNION ALL
|
||||
SELECT revision_id, project_id, contract_family, contract_id
|
||||
FROM awooop_contract_revisions
|
||||
WHERE project_id = 'awoooi'
|
||||
AND contract_family = 'agent'
|
||||
AND contract_id = 'auto_repair_executor'
|
||||
AND version_major = 1
|
||||
AND version_minor = 1
|
||||
AND lifecycle_status = 'active'
|
||||
),
|
||||
upsert_pointer AS (
|
||||
INSERT INTO awooop_active_revisions (
|
||||
project_id,
|
||||
contract_family,
|
||||
contract_id,
|
||||
active_revision_id,
|
||||
updated_at
|
||||
)
|
||||
SELECT DISTINCT ON (project_id, contract_family, contract_id)
|
||||
project_id,
|
||||
contract_family,
|
||||
contract_id,
|
||||
revision_id,
|
||||
NOW()
|
||||
FROM chosen_revision
|
||||
ORDER BY project_id, contract_family, contract_id, revision_id
|
||||
ON CONFLICT (project_id, contract_family, contract_id)
|
||||
DO UPDATE SET
|
||||
active_revision_id = EXCLUDED.active_revision_id,
|
||||
updated_at = NOW()
|
||||
RETURNING contract_id
|
||||
),
|
||||
upsert_tool AS (
|
||||
INSERT INTO awooop_mcp_tool_registry (
|
||||
project_id,
|
||||
tool_name,
|
||||
tool_type,
|
||||
description,
|
||||
allowed_scopes,
|
||||
environment_tags,
|
||||
is_active,
|
||||
updated_at
|
||||
)
|
||||
VALUES (
|
||||
'awoooi',
|
||||
'ssh_docker_restart',
|
||||
'mcp_server',
|
||||
'Policy-approved Docker container restart over SSH for auto-repair',
|
||||
'["write"]'::jsonb,
|
||||
'{"env": "prod"}'::jsonb,
|
||||
TRUE,
|
||||
NOW()
|
||||
)
|
||||
ON CONFLICT (project_id, tool_name)
|
||||
DO UPDATE SET
|
||||
description = EXCLUDED.description,
|
||||
allowed_scopes = EXCLUDED.allowed_scopes,
|
||||
environment_tags = EXCLUDED.environment_tags,
|
||||
is_active = TRUE,
|
||||
updated_at = NOW()
|
||||
RETURNING tool_id, allowed_scopes
|
||||
),
|
||||
upsert_grant AS (
|
||||
INSERT INTO awooop_mcp_grants (
|
||||
project_id,
|
||||
agent_id,
|
||||
tool_id,
|
||||
granted_by,
|
||||
granted_scopes,
|
||||
expires_at,
|
||||
is_revoked,
|
||||
revoked_at,
|
||||
revoked_by
|
||||
)
|
||||
SELECT
|
||||
'awoooi',
|
||||
'auto_repair_executor',
|
||||
tool_id,
|
||||
'migration:t24_auto_repair_docker_restart_gateway',
|
||||
allowed_scopes,
|
||||
NULL,
|
||||
FALSE,
|
||||
NULL,
|
||||
NULL
|
||||
FROM upsert_tool
|
||||
ON CONFLICT (project_id, agent_id, tool_id)
|
||||
DO UPDATE SET
|
||||
granted_by = EXCLUDED.granted_by,
|
||||
granted_scopes = EXCLUDED.granted_scopes,
|
||||
expires_at = NULL,
|
||||
is_revoked = FALSE,
|
||||
revoked_at = NULL,
|
||||
revoked_by = NULL
|
||||
RETURNING grant_id
|
||||
)
|
||||
SELECT
|
||||
'auto_repair_executor_docker_restart_gateway',
|
||||
(SELECT count(*) FROM upsert_pointer) AS active_contract_rows,
|
||||
(SELECT count(*) FROM upsert_tool) AS tool_rows,
|
||||
(SELECT count(*) FROM upsert_grant) AS grant_rows;
|
||||
@@ -0,0 +1,37 @@
|
||||
-- Rollback T24: revoke auto_repair_executor Docker restart write grant.
|
||||
|
||||
SELECT set_config('app.project_id', 'awoooi', FALSE);
|
||||
|
||||
UPDATE awooop_mcp_grants
|
||||
SET is_revoked = TRUE,
|
||||
revoked_at = NOW(),
|
||||
revoked_by = 'rollback:t24_auto_repair_docker_restart_gateway'
|
||||
WHERE project_id = 'awoooi'
|
||||
AND agent_id = 'auto_repair_executor'
|
||||
AND granted_by = 'migration:t24_auto_repair_docker_restart_gateway';
|
||||
|
||||
WITH previous_revision AS (
|
||||
SELECT revision_id, project_id, contract_family, contract_id
|
||||
FROM awooop_contract_revisions
|
||||
WHERE project_id = 'awoooi'
|
||||
AND contract_family = 'agent'
|
||||
AND contract_id = 'auto_repair_executor'
|
||||
AND version_major = 1
|
||||
AND version_minor = 0
|
||||
AND lifecycle_status = 'active'
|
||||
ORDER BY revision_id DESC
|
||||
LIMIT 1
|
||||
)
|
||||
INSERT INTO awooop_active_revisions (
|
||||
project_id,
|
||||
contract_family,
|
||||
contract_id,
|
||||
active_revision_id,
|
||||
updated_at
|
||||
)
|
||||
SELECT project_id, contract_family, contract_id, revision_id, NOW()
|
||||
FROM previous_revision
|
||||
ON CONFLICT (project_id, contract_family, contract_id)
|
||||
DO UPDATE SET
|
||||
active_revision_id = EXCLUDED.active_revision_id,
|
||||
updated_at = NOW();
|
||||
@@ -26,6 +26,7 @@ from collections.abc import Callable
|
||||
from dataclasses import dataclass
|
||||
import re
|
||||
from typing import Any, Protocol
|
||||
from uuid import NAMESPACE_URL, UUID, uuid5
|
||||
|
||||
import structlog
|
||||
|
||||
@@ -88,6 +89,7 @@ class _SshMcpRoute:
|
||||
|
||||
tool_name: str
|
||||
params: dict[str, Any]
|
||||
required_scope: str = "read"
|
||||
|
||||
|
||||
_SHORT_HOST_MAP: dict[str, str] = {
|
||||
@@ -130,6 +132,12 @@ _SSH_WRITE_KEYWORDS = (
|
||||
"bash ",
|
||||
)
|
||||
|
||||
_AUTO_REPAIR_GATEWAY_AGENT_ID = "auto_repair_executor"
|
||||
_AUTO_REPAIR_GATEWAY_PROJECT_ID = "awoooi"
|
||||
_AUTO_REPAIR_GATEWAY_APPROVAL_TTL_SECONDS = 600
|
||||
_SAFE_DOCKER_CONTAINER_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9_.-]{0,127}$")
|
||||
_UNSAFE_LEGACY_WRITE_PATTERN = re.compile(r"[;|<>`\n]|(\$\{|\$\()")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Auto Repair Service Interface
|
||||
@@ -1003,6 +1011,52 @@ class AutoRepairService:
|
||||
|
||||
return _SshMcpRoute(tool_name="ssh_diagnose", params=params)
|
||||
|
||||
def _route_legacy_ssh_write_command_to_mcp(
|
||||
self,
|
||||
incident: Incident,
|
||||
command: str,
|
||||
) -> _SshMcpRoute | None:
|
||||
"""Map safe legacy Docker restart steps to the write-scoped MCP tool.
|
||||
|
||||
This intentionally supports only the historical pattern used by
|
||||
DockerContainerUnhealthy playbooks:
|
||||
``ssh {host} 'docker inspect {container} ... && docker restart {container}'``.
|
||||
Broader shell, command substitution, exporter discovery, and systemd
|
||||
fallbacks stay blocked so they can move through approval/manual review.
|
||||
"""
|
||||
|
||||
raw_command = (command or "").strip()
|
||||
lowered = raw_command.lower()
|
||||
if not lowered.startswith("ssh ") or "docker restart" not in lowered:
|
||||
return None
|
||||
|
||||
if _UNSAFE_LEGACY_WRITE_PATTERN.search(raw_command):
|
||||
return None
|
||||
|
||||
host = self._resolve_ssh_host_for_incident(incident, raw_command)
|
||||
if not host:
|
||||
return None
|
||||
|
||||
container_name = (
|
||||
self._resolve_container_name_for_incident(incident, raw_command)
|
||||
or self._extract_docker_restart_container(raw_command)
|
||||
)
|
||||
if not self._is_safe_docker_container_name(container_name):
|
||||
return None
|
||||
|
||||
if not self._has_simple_docker_restart_tail(raw_command, container_name):
|
||||
return None
|
||||
|
||||
return _SshMcpRoute(
|
||||
tool_name="ssh_docker_restart",
|
||||
params={
|
||||
"host": host,
|
||||
"container_name": container_name,
|
||||
"trust_score": 0.85,
|
||||
},
|
||||
required_scope="write",
|
||||
)
|
||||
|
||||
def preview_read_only_ssh_mcp_route(
|
||||
self,
|
||||
incident: Incident,
|
||||
@@ -1077,11 +1131,35 @@ class AutoRepairService:
|
||||
return value
|
||||
|
||||
match = re.search(
|
||||
r"docker\s+(?:stats\s+--no-stream|inspect|logs|top|ps\s+-a\s+--filter\s+name=)\s+([a-zA-Z0-9._-]+)",
|
||||
r"docker\s+(?:stats\s+--no-stream|inspect|logs|top|restart|ps\s+-a\s+--filter\s+name=)\s+([a-zA-Z0-9._-]+)",
|
||||
command,
|
||||
)
|
||||
return match.group(1) if match else ""
|
||||
|
||||
@staticmethod
|
||||
def _extract_docker_restart_container(command: str) -> str:
|
||||
match = re.search(
|
||||
r"docker\s+restart\s+([A-Za-z0-9][A-Za-z0-9_.-]{0,127})(?:\s*['\"])?\s*$",
|
||||
command,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
return match.group(1) if match else ""
|
||||
|
||||
@staticmethod
|
||||
def _is_safe_docker_container_name(container_name: str) -> bool:
|
||||
return bool(container_name and _SAFE_DOCKER_CONTAINER_RE.fullmatch(container_name))
|
||||
|
||||
@staticmethod
|
||||
def _has_simple_docker_restart_tail(command: str, container_name: str) -> bool:
|
||||
target = rf"(?:{re.escape(container_name)}|\{{container\}}|\{{target\}})"
|
||||
return bool(
|
||||
re.search(
|
||||
rf"docker\s+restart\s+{target}(?:\s*['\"])?\s*$",
|
||||
command,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _incident_labels(incident: Incident) -> dict[str, Any]:
|
||||
for signal in incident.signals or []:
|
||||
@@ -1094,6 +1172,8 @@ class AutoRepairService:
|
||||
self,
|
||||
incident: Incident,
|
||||
route: _SshMcpRoute,
|
||||
*,
|
||||
approved: bool = True,
|
||||
) -> str:
|
||||
"""Execute a routed SSH diagnostic through AwoooP MCP Gateway."""
|
||||
|
||||
@@ -1103,22 +1183,38 @@ class AutoRepairService:
|
||||
from src.services.mcp_audit_context import with_mcp_audit_context
|
||||
|
||||
incident_id = incident.incident_id
|
||||
run_id: UUID | None = None
|
||||
if route.required_scope != "read":
|
||||
if not approved:
|
||||
return (
|
||||
f"FAILED: mcp:{route.tool_name} approval required for "
|
||||
f"{route.required_scope} scope"
|
||||
)
|
||||
run_id = self._mcp_auto_repair_run_id(incident_id, route)
|
||||
approval_projected = await self._project_auto_repair_mcp_approval(
|
||||
route=route,
|
||||
run_id=run_id,
|
||||
)
|
||||
if not approval_projected:
|
||||
return f"FAILED: mcp:{route.tool_name} approval projection failed"
|
||||
|
||||
params = with_mcp_audit_context(
|
||||
route.params,
|
||||
session_id=f"incident:{incident_id}:auto_repair_execute",
|
||||
incident_id=incident_id,
|
||||
flywheel_node="execute",
|
||||
agent_role="auto_repair_executor",
|
||||
agent_role=_AUTO_REPAIR_GATEWAY_AGENT_ID,
|
||||
)
|
||||
async with get_db_context("awoooi") as db:
|
||||
ctx = GatewayContext(
|
||||
project_id="awoooi",
|
||||
agent_id="auto_repair_executor",
|
||||
project_id=_AUTO_REPAIR_GATEWAY_PROJECT_ID,
|
||||
agent_id=_AUTO_REPAIR_GATEWAY_AGENT_ID,
|
||||
tool_name=route.tool_name,
|
||||
run_id=run_id,
|
||||
trace_id=incident_id,
|
||||
is_shadow=False,
|
||||
environment={"env": "prod"},
|
||||
required_scope="read",
|
||||
required_scope=route.required_scope,
|
||||
)
|
||||
result = await McpGateway(db).call(ctx, params)
|
||||
except McpGatewayError as exc:
|
||||
@@ -1137,6 +1233,55 @@ class AutoRepairService:
|
||||
return f"SUCCESS: mcp:{route.tool_name} {preview}".strip()
|
||||
return f"FAILED: mcp:{route.tool_name} {result.error or 'execution failed'}"
|
||||
|
||||
@staticmethod
|
||||
def _mcp_auto_repair_run_id(incident_id: str, route: _SshMcpRoute) -> UUID:
|
||||
stable_payload = (
|
||||
f"{_AUTO_REPAIR_GATEWAY_PROJECT_ID}:"
|
||||
f"{_AUTO_REPAIR_GATEWAY_AGENT_ID}:"
|
||||
f"{route.tool_name}:"
|
||||
f"{incident_id}:"
|
||||
f"{route.params.get('host', '')}:"
|
||||
f"{route.params.get('container_name', '')}"
|
||||
)
|
||||
return uuid5(NAMESPACE_URL, stable_payload)
|
||||
|
||||
async def _project_auto_repair_mcp_approval(
|
||||
self,
|
||||
route: _SshMcpRoute,
|
||||
run_id: UUID,
|
||||
) -> bool:
|
||||
"""Project policy-approved auto repair into Gateway's Gate 5 key."""
|
||||
|
||||
try:
|
||||
from src.core.redis_client import get_redis
|
||||
|
||||
redis = get_redis()
|
||||
approval_key = (
|
||||
f"mcp_approval:{_AUTO_REPAIR_GATEWAY_PROJECT_ID}:"
|
||||
f"{_AUTO_REPAIR_GATEWAY_AGENT_ID}:{route.tool_name}:{run_id}"
|
||||
)
|
||||
await redis.set(
|
||||
approval_key,
|
||||
"approved:auto_repair_policy",
|
||||
ex=_AUTO_REPAIR_GATEWAY_APPROVAL_TTL_SECONDS,
|
||||
)
|
||||
logger.info(
|
||||
"auto_repair_mcp_approval_projected",
|
||||
tool=route.tool_name,
|
||||
required_scope=route.required_scope,
|
||||
run_id=str(run_id),
|
||||
)
|
||||
return True
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"auto_repair_mcp_approval_projection_failed",
|
||||
tool=route.tool_name,
|
||||
required_scope=route.required_scope,
|
||||
run_id=str(run_id),
|
||||
error=str(exc),
|
||||
)
|
||||
return False
|
||||
|
||||
async def _execute_step(self, incident: Incident, step) -> str:
|
||||
"""
|
||||
執行單一修復步驟
|
||||
@@ -1172,9 +1317,17 @@ class AutoRepairService:
|
||||
if route is not None:
|
||||
return await self._execute_ssh_mcp_route(incident, route)
|
||||
|
||||
approved = not getattr(step, "requires_approval", False)
|
||||
route = self._route_legacy_ssh_write_command_to_mcp(incident, step.command)
|
||||
if route is not None:
|
||||
return await self._execute_ssh_mcp_route(
|
||||
incident,
|
||||
route,
|
||||
approved=approved,
|
||||
)
|
||||
|
||||
from src.services.host_repair_agent import HostRepairAgent
|
||||
agent = HostRepairAgent()
|
||||
approved = not getattr(step, "requires_approval", False)
|
||||
result = await agent.repair_by_uri(step.command, approved=approved)
|
||||
if result.success:
|
||||
return f"SUCCESS: {result.output}"
|
||||
|
||||
@@ -459,6 +459,44 @@ class TestAutoRepairService:
|
||||
|
||||
assert route is None
|
||||
|
||||
def test_legacy_ssh_docker_restart_routes_to_write_mcp_gateway(self, service):
|
||||
"""Safe legacy Docker restart steps use the governed write MCP tool."""
|
||||
incident = create_test_incident(
|
||||
severity=Severity.P2,
|
||||
alert_category="infrastructure",
|
||||
alert_name="DockerContainerUnhealthy",
|
||||
)
|
||||
incident.signals[0].labels.update({
|
||||
"host": "110",
|
||||
"container_name": "minio",
|
||||
})
|
||||
|
||||
route = service._route_legacy_ssh_write_command_to_mcp(
|
||||
incident,
|
||||
'ssh {host} \'docker inspect {container} --format="{{.State.Health.Status}}" && docker restart {container}\'',
|
||||
)
|
||||
|
||||
assert route is not None
|
||||
assert route.tool_name == "ssh_docker_restart"
|
||||
assert route.required_scope == "write"
|
||||
assert route.params == {
|
||||
"host": "192.168.0.110",
|
||||
"container_name": "minio",
|
||||
"trust_score": 0.85,
|
||||
}
|
||||
|
||||
def test_legacy_ssh_complex_restart_stays_blocked(self, service):
|
||||
"""Command substitution/fallback shell must not enter auto write MCP."""
|
||||
incident = create_test_incident(severity=Severity.P2)
|
||||
incident.signals[0].labels.update({"host": "110", "container_name": "node-exporter"})
|
||||
|
||||
route = service._route_legacy_ssh_write_command_to_mcp(
|
||||
incident,
|
||||
'ssh {host} \'docker restart $(docker ps -a --filter name=exporter --format "{{.Names}}" | head -1) || systemctl restart node_exporter\'',
|
||||
)
|
||||
|
||||
assert route is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_execute_legacy_ssh_diagnostic_uses_mcp_gateway(
|
||||
self,
|
||||
@@ -508,6 +546,71 @@ class TestAutoRepairService:
|
||||
assert calls[0]["params"]["container_name"] == "momo-scheduler"
|
||||
assert calls[0]["params"]["_mcp_audit"]["flywheel_node"] == "execute"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_execute_legacy_ssh_docker_restart_uses_write_mcp_gateway(
|
||||
self,
|
||||
service,
|
||||
monkeypatch,
|
||||
):
|
||||
incident = create_test_incident(
|
||||
severity=Severity.P2,
|
||||
alert_category="infrastructure",
|
||||
alert_name="DockerContainerUnhealthy",
|
||||
)
|
||||
incident.signals[0].labels.update({
|
||||
"host": "110",
|
||||
"container_name": "minio",
|
||||
})
|
||||
step = RepairStep(
|
||||
step_number=1,
|
||||
action_type=ActionType.SSH_COMMAND,
|
||||
command='ssh {host} \'docker inspect {container} --format="{{.State.Health.Status}}" && docker restart {container}\'',
|
||||
risk_level=RiskLevel.MEDIUM,
|
||||
requires_approval=False,
|
||||
)
|
||||
calls = []
|
||||
redis_sets = []
|
||||
|
||||
class FakeRedis:
|
||||
async def set(self, key, value, ex=None):
|
||||
redis_sets.append({"key": key, "value": value, "ex": ex})
|
||||
return True
|
||||
|
||||
class FakeGateway:
|
||||
def __init__(self, db: object) -> None:
|
||||
self.db = db
|
||||
|
||||
async def call(self, ctx, params):
|
||||
calls.append({"ctx": ctx, "params": params, "db": self.db})
|
||||
return MCPToolResult(
|
||||
success=True,
|
||||
execution_id="gw-write-ok",
|
||||
output={"stdout": "minio"},
|
||||
)
|
||||
|
||||
monkeypatch.setattr("src.core.redis_client.get_redis", lambda: FakeRedis())
|
||||
monkeypatch.setattr("src.db.base.get_db_context", lambda _project_id: _DbContext())
|
||||
monkeypatch.setattr("src.plugins.mcp.gateway.McpGateway", FakeGateway)
|
||||
|
||||
result = await service._execute_step(incident, step)
|
||||
|
||||
assert result.startswith("SUCCESS: mcp:ssh_docker_restart")
|
||||
assert calls
|
||||
assert redis_sets
|
||||
assert calls[0]["ctx"].agent_id == "auto_repair_executor"
|
||||
assert calls[0]["ctx"].tool_name == "ssh_docker_restart"
|
||||
assert calls[0]["ctx"].required_scope == "write"
|
||||
assert calls[0]["ctx"].is_shadow is False
|
||||
assert calls[0]["ctx"].run_id is not None
|
||||
assert calls[0]["params"]["host"] == "192.168.0.110"
|
||||
assert calls[0]["params"]["container_name"] == "minio"
|
||||
assert calls[0]["params"]["trust_score"] == 0.85
|
||||
assert calls[0]["params"]["_mcp_audit"]["flywheel_node"] == "execute"
|
||||
assert redis_sets[0]["key"].startswith(
|
||||
"mcp_approval:awoooi:auto_repair_executor:ssh_docker_restart:"
|
||||
)
|
||||
assert redis_sets[0]["value"] == "approved:auto_repair_policy"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_low_risk_allowed(self, service, mock_playbook_service):
|
||||
"""Test that LOW risk actions are allowed"""
|
||||
|
||||
@@ -1,3 +1,48 @@
|
||||
## 2026-06-01|AI 自健診 W-1 SLO:auto_execute_success_rate 修復
|
||||
|
||||
**背景**:
|
||||
|
||||
- Telegram 出現 `META-20260601130436`「飛輪核心異常」,W-1 SLO 違反 `auto_execute_success_rate`。
|
||||
- production `/api/v1/ai/slo` 顯示 7 日自動執行成功率約 `83.3%`,低於 `85%` 門檻;recent `auto_repair_executions` 指向 `PB-20260420-3F9C4C` 反覆失敗。
|
||||
|
||||
**根因**:
|
||||
|
||||
- `Docker 容器 healthcheck 失敗` PlayBook 使用 legacy 指令:
|
||||
`ssh {host} 'docker inspect {container} --format="{{.State.Health.Status}}" && docker restart {container}'`
|
||||
- 只讀診斷已能走 `auto_repair_executor/ssh_diagnose/read` MCP Gateway,但此寫入指令沒有安全轉譯,落到 `HostRepairAgent.repair_by_uri()` 後被拒絕為 `unsupported scheme`。
|
||||
- production MCP grant 顯示 `auto_repair_executor` 只有 `ssh_diagnose/read`;`ssh_docker_restart/write` 僅授權 `approval_executor`,與 PlayBook `requires_approval=false` 的低/中風險自動修復意圖不一致。
|
||||
|
||||
**本次調整**:
|
||||
|
||||
- `apps/api/src/services/auto_repair_service.py`
|
||||
- 新增安全 legacy SSH write 路由:只允許簡單 `docker restart <container>` 或 `{container}` placeholder,且必須能解析出安全 container name。
|
||||
- 將該路由轉進 AwoooP MCP Gateway `ssh_docker_restart/write`,注入 `trust_score=0.85`、MCP audit context、`auto_repair_executor` agent 與 deterministic run id。
|
||||
- 對 write scope 先投影短效 Gate 5 key `approved:auto_repair_policy`;`requires_approval=true` 時仍 fail-closed,不會繞過人工審批。
|
||||
- 保留 read-only `ssh_diagnose` 原路徑;含 command substitution、pipe、fallback shell、systemd/prune 的複雜命令仍拒絕自動寫入。
|
||||
- `apps/api/migrations/awooop_awoooi_mcp_auto_repair_executor_docker_restart_2026-06-01.sql`
|
||||
- 將 `auto_repair_executor` agent contract 升到 v1.1。
|
||||
- 僅新增 `ssh_docker_restart/write` grant,邊界寫入 contract:只允許安全 Docker container restart,其他寫入工具仍不授權。
|
||||
- `apps/api/tests/test_auto_repair_service.py`
|
||||
- 補上 write MCP route、複雜 shell 封鎖與 Gate 5 approval projection 測試。
|
||||
|
||||
**驗證**:
|
||||
|
||||
```text
|
||||
PYTHONPATH=apps/api python -m py_compile apps/api/src/services/auto_repair_service.py
|
||||
DATABASE_URL=sqlite+aiosqlite:///:memory: PYTHONPATH=apps/api python -m pytest apps/api/tests/test_auto_repair_service.py -q
|
||||
→ 26 passed
|
||||
DATABASE_URL=sqlite+aiosqlite:///:memory: PYTHONPATH=apps/api python -m pytest apps/api/tests/test_auto_repair_service.py apps/api/tests/test_approval_execution_mcp_audit.py -q
|
||||
→ 29 passed
|
||||
production owner-fallback rollback check:
|
||||
apps/api/migrations/awooop_awoooi_mcp_auto_repair_executor_docker_restart_2026-06-01.sql
|
||||
→ migration rollback syntax ok with owner fallback
|
||||
```
|
||||
|
||||
**進度邊界**:
|
||||
|
||||
- 本輪修的是 W-1 的已知 executor gap,避免後續同一 PlayBook 再產生 `unsupported_action_scheme`。
|
||||
- 歷史 7 日 SLO 會等舊失敗列被新成功列稀釋或滾出視窗才完全回綠;部署後需監看下一輪 `auto_repair_executions` 是否出現 `SUCCESS: mcp:ssh_docker_restart` 與 AwoooP MCP Gateway audit。
|
||||
|
||||
## 2026-06-01|IwoooS 決策跑道
|
||||
|
||||
**背景**:
|
||||
|
||||
Reference in New Issue
Block a user