feat(auto-repair): route ssh diagnostics through mcp gateway
All checks were successful
Code Review / ai-code-review (push) Successful in 10s
run-migration / migrate (push) Successful in 9s
CD Pipeline / tests (push) Successful in 1m11s
CD Pipeline / build-and-deploy (push) Successful in 3m17s
CD Pipeline / post-deploy-checks (push) Successful in 1m16s
All checks were successful
Code Review / ai-code-review (push) Successful in 10s
run-migration / migrate (push) Successful in 9s
CD Pipeline / tests (push) Successful in 1m11s
CD Pipeline / build-and-deploy (push) Successful in 3m17s
CD Pipeline / post-deploy-checks (push) Successful in 1m16s
This commit is contained in:
@@ -0,0 +1,166 @@
|
||||
-- T23: auto-repair executor read-only MCP Gateway seed
|
||||
-- 目的:讓 YAML_RULE/PlayBook 的只讀 SSH 診斷步驟經過 AwoooP MCP Gateway。
|
||||
-- 邊界:只授權 read scope;write/admin SSH 工具仍必須走 approval_executor + Gate 5。
|
||||
|
||||
SELECT set_config('app.project_id', 'awoooi', FALSE);
|
||||
|
||||
WITH agent_body AS (
|
||||
SELECT jsonb_build_object(
|
||||
'schema_version', 'awooop_agent_contract_v1',
|
||||
'agent_id', 'auto_repair_executor',
|
||||
'display_name', 'Auto Repair Executor',
|
||||
'project_id', 'awoooi',
|
||||
'purpose', 'Read-only auto-repair diagnostics through AwoooP MCP Gateway',
|
||||
'allowed_scopes', jsonb_build_array('read'),
|
||||
'forbidden_scopes', jsonb_build_array('write', 'admin'),
|
||||
'stage', 't23_auto_repair_diagnostic_gateway'
|
||||
) AS body_json
|
||||
),
|
||||
inserted_revision AS (
|
||||
INSERT INTO awooop_contract_revisions (
|
||||
project_id,
|
||||
contract_family,
|
||||
contract_id,
|
||||
version_major,
|
||||
version_minor,
|
||||
lifecycle_status,
|
||||
body_json,
|
||||
body_hash,
|
||||
body_schema_version,
|
||||
publisher_id,
|
||||
published_at
|
||||
)
|
||||
SELECT
|
||||
'awoooi',
|
||||
'agent',
|
||||
'auto_repair_executor',
|
||||
1,
|
||||
0,
|
||||
'active',
|
||||
body_json,
|
||||
encode(digest(body_json::text, 'sha256'), 'hex'),
|
||||
'v1.0',
|
||||
'migration:t23_auto_repair_executor_read_gateway',
|
||||
NOW()
|
||||
FROM agent_body
|
||||
ON CONFLICT (project_id, contract_family, contract_id, version_major, version_minor)
|
||||
DO NOTHING
|
||||
RETURNING revision_id, project_id, contract_family, contract_id
|
||||
),
|
||||
chosen_revision AS (
|
||||
SELECT revision_id, project_id, contract_family, contract_id
|
||||
FROM inserted_revision
|
||||
UNION ALL
|
||||
SELECT revision_id, project_id, contract_family, contract_id
|
||||
FROM awooop_contract_revisions
|
||||
WHERE project_id = 'awoooi'
|
||||
AND contract_family = 'agent'
|
||||
AND contract_id = 'auto_repair_executor'
|
||||
AND version_major = 1
|
||||
AND version_minor = 0
|
||||
AND lifecycle_status = 'active'
|
||||
),
|
||||
upsert_pointer AS (
|
||||
INSERT INTO awooop_active_revisions (
|
||||
project_id,
|
||||
contract_family,
|
||||
contract_id,
|
||||
active_revision_id,
|
||||
updated_at
|
||||
)
|
||||
SELECT DISTINCT ON (project_id, contract_family, contract_id)
|
||||
project_id,
|
||||
contract_family,
|
||||
contract_id,
|
||||
revision_id,
|
||||
NOW()
|
||||
FROM chosen_revision
|
||||
ORDER BY project_id, contract_family, contract_id, revision_id
|
||||
ON CONFLICT (project_id, contract_family, contract_id)
|
||||
DO UPDATE SET
|
||||
active_revision_id = EXCLUDED.active_revision_id,
|
||||
updated_at = NOW()
|
||||
RETURNING contract_id
|
||||
)
|
||||
SELECT 'auto_repair_executor_active_contracts', count(*) FROM upsert_pointer;
|
||||
|
||||
WITH read_tools(tool_name, description) AS (
|
||||
VALUES
|
||||
('ssh_diagnose', 'SSH host/container diagnosis read'),
|
||||
('ssh_get_top_processes', 'SSH top processes read'),
|
||||
('ssh_get_disk_usage', 'SSH disk usage read'),
|
||||
('ssh_get_memory_info', 'SSH memory info read'),
|
||||
('ssh_get_container_logs', 'SSH container logs read'),
|
||||
('ssh_get_container_status', 'SSH container status read'),
|
||||
('ssh_get_service_status', 'SSH service status read'),
|
||||
('ssh_check_port', 'SSH port check read'),
|
||||
('ssh_get_nginx_error_log', 'SSH nginx error log read'),
|
||||
('ssh_get_swap_info', 'SSH swap info read')
|
||||
),
|
||||
upsert_tools AS (
|
||||
INSERT INTO awooop_mcp_tool_registry (
|
||||
project_id,
|
||||
tool_name,
|
||||
tool_type,
|
||||
description,
|
||||
allowed_scopes,
|
||||
environment_tags,
|
||||
is_active,
|
||||
updated_at
|
||||
)
|
||||
SELECT
|
||||
'awoooi',
|
||||
tool_name,
|
||||
'mcp_server',
|
||||
description,
|
||||
'["read"]'::jsonb,
|
||||
'{"env": "prod"}'::jsonb,
|
||||
TRUE,
|
||||
NOW()
|
||||
FROM read_tools
|
||||
ON CONFLICT (project_id, tool_name)
|
||||
DO UPDATE SET
|
||||
description = EXCLUDED.description,
|
||||
allowed_scopes = EXCLUDED.allowed_scopes,
|
||||
environment_tags = EXCLUDED.environment_tags,
|
||||
is_active = TRUE,
|
||||
updated_at = NOW()
|
||||
RETURNING tool_id, tool_name, allowed_scopes
|
||||
),
|
||||
upsert_grants AS (
|
||||
INSERT INTO awooop_mcp_grants (
|
||||
project_id,
|
||||
agent_id,
|
||||
tool_id,
|
||||
granted_by,
|
||||
granted_scopes,
|
||||
expires_at,
|
||||
is_revoked,
|
||||
revoked_at,
|
||||
revoked_by
|
||||
)
|
||||
SELECT
|
||||
'awoooi',
|
||||
'auto_repair_executor',
|
||||
tool_id,
|
||||
'migration:t23_auto_repair_executor_read_gateway',
|
||||
allowed_scopes,
|
||||
NULL,
|
||||
FALSE,
|
||||
NULL,
|
||||
NULL
|
||||
FROM upsert_tools
|
||||
ON CONFLICT (project_id, agent_id, tool_id)
|
||||
DO UPDATE SET
|
||||
granted_by = EXCLUDED.granted_by,
|
||||
granted_scopes = EXCLUDED.granted_scopes,
|
||||
expires_at = NULL,
|
||||
is_revoked = FALSE,
|
||||
revoked_at = NULL,
|
||||
revoked_by = NULL
|
||||
RETURNING grant_id
|
||||
)
|
||||
SELECT
|
||||
'auto_repair_executor_read_gateway',
|
||||
(SELECT count(*) FROM upsert_tools) AS tool_rows,
|
||||
(SELECT count(*) FROM upsert_grants) AS grant_rows;
|
||||
@@ -0,0 +1,24 @@
|
||||
-- Rollback T23 auto-repair executor read-only MCP Gateway grant.
|
||||
|
||||
SELECT set_config('app.project_id', 'awoooi', FALSE);
|
||||
|
||||
UPDATE awooop_mcp_grants
|
||||
SET is_revoked = TRUE,
|
||||
revoked_at = NOW(),
|
||||
revoked_by = 'rollback:t23_auto_repair_executor_read_gateway'
|
||||
WHERE project_id = 'awoooi'
|
||||
AND agent_id = 'auto_repair_executor'
|
||||
AND granted_by = 'migration:t23_auto_repair_executor_read_gateway';
|
||||
|
||||
DELETE FROM awooop_active_revisions
|
||||
WHERE project_id = 'awoooi'
|
||||
AND contract_family = 'agent'
|
||||
AND contract_id = 'auto_repair_executor';
|
||||
|
||||
UPDATE awooop_contract_revisions
|
||||
SET lifecycle_status = 'retired'
|
||||
WHERE project_id = 'awoooi'
|
||||
AND contract_family = 'agent'
|
||||
AND contract_id = 'auto_repair_executor'
|
||||
AND publisher_id = 'migration:t23_auto_repair_executor_read_gateway'
|
||||
AND lifecycle_status = 'active';
|
||||
@@ -65,6 +65,12 @@ DEFAULT_HOST_USERS = {
|
||||
# AI/Web host is operated by the ollama account in the current topology.
|
||||
"192.168.0.188": "ollama",
|
||||
}
|
||||
SHORT_HOST_MAP = {
|
||||
"110": "192.168.0.110",
|
||||
"120": "192.168.0.120",
|
||||
"121": "192.168.0.121",
|
||||
"188": "192.168.0.188",
|
||||
}
|
||||
DIAG_TIMEOUT = 10 # 診斷類超時(秒)
|
||||
OP_TIMEOUT = 60 # 操作類超時(秒)
|
||||
|
||||
@@ -127,7 +133,9 @@ def _normalize_ssh_host(value: str) -> str:
|
||||
if host.count(":") == 1:
|
||||
maybe_host, maybe_port = host.rsplit(":", 1)
|
||||
if maybe_port.isdigit():
|
||||
return maybe_host
|
||||
host = maybe_host
|
||||
if host in SHORT_HOST_MAP:
|
||||
return SHORT_HOST_MAP[host]
|
||||
return host
|
||||
|
||||
|
||||
@@ -240,6 +248,10 @@ class SSHProvider(MCPToolProvider):
|
||||
),
|
||||
input_schema={"type": "object", "properties": {
|
||||
"host": {"type": "string", "description": "Target host IP"},
|
||||
"container_name": {
|
||||
"type": "string",
|
||||
"description": "Optional Docker container name for container-focused diagnostics",
|
||||
},
|
||||
}, "required": ["host"]},
|
||||
server_name=self.name,
|
||||
),
|
||||
@@ -542,12 +554,23 @@ class SSHProvider(MCPToolProvider):
|
||||
# 所有接受用戶字串的工具,必須先通過 _validate_param() 白名單驗證
|
||||
if tool_name == "ssh_diagnose":
|
||||
# 2026-04-27 Claude Sonnet 4.6: 主機告警自動診斷 — 只讀,不修改任何狀態
|
||||
return (
|
||||
command = (
|
||||
"echo '=== CPU TOP ===' && ps aux --sort=-%cpu | head -15 && "
|
||||
"echo '=== MEMORY ===' && free -h && "
|
||||
"echo '=== DISK ===' && df -h && "
|
||||
"echo '=== LOAD ===' && uptime"
|
||||
)
|
||||
container_name = params.get("container_name")
|
||||
if container_name:
|
||||
name = _validate_param("container_name", str(container_name))
|
||||
command = (
|
||||
f"{command} && "
|
||||
f"echo '=== DOCKER STATS {name} ===' && "
|
||||
f"docker stats --no-stream {name} 2>&1 && "
|
||||
f"echo '=== DOCKER INSPECT {name} ===' && "
|
||||
f"docker inspect {name} 2>&1 | head -80"
|
||||
)
|
||||
return command
|
||||
|
||||
if tool_name == "ssh_get_top_processes":
|
||||
return "ps aux --sort=-%cpu | head -15"
|
||||
|
||||
@@ -22,9 +22,10 @@ Phase 8: 自動化層實作
|
||||
- P0/P1 嚴重度 Incident 需要人工確認
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from collections.abc import Callable
|
||||
from typing import Protocol
|
||||
from dataclasses import dataclass
|
||||
import re
|
||||
from typing import Any, Protocol
|
||||
|
||||
import structlog
|
||||
|
||||
@@ -81,6 +82,55 @@ class AutoRepairResult:
|
||||
execution_time_ms: int = 0
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class _SshMcpRoute:
|
||||
"""Route a legacy SSH playbook command to a governed MCP tool."""
|
||||
|
||||
tool_name: str
|
||||
params: dict[str, Any]
|
||||
|
||||
|
||||
_SHORT_HOST_MAP: dict[str, str] = {
|
||||
"110": "192.168.0.110",
|
||||
"120": "192.168.0.120",
|
||||
"121": "192.168.0.121",
|
||||
"188": "192.168.0.188",
|
||||
}
|
||||
|
||||
_SSH_DIAGNOSTIC_KEYWORDS = (
|
||||
"ps aux",
|
||||
"docker stats",
|
||||
"docker inspect",
|
||||
"docker logs",
|
||||
"docker ps",
|
||||
"docker top",
|
||||
"df -h",
|
||||
"du -",
|
||||
"free -h",
|
||||
"journalctl",
|
||||
"systemctl show",
|
||||
"tail ",
|
||||
"top ",
|
||||
"uptime",
|
||||
)
|
||||
|
||||
_SSH_WRITE_KEYWORDS = (
|
||||
"docker restart",
|
||||
"docker start",
|
||||
"docker stop",
|
||||
"docker rm",
|
||||
"docker prune",
|
||||
"systemctl restart",
|
||||
"systemctl stop",
|
||||
"systemctl start",
|
||||
"truncate ",
|
||||
" rm ",
|
||||
"rm -",
|
||||
"certbot renew",
|
||||
"bash ",
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Auto Repair Service Interface
|
||||
# =============================================================================
|
||||
@@ -918,6 +968,152 @@ class AutoRepairService:
|
||||
# 安全降級:檢查失敗 → 保守拒絕
|
||||
return False
|
||||
|
||||
def _route_legacy_ssh_command_to_mcp(
|
||||
self,
|
||||
incident: Incident,
|
||||
command: str,
|
||||
) -> _SshMcpRoute | None:
|
||||
"""Map read-only legacy ``ssh {host} '...'`` steps to MCP Gateway.
|
||||
|
||||
YAML_RULE playbooks predate the URI executor and can contain compound
|
||||
shell diagnostics. Those commands should not bypass the newer
|
||||
scheme-based HostRepairAgent or loosen its shell safety guard; read-only
|
||||
diagnostics are instead routed to the governed SSH MCP provider.
|
||||
"""
|
||||
|
||||
raw_command = (command or "").strip()
|
||||
lowered = raw_command.lower()
|
||||
if not lowered.startswith("ssh "):
|
||||
return None
|
||||
|
||||
if any(token in lowered for token in _SSH_WRITE_KEYWORDS):
|
||||
return None
|
||||
|
||||
if not any(token in lowered for token in _SSH_DIAGNOSTIC_KEYWORDS):
|
||||
return None
|
||||
|
||||
host = self._resolve_ssh_host_for_incident(incident, raw_command)
|
||||
if not host:
|
||||
return None
|
||||
|
||||
params: dict[str, Any] = {"host": host}
|
||||
container_name = self._resolve_container_name_for_incident(incident, raw_command)
|
||||
if container_name:
|
||||
params["container_name"] = container_name
|
||||
|
||||
return _SshMcpRoute(tool_name="ssh_diagnose", params=params)
|
||||
|
||||
def _resolve_ssh_host_for_incident(self, incident: Incident, command: str) -> str:
|
||||
"""Resolve ``{host}``, short host labels, and exporter instance ports."""
|
||||
|
||||
labels = self._incident_labels(incident)
|
||||
raw_host = ""
|
||||
match = re.match(r"ssh\s+([^\s'\"]+)", command.strip(), flags=re.IGNORECASE)
|
||||
if match:
|
||||
raw_host = match.group(1)
|
||||
|
||||
if not raw_host or "{" in raw_host or "}" in raw_host:
|
||||
raw_host = (
|
||||
str(labels.get("host") or "")
|
||||
or str(labels.get("instance") or "")
|
||||
or str(labels.get("node") or "")
|
||||
or str(labels.get("exported_instance") or "")
|
||||
)
|
||||
|
||||
return self._normalize_ssh_host(raw_host)
|
||||
|
||||
@staticmethod
|
||||
def _normalize_ssh_host(raw_host: str) -> str:
|
||||
host = (raw_host or "").strip()
|
||||
if host.startswith("ssh://"):
|
||||
host = host.removeprefix("ssh://")
|
||||
if "@" in host:
|
||||
host = host.rsplit("@", 1)[1]
|
||||
if host.startswith("[") and "]" in host:
|
||||
host = host[1:host.index("]")]
|
||||
if host.count(":") == 1:
|
||||
maybe_host, maybe_port = host.rsplit(":", 1)
|
||||
if maybe_port.isdigit():
|
||||
host = maybe_host
|
||||
if host in _SHORT_HOST_MAP:
|
||||
return _SHORT_HOST_MAP[host]
|
||||
match = re.fullmatch(r"(?:node-exporter-|host-)?(110|120|121|188)", host)
|
||||
if match:
|
||||
return _SHORT_HOST_MAP[match.group(1)]
|
||||
return host
|
||||
|
||||
def _resolve_container_name_for_incident(
|
||||
self,
|
||||
incident: Incident,
|
||||
command: str,
|
||||
) -> str:
|
||||
labels = self._incident_labels(incident)
|
||||
for key in ("container_name", "container", "name"):
|
||||
value = str(labels.get(key) or "").strip()
|
||||
if value and "{" not in value and "}" not in value:
|
||||
return value
|
||||
|
||||
match = re.search(
|
||||
r"docker\s+(?:stats\s+--no-stream|inspect|logs|top|ps\s+-a\s+--filter\s+name=)\s+([a-zA-Z0-9._-]+)",
|
||||
command,
|
||||
)
|
||||
return match.group(1) if match else ""
|
||||
|
||||
@staticmethod
|
||||
def _incident_labels(incident: Incident) -> dict[str, Any]:
|
||||
for signal in incident.signals or []:
|
||||
labels = getattr(signal, "labels", None)
|
||||
if labels:
|
||||
return labels
|
||||
return {}
|
||||
|
||||
async def _execute_ssh_mcp_route(
|
||||
self,
|
||||
incident: Incident,
|
||||
route: _SshMcpRoute,
|
||||
) -> str:
|
||||
"""Execute a routed SSH diagnostic through AwoooP MCP Gateway."""
|
||||
|
||||
try:
|
||||
from src.db.base import get_db_context
|
||||
from src.plugins.mcp.gateway import GatewayContext, McpGateway, McpGatewayError
|
||||
from src.services.mcp_audit_context import with_mcp_audit_context
|
||||
|
||||
incident_id = incident.incident_id
|
||||
params = with_mcp_audit_context(
|
||||
route.params,
|
||||
session_id=f"incident:{incident_id}:auto_repair_execute",
|
||||
incident_id=incident_id,
|
||||
flywheel_node="execute",
|
||||
agent_role="auto_repair_executor",
|
||||
)
|
||||
async with get_db_context("awoooi") as db:
|
||||
ctx = GatewayContext(
|
||||
project_id="awoooi",
|
||||
agent_id="auto_repair_executor",
|
||||
tool_name=route.tool_name,
|
||||
trace_id=incident_id,
|
||||
is_shadow=False,
|
||||
environment={"env": "prod"},
|
||||
required_scope="read",
|
||||
)
|
||||
result = await McpGateway(db).call(ctx, params)
|
||||
except McpGatewayError as exc:
|
||||
return f"FAILED: mcp:{route.tool_name} {exc.error_code}: {exc}"
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"auto_repair_ssh_mcp_route_failed",
|
||||
incident_id=incident.incident_id,
|
||||
tool=route.tool_name,
|
||||
error=str(exc),
|
||||
)
|
||||
return f"FAILED: mcp:{route.tool_name} {exc}"
|
||||
|
||||
if result.success:
|
||||
preview = str(result.output or "")[:500]
|
||||
return f"SUCCESS: mcp:{route.tool_name} {preview}".strip()
|
||||
return f"FAILED: mcp:{route.tool_name} {result.error or 'execution failed'}"
|
||||
|
||||
async def _execute_step(self, incident: Incident, step) -> str:
|
||||
"""
|
||||
執行單一修復步驟
|
||||
@@ -949,6 +1145,10 @@ class AutoRepairService:
|
||||
|
||||
# 2026-04-06 Claude Code: Sprint 3 — repair_by_uri (URI scheme 路由)
|
||||
if step.action_type == ActionType.SSH_COMMAND:
|
||||
route = self._route_legacy_ssh_command_to_mcp(incident, step.command)
|
||||
if route is not None:
|
||||
return await self._execute_ssh_mcp_route(incident, route)
|
||||
|
||||
from src.services.host_repair_agent import HostRepairAgent
|
||||
agent = HostRepairAgent()
|
||||
approved = not getattr(step, "requires_approval", False)
|
||||
|
||||
@@ -34,6 +34,7 @@ MASTER §3.1 L6×D1
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import structlog
|
||||
@@ -236,6 +237,7 @@ class PostExecutionVerifier:
|
||||
"pod_name": labels.get("pod", labels.get("name", "")),
|
||||
"deployment": labels.get("deployment", ""),
|
||||
"host": labels.get("instance", "").split(":")[0] or labels.get("host", ""),
|
||||
"query": _build_prometheus_query(alertname, labels),
|
||||
}
|
||||
|
||||
async def _call_one(reg) -> tuple[str, Any]:
|
||||
@@ -501,16 +503,95 @@ def _get_incident_id(incident: "Incident") -> str:
|
||||
|
||||
def _get_alertname(incident: "Incident") -> str:
|
||||
if incident.signals:
|
||||
return incident.signals[0].labels.get("alertname", "")
|
||||
signal = incident.signals[0]
|
||||
labels = signal.labels or {}
|
||||
return labels.get("alertname", "") or getattr(signal, "alert_name", "")
|
||||
return ""
|
||||
|
||||
|
||||
def _get_labels(incident: "Incident") -> dict[str, Any]:
|
||||
if incident.signals:
|
||||
return incident.signals[0].labels
|
||||
return incident.signals[0].labels or {}
|
||||
return {}
|
||||
|
||||
|
||||
def _build_prometheus_query(alertname: str, labels: dict[str, Any]) -> str:
|
||||
"""Build a non-empty PromQL probe for post-execution metric sensors."""
|
||||
|
||||
alert = (alertname or "").lower()
|
||||
namespace = _safe_label_value(str(labels.get("namespace") or "awoooi-prod"))
|
||||
pod_name = _safe_label_value(str(labels.get("pod") or labels.get("name") or ""))
|
||||
host = _safe_label_value(str(labels.get("host") or _short_instance(labels.get("instance")) or ""))
|
||||
container = _safe_label_value(str(
|
||||
labels.get("container_name")
|
||||
or labels.get("container")
|
||||
or labels.get("name")
|
||||
or ""
|
||||
))
|
||||
|
||||
if alert.startswith("dockercontainer"):
|
||||
selector = _selector({
|
||||
"host": host,
|
||||
"container_name": container,
|
||||
})
|
||||
if "memory" in alert:
|
||||
return (
|
||||
f"docker_container_memory_usage_bytes{selector} / "
|
||||
f"docker_container_memory_limit_bytes{selector}"
|
||||
)
|
||||
if "restart" in alert:
|
||||
return (
|
||||
f"increase(docker_container_inspect_restart_count{selector}[15m]) "
|
||||
f"or increase(docker_container_restart_count{selector}[15m])"
|
||||
)
|
||||
if "cpu" in alert:
|
||||
return f"docker_container_cpu_cores{selector}"
|
||||
return f"docker_container_info{selector}"
|
||||
|
||||
if any(key in alert for key in ("host", "node")):
|
||||
instance_selector = _selector({"instance": str(labels.get("instance") or "")})
|
||||
if "memory" in alert or "oom" in alert:
|
||||
return f"node_memory_MemAvailable_bytes{instance_selector}"
|
||||
if "disk" in alert or "storage" in alert:
|
||||
return f"node_filesystem_avail_bytes{instance_selector}"
|
||||
if "load" in alert or "cpu" in alert:
|
||||
return f"node_load5{instance_selector}"
|
||||
return f"up{instance_selector}"
|
||||
|
||||
pod_filter = f',pod=~"{pod_name}.*"' if pod_name else ""
|
||||
if any(key in alert for key in ("memory", "mem", "oom")):
|
||||
return (
|
||||
f'avg(container_memory_working_set_bytes{{namespace="{namespace}"{pod_filter}}}) '
|
||||
"/ 1048576"
|
||||
)
|
||||
if any(key in alert for key in ("cpu", "load", "throttl")):
|
||||
return f'avg(rate(container_cpu_usage_seconds_total{{namespace="{namespace}"{pod_filter}}}[5m]))'
|
||||
if any(key in alert for key in ("crash", "restart", "backoff")):
|
||||
return f'sum(increase(kube_pod_container_status_restarts_total{{namespace="{namespace}"}}[15m]))'
|
||||
if any(key in alert for key in ("http", "error", "5xx", "probe", "down", "unhealthy")):
|
||||
return "1 - avg(probe_success)"
|
||||
return f'up{{namespace="{namespace}"}}'
|
||||
|
||||
|
||||
def _selector(labels: dict[str, str]) -> str:
|
||||
parts = [f'{key}="{value}"' for key, value in labels.items() if value]
|
||||
return "{" + ",".join(parts) + "}" if parts else ""
|
||||
|
||||
|
||||
def _short_instance(instance: Any) -> str:
|
||||
raw = str(instance or "")
|
||||
if raw.count(":") == 1:
|
||||
return raw.rsplit(":", 1)[0]
|
||||
return raw
|
||||
|
||||
|
||||
def _safe_label_value(value: str) -> str:
|
||||
cleaned = (value or "").strip()
|
||||
if re.fullmatch(r"[a-zA-Z0-9_.:-]{1,128}", cleaned):
|
||||
return cleaned
|
||||
return ""
|
||||
|
||||
|
||||
async def _update_snapshot(
|
||||
snapshot: EvidenceSnapshot,
|
||||
post_state: dict[str, Any],
|
||||
|
||||
@@ -19,6 +19,7 @@ from src.models.playbook import (
|
||||
RiskLevel,
|
||||
SymptomPattern,
|
||||
)
|
||||
from src.plugins.mcp.interfaces import MCPToolResult
|
||||
from src.services.auto_repair_service import AutoRepairService
|
||||
from src.utils.timezone import now_taipei
|
||||
|
||||
@@ -120,6 +121,14 @@ async def _no_cooldown(*args, **kwargs) -> tuple[bool, str]:
|
||||
return True, "允許自動修復 (test bypass)"
|
||||
|
||||
|
||||
class _DbContext:
|
||||
async def __aenter__(self) -> object:
|
||||
return object()
|
||||
|
||||
async def __aexit__(self, *_args: object) -> None:
|
||||
return None
|
||||
|
||||
|
||||
class TestAutoRepairService:
|
||||
"""Auto Repair Service unit tests"""
|
||||
|
||||
@@ -415,6 +424,90 @@ class TestAutoRepairService:
|
||||
|
||||
assert service._should_escalate_failed_verification(incident, playbook) is False
|
||||
|
||||
def test_legacy_ssh_diagnostic_routes_to_mcp_gateway(self, service):
|
||||
"""Legacy YAML_RULE SSH diagnostics become governed read-only MCP calls."""
|
||||
incident = create_test_incident(
|
||||
severity=Severity.P2,
|
||||
alert_category="infrastructure",
|
||||
alert_name="DockerContainerMemoryLimitPressure",
|
||||
)
|
||||
incident.signals[0].labels.update({
|
||||
"host": "110",
|
||||
"container_name": "sentry-self-hosted-clickhouse-1",
|
||||
})
|
||||
|
||||
route = service._route_legacy_ssh_command_to_mcp(
|
||||
incident,
|
||||
'ssh {host} \'echo "=== LOAD ==="; uptime; docker stats --no-stream\'',
|
||||
)
|
||||
|
||||
assert route is not None
|
||||
assert route.tool_name == "ssh_diagnose"
|
||||
assert route.params == {
|
||||
"host": "192.168.0.110",
|
||||
"container_name": "sentry-self-hosted-clickhouse-1",
|
||||
}
|
||||
|
||||
def test_legacy_ssh_write_action_does_not_route_to_read_mcp(self, service):
|
||||
"""Write operations must stay out of the read-only diagnostic grant."""
|
||||
incident = create_test_incident(severity=Severity.P2)
|
||||
|
||||
route = service._route_legacy_ssh_command_to_mcp(
|
||||
incident,
|
||||
"ssh {host} 'docker restart minio'",
|
||||
)
|
||||
|
||||
assert route is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_execute_legacy_ssh_diagnostic_uses_mcp_gateway(
|
||||
self,
|
||||
service,
|
||||
monkeypatch,
|
||||
):
|
||||
incident = create_test_incident(
|
||||
severity=Severity.P2,
|
||||
alert_category="infrastructure",
|
||||
alert_name="DockerContainerMemoryLimitPressure",
|
||||
)
|
||||
incident.signals[0].labels.update({
|
||||
"host": "110",
|
||||
"container_name": "momo-scheduler",
|
||||
})
|
||||
step = RepairStep(
|
||||
step_number=1,
|
||||
action_type=ActionType.SSH_COMMAND,
|
||||
command='ssh {host} \'echo "=== LOAD ==="; uptime; docker stats --no-stream\'',
|
||||
risk_level=RiskLevel.LOW,
|
||||
)
|
||||
calls = []
|
||||
|
||||
class FakeGateway:
|
||||
def __init__(self, db: object) -> None:
|
||||
self.db = db
|
||||
|
||||
async def call(self, ctx, params):
|
||||
calls.append({"ctx": ctx, "params": params, "db": self.db})
|
||||
return MCPToolResult(
|
||||
success=True,
|
||||
execution_id="gw-ok",
|
||||
output={"stdout": "=== CPU TOP ==="},
|
||||
)
|
||||
|
||||
monkeypatch.setattr("src.db.base.get_db_context", lambda _project_id: _DbContext())
|
||||
monkeypatch.setattr("src.plugins.mcp.gateway.McpGateway", FakeGateway)
|
||||
|
||||
result = await service._execute_step(incident, step)
|
||||
|
||||
assert result.startswith("SUCCESS: mcp:ssh_diagnose")
|
||||
assert calls
|
||||
assert calls[0]["ctx"].agent_id == "auto_repair_executor"
|
||||
assert calls[0]["ctx"].required_scope == "read"
|
||||
assert calls[0]["ctx"].is_shadow is False
|
||||
assert calls[0]["params"]["host"] == "192.168.0.110"
|
||||
assert calls[0]["params"]["container_name"] == "momo-scheduler"
|
||||
assert calls[0]["params"]["_mcp_audit"]["flywheel_node"] == "execute"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_low_risk_allowed(self, service, mock_playbook_service):
|
||||
"""Test that LOW risk actions are allowed"""
|
||||
|
||||
@@ -384,6 +384,25 @@ class _FakeRegistry:
|
||||
]
|
||||
|
||||
|
||||
class _PrometheusRegistry:
|
||||
def __init__(self, provider: _CaptureProvider) -> None:
|
||||
self.provider = provider
|
||||
|
||||
def suggest_tools(self, alertname: str = "", incident_labels: dict | None = None) -> list[RegisteredTool]:
|
||||
return [
|
||||
RegisteredTool(
|
||||
tool=MCPTool(
|
||||
name="prometheus_query",
|
||||
description="",
|
||||
input_schema={},
|
||||
server_name="capture",
|
||||
),
|
||||
provider=self.provider,
|
||||
dimensions=[SensorDimension.D3_METRICS],
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
class _DbContext:
|
||||
async def __aenter__(self) -> object:
|
||||
return object()
|
||||
@@ -448,3 +467,39 @@ class TestCollectPostStateAuditContext:
|
||||
assert ctx.required_scope == "read"
|
||||
assert calls[0]["parameters"]["_mcp_audit"]["incident_id"] == "INC-TEST"
|
||||
assert calls[0]["parameters"]["_mcp_audit"]["flywheel_node"] == "verify"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_collect_post_state_sends_prometheus_query_parameter(self):
|
||||
provider = _CaptureProvider()
|
||||
verifier = PostExecutionVerifier()
|
||||
verifier._registry = _PrometheusRegistry(provider)
|
||||
incident = _stub_incident(alertname="DockerContainerMemoryLimitPressure")
|
||||
incident.signals[0].labels.update({
|
||||
"host": "110",
|
||||
"container_name": "momo-scheduler",
|
||||
})
|
||||
|
||||
await verifier._collect_post_state(incident)
|
||||
|
||||
assert provider.seen_parameters is not None
|
||||
query = provider.seen_parameters["query"]
|
||||
assert "docker_container_memory_usage_bytes" in query
|
||||
assert 'host="110"' in query
|
||||
assert 'container_name="momo-scheduler"' in query
|
||||
|
||||
|
||||
class TestPrometheusQueryBuilder:
|
||||
def test_docker_memory_alert_query_is_not_empty(self):
|
||||
query = pev_module._build_prometheus_query(
|
||||
"DockerContainerMemoryLimitPressure",
|
||||
{"host": "110", "container_name": "momo-scheduler"},
|
||||
)
|
||||
|
||||
assert "docker_container_memory_usage_bytes" in query
|
||||
assert "docker_container_memory_limit_bytes" in query
|
||||
|
||||
def test_canary_alert_uses_generic_non_empty_query(self):
|
||||
query = pev_module._build_prometheus_query("AwoooPAutoRepairCanaryT16", {})
|
||||
|
||||
assert query
|
||||
assert "up" in query
|
||||
|
||||
@@ -21,6 +21,18 @@ async def test_ssh_diagnose_is_registered_read_only_tool():
|
||||
assert "df -h" in command
|
||||
|
||||
|
||||
def test_ssh_diagnose_can_include_container_read_only_context():
|
||||
provider = SSHProvider()
|
||||
|
||||
command = provider._build_command(
|
||||
"ssh_diagnose",
|
||||
{"container_name": "sentry-self-hosted-clickhouse-1"},
|
||||
)
|
||||
|
||||
assert "docker stats --no-stream sentry-self-hosted-clickhouse-1" in command
|
||||
assert "docker inspect sentry-self-hosted-clickhouse-1" in command
|
||||
|
||||
|
||||
def test_ssh_provider_uses_ollama_user_for_188():
|
||||
provider = SSHProvider()
|
||||
|
||||
@@ -32,6 +44,8 @@ def test_ssh_provider_uses_ollama_user_for_188():
|
||||
"raw,expected",
|
||||
[
|
||||
("192.168.0.110:9100", "192.168.0.110"),
|
||||
("110:9100", "192.168.0.110"),
|
||||
("188", "192.168.0.188"),
|
||||
("wooo@192.168.0.110", "192.168.0.110"),
|
||||
("ssh://wooo@192.168.0.110:22", "192.168.0.110"),
|
||||
("192.168.0.188", "192.168.0.188"),
|
||||
|
||||
Reference in New Issue
Block a user