diff --git a/apps/api/src/core/config.py b/apps/api/src/core/config.py index 2ca78a20..89f1eee2 100644 --- a/apps/api/src/core/config.py +++ b/apps/api/src/core/config.py @@ -516,6 +516,29 @@ class Settings(BaseSettings): return [] return [repo.strip() for repo in raw.split(",") if repo.strip()] + # ========================================================================== + # MCP Phase 2b: Prometheus MCP Server (ADR-071, 2026-04-11 Claude Sonnet 4.6) + # ========================================================================== + PROMETHEUS_URL: str = Field( + default="http://192.168.0.188:9090", + description="Prometheus server URL", + ) + PROMETHEUS_MCP_ENABLED: bool = Field( + default=True, + description="啟用 Prometheus MCP Provider", + ) + + # MCP Phase 2a: SSH MCP Server (ADR-071, 2026-04-11 Claude Sonnet 4.6) + # ========================================================================== + SSH_MCP_ENABLED: bool = Field( + default=False, + description="啟用 SSH MCP Provider(需 K8s Secret ssh-mcp-key 掛載)", + ) + SSH_MCP_ALLOWED_HOSTS: str = Field( + default="192.168.0.188,192.168.0.110,192.168.0.111", + description="允許 SSH 的主機 IP 清單(逗號分隔)", + ) + # ========================================================================== # Phase 13.2: Grafana MCP Tool (#83) # ========================================================================== diff --git a/apps/api/src/plugins/mcp/providers/prometheus_provider.py b/apps/api/src/plugins/mcp/providers/prometheus_provider.py new file mode 100644 index 00000000..ad28a18d --- /dev/null +++ b/apps/api/src/plugins/mcp/providers/prometheus_provider.py @@ -0,0 +1,287 @@ +""" +Prometheus MCP Tool Provider — MCP Phase 2b +============================================= +提供三個 Prometheus 查詢工具,供 AI 情報收集使用: + + prometheus_query — 即時查詢(PromQL instant query) + prometheus_query_range — 範圍查詢(step 預設 60s) + prometheus_get_alert_history — 告警觸發歷史(最近 N 筆) + +設計原則: +- 唯讀(只呼叫 /api/v1/query 和 /api/v1/query_range) +- 所有 HTTP 錯誤回傳 MCPToolResult(success=False),不拋出 +- 逾時 10 秒,避免 AI 等待過長 +- Lazy import settings,避免 circular import + +建立時間: 2026-04-11 (台北時區) +建立者: Claude Sonnet 4.6 — MCP Phase 2b + +@see docs/superpowers/specs/2026-04-10-infra-rebuild-sprint-abc-design.md §MCP-2b +""" + +import uuid +from datetime import UTC, datetime, timedelta +from typing import Any + +import httpx +import structlog + +from src.plugins.mcp.interfaces import MCPTool, MCPToolProvider, MCPToolResult + +logger = structlog.get_logger(__name__) + +_HTTP_TIMEOUT = 10.0 # 秒 + + +class PrometheusProvider(MCPToolProvider): + """ + Prometheus MCP Tool Provider + + 三個工具: + prometheus_query — 即時查詢 + prometheus_query_range — 範圍查詢(過去 N 分鐘,step 60s) + prometheus_get_alert_history — 告警觸發歷史 + """ + + @property + def name(self) -> str: + return "prometheus" + + @property + def enabled(self) -> bool: + from src.core.config import settings + return settings.PROMETHEUS_MCP_ENABLED + + def _base_url(self) -> str: + from src.core.config import settings + return settings.PROMETHEUS_URL.rstrip("/") + + async def list_tools(self) -> list[MCPTool]: + return [ + MCPTool( + name="prometheus_query", + description=( + "Execute a PromQL instant query against Prometheus. " + "Returns current value(s) for the given expression." + ), + input_schema={ + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "PromQL expression, e.g. 'up{job=\"awoooi-api\"}'", + }, + "time": { + "type": "string", + "description": "Evaluation timestamp (RFC3339 or Unix). Default: now.", + }, + }, + "required": ["query"], + }, + server_name=self.name, + ), + MCPTool( + name="prometheus_query_range", + description=( + "Execute a PromQL range query. " + "Returns time-series data over the last N minutes." + ), + input_schema={ + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "PromQL expression", + }, + "window_minutes": { + "type": "integer", + "description": "Look-back window in minutes (default: 15)", + }, + "step_seconds": { + "type": "integer", + "description": "Step interval in seconds (default: 60)", + }, + }, + "required": ["query"], + }, + server_name=self.name, + ), + MCPTool( + name="prometheus_get_alert_history", + description=( + "Get recent firing history for a specific alertname. " + "Returns timestamps and labels of the last N firings." + ), + input_schema={ + "type": "object", + "properties": { + "alertname": { + "type": "string", + "description": "Alert name (e.g. HostHighCpuLoad)", + }, + "window_hours": { + "type": "integer", + "description": "Look-back window in hours (default: 24)", + }, + "limit": { + "type": "integer", + "description": "Max number of results (default: 20)", + }, + }, + "required": ["alertname"], + }, + server_name=self.name, + ), + ] + + async def execute( + self, + tool_name: str, + parameters: dict[str, Any], + ) -> MCPToolResult: + execution_id = str(uuid.uuid4()) + start = datetime.now(UTC) + + try: + if tool_name == "prometheus_query": + output = await self._instant_query(parameters) + elif tool_name == "prometheus_query_range": + output = await self._range_query(parameters) + elif tool_name == "prometheus_get_alert_history": + output = await self._alert_history(parameters) + else: + raise ValueError(f"Unknown tool: {tool_name}") + + duration = (datetime.now(UTC) - start).total_seconds() + return MCPToolResult( + success=True, + execution_id=execution_id, + output=output, + duration=duration, + ) + + except Exception as e: + duration = (datetime.now(UTC) - start).total_seconds() + logger.warning( + "prometheus_tool_failed", + tool=tool_name, + error=str(e), + ) + return MCPToolResult( + success=False, + execution_id=execution_id, + error=str(e), + duration=duration, + ) + + async def health_check(self) -> bool: + try: + async with httpx.AsyncClient(timeout=5.0) as client: + resp = await client.get(f"{self._base_url()}/-/ready") + return resp.status_code == 200 + except Exception: + return False + + # ========================================================================= + # Internal helpers + # ========================================================================= + + async def _instant_query(self, params: dict) -> dict: + query = params["query"] + time_param = params.get("time", "") + + url = f"{self._base_url()}/api/v1/query" + payload: dict[str, Any] = {"query": query} + if time_param: + payload["time"] = time_param + + async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT) as client: + resp = await client.get(url, params=payload) + resp.raise_for_status() + data = resp.json() + + if data.get("status") != "success": + raise RuntimeError(f"Prometheus error: {data.get('error', 'unknown')}") + + result = data.get("data", {}).get("result", []) + return { + "query": query, + "result_type": data.get("data", {}).get("resultType", ""), + "result": result, + "result_count": len(result), + } + + async def _range_query(self, params: dict) -> dict: + query = params["query"] + window_minutes = int(params.get("window_minutes", 15)) + step_seconds = int(params.get("step_seconds", 60)) + + now = datetime.now(UTC) + end_ts = now.timestamp() + start_ts = (now - timedelta(minutes=window_minutes)).timestamp() + + url = f"{self._base_url()}/api/v1/query_range" + payload = { + "query": query, + "start": str(start_ts), + "end": str(end_ts), + "step": f"{step_seconds}s", + } + + async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT) as client: + resp = await client.get(url, params=payload) + resp.raise_for_status() + data = resp.json() + + if data.get("status") != "success": + raise RuntimeError(f"Prometheus error: {data.get('error', 'unknown')}") + + result = data.get("data", {}).get("result", []) + return { + "query": query, + "window_minutes": window_minutes, + "step_seconds": step_seconds, + "result_type": data.get("data", {}).get("resultType", ""), + "result": result, + "series_count": len(result), + } + + async def _alert_history(self, params: dict) -> dict: + alertname = params["alertname"] + window_hours = int(params.get("window_hours", 24)) + limit = int(params.get("limit", 20)) + + # 用 ALERTS_FOR_STATE metric 查詢告警觸發歷史 + # 若找不到,fallback 用 ALERTS metric + query = f'ALERTS{{alertname="{alertname}",alertstate="firing"}}' + window_minutes = window_hours * 60 + + range_result = await self._range_query({ + "query": query, + "window_minutes": window_minutes, + "step_seconds": 300, # 5 分鐘粒度 + }) + + # 整理為易讀的觸發時間列表 + firings = [] + for series in range_result.get("result", [])[:limit]: + labels = series.get("metric", {}) + values = series.get("values", []) + for ts, val in values: + if float(val) > 0: + firings.append({ + "timestamp": datetime.fromtimestamp(float(ts), tz=UTC).isoformat(), + "labels": {k: v for k, v in labels.items() if k != "__name__"}, + "value": float(val), + }) + + # 最近優先,截斷到 limit + firings.sort(key=lambda x: x["timestamp"], reverse=True) + firings = firings[:limit] + + return { + "alertname": alertname, + "window_hours": window_hours, + "firing_count": len(firings), + "firings": firings, + } diff --git a/apps/api/src/plugins/mcp/providers/ssh_provider.py b/apps/api/src/plugins/mcp/providers/ssh_provider.py new file mode 100644 index 00000000..6789703c --- /dev/null +++ b/apps/api/src/plugins/mcp/providers/ssh_provider.py @@ -0,0 +1,495 @@ +""" +SSH MCP Tool Provider — MCP Phase 2a +====================================== +主機層操作代理 — 補完 K8s 無法覆蓋的 70% 主機層告警 + +工具分組: + 群組 A (9 個只讀診斷工具,無需信任度) + ssh_get_top_processes — HostHighCpuLoad / HostOutOfMemory + ssh_get_disk_usage — HostOutOfDiskSpace + ssh_get_memory_info — HostOutOfMemory + ssh_get_container_logs — DockerContainerExited / HarborDown + ssh_get_container_status — 所有 DockerContainer* 告警 + ssh_get_service_status — OllamaDown / KaliScannerDown + ssh_check_port — 服務宕機確認 + ssh_get_nginx_error_log — 網站宕機 / TLS 問題 + ssh_get_swap_info — HostOutOfMemory + + 群組 B (6 個安全操作工具,需 trust_score >= 0.8) + ssh_docker_restart — DockerContainerExited / HarborDown + ssh_docker_compose_restart — SentryDown / SignOzDown / GiteaDown + ssh_systemctl_restart — OllamaDown / KaliScannerDown + ssh_clear_docker_logs — HostOutOfDiskSpace (log 佔用) + ssh_renew_ssl — TLSCertExpiringIn7Days + ssh_reload_nginx — TLSProbeFailure / conf 更新後 + +四層安全守衛 (缺一不可): + 1. tool_name 必須在白名單 + 2. host 必須在 SSH_MCP_ALLOWED_HOSTS + 3. 所有參數通過 FORBIDDEN_PATTERNS 正則審查 + 4. 群組 B 工具必須 trust_score >= 0.8 + +SSH 連線: + 認證方式: Private Key,從 /run/secrets/ssh_mcp_key 讀取 + 連線庫: asyncssh (純 Python) + 絞殺者開關: SSH_MCP_ENABLED env var + +建立時間: 2026-04-11 (台北時區) +建立者: Claude Sonnet 4.6 — MCP Phase 2a + +@see docs/superpowers/specs/2026-04-10-infra-rebuild-sprint-abc-design.md §MCP-2a +""" + +import re +import uuid +from datetime import UTC, datetime +from typing import Any + +import structlog + +from src.plugins.mcp.interfaces import MCPTool, MCPToolProvider, MCPToolResult + +logger = structlog.get_logger(__name__) + +# ============================================================================= +# 安全常數 +# ============================================================================= + +SSH_KEY_PATH = "/run/secrets/ssh_mcp_key" +SSH_USER = "wooo" +DIAG_TIMEOUT = 10 # 診斷類超時(秒) +OP_TIMEOUT = 60 # 操作類超時(秒) + +# 禁止字串正則(硬編碼守衛) +FORBIDDEN_PATTERNS = [ + r"rm\s+-rf", # 遞歸刪除 + r"/etc/passwd", # 系統帳號 + r"authorized_keys", # SSH key + r"sudoers", # 權限設定 + r"\$\(", # 命令替換 + r"`", # 反引號執行 + r"\|.*rm", # pipe 到刪除 + r">\s*/etc/", # 重定向到系統目錄 +] + +# 群組 A(只讀) +GROUP_A_TOOLS = { + "ssh_get_top_processes", + "ssh_get_disk_usage", + "ssh_get_memory_info", + "ssh_get_container_logs", + "ssh_get_container_status", + "ssh_get_service_status", + "ssh_check_port", + "ssh_get_nginx_error_log", + "ssh_get_swap_info", +} + +# 群組 B(安全操作,需 trust_score) +GROUP_B_TOOLS = { + "ssh_docker_restart", + "ssh_docker_compose_restart", + "ssh_systemctl_restart", + "ssh_clear_docker_logs", + "ssh_renew_ssl", + "ssh_reload_nginx", +} + +ALL_TOOLS = GROUP_A_TOOLS | GROUP_B_TOOLS + +MIN_TRUST_SCORE_FOR_GROUP_B = 0.8 + + +class SSHProvider(MCPToolProvider): + """ + SSH MCP Provider — 主機層操作代理 + + Phase MCP-2a: 補完 K8s 無法覆蓋的 70% 主機層告警 + 絞殺者開關: SSH_MCP_ENABLED env var + """ + + @property + def name(self) -> str: + return "ssh_host" + + @property + def enabled(self) -> bool: + from src.core.config import settings + return getattr(settings, "SSH_MCP_ENABLED", False) + + def _allowed_hosts(self) -> list[str]: + from src.core.config import settings + raw = getattr(settings, "SSH_MCP_ALLOWED_HOSTS", "") + if not raw: + return ["192.168.0.188", "192.168.0.110", "192.168.0.111"] + if isinstance(raw, list): + return raw + return [h.strip() for h in raw.split(",") if h.strip()] + + # ========================================================================= + # list_tools + # ========================================================================= + + async def list_tools(self) -> list[MCPTool]: + return [ + # ---- 群組 A ---- + MCPTool( + name="ssh_get_top_processes", + description="Get top CPU/memory consuming processes on the target host (ps aux --sort=-%cpu | head 15). Read-only.", + input_schema={"type": "object", "properties": { + "host": {"type": "string", "description": "Target host IP (e.g. 192.168.0.188)"}, + }, "required": ["host"]}, + server_name=self.name, + ), + MCPTool( + name="ssh_get_disk_usage", + description="Get disk usage on the target host (df -h && du -sh /var/lib/docker). Read-only.", + input_schema={"type": "object", "properties": { + "host": {"type": "string"}, + }, "required": ["host"]}, + server_name=self.name, + ), + MCPTool( + name="ssh_get_memory_info", + description="Get memory info (free -h). Read-only.", + input_schema={"type": "object", "properties": { + "host": {"type": "string"}, + }, "required": ["host"]}, + server_name=self.name, + ), + MCPTool( + name="ssh_get_container_logs", + description="Get Docker container logs (last 50 lines). Read-only.", + input_schema={"type": "object", "properties": { + "host": {"type": "string"}, + "container_name": {"type": "string", "description": "Docker container name"}, + "tail": {"type": "integer", "description": "Number of lines (default: 50)"}, + }, "required": ["host", "container_name"]}, + server_name=self.name, + ), + MCPTool( + name="ssh_get_container_status", + description="Get Docker container status filtered by name. Read-only.", + input_schema={"type": "object", "properties": { + "host": {"type": "string"}, + "filter_name": {"type": "string", "description": "Container name filter"}, + }, "required": ["host", "filter_name"]}, + server_name=self.name, + ), + MCPTool( + name="ssh_get_service_status", + description="Get systemd service status. Read-only.", + input_schema={"type": "object", "properties": { + "host": {"type": "string"}, + "service": {"type": "string", "description": "Service name (e.g. ollama)"}, + }, "required": ["host", "service"]}, + server_name=self.name, + ), + MCPTool( + name="ssh_check_port", + description="Check if a port is listening on the target host. Read-only.", + input_schema={"type": "object", "properties": { + "host": {"type": "string"}, + "port": {"type": "integer", "description": "Port number"}, + }, "required": ["host", "port"]}, + server_name=self.name, + ), + MCPTool( + name="ssh_get_nginx_error_log", + description="Get last 50 lines of nginx error log. Read-only.", + input_schema={"type": "object", "properties": { + "host": {"type": "string"}, + "lines": {"type": "integer", "description": "Number of lines (default: 50)"}, + }, "required": ["host"]}, + server_name=self.name, + ), + MCPTool( + name="ssh_get_swap_info", + description="Get swap and memory info (swapon --show && free -h). Read-only.", + input_schema={"type": "object", "properties": { + "host": {"type": "string"}, + }, "required": ["host"]}, + server_name=self.name, + ), + # ---- 群組 B ---- + MCPTool( + name="ssh_docker_restart", + description="Restart a Docker container (docker restart ). Requires trust_score >= 0.8.", + input_schema={"type": "object", "properties": { + "host": {"type": "string"}, + "container_name": {"type": "string"}, + "trust_score": {"type": "number", "description": "Current trust score (0.0-1.0)"}, + }, "required": ["host", "container_name", "trust_score"]}, + server_name=self.name, + ), + MCPTool( + name="ssh_docker_compose_restart", + description="Restart a service via docker compose. Requires trust_score >= 0.8.", + input_schema={"type": "object", "properties": { + "host": {"type": "string"}, + "compose_dir": {"type": "string", "description": "Directory containing docker-compose.yml"}, + "service": {"type": "string", "description": "Service name in compose file"}, + "trust_score": {"type": "number"}, + }, "required": ["host", "compose_dir", "service", "trust_score"]}, + server_name=self.name, + ), + MCPTool( + name="ssh_systemctl_restart", + description="Restart a systemd service (systemctl restart ). Requires trust_score >= 0.8.", + input_schema={"type": "object", "properties": { + "host": {"type": "string"}, + "service": {"type": "string"}, + "trust_score": {"type": "number"}, + }, "required": ["host", "service", "trust_score"]}, + server_name=self.name, + ), + MCPTool( + name="ssh_clear_docker_logs", + description="Truncate Docker container log file to free disk space. Requires trust_score >= 0.8.", + input_schema={"type": "object", "properties": { + "host": {"type": "string"}, + "container_name": {"type": "string"}, + "trust_score": {"type": "number"}, + }, "required": ["host", "container_name", "trust_score"]}, + server_name=self.name, + ), + MCPTool( + name="ssh_renew_ssl", + description="Renew SSL certificate via certbot. Requires trust_score >= 0.8.", + input_schema={"type": "object", "properties": { + "host": {"type": "string"}, + "domain": {"type": "string", "description": "Certificate name (certbot --cert-name)"}, + "trust_score": {"type": "number"}, + }, "required": ["host", "domain", "trust_score"]}, + server_name=self.name, + ), + MCPTool( + name="ssh_reload_nginx", + description="Test and reload nginx config (nginx -t && systemctl reload nginx). Requires trust_score >= 0.8.", + input_schema={"type": "object", "properties": { + "host": {"type": "string"}, + "trust_score": {"type": "number"}, + }, "required": ["host", "trust_score"]}, + server_name=self.name, + ), + ] + + # ========================================================================= + # execute + # ========================================================================= + + async def execute( + self, + tool_name: str, + parameters: dict[str, Any], + ) -> MCPToolResult: + execution_id = str(uuid.uuid4()) + start = datetime.now(UTC) + + # 守衛 1: 白名單 + if tool_name not in ALL_TOOLS: + return MCPToolResult( + success=False, + execution_id=execution_id, + error=f"Unknown tool: {tool_name}", + ) + + host = parameters.get("host", "") + + # 守衛 2: 允許的 host + if host not in self._allowed_hosts(): + return MCPToolResult( + success=False, + execution_id=execution_id, + error=f"Host '{host}' not in SSH_MCP_ALLOWED_HOSTS", + ) + + # 守衛 3: 參數安全檢查 + security_error = self._check_params_safety(parameters) + if security_error: + logger.warning( + "ssh_mcp_forbidden_pattern", + tool=tool_name, + host=host, + reason=security_error, + ) + return MCPToolResult( + success=False, + execution_id=execution_id, + error=f"Security guard blocked: {security_error}", + ) + + # 守衛 4: 群組 B 信任度 + if tool_name in GROUP_B_TOOLS: + trust_score = float(parameters.get("trust_score", 0.0)) + if trust_score < MIN_TRUST_SCORE_FOR_GROUP_B: + return MCPToolResult( + success=False, + execution_id=execution_id, + error=( + f"Group B tool '{tool_name}' requires trust_score >= " + f"{MIN_TRUST_SCORE_FOR_GROUP_B}, got {trust_score:.2f}. " + "Consider manual execution." + ), + ) + + # 執行 + try: + is_group_b = tool_name in GROUP_B_TOOLS + timeout = OP_TIMEOUT if is_group_b else DIAG_TIMEOUT + output = await self._run_tool(tool_name, parameters, host, timeout) + duration = (datetime.now(UTC) - start).total_seconds() + logger.info("ssh_mcp_executed", tool=tool_name, host=host, duration=duration) + return MCPToolResult( + success=True, + execution_id=execution_id, + output=output, + duration=duration, + ) + except Exception as e: + duration = (datetime.now(UTC) - start).total_seconds() + logger.warning("ssh_mcp_failed", tool=tool_name, host=host, error=str(e)) + return MCPToolResult( + success=False, + execution_id=execution_id, + error=str(e), + duration=duration, + ) + + async def health_check(self) -> bool: + """只確認 SSH key 存在,不真正 SSH(避免 health check 觸發副作用)""" + import os + return os.path.exists(SSH_KEY_PATH) + + # ========================================================================= + # 安全守衛 + # ========================================================================= + + def _check_params_safety(self, params: dict) -> str | None: + """ + 掃描所有字串參數,找到禁止字串即返回 error 字串 + """ + for value in params.values(): + if not isinstance(value, str): + continue + for pattern in FORBIDDEN_PATTERNS: + if re.search(pattern, value, re.IGNORECASE): + return f"Forbidden pattern '{pattern}' found in parameter" + return None + + # ========================================================================= + # 工具執行 + # ========================================================================= + + async def _run_tool( + self, + tool_name: str, + params: dict, + host: str, + timeout: int, + ) -> dict: + cmd = self._build_command(tool_name, params) + stdout, stderr = await self._ssh_exec(host, cmd, timeout) + return { + "host": host, + "tool": tool_name, + "command": cmd, + "stdout": stdout, + "stderr": stderr, + } + + def _build_command(self, tool_name: str, params: dict) -> str: + if tool_name == "ssh_get_top_processes": + return "ps aux --sort=-%cpu | head -15" + + if tool_name == "ssh_get_disk_usage": + return "df -h && echo '---' && du -sh /var/lib/docker 2>/dev/null || true" + + if tool_name == "ssh_get_memory_info": + return "free -h" + + if tool_name == "ssh_get_container_logs": + name = params["container_name"] + tail = int(params.get("tail", 50)) + return f"docker logs {name} --tail {tail} 2>&1" + + if tool_name == "ssh_get_container_status": + name = params["filter_name"] + return f"docker ps -a --filter name={name}" + + if tool_name == "ssh_get_service_status": + svc = params["service"] + return f"systemctl status {svc} --no-pager -l 2>&1 | head -30" + + if tool_name == "ssh_check_port": + port = int(params["port"]) + return f"ss -tlnp | grep :{port}" + + if tool_name == "ssh_get_nginx_error_log": + lines = int(params.get("lines", 50)) + return f"tail -n {lines} /var/log/nginx/error.log 2>/dev/null || echo 'Log not found'" + + if tool_name == "ssh_get_swap_info": + return "swapon --show; echo '---'; free -h" + + if tool_name == "ssh_docker_restart": + name = params["container_name"] + return f"docker restart {name}" + + if tool_name == "ssh_docker_compose_restart": + compose_dir = params["compose_dir"] + service = params["service"] + return f"cd {compose_dir} && docker compose restart {service}" + + if tool_name == "ssh_systemctl_restart": + svc = params["service"] + return f"systemctl restart {svc}" + + if tool_name == "ssh_clear_docker_logs": + name = params["container_name"] + # 透過 docker inspect 取得 log 路徑,再截斷 + return ( + f"LOG_PATH=$(docker inspect --format='{{{{.LogPath}}}}' {name} 2>/dev/null) " + f"&& [ -n \"$LOG_PATH\" ] && truncate -s 0 \"$LOG_PATH\" && echo 'Cleared' " + f"|| echo 'Container not found'" + ) + + if tool_name == "ssh_renew_ssl": + domain = params["domain"] + return f"/snap/bin/certbot renew --cert-name {domain} --non-interactive 2>&1" + + if tool_name == "ssh_reload_nginx": + return "nginx -t 2>&1 && systemctl reload nginx && echo 'Nginx reloaded'" + + raise ValueError(f"No command builder for tool: {tool_name}") + + async def _ssh_exec(self, host: str, cmd: str, timeout: int) -> tuple[str, str]: + """ + 透過 asyncssh 執行 SSH 命令 + + 如果 asyncssh 未安裝,返回明確錯誤而非崩潰。 + """ + try: + import asyncssh + except ImportError: + raise RuntimeError( + "asyncssh is not installed. " + "Add 'asyncssh' to pyproject.toml dependencies." + ) + + import os + if not os.path.exists(SSH_KEY_PATH): + raise RuntimeError( + f"SSH key not found at {SSH_KEY_PATH}. " + "Ensure K8s Secret 'ssh-mcp-key' is mounted correctly." + ) + + async with asyncssh.connect( + host, + username=SSH_USER, + client_keys=[SSH_KEY_PATH], + known_hosts=None, # 內網信任,不驗證 known_hosts + connect_timeout=timeout, + ) as conn: + result = await asyncssh.run(conn, cmd, timeout=timeout, check=False) + return (result.stdout or ""), (result.stderr or "") diff --git a/k8s/awoooi-prod/04-ssh-mcp-secret.example.yaml b/k8s/awoooi-prod/04-ssh-mcp-secret.example.yaml new file mode 100644 index 00000000..23e1c884 --- /dev/null +++ b/k8s/awoooi-prod/04-ssh-mcp-secret.example.yaml @@ -0,0 +1,51 @@ +# SSH MCP Key Secret — MCP Phase 2a (ADR-071) +# ============================================ +# 此 Secret 供 SSH MCP Provider 使用,讓 API Pod 可 SSH 到 188/110/111 主機執行診斷和修復指令。 +# +# 建立步驟: +# 1. 在 API Pod 宿主機生成 SSH key pair: +# ssh-keygen -t ed25519 -f /tmp/ssh-mcp-key -N "" -C "awoooi-mcp@k3s" +# +# 2. 將公鑰加入允許主機的 authorized_keys: +# ssh-copy-id -i /tmp/ssh-mcp-key.pub wooo@192.168.0.188 +# ssh-copy-id -i /tmp/ssh-mcp-key.pub wooo@192.168.0.110 +# ssh-copy-id -i /tmp/ssh-mcp-key.pub wooo@192.168.0.111 +# +# 3. 建立 K8s Secret: +# kubectl create secret generic ssh-mcp-key \ +# --from-file=ssh_mcp_key=/tmp/ssh-mcp-key \ +# -n awoooi-prod +# +# 4. 刪除暫存 key: +# rm /tmp/ssh-mcp-key /tmp/ssh-mcp-key.pub +# +# 5. 在 K8s Deployment 加入 volume mount (參考 apps/api/k8s-deployment.yaml) +# volumeMounts: +# - name: ssh-mcp-key +# mountPath: /run/secrets/ssh_mcp_key +# subPath: ssh_mcp_key +# readOnly: true +# volumes: +# - name: ssh-mcp-key +# secret: +# secretName: ssh-mcp-key +# defaultMode: 0400 +# +# 6. 設定環境變數 SSH_MCP_ENABLED=true(在 K8s ConfigMap 或 Deployment env 加入) +# +# ⚠️ 此檔案是範本,實際 Secret 由 CI/CD 注入,禁止提交私鑰到 Git +# ⚠️ 04-ssh-mcp-secret.yaml 已加入 .gitignore + +apiVersion: v1 +kind: Secret +metadata: + name: ssh-mcp-key + namespace: awoooi-prod + annotations: + # MCP Phase 2a (ADR-071, 2026-04-11) + description: "SSH private key for MCP SSH Provider — allows API pods to diagnose/repair host-layer services" +type: Opaque +data: + # Base64 encoded private key + # 使用方式: echo "$(cat /tmp/ssh-mcp-key | base64 -w 0)" + ssh_mcp_key: "CHANGE_ME_BASE64_ENCODED_PRIVATE_KEY" diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index 4435d870..566d3408 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -39,6 +39,10 @@ groups: layer: systemd-188 team: ops auto_repair: "true" + # MCP Phase 2a (ADR-071, 2026-04-11 Claude Sonnet 4.6): SSH MCP 路由標籤 + mcp_provider: "ssh_host" + host_type: "bare_metal" + alert_category: "host_resource" annotations: summary: "主機 {{ $labels.host }} CPU 高負載" description: "CPU 使用率超過 80%" @@ -51,6 +55,9 @@ groups: layer: systemd-188 team: ops auto_repair: "false" + mcp_provider: "ssh_host" + host_type: "bare_metal" + alert_category: "host_resource" annotations: summary: "主機 {{ $labels.host }} 記憶體不足" description: "記憶體使用率超過 85%" @@ -63,6 +70,9 @@ groups: layer: systemd-188 team: ops auto_repair: "false" + mcp_provider: "ssh_host" + host_type: "bare_metal" + alert_category: "host_resource" annotations: summary: "主機 {{ $labels.host }} 磁碟空間不足" description: "磁碟使用率超過 85%" @@ -329,6 +339,9 @@ groups: host: "188" team: ops auto_repair: "true" + mcp_provider: "ssh_host" + target_host: "192.168.0.188" + alert_category: "devops_tool" annotations: summary: "SignOz 服務離線" description: "SignOz (192.168.0.188:3301) 已離線超過 2 分鐘" @@ -344,6 +357,9 @@ groups: host: "110" team: ops auto_repair: "true" + mcp_provider: "ssh_host" + target_host: "192.168.0.110" + alert_category: "devops_tool" annotations: summary: "Sentry 服務離線" description: "Sentry (192.168.0.110:9000) 已離線超過 2 分鐘" @@ -358,6 +374,9 @@ groups: host: "110" team: ops auto_repair: "true" + mcp_provider: "ssh_host" + target_host: "192.168.0.110" + alert_category: "devops_tool" annotations: summary: "Harbor Registry 離線" description: "Harbor (192.168.0.110:5000) 已離線超過 2 分鐘,CD pipeline 將無法拉取映像" @@ -372,6 +391,9 @@ groups: host: "110" team: ops auto_repair: "true" + mcp_provider: "ssh_host" + target_host: "192.168.0.110" + alert_category: "devops_tool" annotations: summary: "Gitea Git 服務離線" description: "Gitea (192.168.0.110:3001) 已離線超過 2 分鐘,CD pipeline 失效" @@ -494,6 +516,8 @@ groups: layer: docker team: ops auto_repair: "true" + mcp_provider: "ssh_host" + alert_category: "devops_tool" annotations: summary: "容器 {{ $labels.container }} 健康檢查失敗" description: "主機 {{ $labels.host }} 容器 {{ $labels.container }} 健康狀態異常,持續 2 分鐘" @@ -506,6 +530,8 @@ groups: layer: docker team: ops auto_repair: "true" + mcp_provider: "ssh_host" + alert_category: "devops_tool" annotations: summary: "容器 {{ $labels.container }} 已停止" description: "主機 {{ $labels.host }} 容器 {{ $labels.container }} 已 exited/dead,持續 1 分鐘"