diff --git a/apps/api/src/plugins/mcp/providers/argocd_provider.py b/apps/api/src/plugins/mcp/providers/argocd_provider.py index 9bb6495b..ca800469 100644 --- a/apps/api/src/plugins/mcp/providers/argocd_provider.py +++ b/apps/api/src/plugins/mcp/providers/argocd_provider.py @@ -33,6 +33,12 @@ _HTTP_TIMEOUT = 10.0 _RE_SAFE_APP_NAME = re.compile(r'^[a-zA-Z0-9][a-zA-Z0-9._-]{0,62}$') +def _tls_verify() -> bool: + """P2 fix 2026-04-11: 讀取 ARGOCD_VERIFY_TLS env,預設 False(因 self-signed cert)。""" + import os + return os.environ.get("ARGOCD_VERIFY_TLS", "false").lower() in ("true", "1", "yes") + + def _validate_app_name(name: str) -> str: if not _RE_SAFE_APP_NAME.match(name): raise ValueError(f"Unsafe app name: {name!r}") @@ -152,7 +158,7 @@ class ArgoCDProvider(MCPToolProvider): if ns_filter: params["appNamespace"] = ns_filter - async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT, verify=False) as client: + async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT, verify=_tls_verify()) as client: resp = await client.get(url, headers=self._headers(), params=params) resp.raise_for_status() data = resp.json() @@ -183,7 +189,7 @@ class ArgoCDProvider(MCPToolProvider): app_name = _validate_app_name(parameters["app_name"]) url = f"{self._base_url()}/api/v1/applications/{app_name}" - async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT, verify=False) as client: + async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT, verify=_tls_verify()) as client: resp = await client.get(url, headers=self._headers()) resp.raise_for_status() app = resp.json() @@ -229,7 +235,7 @@ class ArgoCDProvider(MCPToolProvider): # ArgoCD history endpoint history_url = f"{self._base_url()}/api/v1/applications/{app_name}" - async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT, verify=False) as client: + async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT, verify=_tls_verify()) as client: resp = await client.get(history_url, headers=self._headers()) resp.raise_for_status() app = resp.json() @@ -255,7 +261,7 @@ class ArgoCDProvider(MCPToolProvider): async def health_check(self) -> bool: try: url = f"{self._base_url()}/api/v1/applicationsets" - async with httpx.AsyncClient(timeout=5.0, verify=False) as client: + async with httpx.AsyncClient(timeout=5.0, verify=_tls_verify()) as client: resp = await client.get(url, headers=self._headers()) return resp.status_code < 500 except Exception: diff --git a/apps/api/src/plugins/mcp/providers/prometheus_provider.py b/apps/api/src/plugins/mcp/providers/prometheus_provider.py index ad28a18d..e2636a92 100644 --- a/apps/api/src/plugins/mcp/providers/prometheus_provider.py +++ b/apps/api/src/plugins/mcp/providers/prometheus_provider.py @@ -19,6 +19,7 @@ Prometheus MCP Tool Provider — MCP Phase 2b @see docs/superpowers/specs/2026-04-10-infra-rebuild-sprint-abc-design.md §MCP-2b """ +import re import uuid from datetime import UTC, datetime, timedelta from typing import Any @@ -26,6 +27,9 @@ from typing import Any import httpx import structlog +# P1 fix 2026-04-11: alertname 白名單防止 PromQL label injection +_RE_SAFE_ALERTNAME = re.compile(r"^[a-zA-Z0-9_-]{1,64}$") + from src.plugins.mcp.interfaces import MCPTool, MCPToolProvider, MCPToolResult logger = structlog.get_logger(__name__) @@ -248,6 +252,9 @@ class PrometheusProvider(MCPToolProvider): async def _alert_history(self, params: dict) -> dict: alertname = params["alertname"] + # P1 fix 2026-04-11: 白名單驗證防止 PromQL label injection + if not _RE_SAFE_ALERTNAME.match(alertname): + raise ValueError(f"Unsafe alertname: {alertname!r}") window_hours = int(params.get("window_hours", 24)) limit = int(params.get("limit", 20)) diff --git a/apps/api/src/plugins/mcp/providers/sentry_provider.py b/apps/api/src/plugins/mcp/providers/sentry_provider.py index 4edf3182..21535ad8 100644 --- a/apps/api/src/plugins/mcp/providers/sentry_provider.py +++ b/apps/api/src/plugins/mcp/providers/sentry_provider.py @@ -31,6 +31,11 @@ logger = structlog.get_logger(__name__) _HTTP_TIMEOUT = 10.0 _RE_SAFE_ISSUE_ID = re.compile(r'^\d{1,20}$') +# P2 fix 2026-04-11: Sentry query 語意白名單 — 只允許已知安全的 Sentry 搜尋語法 +# 允許: 純文字識別字、is:unresolved、level:error、project:xxx、assigned:me 等 +_RE_SAFE_SENTRY_QUERY = re.compile( + r'^[\w\s:.\-/]+$' # alphanumeric, space, colon, dot, dash, slash — Sentry filter 所需 +) class SentryProvider(MCPToolProvider): @@ -222,7 +227,10 @@ class SentryProvider(MCPToolProvider): return MCPToolResult(success=True, data=result) async def _search_issues(self, parameters: dict) -> MCPToolResult: - query = str(parameters["query"])[:200] # 限制長度防止注入 + query = str(parameters["query"])[:200] + # P2 fix 2026-04-11: 語意白名單驗證,拒絕含特殊字元的 query + if not _RE_SAFE_SENTRY_QUERY.match(query): + return MCPToolResult(success=False, error=f"Unsafe sentry query: {query!r}") limit = max(1, min(int(parameters.get("limit", 10)), 25)) url = f"{self._base_url()}/api/0/projects/{self._org()}/{self._project()}/issues/" diff --git a/apps/api/src/plugins/mcp/providers/ssh_provider.py b/apps/api/src/plugins/mcp/providers/ssh_provider.py index bfe3b9a3..2bcab0e4 100644 --- a/apps/api/src/plugins/mcp/providers/ssh_provider.py +++ b/apps/api/src/plugins/mcp/providers/ssh_provider.py @@ -360,6 +360,21 @@ class SSHProvider(MCPToolProvider): ), ) + # P1 fix 2026-04-11: 群組 B 寫入工具必須有 known_hosts,否則拒絕執行 + # 防止 MITM — 讀取工具(Group A)允許 known_hosts=None 方便診斷;寫入操作不行 + import os as _os + if tool_name in GROUP_B_TOOLS: + _kh = _os.environ.get("SSH_MCP_KNOWN_HOSTS_FILE") + if not _kh or not _os.path.exists(_kh): + return MCPToolResult( + success=False, + execution_id=execution_id, + error=( + "Group B write tool refused: SSH_MCP_KNOWN_HOSTS_FILE not set or missing. " + "Set up known_hosts per docs/runbooks/ssh-mcp-setup.md before write operations." + ), + ) + # 執行 try: is_group_b = tool_name in GROUP_B_TOOLS diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index aaa302ef..a3b9617c 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -41,6 +41,23 @@ logger = structlog.get_logger(__name__) # Phase 7.5: Playbook 優先閾值 PLAYBOOK_SIMILARITY_THRESHOLD = 0.85 # 相似度 >= 85% 直接使用 Playbook +# P1 fix 2026-04-11: background task GC guard — keep strong refs until done +_background_tasks: set[asyncio.Task] = set() + +def _fire_and_forget(coro) -> asyncio.Task: + """Create a background task with GC protection via _background_tasks.""" + task = asyncio.create_task(coro) + _background_tasks.add(task) + task.add_done_callback(_background_tasks.discard) + return task + +# P1 fix 2026-04-11: kubectl action dangerous char whitelist +import re as _re_module +_ALLOWED_KUBECTL_PATTERN = _re_module.compile( + r"^kubectl\s+(rollout restart|rollout undo|scale|delete pod|get|describe|logs)" + r"\s+[a-zA-Z0-9_./-]+(\s+(-n|--namespace)\s+[a-zA-Z0-9_-]+)?$" +) + # ============================================================================= # Phase 31 (ADR-067 2026-04-10): Log 異常摘要 — NemoTron deepseek-r1:14b @@ -238,13 +255,13 @@ async def _push_decision_to_telegram( # Phase 31 (ADR-067 2026-04-10): Log 異常摘要 — NemoTron deepseek-r1:14b # 非同步執行,不阻塞主流程 - asyncio.create_task(_send_log_summary(incident)) + _fire_and_forget(_send_log_summary(incident)) # MCP Phase 4a: NemoClaw second opinion (2026-04-11 Claude Sonnet 4.6) # 若 proposal_data 有 advisory_note,用 NemoClaw bot 身分追加一條訊息 _advisory_note = proposal_data.get("advisory_note", "") if _advisory_note: - asyncio.create_task( + _fire_and_forget( gateway.send_as_nemotron( f"🤔 NemoClaw 第二意見 (信心={confidence:.2f})\n" f"{_advisory_note}" @@ -446,37 +463,29 @@ async def _fetch_metrics_snapshot(incident: Incident) -> dict: if alertname in ("HostHighCpuLoad", "HostOutOfMemory"): if instance: host = instance.split(":")[0] - r = await prom._instant_query( - f'100 - (avg by(instance) (irate(node_cpu_seconds_total{{mode="idle",instance=~"{host}.*"}}[5m])) * 100)' - ) - if r.get("status") == "success": - for item in r.get("data", {}).get("result", []): - snapshots["cpu_pct"] = round(float(item["value"][1]), 1) - r2 = await prom._instant_query( + # P0 fix 2026-04-11: _instant_query 要求 dict,回傳 {"result": [...]} + r = await prom._instant_query({"query": f'100 - (avg by(instance) (irate(node_cpu_seconds_total{{mode="idle",instance=~"{host}.*"}}[5m])) * 100)'}) + for item in r.get("result", []): + snapshots["cpu_pct"] = round(float(item["value"][1]), 1) + cpu_query = ( f'(1 - (node_memory_MemAvailable_bytes{{instance=~"{instance}"}} / node_memory_MemTotal_bytes{{instance=~"{instance}"}})) * 100' if instance else "100 * (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)" ) - if r2.get("status") == "success": - for item in r2.get("data", {}).get("result", []): - snapshots["mem_pct"] = round(float(item["value"][1]), 1) + r2 = await prom._instant_query({"query": cpu_query}) + for item in r2.get("result", []): + snapshots["mem_pct"] = round(float(item["value"][1]), 1) elif alertname == "HostOutOfDiskSpace": - r = await prom._instant_query( - 'max(100 - ((node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"}) * 100))' - ) - if r.get("status") == "success": - for item in r.get("data", {}).get("result", []): - snapshots["disk_pct"] = round(float(item["value"][1]), 1) + r = await prom._instant_query({"query": 'max(100 - ((node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"}) * 100))'}) + for item in r.get("result", []): + snapshots["disk_pct"] = round(float(item["value"][1]), 1) elif alertname in ("PodRestartingTooMuch", "PodCrashLoopBackOff"): pod = labels.get("pod", labels.get("component", "")) if pod: - r = await prom._instant_query( - f'sum(kube_pod_container_status_restarts_total{{namespace="awoooi-prod",pod=~"{pod}.*"}})' - ) - if r.get("status") == "success": - for item in r.get("data", {}).get("result", []): - snapshots["restart_count"] = int(float(item["value"][1])) + r = await prom._instant_query({"query": f'sum(kube_pod_container_status_restarts_total{{namespace="awoooi-prod",pod=~"{pod}.*"}})'}) + for item in r.get("result", []): + snapshots["restart_count"] = int(float(item["value"][1])) return snapshots except Exception as _e: @@ -953,12 +962,12 @@ class DecisionManager: await self._save_token(token) # 觸發自動執行 (非阻塞) - asyncio.create_task( + _fire_and_forget( self._auto_execute(incident, token) ) else: # 需人工審核: 推送到 Telegram - asyncio.create_task( + _fire_and_forget( _push_decision_to_telegram(incident, token.proposal_data) ) @@ -989,18 +998,20 @@ class DecisionManager: # 另外:若 target 等於 alertname,代表 LLM 把告警名稱填入 deployment_name,也拒絕 _alertname = incident.signals[0].labels.get("alertname", "") if incident.signals else "" _target_is_alertname = bool(_alertname and _target == _alertname) - if "unknown" in action or _re.search(r"[<{][^>}]+[>}]", action) or _target_is_alertname: + # P1 fix 2026-04-11: kubectl action 危險字元白名單 — 防止 && || ; > | 注入 + _action_safe = bool(_ALLOWED_KUBECTL_PATTERN.match(action.strip())) + if "unknown" in action or _re.search(r"[<{][^>}]+[>}]", action) or _target_is_alertname or not _action_safe: logger.warning( "auto_execute_blocked_unresolved_placeholder", incident_id=incident.incident_id, action=action, target=_target, - reason="action 含未解析的 placeholder、unknown、或 target==alertname,拒絕執行", + reason="action 含未解析的 placeholder、unknown、target==alertname、或危險字元,拒絕執行", ) token.state = DecisionState.ERROR token.error = f"Auto-execute blocked: unresolved placeholder in action: {action[:80]}" await self._save_token(token) - asyncio.create_task( + _fire_and_forget( _push_auto_repair_result(incident, action, success=False, error="無法確認 deployment 名稱,請人工確認後手動執行") ) @@ -1057,7 +1068,7 @@ class DecisionManager: ) # 2026-04-09 Claude Sonnet 4.6: 執行成功 → 發 Telegram 結果通知 - asyncio.create_task( + _fire_and_forget( _push_auto_repair_result(incident, action, success=True) ) @@ -1072,10 +1083,10 @@ class DecisionManager: await self._save_token(token) # 2026-04-09 Claude Sonnet 4.6: 執行失敗 → 發 Telegram 失敗通知 + fallback 人工 - asyncio.create_task( + _fire_and_forget( _push_auto_repair_result(incident, action, success=False, error=str(e)) ) - asyncio.create_task( + _fire_and_forget( _push_decision_to_telegram(incident, token.proposal_data) ) @@ -1151,7 +1162,7 @@ class DecisionManager: return playbook_result # MCP Phase 4c: Playbook 無命中 → 非同步產生 AI 草稿 Playbook (2026-04-11 Claude Sonnet 4.6) - asyncio.create_task(_generate_playbook_draft_if_new(incident)) + _fire_and_forget(_generate_playbook_draft_if_new(incident)) # Expert System 同步執行 (立即可用) expert_result = expert_analyze(incident)