diff --git a/apps/api/src/plugins/mcp/providers/argocd_provider.py b/apps/api/src/plugins/mcp/providers/argocd_provider.py
index 9bb6495b..ca800469 100644
--- a/apps/api/src/plugins/mcp/providers/argocd_provider.py
+++ b/apps/api/src/plugins/mcp/providers/argocd_provider.py
@@ -33,6 +33,12 @@ _HTTP_TIMEOUT = 10.0
_RE_SAFE_APP_NAME = re.compile(r'^[a-zA-Z0-9][a-zA-Z0-9._-]{0,62}$')
+def _tls_verify() -> bool:
+ """P2 fix 2026-04-11: 讀取 ARGOCD_VERIFY_TLS env,預設 False(因 self-signed cert)。"""
+ import os
+ return os.environ.get("ARGOCD_VERIFY_TLS", "false").lower() in ("true", "1", "yes")
+
+
def _validate_app_name(name: str) -> str:
if not _RE_SAFE_APP_NAME.match(name):
raise ValueError(f"Unsafe app name: {name!r}")
@@ -152,7 +158,7 @@ class ArgoCDProvider(MCPToolProvider):
if ns_filter:
params["appNamespace"] = ns_filter
- async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT, verify=False) as client:
+ async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT, verify=_tls_verify()) as client:
resp = await client.get(url, headers=self._headers(), params=params)
resp.raise_for_status()
data = resp.json()
@@ -183,7 +189,7 @@ class ArgoCDProvider(MCPToolProvider):
app_name = _validate_app_name(parameters["app_name"])
url = f"{self._base_url()}/api/v1/applications/{app_name}"
- async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT, verify=False) as client:
+ async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT, verify=_tls_verify()) as client:
resp = await client.get(url, headers=self._headers())
resp.raise_for_status()
app = resp.json()
@@ -229,7 +235,7 @@ class ArgoCDProvider(MCPToolProvider):
# ArgoCD history endpoint
history_url = f"{self._base_url()}/api/v1/applications/{app_name}"
- async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT, verify=False) as client:
+ async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT, verify=_tls_verify()) as client:
resp = await client.get(history_url, headers=self._headers())
resp.raise_for_status()
app = resp.json()
@@ -255,7 +261,7 @@ class ArgoCDProvider(MCPToolProvider):
async def health_check(self) -> bool:
try:
url = f"{self._base_url()}/api/v1/applicationsets"
- async with httpx.AsyncClient(timeout=5.0, verify=False) as client:
+ async with httpx.AsyncClient(timeout=5.0, verify=_tls_verify()) as client:
resp = await client.get(url, headers=self._headers())
return resp.status_code < 500
except Exception:
diff --git a/apps/api/src/plugins/mcp/providers/prometheus_provider.py b/apps/api/src/plugins/mcp/providers/prometheus_provider.py
index ad28a18d..e2636a92 100644
--- a/apps/api/src/plugins/mcp/providers/prometheus_provider.py
+++ b/apps/api/src/plugins/mcp/providers/prometheus_provider.py
@@ -19,6 +19,7 @@ Prometheus MCP Tool Provider — MCP Phase 2b
@see docs/superpowers/specs/2026-04-10-infra-rebuild-sprint-abc-design.md §MCP-2b
"""
+import re
import uuid
from datetime import UTC, datetime, timedelta
from typing import Any
@@ -26,6 +27,9 @@ from typing import Any
import httpx
import structlog
+# P1 fix 2026-04-11: alertname 白名單防止 PromQL label injection
+_RE_SAFE_ALERTNAME = re.compile(r"^[a-zA-Z0-9_-]{1,64}$")
+
from src.plugins.mcp.interfaces import MCPTool, MCPToolProvider, MCPToolResult
logger = structlog.get_logger(__name__)
@@ -248,6 +252,9 @@ class PrometheusProvider(MCPToolProvider):
async def _alert_history(self, params: dict) -> dict:
alertname = params["alertname"]
+ # P1 fix 2026-04-11: 白名單驗證防止 PromQL label injection
+ if not _RE_SAFE_ALERTNAME.match(alertname):
+ raise ValueError(f"Unsafe alertname: {alertname!r}")
window_hours = int(params.get("window_hours", 24))
limit = int(params.get("limit", 20))
diff --git a/apps/api/src/plugins/mcp/providers/sentry_provider.py b/apps/api/src/plugins/mcp/providers/sentry_provider.py
index 4edf3182..21535ad8 100644
--- a/apps/api/src/plugins/mcp/providers/sentry_provider.py
+++ b/apps/api/src/plugins/mcp/providers/sentry_provider.py
@@ -31,6 +31,11 @@ logger = structlog.get_logger(__name__)
_HTTP_TIMEOUT = 10.0
_RE_SAFE_ISSUE_ID = re.compile(r'^\d{1,20}$')
+# P2 fix 2026-04-11: Sentry query 語意白名單 — 只允許已知安全的 Sentry 搜尋語法
+# 允許: 純文字識別字、is:unresolved、level:error、project:xxx、assigned:me 等
+_RE_SAFE_SENTRY_QUERY = re.compile(
+ r'^[\w\s:.\-/]+$' # alphanumeric, space, colon, dot, dash, slash — Sentry filter 所需
+)
class SentryProvider(MCPToolProvider):
@@ -222,7 +227,10 @@ class SentryProvider(MCPToolProvider):
return MCPToolResult(success=True, data=result)
async def _search_issues(self, parameters: dict) -> MCPToolResult:
- query = str(parameters["query"])[:200] # 限制長度防止注入
+ query = str(parameters["query"])[:200]
+ # P2 fix 2026-04-11: 語意白名單驗證,拒絕含特殊字元的 query
+ if not _RE_SAFE_SENTRY_QUERY.match(query):
+ return MCPToolResult(success=False, error=f"Unsafe sentry query: {query!r}")
limit = max(1, min(int(parameters.get("limit", 10)), 25))
url = f"{self._base_url()}/api/0/projects/{self._org()}/{self._project()}/issues/"
diff --git a/apps/api/src/plugins/mcp/providers/ssh_provider.py b/apps/api/src/plugins/mcp/providers/ssh_provider.py
index bfe3b9a3..2bcab0e4 100644
--- a/apps/api/src/plugins/mcp/providers/ssh_provider.py
+++ b/apps/api/src/plugins/mcp/providers/ssh_provider.py
@@ -360,6 +360,21 @@ class SSHProvider(MCPToolProvider):
),
)
+ # P1 fix 2026-04-11: 群組 B 寫入工具必須有 known_hosts,否則拒絕執行
+ # 防止 MITM — 讀取工具(Group A)允許 known_hosts=None 方便診斷;寫入操作不行
+ import os as _os
+ if tool_name in GROUP_B_TOOLS:
+ _kh = _os.environ.get("SSH_MCP_KNOWN_HOSTS_FILE")
+ if not _kh or not _os.path.exists(_kh):
+ return MCPToolResult(
+ success=False,
+ execution_id=execution_id,
+ error=(
+ "Group B write tool refused: SSH_MCP_KNOWN_HOSTS_FILE not set or missing. "
+ "Set up known_hosts per docs/runbooks/ssh-mcp-setup.md before write operations."
+ ),
+ )
+
# 執行
try:
is_group_b = tool_name in GROUP_B_TOOLS
diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py
index aaa302ef..a3b9617c 100644
--- a/apps/api/src/services/decision_manager.py
+++ b/apps/api/src/services/decision_manager.py
@@ -41,6 +41,23 @@ logger = structlog.get_logger(__name__)
# Phase 7.5: Playbook 優先閾值
PLAYBOOK_SIMILARITY_THRESHOLD = 0.85 # 相似度 >= 85% 直接使用 Playbook
+# P1 fix 2026-04-11: background task GC guard — keep strong refs until done
+_background_tasks: set[asyncio.Task] = set()
+
+def _fire_and_forget(coro) -> asyncio.Task:
+ """Create a background task with GC protection via _background_tasks."""
+ task = asyncio.create_task(coro)
+ _background_tasks.add(task)
+ task.add_done_callback(_background_tasks.discard)
+ return task
+
+# P1 fix 2026-04-11: kubectl action dangerous char whitelist
+import re as _re_module
+_ALLOWED_KUBECTL_PATTERN = _re_module.compile(
+ r"^kubectl\s+(rollout restart|rollout undo|scale|delete pod|get|describe|logs)"
+ r"\s+[a-zA-Z0-9_./-]+(\s+(-n|--namespace)\s+[a-zA-Z0-9_-]+)?$"
+)
+
# =============================================================================
# Phase 31 (ADR-067 2026-04-10): Log 異常摘要 — NemoTron deepseek-r1:14b
@@ -238,13 +255,13 @@ async def _push_decision_to_telegram(
# Phase 31 (ADR-067 2026-04-10): Log 異常摘要 — NemoTron deepseek-r1:14b
# 非同步執行,不阻塞主流程
- asyncio.create_task(_send_log_summary(incident))
+ _fire_and_forget(_send_log_summary(incident))
# MCP Phase 4a: NemoClaw second opinion (2026-04-11 Claude Sonnet 4.6)
# 若 proposal_data 有 advisory_note,用 NemoClaw bot 身分追加一條訊息
_advisory_note = proposal_data.get("advisory_note", "")
if _advisory_note:
- asyncio.create_task(
+ _fire_and_forget(
gateway.send_as_nemotron(
f"🤔 NemoClaw 第二意見 (信心={confidence:.2f})\n"
f"{_advisory_note}"
@@ -446,37 +463,29 @@ async def _fetch_metrics_snapshot(incident: Incident) -> dict:
if alertname in ("HostHighCpuLoad", "HostOutOfMemory"):
if instance:
host = instance.split(":")[0]
- r = await prom._instant_query(
- f'100 - (avg by(instance) (irate(node_cpu_seconds_total{{mode="idle",instance=~"{host}.*"}}[5m])) * 100)'
- )
- if r.get("status") == "success":
- for item in r.get("data", {}).get("result", []):
- snapshots["cpu_pct"] = round(float(item["value"][1]), 1)
- r2 = await prom._instant_query(
+ # P0 fix 2026-04-11: _instant_query 要求 dict,回傳 {"result": [...]}
+ r = await prom._instant_query({"query": f'100 - (avg by(instance) (irate(node_cpu_seconds_total{{mode="idle",instance=~"{host}.*"}}[5m])) * 100)'})
+ for item in r.get("result", []):
+ snapshots["cpu_pct"] = round(float(item["value"][1]), 1)
+ cpu_query = (
f'(1 - (node_memory_MemAvailable_bytes{{instance=~"{instance}"}} / node_memory_MemTotal_bytes{{instance=~"{instance}"}})) * 100'
if instance else "100 * (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)"
)
- if r2.get("status") == "success":
- for item in r2.get("data", {}).get("result", []):
- snapshots["mem_pct"] = round(float(item["value"][1]), 1)
+ r2 = await prom._instant_query({"query": cpu_query})
+ for item in r2.get("result", []):
+ snapshots["mem_pct"] = round(float(item["value"][1]), 1)
elif alertname == "HostOutOfDiskSpace":
- r = await prom._instant_query(
- 'max(100 - ((node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"}) * 100))'
- )
- if r.get("status") == "success":
- for item in r.get("data", {}).get("result", []):
- snapshots["disk_pct"] = round(float(item["value"][1]), 1)
+ r = await prom._instant_query({"query": 'max(100 - ((node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"}) * 100))'})
+ for item in r.get("result", []):
+ snapshots["disk_pct"] = round(float(item["value"][1]), 1)
elif alertname in ("PodRestartingTooMuch", "PodCrashLoopBackOff"):
pod = labels.get("pod", labels.get("component", ""))
if pod:
- r = await prom._instant_query(
- f'sum(kube_pod_container_status_restarts_total{{namespace="awoooi-prod",pod=~"{pod}.*"}})'
- )
- if r.get("status") == "success":
- for item in r.get("data", {}).get("result", []):
- snapshots["restart_count"] = int(float(item["value"][1]))
+ r = await prom._instant_query({"query": f'sum(kube_pod_container_status_restarts_total{{namespace="awoooi-prod",pod=~"{pod}.*"}})'})
+ for item in r.get("result", []):
+ snapshots["restart_count"] = int(float(item["value"][1]))
return snapshots
except Exception as _e:
@@ -953,12 +962,12 @@ class DecisionManager:
await self._save_token(token)
# 觸發自動執行 (非阻塞)
- asyncio.create_task(
+ _fire_and_forget(
self._auto_execute(incident, token)
)
else:
# 需人工審核: 推送到 Telegram
- asyncio.create_task(
+ _fire_and_forget(
_push_decision_to_telegram(incident, token.proposal_data)
)
@@ -989,18 +998,20 @@ class DecisionManager:
# 另外:若 target 等於 alertname,代表 LLM 把告警名稱填入 deployment_name,也拒絕
_alertname = incident.signals[0].labels.get("alertname", "") if incident.signals else ""
_target_is_alertname = bool(_alertname and _target == _alertname)
- if "unknown" in action or _re.search(r"[<{][^>}]+[>}]", action) or _target_is_alertname:
+ # P1 fix 2026-04-11: kubectl action 危險字元白名單 — 防止 && || ; > | 注入
+ _action_safe = bool(_ALLOWED_KUBECTL_PATTERN.match(action.strip()))
+ if "unknown" in action or _re.search(r"[<{][^>}]+[>}]", action) or _target_is_alertname or not _action_safe:
logger.warning(
"auto_execute_blocked_unresolved_placeholder",
incident_id=incident.incident_id,
action=action,
target=_target,
- reason="action 含未解析的 placeholder、unknown、或 target==alertname,拒絕執行",
+ reason="action 含未解析的 placeholder、unknown、target==alertname、或危險字元,拒絕執行",
)
token.state = DecisionState.ERROR
token.error = f"Auto-execute blocked: unresolved placeholder in action: {action[:80]}"
await self._save_token(token)
- asyncio.create_task(
+ _fire_and_forget(
_push_auto_repair_result(incident, action, success=False,
error="無法確認 deployment 名稱,請人工確認後手動執行")
)
@@ -1057,7 +1068,7 @@ class DecisionManager:
)
# 2026-04-09 Claude Sonnet 4.6: 執行成功 → 發 Telegram 結果通知
- asyncio.create_task(
+ _fire_and_forget(
_push_auto_repair_result(incident, action, success=True)
)
@@ -1072,10 +1083,10 @@ class DecisionManager:
await self._save_token(token)
# 2026-04-09 Claude Sonnet 4.6: 執行失敗 → 發 Telegram 失敗通知 + fallback 人工
- asyncio.create_task(
+ _fire_and_forget(
_push_auto_repair_result(incident, action, success=False, error=str(e))
)
- asyncio.create_task(
+ _fire_and_forget(
_push_decision_to_telegram(incident, token.proposal_data)
)
@@ -1151,7 +1162,7 @@ class DecisionManager:
return playbook_result
# MCP Phase 4c: Playbook 無命中 → 非同步產生 AI 草稿 Playbook (2026-04-11 Claude Sonnet 4.6)
- asyncio.create_task(_generate_playbook_draft_if_new(incident))
+ _fire_and_forget(_generate_playbook_draft_if_new(incident))
# Expert System 同步執行 (立即可用)
expert_result = expert_analyze(incident)