fix(security): Code Review P0+P1+P2 全修補 — MCP Phase 2b-3 + decision_manager
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled

P0: decision_manager _fetch_metrics_snapshot 參數型別錯誤
  - prom._instant_query(str) → prom._instant_query({"query": str})
  - 結果解析 r.get("status")=="success" → r.get("result", [])

P1: prometheus_provider — alertname PromQL injection 防範
  - 新增 _RE_SAFE_ALERTNAME 白名單正則

P1: decision_manager — kubectl action 危險字元注入防範
  - 新增 _ALLOWED_KUBECTL_PATTERN 白名單,非法指令格式直接拒絕

P1: decision_manager — 6 個 asyncio.create_task() GC 風險
  - 新增 _background_tasks: set + _fire_and_forget() helper
  - 所有 bare create_task 改用 _fire_and_forget

P1: ssh_provider — Group B 寫入工具強制需要 known_hosts
  - known_hosts 未設定或檔案不存在時拒絕執行,防 MITM

P2: sentry_provider — query 語意白名單驗證
  - 新增 _RE_SAFE_SENTRY_QUERY,拒絕含特殊字元的 query

P2: argocd_provider — verify=False 改為 ARGOCD_VERIFY_TLS 環境變數開關
  - 新增 _tls_verify() helper,預設 false(self-signed cert)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-11 20:10:33 +08:00
parent 083b1a5449
commit f3236338a5
5 changed files with 85 additions and 38 deletions

View File

@@ -33,6 +33,12 @@ _HTTP_TIMEOUT = 10.0
_RE_SAFE_APP_NAME = re.compile(r'^[a-zA-Z0-9][a-zA-Z0-9._-]{0,62}$')
def _tls_verify() -> bool:
"""P2 fix 2026-04-11: 讀取 ARGOCD_VERIFY_TLS env預設 False因 self-signed cert"""
import os
return os.environ.get("ARGOCD_VERIFY_TLS", "false").lower() in ("true", "1", "yes")
def _validate_app_name(name: str) -> str:
if not _RE_SAFE_APP_NAME.match(name):
raise ValueError(f"Unsafe app name: {name!r}")
@@ -152,7 +158,7 @@ class ArgoCDProvider(MCPToolProvider):
if ns_filter:
params["appNamespace"] = ns_filter
async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT, verify=False) as client:
async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT, verify=_tls_verify()) as client:
resp = await client.get(url, headers=self._headers(), params=params)
resp.raise_for_status()
data = resp.json()
@@ -183,7 +189,7 @@ class ArgoCDProvider(MCPToolProvider):
app_name = _validate_app_name(parameters["app_name"])
url = f"{self._base_url()}/api/v1/applications/{app_name}"
async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT, verify=False) as client:
async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT, verify=_tls_verify()) as client:
resp = await client.get(url, headers=self._headers())
resp.raise_for_status()
app = resp.json()
@@ -229,7 +235,7 @@ class ArgoCDProvider(MCPToolProvider):
# ArgoCD history endpoint
history_url = f"{self._base_url()}/api/v1/applications/{app_name}"
async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT, verify=False) as client:
async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT, verify=_tls_verify()) as client:
resp = await client.get(history_url, headers=self._headers())
resp.raise_for_status()
app = resp.json()
@@ -255,7 +261,7 @@ class ArgoCDProvider(MCPToolProvider):
async def health_check(self) -> bool:
try:
url = f"{self._base_url()}/api/v1/applicationsets"
async with httpx.AsyncClient(timeout=5.0, verify=False) as client:
async with httpx.AsyncClient(timeout=5.0, verify=_tls_verify()) as client:
resp = await client.get(url, headers=self._headers())
return resp.status_code < 500
except Exception:

View File

@@ -19,6 +19,7 @@ Prometheus MCP Tool Provider — MCP Phase 2b
@see docs/superpowers/specs/2026-04-10-infra-rebuild-sprint-abc-design.md §MCP-2b
"""
import re
import uuid
from datetime import UTC, datetime, timedelta
from typing import Any
@@ -26,6 +27,9 @@ from typing import Any
import httpx
import structlog
# P1 fix 2026-04-11: alertname 白名單防止 PromQL label injection
_RE_SAFE_ALERTNAME = re.compile(r"^[a-zA-Z0-9_-]{1,64}$")
from src.plugins.mcp.interfaces import MCPTool, MCPToolProvider, MCPToolResult
logger = structlog.get_logger(__name__)
@@ -248,6 +252,9 @@ class PrometheusProvider(MCPToolProvider):
async def _alert_history(self, params: dict) -> dict:
alertname = params["alertname"]
# P1 fix 2026-04-11: 白名單驗證防止 PromQL label injection
if not _RE_SAFE_ALERTNAME.match(alertname):
raise ValueError(f"Unsafe alertname: {alertname!r}")
window_hours = int(params.get("window_hours", 24))
limit = int(params.get("limit", 20))

View File

@@ -31,6 +31,11 @@ logger = structlog.get_logger(__name__)
_HTTP_TIMEOUT = 10.0
_RE_SAFE_ISSUE_ID = re.compile(r'^\d{1,20}$')
# P2 fix 2026-04-11: Sentry query 語意白名單 — 只允許已知安全的 Sentry 搜尋語法
# 允許: 純文字識別字、is:unresolved、level:error、project:xxx、assigned:me 等
_RE_SAFE_SENTRY_QUERY = re.compile(
r'^[\w\s:.\-/]+$' # alphanumeric, space, colon, dot, dash, slash — Sentry filter 所需
)
class SentryProvider(MCPToolProvider):
@@ -222,7 +227,10 @@ class SentryProvider(MCPToolProvider):
return MCPToolResult(success=True, data=result)
async def _search_issues(self, parameters: dict) -> MCPToolResult:
query = str(parameters["query"])[:200] # 限制長度防止注入
query = str(parameters["query"])[:200]
# P2 fix 2026-04-11: 語意白名單驗證,拒絕含特殊字元的 query
if not _RE_SAFE_SENTRY_QUERY.match(query):
return MCPToolResult(success=False, error=f"Unsafe sentry query: {query!r}")
limit = max(1, min(int(parameters.get("limit", 10)), 25))
url = f"{self._base_url()}/api/0/projects/{self._org()}/{self._project()}/issues/"

View File

@@ -360,6 +360,21 @@ class SSHProvider(MCPToolProvider):
),
)
# P1 fix 2026-04-11: 群組 B 寫入工具必須有 known_hosts否則拒絕執行
# 防止 MITM — 讀取工具Group A允許 known_hosts=None 方便診斷;寫入操作不行
import os as _os
if tool_name in GROUP_B_TOOLS:
_kh = _os.environ.get("SSH_MCP_KNOWN_HOSTS_FILE")
if not _kh or not _os.path.exists(_kh):
return MCPToolResult(
success=False,
execution_id=execution_id,
error=(
"Group B write tool refused: SSH_MCP_KNOWN_HOSTS_FILE not set or missing. "
"Set up known_hosts per docs/runbooks/ssh-mcp-setup.md before write operations."
),
)
# 執行
try:
is_group_b = tool_name in GROUP_B_TOOLS

View File

@@ -41,6 +41,23 @@ logger = structlog.get_logger(__name__)
# Phase 7.5: Playbook 優先閾值
PLAYBOOK_SIMILARITY_THRESHOLD = 0.85 # 相似度 >= 85% 直接使用 Playbook
# P1 fix 2026-04-11: background task GC guard — keep strong refs until done
_background_tasks: set[asyncio.Task] = set()
def _fire_and_forget(coro) -> asyncio.Task:
"""Create a background task with GC protection via _background_tasks."""
task = asyncio.create_task(coro)
_background_tasks.add(task)
task.add_done_callback(_background_tasks.discard)
return task
# P1 fix 2026-04-11: kubectl action dangerous char whitelist
import re as _re_module
_ALLOWED_KUBECTL_PATTERN = _re_module.compile(
r"^kubectl\s+(rollout restart|rollout undo|scale|delete pod|get|describe|logs)"
r"\s+[a-zA-Z0-9_./-]+(\s+(-n|--namespace)\s+[a-zA-Z0-9_-]+)?$"
)
# =============================================================================
# Phase 31 (ADR-067 2026-04-10): Log 異常摘要 — NemoTron deepseek-r1:14b
@@ -238,13 +255,13 @@ async def _push_decision_to_telegram(
# Phase 31 (ADR-067 2026-04-10): Log 異常摘要 — NemoTron deepseek-r1:14b
# 非同步執行,不阻塞主流程
asyncio.create_task(_send_log_summary(incident))
_fire_and_forget(_send_log_summary(incident))
# MCP Phase 4a: NemoClaw second opinion (2026-04-11 Claude Sonnet 4.6)
# 若 proposal_data 有 advisory_note用 NemoClaw bot 身分追加一條訊息
_advisory_note = proposal_data.get("advisory_note", "")
if _advisory_note:
asyncio.create_task(
_fire_and_forget(
gateway.send_as_nemotron(
f"🤔 <b>NemoClaw 第二意見</b> (信心={confidence:.2f})\n"
f"<i>{_advisory_note}</i>"
@@ -446,37 +463,29 @@ async def _fetch_metrics_snapshot(incident: Incident) -> dict:
if alertname in ("HostHighCpuLoad", "HostOutOfMemory"):
if instance:
host = instance.split(":")[0]
r = await prom._instant_query(
f'100 - (avg by(instance) (irate(node_cpu_seconds_total{{mode="idle",instance=~"{host}.*"}}[5m])) * 100)'
)
if r.get("status") == "success":
for item in r.get("data", {}).get("result", []):
snapshots["cpu_pct"] = round(float(item["value"][1]), 1)
r2 = await prom._instant_query(
# P0 fix 2026-04-11: _instant_query 要求 dict回傳 {"result": [...]}
r = await prom._instant_query({"query": f'100 - (avg by(instance) (irate(node_cpu_seconds_total{{mode="idle",instance=~"{host}.*"}}[5m])) * 100)'})
for item in r.get("result", []):
snapshots["cpu_pct"] = round(float(item["value"][1]), 1)
cpu_query = (
f'(1 - (node_memory_MemAvailable_bytes{{instance=~"{instance}"}} / node_memory_MemTotal_bytes{{instance=~"{instance}"}})) * 100'
if instance else "100 * (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)"
)
if r2.get("status") == "success":
for item in r2.get("data", {}).get("result", []):
snapshots["mem_pct"] = round(float(item["value"][1]), 1)
r2 = await prom._instant_query({"query": cpu_query})
for item in r2.get("result", []):
snapshots["mem_pct"] = round(float(item["value"][1]), 1)
elif alertname == "HostOutOfDiskSpace":
r = await prom._instant_query(
'max(100 - ((node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"}) * 100))'
)
if r.get("status") == "success":
for item in r.get("data", {}).get("result", []):
snapshots["disk_pct"] = round(float(item["value"][1]), 1)
r = await prom._instant_query({"query": 'max(100 - ((node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"}) * 100))'})
for item in r.get("result", []):
snapshots["disk_pct"] = round(float(item["value"][1]), 1)
elif alertname in ("PodRestartingTooMuch", "PodCrashLoopBackOff"):
pod = labels.get("pod", labels.get("component", ""))
if pod:
r = await prom._instant_query(
f'sum(kube_pod_container_status_restarts_total{{namespace="awoooi-prod",pod=~"{pod}.*"}})'
)
if r.get("status") == "success":
for item in r.get("data", {}).get("result", []):
snapshots["restart_count"] = int(float(item["value"][1]))
r = await prom._instant_query({"query": f'sum(kube_pod_container_status_restarts_total{{namespace="awoooi-prod",pod=~"{pod}.*"}})'})
for item in r.get("result", []):
snapshots["restart_count"] = int(float(item["value"][1]))
return snapshots
except Exception as _e:
@@ -953,12 +962,12 @@ class DecisionManager:
await self._save_token(token)
# 觸發自動執行 (非阻塞)
asyncio.create_task(
_fire_and_forget(
self._auto_execute(incident, token)
)
else:
# 需人工審核: 推送到 Telegram
asyncio.create_task(
_fire_and_forget(
_push_decision_to_telegram(incident, token.proposal_data)
)
@@ -989,18 +998,20 @@ class DecisionManager:
# 另外:若 target 等於 alertname代表 LLM 把告警名稱填入 deployment_name也拒絕
_alertname = incident.signals[0].labels.get("alertname", "") if incident.signals else ""
_target_is_alertname = bool(_alertname and _target == _alertname)
if "unknown" in action or _re.search(r"[<{][^>}]+[>}]", action) or _target_is_alertname:
# P1 fix 2026-04-11: kubectl action 危險字元白名單 — 防止 && || ; > | 注入
_action_safe = bool(_ALLOWED_KUBECTL_PATTERN.match(action.strip()))
if "unknown" in action or _re.search(r"[<{][^>}]+[>}]", action) or _target_is_alertname or not _action_safe:
logger.warning(
"auto_execute_blocked_unresolved_placeholder",
incident_id=incident.incident_id,
action=action,
target=_target,
reason="action 含未解析的 placeholder、unknown、target==alertname拒絕執行",
reason="action 含未解析的 placeholder、unknown、target==alertname、或危險字元,拒絕執行",
)
token.state = DecisionState.ERROR
token.error = f"Auto-execute blocked: unresolved placeholder in action: {action[:80]}"
await self._save_token(token)
asyncio.create_task(
_fire_and_forget(
_push_auto_repair_result(incident, action, success=False,
error="無法確認 deployment 名稱,請人工確認後手動執行")
)
@@ -1057,7 +1068,7 @@ class DecisionManager:
)
# 2026-04-09 Claude Sonnet 4.6: 執行成功 → 發 Telegram 結果通知
asyncio.create_task(
_fire_and_forget(
_push_auto_repair_result(incident, action, success=True)
)
@@ -1072,10 +1083,10 @@ class DecisionManager:
await self._save_token(token)
# 2026-04-09 Claude Sonnet 4.6: 執行失敗 → 發 Telegram 失敗通知 + fallback 人工
asyncio.create_task(
_fire_and_forget(
_push_auto_repair_result(incident, action, success=False, error=str(e))
)
asyncio.create_task(
_fire_and_forget(
_push_decision_to_telegram(incident, token.proposal_data)
)
@@ -1151,7 +1162,7 @@ class DecisionManager:
return playbook_result
# MCP Phase 4c: Playbook 無命中 → 非同步產生 AI 草稿 Playbook (2026-04-11 Claude Sonnet 4.6)
asyncio.create_task(_generate_playbook_draft_if_new(incident))
_fire_and_forget(_generate_playbook_draft_if_new(incident))
# Expert System 同步執行 (立即可用)
expert_result = expert_analyze(incident)