fix(security): Code Review P0+P1+P2 全修補 — MCP Phase 2b-3 + decision_manager
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
P0: decision_manager _fetch_metrics_snapshot 參數型別錯誤
- prom._instant_query(str) → prom._instant_query({"query": str})
- 結果解析 r.get("status")=="success" → r.get("result", [])
P1: prometheus_provider — alertname PromQL injection 防範
- 新增 _RE_SAFE_ALERTNAME 白名單正則
P1: decision_manager — kubectl action 危險字元注入防範
- 新增 _ALLOWED_KUBECTL_PATTERN 白名單,非法指令格式直接拒絕
P1: decision_manager — 6 個 asyncio.create_task() GC 風險
- 新增 _background_tasks: set + _fire_and_forget() helper
- 所有 bare create_task 改用 _fire_and_forget
P1: ssh_provider — Group B 寫入工具強制需要 known_hosts
- known_hosts 未設定或檔案不存在時拒絕執行,防 MITM
P2: sentry_provider — query 語意白名單驗證
- 新增 _RE_SAFE_SENTRY_QUERY,拒絕含特殊字元的 query
P2: argocd_provider — verify=False 改為 ARGOCD_VERIFY_TLS 環境變數開關
- 新增 _tls_verify() helper,預設 false(self-signed cert)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -33,6 +33,12 @@ _HTTP_TIMEOUT = 10.0
|
||||
_RE_SAFE_APP_NAME = re.compile(r'^[a-zA-Z0-9][a-zA-Z0-9._-]{0,62}$')
|
||||
|
||||
|
||||
def _tls_verify() -> bool:
|
||||
"""P2 fix 2026-04-11: 讀取 ARGOCD_VERIFY_TLS env,預設 False(因 self-signed cert)。"""
|
||||
import os
|
||||
return os.environ.get("ARGOCD_VERIFY_TLS", "false").lower() in ("true", "1", "yes")
|
||||
|
||||
|
||||
def _validate_app_name(name: str) -> str:
|
||||
if not _RE_SAFE_APP_NAME.match(name):
|
||||
raise ValueError(f"Unsafe app name: {name!r}")
|
||||
@@ -152,7 +158,7 @@ class ArgoCDProvider(MCPToolProvider):
|
||||
if ns_filter:
|
||||
params["appNamespace"] = ns_filter
|
||||
|
||||
async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT, verify=False) as client:
|
||||
async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT, verify=_tls_verify()) as client:
|
||||
resp = await client.get(url, headers=self._headers(), params=params)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
@@ -183,7 +189,7 @@ class ArgoCDProvider(MCPToolProvider):
|
||||
app_name = _validate_app_name(parameters["app_name"])
|
||||
url = f"{self._base_url()}/api/v1/applications/{app_name}"
|
||||
|
||||
async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT, verify=False) as client:
|
||||
async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT, verify=_tls_verify()) as client:
|
||||
resp = await client.get(url, headers=self._headers())
|
||||
resp.raise_for_status()
|
||||
app = resp.json()
|
||||
@@ -229,7 +235,7 @@ class ArgoCDProvider(MCPToolProvider):
|
||||
|
||||
# ArgoCD history endpoint
|
||||
history_url = f"{self._base_url()}/api/v1/applications/{app_name}"
|
||||
async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT, verify=False) as client:
|
||||
async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT, verify=_tls_verify()) as client:
|
||||
resp = await client.get(history_url, headers=self._headers())
|
||||
resp.raise_for_status()
|
||||
app = resp.json()
|
||||
@@ -255,7 +261,7 @@ class ArgoCDProvider(MCPToolProvider):
|
||||
async def health_check(self) -> bool:
|
||||
try:
|
||||
url = f"{self._base_url()}/api/v1/applicationsets"
|
||||
async with httpx.AsyncClient(timeout=5.0, verify=False) as client:
|
||||
async with httpx.AsyncClient(timeout=5.0, verify=_tls_verify()) as client:
|
||||
resp = await client.get(url, headers=self._headers())
|
||||
return resp.status_code < 500
|
||||
except Exception:
|
||||
|
||||
@@ -19,6 +19,7 @@ Prometheus MCP Tool Provider — MCP Phase 2b
|
||||
@see docs/superpowers/specs/2026-04-10-infra-rebuild-sprint-abc-design.md §MCP-2b
|
||||
"""
|
||||
|
||||
import re
|
||||
import uuid
|
||||
from datetime import UTC, datetime, timedelta
|
||||
from typing import Any
|
||||
@@ -26,6 +27,9 @@ from typing import Any
|
||||
import httpx
|
||||
import structlog
|
||||
|
||||
# P1 fix 2026-04-11: alertname 白名單防止 PromQL label injection
|
||||
_RE_SAFE_ALERTNAME = re.compile(r"^[a-zA-Z0-9_-]{1,64}$")
|
||||
|
||||
from src.plugins.mcp.interfaces import MCPTool, MCPToolProvider, MCPToolResult
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
@@ -248,6 +252,9 @@ class PrometheusProvider(MCPToolProvider):
|
||||
|
||||
async def _alert_history(self, params: dict) -> dict:
|
||||
alertname = params["alertname"]
|
||||
# P1 fix 2026-04-11: 白名單驗證防止 PromQL label injection
|
||||
if not _RE_SAFE_ALERTNAME.match(alertname):
|
||||
raise ValueError(f"Unsafe alertname: {alertname!r}")
|
||||
window_hours = int(params.get("window_hours", 24))
|
||||
limit = int(params.get("limit", 20))
|
||||
|
||||
|
||||
@@ -31,6 +31,11 @@ logger = structlog.get_logger(__name__)
|
||||
|
||||
_HTTP_TIMEOUT = 10.0
|
||||
_RE_SAFE_ISSUE_ID = re.compile(r'^\d{1,20}$')
|
||||
# P2 fix 2026-04-11: Sentry query 語意白名單 — 只允許已知安全的 Sentry 搜尋語法
|
||||
# 允許: 純文字識別字、is:unresolved、level:error、project:xxx、assigned:me 等
|
||||
_RE_SAFE_SENTRY_QUERY = re.compile(
|
||||
r'^[\w\s:.\-/]+$' # alphanumeric, space, colon, dot, dash, slash — Sentry filter 所需
|
||||
)
|
||||
|
||||
|
||||
class SentryProvider(MCPToolProvider):
|
||||
@@ -222,7 +227,10 @@ class SentryProvider(MCPToolProvider):
|
||||
return MCPToolResult(success=True, data=result)
|
||||
|
||||
async def _search_issues(self, parameters: dict) -> MCPToolResult:
|
||||
query = str(parameters["query"])[:200] # 限制長度防止注入
|
||||
query = str(parameters["query"])[:200]
|
||||
# P2 fix 2026-04-11: 語意白名單驗證,拒絕含特殊字元的 query
|
||||
if not _RE_SAFE_SENTRY_QUERY.match(query):
|
||||
return MCPToolResult(success=False, error=f"Unsafe sentry query: {query!r}")
|
||||
limit = max(1, min(int(parameters.get("limit", 10)), 25))
|
||||
|
||||
url = f"{self._base_url()}/api/0/projects/{self._org()}/{self._project()}/issues/"
|
||||
|
||||
@@ -360,6 +360,21 @@ class SSHProvider(MCPToolProvider):
|
||||
),
|
||||
)
|
||||
|
||||
# P1 fix 2026-04-11: 群組 B 寫入工具必須有 known_hosts,否則拒絕執行
|
||||
# 防止 MITM — 讀取工具(Group A)允許 known_hosts=None 方便診斷;寫入操作不行
|
||||
import os as _os
|
||||
if tool_name in GROUP_B_TOOLS:
|
||||
_kh = _os.environ.get("SSH_MCP_KNOWN_HOSTS_FILE")
|
||||
if not _kh or not _os.path.exists(_kh):
|
||||
return MCPToolResult(
|
||||
success=False,
|
||||
execution_id=execution_id,
|
||||
error=(
|
||||
"Group B write tool refused: SSH_MCP_KNOWN_HOSTS_FILE not set or missing. "
|
||||
"Set up known_hosts per docs/runbooks/ssh-mcp-setup.md before write operations."
|
||||
),
|
||||
)
|
||||
|
||||
# 執行
|
||||
try:
|
||||
is_group_b = tool_name in GROUP_B_TOOLS
|
||||
|
||||
@@ -41,6 +41,23 @@ logger = structlog.get_logger(__name__)
|
||||
# Phase 7.5: Playbook 優先閾值
|
||||
PLAYBOOK_SIMILARITY_THRESHOLD = 0.85 # 相似度 >= 85% 直接使用 Playbook
|
||||
|
||||
# P1 fix 2026-04-11: background task GC guard — keep strong refs until done
|
||||
_background_tasks: set[asyncio.Task] = set()
|
||||
|
||||
def _fire_and_forget(coro) -> asyncio.Task:
|
||||
"""Create a background task with GC protection via _background_tasks."""
|
||||
task = asyncio.create_task(coro)
|
||||
_background_tasks.add(task)
|
||||
task.add_done_callback(_background_tasks.discard)
|
||||
return task
|
||||
|
||||
# P1 fix 2026-04-11: kubectl action dangerous char whitelist
|
||||
import re as _re_module
|
||||
_ALLOWED_KUBECTL_PATTERN = _re_module.compile(
|
||||
r"^kubectl\s+(rollout restart|rollout undo|scale|delete pod|get|describe|logs)"
|
||||
r"\s+[a-zA-Z0-9_./-]+(\s+(-n|--namespace)\s+[a-zA-Z0-9_-]+)?$"
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Phase 31 (ADR-067 2026-04-10): Log 異常摘要 — NemoTron deepseek-r1:14b
|
||||
@@ -238,13 +255,13 @@ async def _push_decision_to_telegram(
|
||||
|
||||
# Phase 31 (ADR-067 2026-04-10): Log 異常摘要 — NemoTron deepseek-r1:14b
|
||||
# 非同步執行,不阻塞主流程
|
||||
asyncio.create_task(_send_log_summary(incident))
|
||||
_fire_and_forget(_send_log_summary(incident))
|
||||
|
||||
# MCP Phase 4a: NemoClaw second opinion (2026-04-11 Claude Sonnet 4.6)
|
||||
# 若 proposal_data 有 advisory_note,用 NemoClaw bot 身分追加一條訊息
|
||||
_advisory_note = proposal_data.get("advisory_note", "")
|
||||
if _advisory_note:
|
||||
asyncio.create_task(
|
||||
_fire_and_forget(
|
||||
gateway.send_as_nemotron(
|
||||
f"🤔 <b>NemoClaw 第二意見</b> (信心={confidence:.2f})\n"
|
||||
f"<i>{_advisory_note}</i>"
|
||||
@@ -446,37 +463,29 @@ async def _fetch_metrics_snapshot(incident: Incident) -> dict:
|
||||
if alertname in ("HostHighCpuLoad", "HostOutOfMemory"):
|
||||
if instance:
|
||||
host = instance.split(":")[0]
|
||||
r = await prom._instant_query(
|
||||
f'100 - (avg by(instance) (irate(node_cpu_seconds_total{{mode="idle",instance=~"{host}.*"}}[5m])) * 100)'
|
||||
)
|
||||
if r.get("status") == "success":
|
||||
for item in r.get("data", {}).get("result", []):
|
||||
snapshots["cpu_pct"] = round(float(item["value"][1]), 1)
|
||||
r2 = await prom._instant_query(
|
||||
# P0 fix 2026-04-11: _instant_query 要求 dict,回傳 {"result": [...]}
|
||||
r = await prom._instant_query({"query": f'100 - (avg by(instance) (irate(node_cpu_seconds_total{{mode="idle",instance=~"{host}.*"}}[5m])) * 100)'})
|
||||
for item in r.get("result", []):
|
||||
snapshots["cpu_pct"] = round(float(item["value"][1]), 1)
|
||||
cpu_query = (
|
||||
f'(1 - (node_memory_MemAvailable_bytes{{instance=~"{instance}"}} / node_memory_MemTotal_bytes{{instance=~"{instance}"}})) * 100'
|
||||
if instance else "100 * (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)"
|
||||
)
|
||||
if r2.get("status") == "success":
|
||||
for item in r2.get("data", {}).get("result", []):
|
||||
snapshots["mem_pct"] = round(float(item["value"][1]), 1)
|
||||
r2 = await prom._instant_query({"query": cpu_query})
|
||||
for item in r2.get("result", []):
|
||||
snapshots["mem_pct"] = round(float(item["value"][1]), 1)
|
||||
|
||||
elif alertname == "HostOutOfDiskSpace":
|
||||
r = await prom._instant_query(
|
||||
'max(100 - ((node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"}) * 100))'
|
||||
)
|
||||
if r.get("status") == "success":
|
||||
for item in r.get("data", {}).get("result", []):
|
||||
snapshots["disk_pct"] = round(float(item["value"][1]), 1)
|
||||
r = await prom._instant_query({"query": 'max(100 - ((node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"}) * 100))'})
|
||||
for item in r.get("result", []):
|
||||
snapshots["disk_pct"] = round(float(item["value"][1]), 1)
|
||||
|
||||
elif alertname in ("PodRestartingTooMuch", "PodCrashLoopBackOff"):
|
||||
pod = labels.get("pod", labels.get("component", ""))
|
||||
if pod:
|
||||
r = await prom._instant_query(
|
||||
f'sum(kube_pod_container_status_restarts_total{{namespace="awoooi-prod",pod=~"{pod}.*"}})'
|
||||
)
|
||||
if r.get("status") == "success":
|
||||
for item in r.get("data", {}).get("result", []):
|
||||
snapshots["restart_count"] = int(float(item["value"][1]))
|
||||
r = await prom._instant_query({"query": f'sum(kube_pod_container_status_restarts_total{{namespace="awoooi-prod",pod=~"{pod}.*"}})'})
|
||||
for item in r.get("result", []):
|
||||
snapshots["restart_count"] = int(float(item["value"][1]))
|
||||
|
||||
return snapshots
|
||||
except Exception as _e:
|
||||
@@ -953,12 +962,12 @@ class DecisionManager:
|
||||
await self._save_token(token)
|
||||
|
||||
# 觸發自動執行 (非阻塞)
|
||||
asyncio.create_task(
|
||||
_fire_and_forget(
|
||||
self._auto_execute(incident, token)
|
||||
)
|
||||
else:
|
||||
# 需人工審核: 推送到 Telegram
|
||||
asyncio.create_task(
|
||||
_fire_and_forget(
|
||||
_push_decision_to_telegram(incident, token.proposal_data)
|
||||
)
|
||||
|
||||
@@ -989,18 +998,20 @@ class DecisionManager:
|
||||
# 另外:若 target 等於 alertname,代表 LLM 把告警名稱填入 deployment_name,也拒絕
|
||||
_alertname = incident.signals[0].labels.get("alertname", "") if incident.signals else ""
|
||||
_target_is_alertname = bool(_alertname and _target == _alertname)
|
||||
if "unknown" in action or _re.search(r"[<{][^>}]+[>}]", action) or _target_is_alertname:
|
||||
# P1 fix 2026-04-11: kubectl action 危險字元白名單 — 防止 && || ; > | 注入
|
||||
_action_safe = bool(_ALLOWED_KUBECTL_PATTERN.match(action.strip()))
|
||||
if "unknown" in action or _re.search(r"[<{][^>}]+[>}]", action) or _target_is_alertname or not _action_safe:
|
||||
logger.warning(
|
||||
"auto_execute_blocked_unresolved_placeholder",
|
||||
incident_id=incident.incident_id,
|
||||
action=action,
|
||||
target=_target,
|
||||
reason="action 含未解析的 placeholder、unknown、或 target==alertname,拒絕執行",
|
||||
reason="action 含未解析的 placeholder、unknown、target==alertname、或危險字元,拒絕執行",
|
||||
)
|
||||
token.state = DecisionState.ERROR
|
||||
token.error = f"Auto-execute blocked: unresolved placeholder in action: {action[:80]}"
|
||||
await self._save_token(token)
|
||||
asyncio.create_task(
|
||||
_fire_and_forget(
|
||||
_push_auto_repair_result(incident, action, success=False,
|
||||
error="無法確認 deployment 名稱,請人工確認後手動執行")
|
||||
)
|
||||
@@ -1057,7 +1068,7 @@ class DecisionManager:
|
||||
)
|
||||
|
||||
# 2026-04-09 Claude Sonnet 4.6: 執行成功 → 發 Telegram 結果通知
|
||||
asyncio.create_task(
|
||||
_fire_and_forget(
|
||||
_push_auto_repair_result(incident, action, success=True)
|
||||
)
|
||||
|
||||
@@ -1072,10 +1083,10 @@ class DecisionManager:
|
||||
await self._save_token(token)
|
||||
|
||||
# 2026-04-09 Claude Sonnet 4.6: 執行失敗 → 發 Telegram 失敗通知 + fallback 人工
|
||||
asyncio.create_task(
|
||||
_fire_and_forget(
|
||||
_push_auto_repair_result(incident, action, success=False, error=str(e))
|
||||
)
|
||||
asyncio.create_task(
|
||||
_fire_and_forget(
|
||||
_push_decision_to_telegram(incident, token.proposal_data)
|
||||
)
|
||||
|
||||
@@ -1151,7 +1162,7 @@ class DecisionManager:
|
||||
return playbook_result
|
||||
|
||||
# MCP Phase 4c: Playbook 無命中 → 非同步產生 AI 草稿 Playbook (2026-04-11 Claude Sonnet 4.6)
|
||||
asyncio.create_task(_generate_playbook_draft_if_new(incident))
|
||||
_fire_and_forget(_generate_playbook_draft_if_new(incident))
|
||||
|
||||
# Expert System 同步執行 (立即可用)
|
||||
expert_result = expert_analyze(incident)
|
||||
|
||||
Reference in New Issue
Block a user