Adds Group B SSH MCP tool ssh_docker_prune (image+volume+builder prune with ≥75% disk usage gate) and routes "docker prune" actions through it. Flips HostDiskUsageHigh from auto_repair=false to true with mcp_provider routing labels so the flywheel can self-heal next disk-full event without hitting the emergency_channel Telegram path. Trigger: 2026-05-01 → 05-02 Telegram alert storm (peak 53/hr) caused by empty ssh-mcp-key/known_hosts secret rejecting all SSH and forcing every disk-full alert through "Host key is not trusted → escalate" loop. known_hosts patched live; this commit closes the playbook gap so the next occurrence resolves without manual intervention. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
137 lines
5.1 KiB
Python
137 lines
5.1 KiB
Python
"""
|
||
DecisionManager._ssh_execute docker prune 路由測試
|
||
=================================================
|
||
ADR-068 飛輪 — disk full SOP(2026-05-02 ogt + Claude Sonnet 4.6)
|
||
|
||
驗證 LLM 提案 action 含 "docker prune" 時,會路由到 ssh_provider 的
|
||
ssh_docker_prune 工具,並帶上 trust_score=0.85 + host=instance label。
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from types import SimpleNamespace
|
||
from unittest.mock import AsyncMock, MagicMock, patch
|
||
|
||
import pytest
|
||
|
||
from src.plugins.mcp.interfaces import MCPToolResult
|
||
from src.services.decision_manager import DecisionManager
|
||
|
||
|
||
def _fake_incident(host: str = "192.168.0.110") -> SimpleNamespace:
|
||
"""最小可用 Incident stub — 只有 _ssh_execute 用到的欄位"""
|
||
signal = SimpleNamespace(labels={"instance": host})
|
||
return SimpleNamespace(
|
||
incident_id="INC-TEST-PRUNE",
|
||
signals=[signal],
|
||
)
|
||
|
||
|
||
def _fake_token() -> SimpleNamespace:
|
||
"""最小可用 DecisionToken stub"""
|
||
return SimpleNamespace(
|
||
state=None,
|
||
proposal_data={},
|
||
error=None,
|
||
)
|
||
|
||
|
||
@pytest.fixture
|
||
def manager(monkeypatch):
|
||
"""避免實例化重型依賴(openclaw / redis / knowledge_service)"""
|
||
# get_knowledge_service 在 __init__ 內 import,patch 原始 module
|
||
with patch("src.services.decision_manager.get_openclaw"), \
|
||
patch("src.services.knowledge_service.get_knowledge_service"), \
|
||
patch("src.plugins.mcp.providers.k8s_provider.K8sProvider"), \
|
||
patch("src.plugins.mcp.providers.ssh_provider.SSHProvider"):
|
||
mgr = DecisionManager()
|
||
# 用 AsyncMock 攔截 SSH execute、token 存檔、telegram push
|
||
mgr._ssh = MagicMock()
|
||
mgr._ssh.execute = AsyncMock(
|
||
return_value=MCPToolResult(success=True, execution_id="exec-1", output={"stdout": "pruned"})
|
||
)
|
||
mgr._save_token = AsyncMock()
|
||
monkeypatch.setattr(
|
||
"src.services.decision_manager._fire_and_forget",
|
||
lambda *a, **k: None,
|
||
)
|
||
# _fire_and_forget 已經 patch 為 no-op,但這兩個函式被「呼叫」會建出 coroutine
|
||
# 改用同步 stub 直接回 None 避免 unawaited coroutine warning
|
||
monkeypatch.setattr(
|
||
"src.services.decision_manager._push_decision_to_telegram",
|
||
lambda *a, **k: None,
|
||
)
|
||
monkeypatch.setattr(
|
||
"src.services.decision_manager._push_auto_repair_result",
|
||
lambda *a, **k: None,
|
||
)
|
||
monkeypatch.setenv("SSH_MCP_ALLOWED_HOSTS", "192.168.0.110,192.168.0.188")
|
||
return mgr
|
||
|
||
|
||
class TestDockerPruneActionRouting:
|
||
"""LLM action 字串 → ssh_docker_prune 工具路由"""
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_ssh_docker_prune_action_routes_to_tool(self, manager):
|
||
incident = _fake_incident()
|
||
token = _fake_token()
|
||
|
||
await manager._ssh_execute(
|
||
incident=incident,
|
||
token=token,
|
||
action="ssh 192.168.0.110 'docker image prune -a -f && docker volume prune -f'",
|
||
target="root",
|
||
)
|
||
|
||
manager._ssh.execute.assert_awaited_once()
|
||
call = manager._ssh.execute.call_args
|
||
assert call.kwargs["tool_name"] == "ssh_docker_prune", (
|
||
f"expected ssh_docker_prune, got {call.kwargs.get('tool_name')}"
|
||
)
|
||
params = call.kwargs["parameters"]
|
||
assert params["host"] == "192.168.0.110"
|
||
# Group B 必須帶 trust_score >= 0.8
|
||
assert params.get("trust_score", 0) >= 0.8
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_short_form_docker_prune_routes(self, manager):
|
||
"""簡寫格式(無 ssh prefix 但有 docker prune 關鍵字)也應路由"""
|
||
# 注意:必須以 ssh 開頭才命中現有路由家族;非 ssh 開頭走別的分支
|
||
incident = _fake_incident()
|
||
token = _fake_token()
|
||
await manager._ssh_execute(
|
||
incident=incident,
|
||
token=token,
|
||
action="ssh wooo@192.168.0.110 docker prune -a",
|
||
target="docker",
|
||
)
|
||
assert manager._ssh.execute.call_args.kwargs["tool_name"] == "ssh_docker_prune"
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_docker_restart_still_routes_to_docker_restart(self, manager):
|
||
"""加新分支不能誤傷既有 docker restart 路由"""
|
||
incident = _fake_incident()
|
||
token = _fake_token()
|
||
await manager._ssh_execute(
|
||
incident=incident,
|
||
token=token,
|
||
action="ssh 192.168.0.110 docker restart awoooi-api",
|
||
target="awoooi-api",
|
||
)
|
||
# Codex 已把 _tool 改為 SSH MCP 全名(之前 short name 與 ssh_provider 不對齊)
|
||
assert manager._ssh.execute.call_args.kwargs["tool_name"] == "ssh_docker_restart"
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_diagnose_still_routes_to_ssh_diagnose(self, manager):
|
||
"""加新分支不能誤傷 diagnose 路由"""
|
||
incident = _fake_incident()
|
||
token = _fake_token()
|
||
await manager._ssh_execute(
|
||
incident=incident,
|
||
token=token,
|
||
action="ssh 192.168.0.110 'df -h && free -h'",
|
||
target="unknown",
|
||
)
|
||
assert manager._ssh.execute.call_args.kwargs["tool_name"] == "ssh_diagnose"
|