From 8e49f2ea887566f7841c46a409fcb3f4ff09da1e Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 1 May 2026 17:18:32 +0800 Subject: [PATCH] fix(ci): preserve ssh mcp known hosts [skip ci] --- .agents/skills/04-awoooi-devops-commander.md | 10 +++++----- .agents/skills/05-awoooi-sre-qa.md | 1 + .gitea/workflows/cd.yaml | 12 ++++++++---- docs/LOGBOOK.md | 1 + docs/adr/ADR-058-host-auto-repair-ssh-whitelist.md | 3 +++ docs/runbooks/ssh-mcp-setup.md | 9 +++++++-- 6 files changed, 25 insertions(+), 11 deletions(-) diff --git a/.agents/skills/04-awoooi-devops-commander.md b/.agents/skills/04-awoooi-devops-commander.md index 0c764af7..1676868a 100644 --- a/.agents/skills/04-awoooi-devops-commander.md +++ b/.agents/skills/04-awoooi-devops-commander.md @@ -1235,9 +1235,9 @@ links = DeepLinking.get_all_links( |------|-------|------| | Dockerfile | `openssh-client` | 生產 stage 必須安裝,ssh binary 才存在 | | K8s Pod securityContext | `fsGroup: 1000` | 讓 appuser 有 group read on 0400 Secret | -| NetworkPolicy egress | port 22 → 110 + 188 | 預設拒絕,必須明確開放 | +| NetworkPolicy egress | port 22 → 110/120/121/188 | 預設拒絕,必須明確開放 | | Secret defaultMode | `0400` (八進位) | SSH 要求 owner-only,group read 靠 fsGroup | -| known_hosts Secret | `awoooi-repair-known-hosts` | optional: true,含 110+188 hashed 指紋 | +| known_hosts Secret | `awoooi-repair-known-hosts` + `ssh-mcp-key.known_hosts` | optional: true,含 110/120/121/188 指紋;`ssh-mcp-key` 給 asyncssh 使用 | ### repair-bot 白名單 (當前完整清單) @@ -1277,7 +1277,7 @@ links = DeepLinking.get_all_links( 1. 在目標主機建立 `~/bin/repair-bot-{host}.sh`(複製模板) 2. 將 `awoooi-repair-ssh-key.pub` 加入 `~/.ssh/authorized_keys`(加 `command=` 限制) -3. `ssh-keyscan -H {host_ip}` → 更新 `awoooi-repair-known-hosts` Secret +3. `ssh-keyscan {host_ip}` → 更新 `awoooi-repair-known-hosts` Secret 與 `ssh-mcp-key.known_hosts` 4. NetworkPolicy 新增 `{host_ip}:22` egress 5. `LAYER_SSH_CONFIG` 新增 layer 設定(`host_repair_agent.py`) 6. service-registry.yaml 新增服務分級 @@ -1291,8 +1291,8 @@ links = DeepLinking.get_all_links( ❌ kubectl apply 06-deployment-api.yaml → IMAGE_TAG_PLACEHOLDER 覆蓋真實 SHA → ImagePullBackOff ✅ 修改 K8s Deployment 配置用 kubectl patch,不用 kubectl apply -❌ known_hosts hashed 格式,grep IP 會得 0 → 以為沒寫進去 -✅ 用 wc -l 或 ssh 實測驗證,hashed 格式是正常的 +❌ ssh-mcp-key known_hosts 是空檔或只更新 Secret 未重啟 subPath pod → asyncssh `Host key is not trusted` +✅ 用 `wc -c /etc/ssh-mcp/known_hosts` 驗證非 0;subPath 掛載更新後 rollout restart API/worker ❌ StrictHostKeyChecking=no(舊設定) ✅ known_hosts Secret 已建立,改用 StrictHostKeyChecking=yes diff --git a/.agents/skills/05-awoooi-sre-qa.md b/.agents/skills/05-awoooi-sre-qa.md index 4bd32deb..1b9b9cee 100644 --- a/.agents/skills/05-awoooi-sre-qa.md +++ b/.agents/skills/05-awoooi-sre-qa.md @@ -785,6 +785,7 @@ kubectl -n awoooi-prod logs -l app=awoooi-api --tail=50 | \ | `ssh: command not found` | Dockerfile 缺 openssh-client | Pod exec `which ssh` | | `Permission denied (publickey)` | known_hosts 缺少該主機 | Pod exec SSH 看錯誤訊息 | | `Permission denied (publickey)` only on `192.168.0.188` | 188 需要 `ollama` 使用者,不是預設 `wooo` | 查 `SSH_MCP_HOST_USERS=192.168.0.188=ollama`,用 `ollama@192.168.0.188` 測 | +| `Host key is not trusted for host ...` | `/etc/ssh-mcp/known_hosts` 空檔、過期,或 Secret 已 patch 但 subPath pod 未重啟 | patch `ssh-mcp-key.known_hosts`,rollout restart API/worker,再用 `ssh_diagnose` 驗證 | | `Load key ... Permission denied` | fsGroup 未設定 | Pod exec `ls -la /etc/repair-ssh/` | | `Connection refused/timeout` | NetworkPolicy 封鎖 22 | Pod exec `ssh -v` 看連線過程 | | `forbidden_shell_metachar` 且 action 是 `ssh ... '...'` | host/backup category 沒在 DecisionManager kubectl parser 前路由 SSH | 查 `alert_category` 是否為 `backup_failure`,確認 `_is_host_layer_ssh_category()` 覆蓋 | diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index bb0dd48f..1e64b79a 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -563,7 +563,10 @@ jobs: # 2026-04-06 Claude Code: Sprint 3 T2 — known_hosts Secret (Security Fix A1) # 替換 StrictHostKeyChecking=no,讓 SSH 修復路徑使用已知主機指紋 - ssh-keyscan -H 192.168.0.110 192.168.0.120 192.168.0.121 192.168.0.188 > /tmp/known_hosts_repair 2>/dev/null + # asyncssh reads /etc/ssh-mcp/known_hosts and requires a non-empty + # OpenSSH known_hosts file. Keep hosts unhashed so both asyncssh and + # CLI diagnostics can trust the same secret. + ssh-keyscan 192.168.0.110 192.168.0.120 192.168.0.121 192.168.0.188 > /tmp/known_hosts_repair 2>/dev/null if [ -s /tmp/known_hosts_repair ]; then sudo kubectl create secret generic awoooi-repair-known-hosts \ -n awoooi-prod \ @@ -571,9 +574,10 @@ jobs: --dry-run=client -o yaml | sudo kubectl apply -f - \ && echo "✅ awoooi-repair-known-hosts Secret 已建立/更新" \ || echo "⚠️ awoooi-repair-known-hosts Secret 建立失敗 (非致命)" - sudo kubectl patch secret ssh-mcp-key -n awoooi-prod --type='json' -p='[ - {"op":"add","path":"/data/known_hosts","value":"'$(base64 -w 0 /tmp/known_hosts_repair)'"} - ]' && echo "✅ ssh-mcp-key known_hosts 已更新" || echo "⚠️ ssh-mcp-key known_hosts 更新失敗 (非致命)" + sudo kubectl patch secret ssh-mcp-key -n awoooi-prod --type=merge \ + -p='{"data":{"known_hosts":"'$(base64 -w 0 /tmp/known_hosts_repair)'"}}' \ + && echo "✅ ssh-mcp-key known_hosts 已更新" \ + || echo "⚠️ ssh-mcp-key known_hosts 更新失敗 (非致命)" rm -f /tmp/known_hosts_repair else echo "⚠️ ssh-keyscan 掃描失敗,跳過 known_hosts Secret" diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 86625694..9e6b1c76 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -25,6 +25,7 @@ - YAML parse:`callback_action_spec.yaml`、`04-configmap.yaml`、`08-deployment-worker.yaml`、`.gitea/workflows/cd.yaml` 通過。 - `cd apps/api && DATABASE_URL=postgresql://test:test@localhost:5432/test pytest tests/test_ssh_provider_tools.py tests/test_callback_dispatcher.py tests/test_action_parsing.py tests/test_action_parser_safety.py tests/test_alertmanager_rule_bypass.py tests/test_auto_repair_service.py tests/test_telegram_button_consistency.py tests/test_openclaw_cache_key.py -q` → 138 passed。 - Live SSH 基準:API pod 使用 `/etc/ssh-mcp/known_hosts` 可連 `wooo@110/120/121` 與 `ollama@188`;`wooo@188` 會 publickey denied,確認 host user override 是必要修復。 +- Live 補驗:`ssh-mcp-key.known_hosts` 原先未寫入,subPath pod 內為 0 bytes;已 live patch non-empty known_hosts、rolling restart API/worker,並驗證 `SSHProvider.execute("ssh_diagnose", {"host": "192.168.0.188"})` success、username=`ollama`。CD workflow 改用 non-hashed `ssh-keyscan` + merge patch 防回歸。 ## 2026-05-01 | Gitea host runner graceful shutdown guard diff --git a/docs/adr/ADR-058-host-auto-repair-ssh-whitelist.md b/docs/adr/ADR-058-host-auto-repair-ssh-whitelist.md index 19410384..9ba8827b 100644 --- a/docs/adr/ADR-058-host-auto-repair-ssh-whitelist.md +++ b/docs/adr/ADR-058-host-auto-repair-ssh-whitelist.md @@ -171,6 +171,9 @@ MoWoooWorkDown → Jaccard 匹配 momo-app-down-repair → SSH ollama@192.168.0. - `ollama@192.168.0.188` - Runtime config: - `SSH_MCP_HOST_USERS=192.168.0.188=ollama` +- Runtime known_hosts: + - `ssh-mcp-key.data.known_hosts` must be non-empty and mounted at `/etc/ssh-mcp/known_hosts` + - Because the file is mounted with `subPath`, updating the Secret requires rolling API/worker pods before asyncssh sees the new trust store - NetworkPolicy egress: - `192.168.0.110:22` - `192.168.0.120:22` diff --git a/docs/runbooks/ssh-mcp-setup.md b/docs/runbooks/ssh-mcp-setup.md index fb81960f..aba25ac2 100644 --- a/docs/runbooks/ssh-mcp-setup.md +++ b/docs/runbooks/ssh-mcp-setup.md @@ -61,7 +61,7 @@ ssh-copy-id -i /tmp/ssh-mcp-key.pub wooo@192.168.0.121 ### 3. 生成 known_hosts ```bash -ssh-keyscan -H 192.168.0.110 192.168.0.120 192.168.0.121 192.168.0.188 > /tmp/ssh-mcp-known_hosts +ssh-keyscan 192.168.0.110 192.168.0.120 192.168.0.121 192.168.0.188 > /tmp/ssh-mcp-known_hosts ``` ### 4. 建立 K8s Secret @@ -72,6 +72,10 @@ kubectl create secret generic ssh-mcp-key \ --from-literal=known_hosts="$(cat /tmp/ssh-mcp-known_hosts)" \ -n awoooi-prod +# 更新既有 Secret 時,用 merge patch,避免 json add 在 key 狀態漂移時失敗 +kubectl patch secret ssh-mcp-key -n awoooi-prod --type=merge \ + -p "{\"data\":{\"known_hosts\":\"$(base64 -w 0 /tmp/ssh-mcp-known_hosts)\"}}" + # 清除暫存 rm /tmp/ssh-mcp-key /tmp/ssh-mcp-key.pub /tmp/ssh-mcp-known_hosts ``` @@ -115,6 +119,7 @@ kubectl exec -n awoooi-prod deploy/awoooi-api -- ls -la /run/secrets/ssh_mcp_key # 確認 known_hosts 掛載 kubectl exec -n awoooi-prod deploy/awoooi-api -- ls -la /etc/ssh-mcp/known_hosts +kubectl exec -n awoooi-prod deploy/awoooi-api -- wc -c /etc/ssh-mcp/known_hosts # 確認 provider 已啟用 kubectl logs -n awoooi-prod deploy/awoooi-api | grep '"name": "ssh_host"' @@ -154,6 +159,6 @@ kubectl rollout restart deploy/awoooi-api -n awoooi-prod | 症狀 | 原因 | 解決 | |------|------|------| | `ssh_host` provider enabled=false | SSH_MCP_ENABLED 未設定 | 確認 ConfigMap | -| known_hosts WARNING | SSH_MCP_KNOWN_HOSTS_FILE 指向空檔 | 確認 Secret 有 known_hosts key | +| known_hosts WARNING | SSH_MCP_KNOWN_HOSTS_FILE 指向空檔 | 確認 Secret 有 known_hosts key;若用 subPath 掛載,patch 後需 rollout restart API/worker | | Connection refused | authorized_keys 未加入公鑰 | 重做步驟 2 | | Host key verification failed | known_hosts 過期 | 重做步驟 3+4 |