From 297afb699822716adadd3c5c7e386a2be74f6cbc Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 2 May 2026 17:14:23 +0800 Subject: [PATCH] fix(ci): require all 4 host keys before overwriting ssh-mcp-key secret When ssh-keyscan partially fails (e.g. one host is unreachable for a moment) the previous logic still considered the file non-empty, so it patched ssh-mcp-key/known_hosts with an incomplete set. asyncssh then rejected any SSH to the missing host with "Host key is not trusted", which routed every host disk-full / docker alert into the emergency escalation channel and spammed Telegram (today's regression for 110). Now we explicitly verify all four target IPs (110/120/121/188) appear in the scan output before patching. Missing any of them aborts the patch and keeps the previously-good secret untouched, plus logs the ssh-keyscan stderr to help debug intermittent network issues. Co-Authored-By: Claude Sonnet 4.6 --- .gitea/workflows/cd.yaml | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index 1e64b79a..4e9b071d 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -566,8 +566,22 @@ jobs: # asyncssh reads /etc/ssh-mcp/known_hosts and requires a non-empty # OpenSSH known_hosts file. Keep hosts unhashed so both asyncssh and # CLI diagnostics can trust the same secret. - ssh-keyscan 192.168.0.110 192.168.0.120 192.168.0.121 192.168.0.188 > /tmp/known_hosts_repair 2>/dev/null - if [ -s /tmp/known_hosts_repair ]; then + # 2026-05-02 ogt + Claude Sonnet 4.6: 加 4 台主機完整性檢查 + # 根因:partial scan(如 110 timeout、其他成功)會讓 [-s file] 通過、 + # 後續 patch 推進缺漏的 known_hosts → asyncssh 拒所有 SSH。 + # 修法:scan 完用 grep -c 驗證 4 台主機都在;缺任何一台就 abort, + # 不能覆蓋現有 secret,防止 production SSH 自動修復路徑癱瘓。 + ssh-keyscan 192.168.0.110 192.168.0.120 192.168.0.121 192.168.0.188 > /tmp/known_hosts_repair 2>/tmp/known_hosts_scan_err || true + EXPECTED_HOSTS=4 + PRESENT=0 + for ip in 192.168.0.110 192.168.0.120 192.168.0.121 192.168.0.188; do + if grep -qE "^${ip}[[:space:]]" /tmp/known_hosts_repair 2>/dev/null; then + PRESENT=$((PRESENT + 1)) + else + echo "⚠️ ssh-keyscan 缺主機 ${ip}" + fi + done + if [ "$PRESENT" -eq "$EXPECTED_HOSTS" ]; then sudo kubectl create secret generic awoooi-repair-known-hosts \ -n awoooi-prod \ --from-file=known_hosts=/tmp/known_hosts_repair \ @@ -576,11 +590,13 @@ jobs: || echo "⚠️ awoooi-repair-known-hosts Secret 建立失敗 (非致命)" sudo kubectl patch secret ssh-mcp-key -n awoooi-prod --type=merge \ -p='{"data":{"known_hosts":"'$(base64 -w 0 /tmp/known_hosts_repair)'"}}' \ - && echo "✅ ssh-mcp-key known_hosts 已更新" \ + && echo "✅ ssh-mcp-key known_hosts 已更新(4 台主機完整)" \ || echo "⚠️ ssh-mcp-key known_hosts 更新失敗 (非致命)" - rm -f /tmp/known_hosts_repair + rm -f /tmp/known_hosts_repair /tmp/known_hosts_scan_err else - echo "⚠️ ssh-keyscan 掃描失敗,跳過 known_hosts Secret" + echo "❌ ssh-keyscan 只抓到 ${PRESENT}/${EXPECTED_HOSTS} 台主機,跳過 patch(保留現有 secret)" + cat /tmp/known_hosts_scan_err 2>/dev/null | head -10 + rm -f /tmp/known_hosts_repair /tmp/known_hosts_scan_err fi echo "✅ 所有 Secrets 注入完成"