From 80e6ec1a6761a2e24663858219d3b9d4df0f7949 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 13 Jun 2026 01:12:21 +0800 Subject: [PATCH] fix(ci): avoid clobbering runner known hosts --- .gitea/workflows/cd-dev.yaml | 16 ++++++++++------ .gitea/workflows/cd.yaml | 25 +++++++++++++++++-------- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/.gitea/workflows/cd-dev.yaml b/.gitea/workflows/cd-dev.yaml index fa3203e1..765b6163 100644 --- a/.gitea/workflows/cd-dev.yaml +++ b/.gitea/workflows/cd-dev.yaml @@ -145,9 +145,12 @@ jobs: mkdir -p ~/.ssh write_deploy_key - ssh-keyscan -T 5 -t ed25519,rsa,ecdsa 192.168.0.120 > "${HOME}/.ssh/known_hosts" 2>/dev/null - test -s "${HOME}/.ssh/known_hosts" || { echo "❌ K8S host keyscan failed: 192.168.0.120"; exit 1; } - SSH_OPTS="-o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${HOME}/.ssh/known_hosts -i ~/.ssh/deploy_key" + # Keep deploy-time host keys separate from the runner user's global + # known_hosts, which is also used by reboot/cold-start checks. + DEPLOY_KNOWN_HOSTS="${HOME}/.ssh/deploy_known_hosts" + ssh-keyscan -T 5 -t ed25519,rsa,ecdsa 192.168.0.120 > "${DEPLOY_KNOWN_HOSTS}" 2>/dev/null + test -s "${DEPLOY_KNOWN_HOSTS}" || { echo "❌ K8S host keyscan failed: 192.168.0.120"; exit 1; } + SSH_OPTS="-o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${DEPLOY_KNOWN_HOSTS} -i ~/.ssh/deploy_key" # 2026-05-05 Codex: kubectl runs on 120 control-plane. 121 is a # worker and its local kubeconfig points at 127.0.0.1:6443. ssh $SSH_OPTS wooo@192.168.0.120 << SECRETS @@ -177,9 +180,10 @@ jobs: # 部署到 awoooi-dev - name: Deploy to Dev K8s run: | - ssh-keyscan -T 5 -t ed25519,rsa,ecdsa 192.168.0.120 > "${HOME}/.ssh/known_hosts" 2>/dev/null - test -s "${HOME}/.ssh/known_hosts" || { echo "❌ K8S host keyscan failed: 192.168.0.120"; exit 1; } - SSH_OPTS="-o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${HOME}/.ssh/known_hosts -i ~/.ssh/deploy_key" + DEPLOY_KNOWN_HOSTS="${HOME}/.ssh/deploy_known_hosts" + ssh-keyscan -T 5 -t ed25519,rsa,ecdsa 192.168.0.120 > "${DEPLOY_KNOWN_HOSTS}" 2>/dev/null + test -s "${DEPLOY_KNOWN_HOSTS}" || { echo "❌ K8S host keyscan failed: 192.168.0.120"; exit 1; } + SSH_OPTS="-o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${DEPLOY_KNOWN_HOSTS} -i ~/.ssh/deploy_key" cat k8s/awoooi-dev/02-configmap.yaml | \ ssh $SSH_OPTS wooo@192.168.0.120 \ "export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -" diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index 26a21c24..21fe857a 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -607,9 +607,14 @@ jobs: # 2026-05-13 Codex: keyscan must include ED25519 explicitly. Some # OpenSSH builds otherwise record only RSA/ECDSA, then strict deploy # SSH fails with "No ED25519 host key is known" after image push. - ssh-keyscan -T 5 -t ed25519,rsa,ecdsa "${K8S_SSH_HOST}" > "${HOME}/.ssh/known_hosts" 2>/dev/null - test -s "${HOME}/.ssh/known_hosts" || { echo "❌ K8S host keyscan failed: ${K8S_SSH_HOST}"; exit 1; } - SSH_OPTS="-i ${HOME}/.ssh/deploy_key -o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${HOME}/.ssh/known_hosts -o ConnectTimeout=10" + # 2026-06-13 Codex: keep deploy-time host keys in a dedicated file. + # The runner user's global known_hosts is shared by cold-start and + # backup checks for 120/188; overwriting it here caused strict SSH + # recovery gates to flap after every CD run. + DEPLOY_KNOWN_HOSTS="${HOME}/.ssh/deploy_known_hosts" + ssh-keyscan -T 5 -t ed25519,rsa,ecdsa "${K8S_SSH_HOST}" > "${DEPLOY_KNOWN_HOSTS}" 2>/dev/null + test -s "${DEPLOY_KNOWN_HOSTS}" || { echo "❌ K8S host keyscan failed: ${K8S_SSH_HOST}"; exit 1; } + SSH_OPTS="-i ${HOME}/.ssh/deploy_key -o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${DEPLOY_KNOWN_HOSTS} -o ConnectTimeout=10" ssh $SSH_OPTS "wooo@${{ env.K8S_SSH_HOST }}" << SECRETS set -e K8S_API_SERVER="${{ env.K8S_API_SERVER }}" @@ -855,9 +860,12 @@ jobs: write_deploy_key # 2026-05-13 Codex: mirror Inject K8s Secrets host-key handling so the # deploy job never reaches SSH with a known_hosts file missing ED25519. - ssh-keyscan -T 5 -t ed25519,rsa,ecdsa "${K8S_SSH_HOST}" > "${HOME}/.ssh/known_hosts" 2>/dev/null - test -s "${HOME}/.ssh/known_hosts" || { echo "❌ K8S host keyscan failed: ${K8S_SSH_HOST}"; exit 1; } - SSH_OPTS="-i ${HOME}/.ssh/deploy_key -o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${HOME}/.ssh/known_hosts -o ConnectTimeout=10" + # 2026-06-13 Codex: use the deploy-only known_hosts file so this + # stage cannot wipe cold-start/backup host trust for 120/188. + DEPLOY_KNOWN_HOSTS="${HOME}/.ssh/deploy_known_hosts" + ssh-keyscan -T 5 -t ed25519,rsa,ecdsa "${K8S_SSH_HOST}" > "${DEPLOY_KNOWN_HOSTS}" 2>/dev/null + test -s "${DEPLOY_KNOWN_HOSTS}" || { echo "❌ K8S host keyscan failed: ${K8S_SSH_HOST}"; exit 1; } + SSH_OPTS="-i ${HOME}/.ssh/deploy_key -o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${DEPLOY_KNOWN_HOSTS} -o ConnectTimeout=10" IMAGE_TAG="${{ github.sha }}" HARBOR=192.168.0.110:5000 @@ -1270,8 +1278,9 @@ jobs: EVENT_EXPORTER_STATUSES="" write_deploy_key - if ssh-keyscan -T 5 -t ed25519,rsa,ecdsa "${K8S_SSH_HOST}" > "${HOME}/.ssh/known_hosts" 2>/dev/null && test -s "${HOME}/.ssh/known_hosts"; then - SSH_OPTS="-i ${HOME}/.ssh/deploy_key -o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${HOME}/.ssh/known_hosts -o ConnectTimeout=10" + DEPLOY_KNOWN_HOSTS="${HOME}/.ssh/deploy_known_hosts" + if ssh-keyscan -T 5 -t ed25519,rsa,ecdsa "${K8S_SSH_HOST}" > "${DEPLOY_KNOWN_HOSTS}" 2>/dev/null && test -s "${DEPLOY_KNOWN_HOSTS}"; then + SSH_OPTS="-i ${HOME}/.ssh/deploy_key -o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${DEPLOY_KNOWN_HOSTS} -o ConnectTimeout=10" if ! OTEL_COLLECTOR_STATUSES="$(capture_observability_statuses otel-collector)"; then OTEL_COLLECTOR_ERROR="$(printf '%s' "${OTEL_COLLECTOR_STATUSES}" | tail -1 | head -c 200)" OTEL_COLLECTOR_STATUSES=""