fix(recovery): bound cold-start monitor probes
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 55s
CD Pipeline / build-and-deploy (push) Failing after 4m2s
CD Pipeline / post-deploy-checks (push) Has been skipped

This commit is contained in:
Your Name
2026-06-30 09:02:57 +08:00
parent 4cb2de32fe
commit eb137bb4e0
7 changed files with 139 additions and 8 deletions

View File

@@ -4,7 +4,14 @@
set -uo pipefail
SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=6)
SSH_COMMAND_TIMEOUT_SECONDS="${SSH_COMMAND_TIMEOUT_SECONDS:-45}"
SSH_OPTS=(
-o BatchMode=yes
-o ConnectTimeout=6
-o ConnectionAttempts=1
-o ServerAliveInterval=5
-o ServerAliveCountMax=1
)
SEND_ALERT_TEST=0
MONITOR_READ_ONLY=0
NO_COLOR_FLAG=0
@@ -129,10 +136,12 @@ ssh_cmd() {
local user_host="$1"
local cmd="$2"
local prefix=""
local quoted_cmd=""
if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
printf -v prefix 'REMOTE_SUDO_PASSWORD=%q ' "$REMOTE_SUDO_PASSWORD"
fi
ssh "${SSH_OPTS[@]}" "$user_host" "${prefix}${cmd}"
printf -v quoted_cmd '%q' "$cmd"
ssh "${SSH_OPTS[@]}" "$user_host" "${prefix}if command -v timeout >/dev/null 2>&1; then timeout ${SSH_COMMAND_TIMEOUT_SECONDS}s bash -lc ${quoted_cmd}; else bash -lc ${quoted_cmd}; fi"
}
host_has_ip() {

View File

@@ -9,6 +9,7 @@ PROMETHEUS_URL="${PROMETHEUS_URL:-http://192.168.0.110:9090}"
ALERTMANAGER_URL="${ALERTMANAGER_URL:-http://192.168.0.110:9093}"
SSH_BATCH_MODE="${SSH_BATCH_MODE:-yes}"
SSH_STRICT_HOST_KEY_CHECKING="${SSH_STRICT_HOST_KEY_CHECKING:-accept-new}"
REMOTE_COMMAND_TIMEOUT_SECONDS="${REMOTE_COMMAND_TIMEOUT_SECONDS:-45}"
REQUIRE_CORE=0
REQUIRE_DR=0
@@ -56,7 +57,14 @@ while [ "$#" -gt 0 ]; do
esac
done
ssh_opts=(-o BatchMode="$SSH_BATCH_MODE" -o ConnectTimeout=6 -o StrictHostKeyChecking="$SSH_STRICT_HOST_KEY_CHECKING")
ssh_opts=(
-o BatchMode="$SSH_BATCH_MODE"
-o ConnectTimeout=6
-o ConnectionAttempts=1
-o ServerAliveInterval=5
-o ServerAliveCountMax=1
-o StrictHostKeyChecking="$SSH_STRICT_HOST_KEY_CHECKING"
)
status_value() {
local key="$1"
@@ -98,6 +106,13 @@ except Exception:
PY
}
remote_110_read() {
local command="$1"
local quoted_command=""
printf -v quoted_command '%q' "$command"
ssh "${ssh_opts[@]}" "$REMOTE_110" "if command -v timeout >/dev/null 2>&1; then timeout ${REMOTE_COMMAND_TIMEOUT_SECONDS}s bash -lc ${quoted_command}; else bash -lc ${quoted_command}; fi"
}
echo "AWOOOI full-stack recovery scorecard"
date '+%Y-%m-%d %H:%M:%S %Z'
echo
@@ -127,7 +142,7 @@ else
status_value BACKUP_GAP_ALERT_VISIBILITY 0
fi
evidence_report="$(ssh "${ssh_opts[@]}" "$REMOTE_110" '/backup/scripts/offsite-escrow-evidence-report.sh --no-color' 2>/tmp/awoooi-scorecard-offsite-evidence.err || true)"
evidence_report="$(remote_110_read '/backup/scripts/offsite-escrow-evidence-report.sh --no-color' 2>/tmp/awoooi-scorecard-offsite-evidence.err || true)"
extract_report_value() {
local key="$1"

View File

@@ -0,0 +1,47 @@
from __future__ import annotations
from pathlib import Path
ROOT = Path(__file__).resolve().parents[3]
COLD_START_CHECK = ROOT / "scripts" / "reboot-recovery" / "full-stack-cold-start-check.sh"
RECOVERY_SCORECARD = (
ROOT / "scripts" / "reboot-recovery" / "full-stack-recovery-scorecard.sh"
)
VERIFY_DEPLOY = ROOT / "scripts" / "reboot-recovery" / "verify-cold-start-monitor-deploy.sh"
def test_full_stack_cold_start_check_bounds_ssh_probes() -> None:
text = COLD_START_CHECK.read_text(encoding="utf-8")
assert 'SSH_COMMAND_TIMEOUT_SECONDS="${SSH_COMMAND_TIMEOUT_SECONDS:-45}"' in text
assert "-o ConnectionAttempts=1" in text
assert "-o ServerAliveInterval=5" in text
assert "-o ServerAliveCountMax=1" in text
assert "timeout ${SSH_COMMAND_TIMEOUT_SECONDS}s bash -lc" in text
assert "printf -v quoted_cmd '%q' \"$cmd\"" in text
def test_recovery_scorecard_bounds_offsite_evidence_ssh() -> None:
text = RECOVERY_SCORECARD.read_text(encoding="utf-8")
assert 'REMOTE_COMMAND_TIMEOUT_SECONDS="${REMOTE_COMMAND_TIMEOUT_SECONDS:-45}"' in text
assert "-o ConnectionAttempts=1" in text
assert "-o ServerAliveInterval=5" in text
assert "-o ServerAliveCountMax=1" in text
assert "remote_110_read()" in text
assert "timeout ${REMOTE_COMMAND_TIMEOUT_SECONDS}s bash -lc" in text
assert "offsite-escrow-evidence-report.sh --no-color" in text
def test_cold_start_deploy_parity_verifier_bounds_ssh_readback() -> None:
text = VERIFY_DEPLOY.read_text(encoding="utf-8")
assert 'REMOTE_COMMAND_TIMEOUT_SECONDS="${REMOTE_COMMAND_TIMEOUT_SECONDS:-45}"' in text
assert "-o ConnectionAttempts=1" in text
assert "-o ServerAliveInterval=5" in text
assert "-o ServerAliveCountMax=1" in text
assert "remote_read()" in text
assert "timeout ${REMOTE_COMMAND_TIMEOUT_SECONDS}s bash -lc" in text
assert 'remote_read "sha256sum' in text
assert 'if remote_read "grep -Fq' in text

View File

@@ -7,9 +7,17 @@ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
REMOTE="${REMOTE:-wooo@192.168.0.110}"
SSH_BATCH_MODE="${SSH_BATCH_MODE:-yes}"
SSH_STRICT_HOST_KEY_CHECKING="${SSH_STRICT_HOST_KEY_CHECKING:-accept-new}"
REMOTE_COMMAND_TIMEOUT_SECONDS="${REMOTE_COMMAND_TIMEOUT_SECONDS:-45}"
PROMETHEUS_URL="${PROMETHEUS_URL:-http://192.168.0.110:9090}"
ssh_opts=(-o BatchMode="$SSH_BATCH_MODE" -o ConnectTimeout=6 -o StrictHostKeyChecking="$SSH_STRICT_HOST_KEY_CHECKING")
ssh_opts=(
-o BatchMode="$SSH_BATCH_MODE"
-o ConnectTimeout=6
-o ConnectionAttempts=1
-o ServerAliveInterval=5
-o ServerAliveCountMax=1
-o StrictHostKeyChecking="$SSH_STRICT_HOST_KEY_CHECKING"
)
local_sha256() {
if command -v sha256sum >/dev/null 2>&1; then
@@ -20,7 +28,14 @@ local_sha256() {
}
remote_sha256() {
ssh "${ssh_opts[@]}" "$REMOTE" "sha256sum '$1' 2>/dev/null | awk '{print \$1}'"
remote_read "sha256sum '$1' 2>/dev/null | awk '{print \$1}'"
}
remote_read() {
local command="$1"
local quoted_command=""
printf -v quoted_command '%q' "$command"
ssh "${ssh_opts[@]}" "$REMOTE" "if command -v timeout >/dev/null 2>&1; then timeout ${REMOTE_COMMAND_TIMEOUT_SECONDS}s bash -lc ${quoted_command}; else bash -lc ${quoted_command}; fi"
}
require_same_hash() {
@@ -46,7 +61,7 @@ require_remote_pattern() {
local pattern="$1"
local path="$2"
local label="$3"
if ssh "${ssh_opts[@]}" "$REMOTE" "grep -Fq '$pattern' '$path'"; then
if remote_read "grep -Fq '$pattern' '$path'"; then
echo "OK $label"
else
echo "BLOCKED $label missing in $path" >&2