fix(recovery): bound cold-start monitor probes
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 55s
CD Pipeline / build-and-deploy (push) Failing after 4m2s
CD Pipeline / post-deploy-checks (push) Has been skipped
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 55s
CD Pipeline / build-and-deploy (push) Failing after 4m2s
CD Pipeline / post-deploy-checks (push) Has been skipped
This commit is contained in:
@@ -4,7 +4,14 @@
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=6)
|
||||
SSH_COMMAND_TIMEOUT_SECONDS="${SSH_COMMAND_TIMEOUT_SECONDS:-45}"
|
||||
SSH_OPTS=(
|
||||
-o BatchMode=yes
|
||||
-o ConnectTimeout=6
|
||||
-o ConnectionAttempts=1
|
||||
-o ServerAliveInterval=5
|
||||
-o ServerAliveCountMax=1
|
||||
)
|
||||
SEND_ALERT_TEST=0
|
||||
MONITOR_READ_ONLY=0
|
||||
NO_COLOR_FLAG=0
|
||||
@@ -129,10 +136,12 @@ ssh_cmd() {
|
||||
local user_host="$1"
|
||||
local cmd="$2"
|
||||
local prefix=""
|
||||
local quoted_cmd=""
|
||||
if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
|
||||
printf -v prefix 'REMOTE_SUDO_PASSWORD=%q ' "$REMOTE_SUDO_PASSWORD"
|
||||
fi
|
||||
ssh "${SSH_OPTS[@]}" "$user_host" "${prefix}${cmd}"
|
||||
printf -v quoted_cmd '%q' "$cmd"
|
||||
ssh "${SSH_OPTS[@]}" "$user_host" "${prefix}if command -v timeout >/dev/null 2>&1; then timeout ${SSH_COMMAND_TIMEOUT_SECONDS}s bash -lc ${quoted_cmd}; else bash -lc ${quoted_cmd}; fi"
|
||||
}
|
||||
|
||||
host_has_ip() {
|
||||
|
||||
@@ -9,6 +9,7 @@ PROMETHEUS_URL="${PROMETHEUS_URL:-http://192.168.0.110:9090}"
|
||||
ALERTMANAGER_URL="${ALERTMANAGER_URL:-http://192.168.0.110:9093}"
|
||||
SSH_BATCH_MODE="${SSH_BATCH_MODE:-yes}"
|
||||
SSH_STRICT_HOST_KEY_CHECKING="${SSH_STRICT_HOST_KEY_CHECKING:-accept-new}"
|
||||
REMOTE_COMMAND_TIMEOUT_SECONDS="${REMOTE_COMMAND_TIMEOUT_SECONDS:-45}"
|
||||
REQUIRE_CORE=0
|
||||
REQUIRE_DR=0
|
||||
|
||||
@@ -56,7 +57,14 @@ while [ "$#" -gt 0 ]; do
|
||||
esac
|
||||
done
|
||||
|
||||
ssh_opts=(-o BatchMode="$SSH_BATCH_MODE" -o ConnectTimeout=6 -o StrictHostKeyChecking="$SSH_STRICT_HOST_KEY_CHECKING")
|
||||
ssh_opts=(
|
||||
-o BatchMode="$SSH_BATCH_MODE"
|
||||
-o ConnectTimeout=6
|
||||
-o ConnectionAttempts=1
|
||||
-o ServerAliveInterval=5
|
||||
-o ServerAliveCountMax=1
|
||||
-o StrictHostKeyChecking="$SSH_STRICT_HOST_KEY_CHECKING"
|
||||
)
|
||||
|
||||
status_value() {
|
||||
local key="$1"
|
||||
@@ -98,6 +106,13 @@ except Exception:
|
||||
PY
|
||||
}
|
||||
|
||||
remote_110_read() {
|
||||
local command="$1"
|
||||
local quoted_command=""
|
||||
printf -v quoted_command '%q' "$command"
|
||||
ssh "${ssh_opts[@]}" "$REMOTE_110" "if command -v timeout >/dev/null 2>&1; then timeout ${REMOTE_COMMAND_TIMEOUT_SECONDS}s bash -lc ${quoted_command}; else bash -lc ${quoted_command}; fi"
|
||||
}
|
||||
|
||||
echo "AWOOOI full-stack recovery scorecard"
|
||||
date '+%Y-%m-%d %H:%M:%S %Z'
|
||||
echo
|
||||
@@ -127,7 +142,7 @@ else
|
||||
status_value BACKUP_GAP_ALERT_VISIBILITY 0
|
||||
fi
|
||||
|
||||
evidence_report="$(ssh "${ssh_opts[@]}" "$REMOTE_110" '/backup/scripts/offsite-escrow-evidence-report.sh --no-color' 2>/tmp/awoooi-scorecard-offsite-evidence.err || true)"
|
||||
evidence_report="$(remote_110_read '/backup/scripts/offsite-escrow-evidence-report.sh --no-color' 2>/tmp/awoooi-scorecard-offsite-evidence.err || true)"
|
||||
|
||||
extract_report_value() {
|
||||
local key="$1"
|
||||
|
||||
@@ -0,0 +1,47 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[3]
|
||||
COLD_START_CHECK = ROOT / "scripts" / "reboot-recovery" / "full-stack-cold-start-check.sh"
|
||||
RECOVERY_SCORECARD = (
|
||||
ROOT / "scripts" / "reboot-recovery" / "full-stack-recovery-scorecard.sh"
|
||||
)
|
||||
VERIFY_DEPLOY = ROOT / "scripts" / "reboot-recovery" / "verify-cold-start-monitor-deploy.sh"
|
||||
|
||||
|
||||
def test_full_stack_cold_start_check_bounds_ssh_probes() -> None:
|
||||
text = COLD_START_CHECK.read_text(encoding="utf-8")
|
||||
|
||||
assert 'SSH_COMMAND_TIMEOUT_SECONDS="${SSH_COMMAND_TIMEOUT_SECONDS:-45}"' in text
|
||||
assert "-o ConnectionAttempts=1" in text
|
||||
assert "-o ServerAliveInterval=5" in text
|
||||
assert "-o ServerAliveCountMax=1" in text
|
||||
assert "timeout ${SSH_COMMAND_TIMEOUT_SECONDS}s bash -lc" in text
|
||||
assert "printf -v quoted_cmd '%q' \"$cmd\"" in text
|
||||
|
||||
|
||||
def test_recovery_scorecard_bounds_offsite_evidence_ssh() -> None:
|
||||
text = RECOVERY_SCORECARD.read_text(encoding="utf-8")
|
||||
|
||||
assert 'REMOTE_COMMAND_TIMEOUT_SECONDS="${REMOTE_COMMAND_TIMEOUT_SECONDS:-45}"' in text
|
||||
assert "-o ConnectionAttempts=1" in text
|
||||
assert "-o ServerAliveInterval=5" in text
|
||||
assert "-o ServerAliveCountMax=1" in text
|
||||
assert "remote_110_read()" in text
|
||||
assert "timeout ${REMOTE_COMMAND_TIMEOUT_SECONDS}s bash -lc" in text
|
||||
assert "offsite-escrow-evidence-report.sh --no-color" in text
|
||||
|
||||
|
||||
def test_cold_start_deploy_parity_verifier_bounds_ssh_readback() -> None:
|
||||
text = VERIFY_DEPLOY.read_text(encoding="utf-8")
|
||||
|
||||
assert 'REMOTE_COMMAND_TIMEOUT_SECONDS="${REMOTE_COMMAND_TIMEOUT_SECONDS:-45}"' in text
|
||||
assert "-o ConnectionAttempts=1" in text
|
||||
assert "-o ServerAliveInterval=5" in text
|
||||
assert "-o ServerAliveCountMax=1" in text
|
||||
assert "remote_read()" in text
|
||||
assert "timeout ${REMOTE_COMMAND_TIMEOUT_SECONDS}s bash -lc" in text
|
||||
assert 'remote_read "sha256sum' in text
|
||||
assert 'if remote_read "grep -Fq' in text
|
||||
@@ -7,9 +7,17 @@ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
||||
REMOTE="${REMOTE:-wooo@192.168.0.110}"
|
||||
SSH_BATCH_MODE="${SSH_BATCH_MODE:-yes}"
|
||||
SSH_STRICT_HOST_KEY_CHECKING="${SSH_STRICT_HOST_KEY_CHECKING:-accept-new}"
|
||||
REMOTE_COMMAND_TIMEOUT_SECONDS="${REMOTE_COMMAND_TIMEOUT_SECONDS:-45}"
|
||||
PROMETHEUS_URL="${PROMETHEUS_URL:-http://192.168.0.110:9090}"
|
||||
|
||||
ssh_opts=(-o BatchMode="$SSH_BATCH_MODE" -o ConnectTimeout=6 -o StrictHostKeyChecking="$SSH_STRICT_HOST_KEY_CHECKING")
|
||||
ssh_opts=(
|
||||
-o BatchMode="$SSH_BATCH_MODE"
|
||||
-o ConnectTimeout=6
|
||||
-o ConnectionAttempts=1
|
||||
-o ServerAliveInterval=5
|
||||
-o ServerAliveCountMax=1
|
||||
-o StrictHostKeyChecking="$SSH_STRICT_HOST_KEY_CHECKING"
|
||||
)
|
||||
|
||||
local_sha256() {
|
||||
if command -v sha256sum >/dev/null 2>&1; then
|
||||
@@ -20,7 +28,14 @@ local_sha256() {
|
||||
}
|
||||
|
||||
remote_sha256() {
|
||||
ssh "${ssh_opts[@]}" "$REMOTE" "sha256sum '$1' 2>/dev/null | awk '{print \$1}'"
|
||||
remote_read "sha256sum '$1' 2>/dev/null | awk '{print \$1}'"
|
||||
}
|
||||
|
||||
remote_read() {
|
||||
local command="$1"
|
||||
local quoted_command=""
|
||||
printf -v quoted_command '%q' "$command"
|
||||
ssh "${ssh_opts[@]}" "$REMOTE" "if command -v timeout >/dev/null 2>&1; then timeout ${REMOTE_COMMAND_TIMEOUT_SECONDS}s bash -lc ${quoted_command}; else bash -lc ${quoted_command}; fi"
|
||||
}
|
||||
|
||||
require_same_hash() {
|
||||
@@ -46,7 +61,7 @@ require_remote_pattern() {
|
||||
local pattern="$1"
|
||||
local path="$2"
|
||||
local label="$3"
|
||||
if ssh "${ssh_opts[@]}" "$REMOTE" "grep -Fq '$pattern' '$path'"; then
|
||||
if remote_read "grep -Fq '$pattern' '$path'"; then
|
||||
echo "OK $label"
|
||||
else
|
||||
echo "BLOCKED $label missing in $path" >&2
|
||||
|
||||
Reference in New Issue
Block a user