fix(ci): feed observability pod status into alert smoke
All checks were successful
Code Review / ai-code-review (push) Successful in 11s

This commit is contained in:
Your Name
2026-05-19 14:58:34 +08:00
parent 842069a1fd
commit d6c941ea39
2 changed files with 162 additions and 35 deletions

View File

@@ -1001,6 +1001,57 @@ jobs:
- name: Alert Chain Smoke Test
id: alert_chain_smoke
run: |
write_deploy_key() {
mkdir -p "${HOME}/.ssh"
umask 077
cat > "${HOME}/.ssh/deploy_key" <<'AWOOOI_DEPLOY_KEY'
${{ secrets.DEPLOY_SSH_KEY }}
AWOOOI_DEPLOY_KEY
chmod 600 "${HOME}/.ssh/deploy_key"
}
collect_observability_statuses() {
local component="$1"
ssh $SSH_OPTS "wooo@${K8S_SSH_HOST}" \
"sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${K8S_API_SERVER} get pods -n observability -l app.kubernetes.io/name=${component} --no-headers -o custom-columns=STATUS:.status.phase"
}
capture_observability_statuses() {
local component="$1"
local output
if output="$(collect_observability_statuses "${component}" 2>&1)"; then
printf '%s' "${output}"
return 0
fi
printf '%s' "${output}"
return 1
}
# 2026-05-19 Codex: the smoke test runs inside CI_IMAGE, but the
# observability pod checks need the K3s host kubectl context. Capture
# those read-only statuses on the host and pass them into the
# container, instead of making the container own kube credentials.
OBSERVABILITY_PREFLIGHT_ERROR=""
OTEL_COLLECTOR_ERROR=""
EVENT_EXPORTER_ERROR=""
OTEL_COLLECTOR_STATUSES=""
EVENT_EXPORTER_STATUSES=""
write_deploy_key
if ssh-keyscan -T 5 -t ed25519,rsa,ecdsa "${K8S_SSH_HOST}" > "${HOME}/.ssh/known_hosts" 2>/dev/null && test -s "${HOME}/.ssh/known_hosts"; then
SSH_OPTS="-i ${HOME}/.ssh/deploy_key -o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${HOME}/.ssh/known_hosts -o ConnectTimeout=10"
if ! OTEL_COLLECTOR_STATUSES="$(capture_observability_statuses otel-collector)"; then
OTEL_COLLECTOR_ERROR="$(printf '%s' "${OTEL_COLLECTOR_STATUSES}" | tail -1 | head -c 200)"
OTEL_COLLECTOR_STATUSES=""
fi
if ! EVENT_EXPORTER_STATUSES="$(capture_observability_statuses event-exporter)"; then
EVENT_EXPORTER_ERROR="$(printf '%s' "${EVENT_EXPORTER_STATUSES}" | tail -1 | head -c 200)"
EVENT_EXPORTER_STATUSES=""
fi
else
OBSERVABILITY_PREFLIGHT_ERROR="K8s host keyscan failed"
OTEL_COLLECTOR_ERROR="${OBSERVABILITY_PREFLIGHT_ERROR}"
EVENT_EXPORTER_ERROR="${OBSERVABILITY_PREFLIGHT_ERROR}"
fi
# 2026-05-05 Codex: use the keepalived VIP instead of a fixed node.
# Host runner launches the CI image explicitly to avoid act RWLayer=nil.
if docker run --rm \
@@ -1010,6 +1061,10 @@ jobs:
-v "$PWD:/workspace" \
-v awoooi-api-venv-cache:/opt/api-venv \
-w /workspace \
-e AWOOOI_OTEL_COLLECTOR_STATUSES="${OTEL_COLLECTOR_STATUSES}" \
-e AWOOOI_OTEL_COLLECTOR_ERROR="${OTEL_COLLECTOR_ERROR}" \
-e AWOOOI_EVENT_EXPORTER_STATUSES="${EVENT_EXPORTER_STATUSES}" \
-e AWOOOI_EVENT_EXPORTER_ERROR="${EVENT_EXPORTER_ERROR}" \
"${{ env.CI_IMAGE }}" \
bash -lc 'source /opt/api-venv/bin/activate && python3 scripts/alert_chain_smoke_test.py --api-url ${{ env.ALERT_CHAIN_API_URL }} --json | tee /tmp/alert_chain_result.json'; then
echo "alert_chain_status=pass" >> $GITHUB_OUTPUT

View File

@@ -27,6 +27,8 @@ from __future__ import annotations
import argparse
import json
import os
import shlex
import sys
import time
from dataclasses import dataclass, field
@@ -52,6 +54,64 @@ MAX_ALERT_CHAIN_SILENCE_SECONDS = 2 * 60 * 60
TIMEOUT = 10 # 秒
def _statuses_from_env(env_name: str) -> list[str] | None:
"""Return preflight pod statuses supplied by CI, or None to use kubectl."""
if env_name not in os.environ:
return None
return [
line.strip()
for line in os.environ[env_name].splitlines()
if line.strip()
]
def _status_error_from_env(env_name: str) -> str | None:
value = os.environ.get(env_name, "").strip()
return value or None
def _check_running_statuses(
name: str,
statuses: list[str],
empty_message: str,
) -> CheckResult:
running = [s for s in statuses if s == "Running"]
if len(running) == 0:
return CheckResult(name, False, empty_message)
return CheckResult(name, True, f"{len(running)} Pod(s) Running")
def _kubectl_base_command() -> list[str]:
# CI may provide a full safe wrapper such as:
# sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=https://...
return shlex.split(os.environ.get("AWOOOI_KUBECTL_CMD", "kubectl"))
def _run_kubectl_status_query(label: str) -> list[str] | None:
import subprocess
result = subprocess.run(
[
*_kubectl_base_command(),
"get",
"pods",
"-n",
"observability",
"-l",
f"app.kubernetes.io/name={label}",
"--no-headers",
"-o",
"custom-columns=STATUS:.status.phase",
],
capture_output=True,
text=True,
timeout=15,
)
if result.returncode != 0:
return None
return [line.strip() for line in result.stdout.splitlines() if line.strip()]
# =============================================================================
# 測試結果
# =============================================================================
@@ -206,29 +266,34 @@ def check_signoz_reachable(signoz_url: str) -> CheckResult:
def check_otel_collector() -> CheckResult:
"""Check 5: OTEL Collector DaemonSet 是否在 K3s 運行"""
try:
import subprocess
result = subprocess.run(
["kubectl", "get", "pods", "-n", "observability",
"-l", "app.kubernetes.io/name=otel-collector",
"--no-headers", "-o", "custom-columns=STATUS:.status.phase"],
capture_output=True, text=True, timeout=15
preflight_error = _status_error_from_env("AWOOOI_OTEL_COLLECTOR_ERROR")
if preflight_error:
return CheckResult(
"OTEL Collector",
False,
f"host kubectl preflight failed: {preflight_error}",
critical=False,
)
if result.returncode != 0:
preflight_statuses = _statuses_from_env("AWOOOI_OTEL_COLLECTOR_STATUSES")
if preflight_statuses is not None:
return _check_running_statuses(
"OTEL Collector",
preflight_statuses,
"沒有 Running 的 OTEL Collector Pod",
)
try:
statuses = _run_kubectl_status_query("otel-collector")
if statuses is None:
return CheckResult(
"OTEL Collector", False, "kubectl 查詢失敗", critical=False
)
statuses = result.stdout.strip().split("\n")
running = [s for s in statuses if s.strip() == "Running"]
if len(running) == 0:
return CheckResult(
"OTEL Collector", False, "沒有 Running 的 OTEL Collector Pod"
)
return CheckResult(
"OTEL Collector", True, f"{len(running)} Pod(s) Running"
return _check_running_statuses(
"OTEL Collector",
statuses,
"沒有 Running 的 OTEL Collector Pod",
)
except Exception as e:
return CheckResult(
@@ -238,28 +303,35 @@ def check_otel_collector() -> CheckResult:
def check_event_exporter() -> CheckResult:
"""Check 6: Event Exporter 是否在 K3s 運行"""
try:
import subprocess
result = subprocess.run(
["kubectl", "get", "pods", "-n", "observability",
"-l", "app.kubernetes.io/name=event-exporter",
"--no-headers", "-o", "custom-columns=STATUS:.status.phase"],
capture_output=True, text=True, timeout=15
preflight_error = _status_error_from_env("AWOOOI_EVENT_EXPORTER_ERROR")
if preflight_error:
return CheckResult(
"Event Exporter",
False,
f"host kubectl preflight failed: {preflight_error}",
critical=False,
)
if result.returncode != 0:
preflight_statuses = _statuses_from_env("AWOOOI_EVENT_EXPORTER_STATUSES")
if preflight_statuses is not None:
return _check_running_statuses(
"Event Exporter",
preflight_statuses,
"沒有 Running 的 Event Exporter Pod",
)
try:
statuses = _run_kubectl_status_query("event-exporter")
if statuses is None:
return CheckResult(
"Event Exporter", False, "kubectl 查詢失敗", critical=False
)
statuses = result.stdout.strip().split("\n")
running = [s for s in statuses if s.strip() == "Running"]
if len(running) == 0:
return CheckResult(
"Event Exporter", False, "沒有 Running 的 Event Exporter Pod"
)
return CheckResult("Event Exporter", True, f"{len(running)} Pod(s) Running")
return _check_running_statuses(
"Event Exporter",
statuses,
"沒有 Running 的 Event Exporter Pod",
)
except Exception as e:
return CheckResult(
"Event Exporter", False, f"無法檢查: {e}", critical=False