fix(ci): feed observability pod status into alert smoke
All checks were successful
Code Review / ai-code-review (push) Successful in 11s
All checks were successful
Code Review / ai-code-review (push) Successful in 11s
This commit is contained in:
@@ -1001,6 +1001,57 @@ jobs:
|
||||
- name: Alert Chain Smoke Test
|
||||
id: alert_chain_smoke
|
||||
run: |
|
||||
write_deploy_key() {
|
||||
mkdir -p "${HOME}/.ssh"
|
||||
umask 077
|
||||
cat > "${HOME}/.ssh/deploy_key" <<'AWOOOI_DEPLOY_KEY'
|
||||
${{ secrets.DEPLOY_SSH_KEY }}
|
||||
AWOOOI_DEPLOY_KEY
|
||||
chmod 600 "${HOME}/.ssh/deploy_key"
|
||||
}
|
||||
collect_observability_statuses() {
|
||||
local component="$1"
|
||||
ssh $SSH_OPTS "wooo@${K8S_SSH_HOST}" \
|
||||
"sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${K8S_API_SERVER} get pods -n observability -l app.kubernetes.io/name=${component} --no-headers -o custom-columns=STATUS:.status.phase"
|
||||
}
|
||||
capture_observability_statuses() {
|
||||
local component="$1"
|
||||
local output
|
||||
if output="$(collect_observability_statuses "${component}" 2>&1)"; then
|
||||
printf '%s' "${output}"
|
||||
return 0
|
||||
fi
|
||||
printf '%s' "${output}"
|
||||
return 1
|
||||
}
|
||||
|
||||
# 2026-05-19 Codex: the smoke test runs inside CI_IMAGE, but the
|
||||
# observability pod checks need the K3s host kubectl context. Capture
|
||||
# those read-only statuses on the host and pass them into the
|
||||
# container, instead of making the container own kube credentials.
|
||||
OBSERVABILITY_PREFLIGHT_ERROR=""
|
||||
OTEL_COLLECTOR_ERROR=""
|
||||
EVENT_EXPORTER_ERROR=""
|
||||
OTEL_COLLECTOR_STATUSES=""
|
||||
EVENT_EXPORTER_STATUSES=""
|
||||
|
||||
write_deploy_key
|
||||
if ssh-keyscan -T 5 -t ed25519,rsa,ecdsa "${K8S_SSH_HOST}" > "${HOME}/.ssh/known_hosts" 2>/dev/null && test -s "${HOME}/.ssh/known_hosts"; then
|
||||
SSH_OPTS="-i ${HOME}/.ssh/deploy_key -o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${HOME}/.ssh/known_hosts -o ConnectTimeout=10"
|
||||
if ! OTEL_COLLECTOR_STATUSES="$(capture_observability_statuses otel-collector)"; then
|
||||
OTEL_COLLECTOR_ERROR="$(printf '%s' "${OTEL_COLLECTOR_STATUSES}" | tail -1 | head -c 200)"
|
||||
OTEL_COLLECTOR_STATUSES=""
|
||||
fi
|
||||
if ! EVENT_EXPORTER_STATUSES="$(capture_observability_statuses event-exporter)"; then
|
||||
EVENT_EXPORTER_ERROR="$(printf '%s' "${EVENT_EXPORTER_STATUSES}" | tail -1 | head -c 200)"
|
||||
EVENT_EXPORTER_STATUSES=""
|
||||
fi
|
||||
else
|
||||
OBSERVABILITY_PREFLIGHT_ERROR="K8s host keyscan failed"
|
||||
OTEL_COLLECTOR_ERROR="${OBSERVABILITY_PREFLIGHT_ERROR}"
|
||||
EVENT_EXPORTER_ERROR="${OBSERVABILITY_PREFLIGHT_ERROR}"
|
||||
fi
|
||||
|
||||
# 2026-05-05 Codex: use the keepalived VIP instead of a fixed node.
|
||||
# Host runner launches the CI image explicitly to avoid act RWLayer=nil.
|
||||
if docker run --rm \
|
||||
@@ -1010,6 +1061,10 @@ jobs:
|
||||
-v "$PWD:/workspace" \
|
||||
-v awoooi-api-venv-cache:/opt/api-venv \
|
||||
-w /workspace \
|
||||
-e AWOOOI_OTEL_COLLECTOR_STATUSES="${OTEL_COLLECTOR_STATUSES}" \
|
||||
-e AWOOOI_OTEL_COLLECTOR_ERROR="${OTEL_COLLECTOR_ERROR}" \
|
||||
-e AWOOOI_EVENT_EXPORTER_STATUSES="${EVENT_EXPORTER_STATUSES}" \
|
||||
-e AWOOOI_EVENT_EXPORTER_ERROR="${EVENT_EXPORTER_ERROR}" \
|
||||
"${{ env.CI_IMAGE }}" \
|
||||
bash -lc 'source /opt/api-venv/bin/activate && python3 scripts/alert_chain_smoke_test.py --api-url ${{ env.ALERT_CHAIN_API_URL }} --json | tee /tmp/alert_chain_result.json'; then
|
||||
echo "alert_chain_status=pass" >> $GITHUB_OUTPUT
|
||||
|
||||
@@ -27,6 +27,8 @@ from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import shlex
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
@@ -52,6 +54,64 @@ MAX_ALERT_CHAIN_SILENCE_SECONDS = 2 * 60 * 60
|
||||
TIMEOUT = 10 # 秒
|
||||
|
||||
|
||||
def _statuses_from_env(env_name: str) -> list[str] | None:
|
||||
"""Return preflight pod statuses supplied by CI, or None to use kubectl."""
|
||||
if env_name not in os.environ:
|
||||
return None
|
||||
return [
|
||||
line.strip()
|
||||
for line in os.environ[env_name].splitlines()
|
||||
if line.strip()
|
||||
]
|
||||
|
||||
|
||||
def _status_error_from_env(env_name: str) -> str | None:
|
||||
value = os.environ.get(env_name, "").strip()
|
||||
return value or None
|
||||
|
||||
|
||||
def _check_running_statuses(
|
||||
name: str,
|
||||
statuses: list[str],
|
||||
empty_message: str,
|
||||
) -> CheckResult:
|
||||
running = [s for s in statuses if s == "Running"]
|
||||
if len(running) == 0:
|
||||
return CheckResult(name, False, empty_message)
|
||||
return CheckResult(name, True, f"{len(running)} Pod(s) Running")
|
||||
|
||||
|
||||
def _kubectl_base_command() -> list[str]:
|
||||
# CI may provide a full safe wrapper such as:
|
||||
# sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=https://...
|
||||
return shlex.split(os.environ.get("AWOOOI_KUBECTL_CMD", "kubectl"))
|
||||
|
||||
|
||||
def _run_kubectl_status_query(label: str) -> list[str] | None:
|
||||
import subprocess
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
*_kubectl_base_command(),
|
||||
"get",
|
||||
"pods",
|
||||
"-n",
|
||||
"observability",
|
||||
"-l",
|
||||
f"app.kubernetes.io/name={label}",
|
||||
"--no-headers",
|
||||
"-o",
|
||||
"custom-columns=STATUS:.status.phase",
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=15,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
return None
|
||||
return [line.strip() for line in result.stdout.splitlines() if line.strip()]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 測試結果
|
||||
# =============================================================================
|
||||
@@ -206,29 +266,34 @@ def check_signoz_reachable(signoz_url: str) -> CheckResult:
|
||||
|
||||
def check_otel_collector() -> CheckResult:
|
||||
"""Check 5: OTEL Collector DaemonSet 是否在 K3s 運行"""
|
||||
try:
|
||||
import subprocess
|
||||
result = subprocess.run(
|
||||
["kubectl", "get", "pods", "-n", "observability",
|
||||
"-l", "app.kubernetes.io/name=otel-collector",
|
||||
"--no-headers", "-o", "custom-columns=STATUS:.status.phase"],
|
||||
capture_output=True, text=True, timeout=15
|
||||
preflight_error = _status_error_from_env("AWOOOI_OTEL_COLLECTOR_ERROR")
|
||||
if preflight_error:
|
||||
return CheckResult(
|
||||
"OTEL Collector",
|
||||
False,
|
||||
f"host kubectl preflight failed: {preflight_error}",
|
||||
critical=False,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
|
||||
preflight_statuses = _statuses_from_env("AWOOOI_OTEL_COLLECTOR_STATUSES")
|
||||
if preflight_statuses is not None:
|
||||
return _check_running_statuses(
|
||||
"OTEL Collector",
|
||||
preflight_statuses,
|
||||
"沒有 Running 的 OTEL Collector Pod",
|
||||
)
|
||||
|
||||
try:
|
||||
statuses = _run_kubectl_status_query("otel-collector")
|
||||
if statuses is None:
|
||||
return CheckResult(
|
||||
"OTEL Collector", False, "kubectl 查詢失敗", critical=False
|
||||
)
|
||||
|
||||
statuses = result.stdout.strip().split("\n")
|
||||
running = [s for s in statuses if s.strip() == "Running"]
|
||||
|
||||
if len(running) == 0:
|
||||
return CheckResult(
|
||||
"OTEL Collector", False, "沒有 Running 的 OTEL Collector Pod"
|
||||
)
|
||||
|
||||
return CheckResult(
|
||||
"OTEL Collector", True, f"{len(running)} Pod(s) Running"
|
||||
return _check_running_statuses(
|
||||
"OTEL Collector",
|
||||
statuses,
|
||||
"沒有 Running 的 OTEL Collector Pod",
|
||||
)
|
||||
except Exception as e:
|
||||
return CheckResult(
|
||||
@@ -238,28 +303,35 @@ def check_otel_collector() -> CheckResult:
|
||||
|
||||
def check_event_exporter() -> CheckResult:
|
||||
"""Check 6: Event Exporter 是否在 K3s 運行"""
|
||||
try:
|
||||
import subprocess
|
||||
result = subprocess.run(
|
||||
["kubectl", "get", "pods", "-n", "observability",
|
||||
"-l", "app.kubernetes.io/name=event-exporter",
|
||||
"--no-headers", "-o", "custom-columns=STATUS:.status.phase"],
|
||||
capture_output=True, text=True, timeout=15
|
||||
preflight_error = _status_error_from_env("AWOOOI_EVENT_EXPORTER_ERROR")
|
||||
if preflight_error:
|
||||
return CheckResult(
|
||||
"Event Exporter",
|
||||
False,
|
||||
f"host kubectl preflight failed: {preflight_error}",
|
||||
critical=False,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
|
||||
preflight_statuses = _statuses_from_env("AWOOOI_EVENT_EXPORTER_STATUSES")
|
||||
if preflight_statuses is not None:
|
||||
return _check_running_statuses(
|
||||
"Event Exporter",
|
||||
preflight_statuses,
|
||||
"沒有 Running 的 Event Exporter Pod",
|
||||
)
|
||||
|
||||
try:
|
||||
statuses = _run_kubectl_status_query("event-exporter")
|
||||
if statuses is None:
|
||||
return CheckResult(
|
||||
"Event Exporter", False, "kubectl 查詢失敗", critical=False
|
||||
)
|
||||
|
||||
statuses = result.stdout.strip().split("\n")
|
||||
running = [s for s in statuses if s.strip() == "Running"]
|
||||
|
||||
if len(running) == 0:
|
||||
return CheckResult(
|
||||
"Event Exporter", False, "沒有 Running 的 Event Exporter Pod"
|
||||
)
|
||||
|
||||
return CheckResult("Event Exporter", True, f"{len(running)} Pod(s) Running")
|
||||
return _check_running_statuses(
|
||||
"Event Exporter",
|
||||
statuses,
|
||||
"沒有 Running 的 Event Exporter Pod",
|
||||
)
|
||||
except Exception as e:
|
||||
return CheckResult(
|
||||
"Event Exporter", False, f"無法檢查: {e}", critical=False
|
||||
|
||||
Reference in New Issue
Block a user