diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index f7d61fc7..424240cc 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -1001,6 +1001,57 @@ jobs: - name: Alert Chain Smoke Test id: alert_chain_smoke run: | + write_deploy_key() { + mkdir -p "${HOME}/.ssh" + umask 077 + cat > "${HOME}/.ssh/deploy_key" <<'AWOOOI_DEPLOY_KEY' + ${{ secrets.DEPLOY_SSH_KEY }} + AWOOOI_DEPLOY_KEY + chmod 600 "${HOME}/.ssh/deploy_key" + } + collect_observability_statuses() { + local component="$1" + ssh $SSH_OPTS "wooo@${K8S_SSH_HOST}" \ + "sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${K8S_API_SERVER} get pods -n observability -l app.kubernetes.io/name=${component} --no-headers -o custom-columns=STATUS:.status.phase" + } + capture_observability_statuses() { + local component="$1" + local output + if output="$(collect_observability_statuses "${component}" 2>&1)"; then + printf '%s' "${output}" + return 0 + fi + printf '%s' "${output}" + return 1 + } + + # 2026-05-19 Codex: the smoke test runs inside CI_IMAGE, but the + # observability pod checks need the K3s host kubectl context. Capture + # those read-only statuses on the host and pass them into the + # container, instead of making the container own kube credentials. + OBSERVABILITY_PREFLIGHT_ERROR="" + OTEL_COLLECTOR_ERROR="" + EVENT_EXPORTER_ERROR="" + OTEL_COLLECTOR_STATUSES="" + EVENT_EXPORTER_STATUSES="" + + write_deploy_key + if ssh-keyscan -T 5 -t ed25519,rsa,ecdsa "${K8S_SSH_HOST}" > "${HOME}/.ssh/known_hosts" 2>/dev/null && test -s "${HOME}/.ssh/known_hosts"; then + SSH_OPTS="-i ${HOME}/.ssh/deploy_key -o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${HOME}/.ssh/known_hosts -o ConnectTimeout=10" + if ! OTEL_COLLECTOR_STATUSES="$(capture_observability_statuses otel-collector)"; then + OTEL_COLLECTOR_ERROR="$(printf '%s' "${OTEL_COLLECTOR_STATUSES}" | tail -1 | head -c 200)" + OTEL_COLLECTOR_STATUSES="" + fi + if ! EVENT_EXPORTER_STATUSES="$(capture_observability_statuses event-exporter)"; then + EVENT_EXPORTER_ERROR="$(printf '%s' "${EVENT_EXPORTER_STATUSES}" | tail -1 | head -c 200)" + EVENT_EXPORTER_STATUSES="" + fi + else + OBSERVABILITY_PREFLIGHT_ERROR="K8s host keyscan failed" + OTEL_COLLECTOR_ERROR="${OBSERVABILITY_PREFLIGHT_ERROR}" + EVENT_EXPORTER_ERROR="${OBSERVABILITY_PREFLIGHT_ERROR}" + fi + # 2026-05-05 Codex: use the keepalived VIP instead of a fixed node. # Host runner launches the CI image explicitly to avoid act RWLayer=nil. if docker run --rm \ @@ -1010,6 +1061,10 @@ jobs: -v "$PWD:/workspace" \ -v awoooi-api-venv-cache:/opt/api-venv \ -w /workspace \ + -e AWOOOI_OTEL_COLLECTOR_STATUSES="${OTEL_COLLECTOR_STATUSES}" \ + -e AWOOOI_OTEL_COLLECTOR_ERROR="${OTEL_COLLECTOR_ERROR}" \ + -e AWOOOI_EVENT_EXPORTER_STATUSES="${EVENT_EXPORTER_STATUSES}" \ + -e AWOOOI_EVENT_EXPORTER_ERROR="${EVENT_EXPORTER_ERROR}" \ "${{ env.CI_IMAGE }}" \ bash -lc 'source /opt/api-venv/bin/activate && python3 scripts/alert_chain_smoke_test.py --api-url ${{ env.ALERT_CHAIN_API_URL }} --json | tee /tmp/alert_chain_result.json'; then echo "alert_chain_status=pass" >> $GITHUB_OUTPUT diff --git a/scripts/alert_chain_smoke_test.py b/scripts/alert_chain_smoke_test.py index 0ac02c46..2eb559e0 100644 --- a/scripts/alert_chain_smoke_test.py +++ b/scripts/alert_chain_smoke_test.py @@ -27,6 +27,8 @@ from __future__ import annotations import argparse import json +import os +import shlex import sys import time from dataclasses import dataclass, field @@ -52,6 +54,64 @@ MAX_ALERT_CHAIN_SILENCE_SECONDS = 2 * 60 * 60 TIMEOUT = 10 # 秒 +def _statuses_from_env(env_name: str) -> list[str] | None: + """Return preflight pod statuses supplied by CI, or None to use kubectl.""" + if env_name not in os.environ: + return None + return [ + line.strip() + for line in os.environ[env_name].splitlines() + if line.strip() + ] + + +def _status_error_from_env(env_name: str) -> str | None: + value = os.environ.get(env_name, "").strip() + return value or None + + +def _check_running_statuses( + name: str, + statuses: list[str], + empty_message: str, +) -> CheckResult: + running = [s for s in statuses if s == "Running"] + if len(running) == 0: + return CheckResult(name, False, empty_message) + return CheckResult(name, True, f"{len(running)} Pod(s) Running") + + +def _kubectl_base_command() -> list[str]: + # CI may provide a full safe wrapper such as: + # sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=https://... + return shlex.split(os.environ.get("AWOOOI_KUBECTL_CMD", "kubectl")) + + +def _run_kubectl_status_query(label: str) -> list[str] | None: + import subprocess + + result = subprocess.run( + [ + *_kubectl_base_command(), + "get", + "pods", + "-n", + "observability", + "-l", + f"app.kubernetes.io/name={label}", + "--no-headers", + "-o", + "custom-columns=STATUS:.status.phase", + ], + capture_output=True, + text=True, + timeout=15, + ) + if result.returncode != 0: + return None + return [line.strip() for line in result.stdout.splitlines() if line.strip()] + + # ============================================================================= # 測試結果 # ============================================================================= @@ -206,29 +266,34 @@ def check_signoz_reachable(signoz_url: str) -> CheckResult: def check_otel_collector() -> CheckResult: """Check 5: OTEL Collector DaemonSet 是否在 K3s 運行""" - try: - import subprocess - result = subprocess.run( - ["kubectl", "get", "pods", "-n", "observability", - "-l", "app.kubernetes.io/name=otel-collector", - "--no-headers", "-o", "custom-columns=STATUS:.status.phase"], - capture_output=True, text=True, timeout=15 + preflight_error = _status_error_from_env("AWOOOI_OTEL_COLLECTOR_ERROR") + if preflight_error: + return CheckResult( + "OTEL Collector", + False, + f"host kubectl preflight failed: {preflight_error}", + critical=False, ) - if result.returncode != 0: + + preflight_statuses = _statuses_from_env("AWOOOI_OTEL_COLLECTOR_STATUSES") + if preflight_statuses is not None: + return _check_running_statuses( + "OTEL Collector", + preflight_statuses, + "沒有 Running 的 OTEL Collector Pod", + ) + + try: + statuses = _run_kubectl_status_query("otel-collector") + if statuses is None: return CheckResult( "OTEL Collector", False, "kubectl 查詢失敗", critical=False ) - statuses = result.stdout.strip().split("\n") - running = [s for s in statuses if s.strip() == "Running"] - - if len(running) == 0: - return CheckResult( - "OTEL Collector", False, "沒有 Running 的 OTEL Collector Pod" - ) - - return CheckResult( - "OTEL Collector", True, f"{len(running)} Pod(s) Running" + return _check_running_statuses( + "OTEL Collector", + statuses, + "沒有 Running 的 OTEL Collector Pod", ) except Exception as e: return CheckResult( @@ -238,28 +303,35 @@ def check_otel_collector() -> CheckResult: def check_event_exporter() -> CheckResult: """Check 6: Event Exporter 是否在 K3s 運行""" - try: - import subprocess - result = subprocess.run( - ["kubectl", "get", "pods", "-n", "observability", - "-l", "app.kubernetes.io/name=event-exporter", - "--no-headers", "-o", "custom-columns=STATUS:.status.phase"], - capture_output=True, text=True, timeout=15 + preflight_error = _status_error_from_env("AWOOOI_EVENT_EXPORTER_ERROR") + if preflight_error: + return CheckResult( + "Event Exporter", + False, + f"host kubectl preflight failed: {preflight_error}", + critical=False, ) - if result.returncode != 0: + + preflight_statuses = _statuses_from_env("AWOOOI_EVENT_EXPORTER_STATUSES") + if preflight_statuses is not None: + return _check_running_statuses( + "Event Exporter", + preflight_statuses, + "沒有 Running 的 Event Exporter Pod", + ) + + try: + statuses = _run_kubectl_status_query("event-exporter") + if statuses is None: return CheckResult( "Event Exporter", False, "kubectl 查詢失敗", critical=False ) - statuses = result.stdout.strip().split("\n") - running = [s for s in statuses if s.strip() == "Running"] - - if len(running) == 0: - return CheckResult( - "Event Exporter", False, "沒有 Running 的 Event Exporter Pod" - ) - - return CheckResult("Event Exporter", True, f"{len(running)} Pod(s) Running") + return _check_running_statuses( + "Event Exporter", + statuses, + "沒有 Running 的 Event Exporter Pod", + ) except Exception as e: return CheckResult( "Event Exporter", False, f"無法檢查: {e}", critical=False