diff --git a/.github/workflows/cd.yaml b/.github/workflows/cd.yaml index dd81dae4..0b6252f4 100644 --- a/.github/workflows/cd.yaml +++ b/.github/workflows/cd.yaml @@ -124,11 +124,31 @@ jobs: -d chat_id="${{ secrets.OPENCLAW_TG_CHAT_ID }}" \ -d text="❌ AWOOOI Pre-flight 失敗%0A%0A🔗 ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" || true + # ==================== Wave C.2: 監控覆蓋率檢查 (ADR-037) ==================== + # 2026-03-29 Claude Code: 確保監控覆蓋率 >= 90% + monitoring-coverage: + name: "Monitoring Coverage" + runs-on: [self-hosted, harbor, k8s] + needs: pre-flight-check + timeout-minutes: 2 + steps: + - uses: actions/checkout@v4 + - name: "Check Monitoring Coverage" + run: | + python3 ops/monitoring/generate_monitoring.py --validate-only --ci + echo "✅ 監控覆蓋率檢查通過 (>= 90%)" + - name: "Notify Coverage Failure" + if: failure() + run: | + curl -sf -X POST "https://api.telegram.org/bot${{ secrets.OPENCLAW_TG_BOT_TOKEN }}/sendMessage" \ + -d chat_id="${{ secrets.OPENCLAW_TG_CHAT_ID }}" \ + -d text="⚠️ 監控覆蓋率低於 90%%,請更新 service-registry.yaml%0A%0A🔗 ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" || true + # ==================== 路徑偵測 (使用 dorny/paths-filter) ==================== detect-changes: name: Detect Changes runs-on: [self-hosted, harbor, k8s] - needs: pre-flight-check + needs: [pre-flight-check, monitoring-coverage] timeout-minutes: 1 outputs: api: ${{ inputs.force_deploy == true && 'true' || steps.filter.outputs.api }} @@ -426,10 +446,22 @@ jobs: # 3. SignOz Webhook Health try: r = httpx.get(f'{BASE}/api/v1/webhooks/signoz/health', timeout=TIMEOUT) - results.append(('signoz', r.status_code == 200)) + results.append(('signoz_health', r.status_code == 200)) except Exception as e: - results.append(('signoz', False)) - print(f'SignOz: {e}') + results.append(('signoz_health', False)) + print(f'SignOz Health: {e}') + + # 4. SignOz Webhook POST (P0-1 修復 2026-03-29) + try: + r = httpx.post(f'{BASE}/api/v1/webhooks/signoz/alert', json={ + 'alertname': 'E2E_CD_TEST', 'status': 'firing', + 'labels': {'severity': 'info', 'service_name': 'cd-test'}, + 'annotations': {'summary': 'CD Pipeline E2E Test'} + }, timeout=TIMEOUT) + results.append(('signoz_post', r.status_code == 200)) + except Exception as e: + results.append(('signoz_post', False)) + print(f'SignOz POST: {e}') # Summary passed = sum(1 for _, ok in results if ok) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 8ef88d14..09dffcd0 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -31,6 +31,13 @@ jobs: runs-on: [self-hosted, harbor, k8s] timeout-minutes: 1 steps: + # 2026-03-29 Claude Code: 修復 _diag/pages 檔案衝突 + - name: "Clean Runner Diagnostics" + run: | + RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")") + rm -rf "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true + mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true + - name: Quick sanity check run: | echo "✅ Runner 可用" @@ -44,6 +51,13 @@ jobs: needs: pre-flight timeout-minutes: 10 steps: + # 2026-03-29 Claude Code: 修復 _diag/pages 檔案衝突 + - name: "Clean Runner Diagnostics" + run: | + RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")") + rm -rf "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true + mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true + - uses: actions/checkout@v4 with: clean: true @@ -124,6 +138,11 @@ jobs: needs: lint timeout-minutes: 15 steps: + - name: "Clean Runner Diagnostics" + run: | + RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")") + rm -rf "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true + mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true - uses: actions/checkout@v4 - name: Setup pnpm @@ -157,6 +176,11 @@ jobs: needs: lint timeout-minutes: 15 steps: + - name: "Clean Runner Diagnostics" + run: | + RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")") + rm -rf "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true + mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true - uses: actions/checkout@v4 - name: Setup pnpm @@ -199,6 +223,11 @@ jobs: needs: pre-flight timeout-minutes: 5 steps: + - name: "Clean Runner Diagnostics" + run: | + RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")") + rm -rf "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true + mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true - uses: actions/checkout@v4 - name: Setup Python @@ -227,6 +256,11 @@ jobs: needs: api-lint timeout-minutes: 10 steps: + - name: "Clean Runner Diagnostics" + run: | + RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")") + rm -rf "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true + mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true - uses: actions/checkout@v4 - name: Setup Python @@ -255,6 +289,11 @@ jobs: timeout-minutes: 5 continue-on-error: true # 不阻塞主 Pipeline steps: + - name: "Clean Runner Diagnostics" + run: | + RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")") + rm -rf "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true + mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true - uses: actions/checkout@v4 - name: Test Ollama Connectivity @@ -320,6 +359,11 @@ jobs: needs: pre-flight timeout-minutes: 3 steps: + - name: "Clean Runner Diagnostics" + run: | + RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")") + rm -rf "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true + mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true - uses: actions/checkout@v4 - name: Setup Node.js @@ -346,6 +390,11 @@ jobs: matrix: app: [web, api] steps: + - name: "Clean Runner Diagnostics" + run: | + RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")") + rm -rf "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true + mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true - uses: actions/checkout@v4 - name: Set up Docker Buildx diff --git a/.github/workflows/daily-e2e-health.yaml b/.github/workflows/daily-e2e-health.yaml index ed2e4c40..e45fee1c 100644 --- a/.github/workflows/daily-e2e-health.yaml +++ b/.github/workflows/daily-e2e-health.yaml @@ -48,6 +48,13 @@ jobs: runs-on: [self-hosted, harbor, k8s] timeout-minutes: 15 steps: + # 2026-03-29 Claude Code: 修復 _diag/pages 檔案衝突 + - name: "Clean Runner Diagnostics" + run: | + RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")") + rm -rf "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true + mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true + - uses: actions/checkout@v4 - name: Setup Python diff --git a/.github/workflows/nightly-llm.yaml b/.github/workflows/nightly-llm.yaml index 8e4492d4..f29bff0c 100644 --- a/.github/workflows/nightly-llm.yaml +++ b/.github/workflows/nightly-llm.yaml @@ -38,6 +38,13 @@ jobs: runs-on: [self-hosted, harbor, k8s] timeout-minutes: 60 # 1 小時超時 steps: + # 2026-03-29 Claude Code: 修復 _diag/pages 檔案衝突 + - name: "Clean Runner Diagnostics" + run: | + RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")") + rm -rf "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true + mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true + - uses: actions/checkout@v4 - name: Setup Python diff --git a/.github/workflows/runner-healthcheck.yml b/.github/workflows/runner-healthcheck.yml index 71929907..7c60529e 100644 --- a/.github/workflows/runner-healthcheck.yml +++ b/.github/workflows/runner-healthcheck.yml @@ -36,6 +36,17 @@ jobs: timeout-minutes: 5 steps: + # ================================================================= + # 2026-03-29 Claude Code: 修復 _diag/pages 檔案衝突 + # 根本原因: GitHub Actions Runner 診斷檔案沒有自動清理 + # ================================================================= + - name: "Clean Runner Diagnostics" + run: | + RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")") + rm -rf "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true + mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true + echo "✅ Cleaned _diag/pages" + - name: "Check Docker Service" id: docker run: | @@ -83,7 +94,7 @@ jobs: id: k8s run: | if timeout 10 ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 \ - wooo@192.168.0.120 "kubectl get nodes" 2>/dev/null; then + wooo@192.168.0.125 "kubectl get nodes" 2>/dev/null; then echo "k8s_status=healthy" >> $GITHUB_OUTPUT else echo "k8s_status=unhealthy" >> $GITHUB_OUTPUT @@ -131,6 +142,12 @@ jobs: runs-on: [self-hosted, harbor, k8s] steps: + - name: "Clean Runner Diagnostics" + run: | + RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")") + rm -rf "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true + mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true + - name: "Attempt Docker Cleanup" run: | echo "=== 嘗試清理 Docker ==="