From ee2cc2bfc3335fd0955c397ad10923ed2a109050 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 12 Jun 2026 11:03:07 +0800 Subject: [PATCH] =?UTF-8?q?fix(alerts):=20=E6=94=B6=E6=96=82=20Telegram=20?= =?UTF-8?q?=E5=91=8A=E8=AD=A6=E5=88=B0=20SRE=20=E6=88=B0=E6=83=85=E5=AE=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitea/workflows/agent-market-watch.yaml | 4 +-- .gitea/workflows/cd-dev.yaml | 14 ++++---- .gitea/workflows/cd.yaml | 20 ++++++----- .gitea/workflows/code-review.yaml | 14 ++++---- .gitea/workflows/deploy-alerts.yaml | 4 +-- .gitea/workflows/e2e-health.yaml | 6 ++-- .gitea/workflows/run-migration.yml | 8 ++--- apps/api/src/api/v1/telegram.py | 2 +- apps/api/src/jobs/capacity_forecaster_job.py | 2 +- apps/api/src/jobs/compliance_scanner_job.py | 2 +- apps/api/src/jobs/coverage_evaluator_job.py | 2 +- apps/api/src/jobs/hermes_rule_quality_job.py | 2 +- apps/api/src/services/ai_rate_limiter.py | 4 +-- apps/api/src/services/approval_execution.py | 2 +- .../converged_alert_recurrence_notifier.py | 18 +--------- apps/api/src/services/failover_alerter.py | 2 +- apps/api/src/services/notification_matrix.py | 14 ++++---- .../src/services/notifications/telegram.py | 6 ++-- .../src/services/post_execution_verifier.py | 2 +- apps/api/src/services/telegram_gateway.py | 28 ++++++++-------- .../tests/test_alert_converged_recurrence.py | 10 ++---- .../test_notification_matrix_group_cutover.py | 4 +-- .../tests/test_telegram_button_consistency.py | 24 ++++++++++++++ docs/LOGBOOK.md | 29 ++++++++++++++++ ...AI_AGENT_AUTOMATION_WORKLIST_2026-06-04.md | 7 ++-- scripts/ci/check-gitea-step-env-secrets.js | 33 +++++++++++++++++-- scripts/ops/backup-from-110.sh | 2 +- scripts/ops/deploy-alertmanager-config.sh | 3 +- scripts/ops/deploy-docker-health-monitor.sh | 1 - scripts/ops/docker-health-monitor.sh | 5 ++- scripts/ops/dr-drill.sh | 2 +- scripts/ops/pg-backup.sh | 2 +- 32 files changed, 165 insertions(+), 113 deletions(-) diff --git a/.gitea/workflows/agent-market-watch.yaml b/.gitea/workflows/agent-market-watch.yaml index 3605e88d..7bd2e648 100644 --- a/.gitea/workflows/agent-market-watch.yaml +++ b/.gitea/workflows/agent-market-watch.yaml @@ -14,7 +14,7 @@ on: env: GITEA_ACTIONS_URL: http://192.168.0.110:3001/wooo/awoooi/actions - TELEGRAM_ALERT_CHAT_ID: "-1003711974679" + SRE_GROUP_CHAT_ID: "-1003711974679" jobs: market-watch: @@ -499,7 +499,7 @@ jobs: - name: Summarize actionable change or failure if: always() env: - TG_CHAT_ID: ${{ env.TELEGRAM_ALERT_CHAT_ID }} + TG_CHAT_ID: ${{ env.SRE_GROUP_CHAT_ID }} JOB_STATUS: ${{ job.status }} CANDIDATE_COUNT: ${{ steps.watch.outputs.candidate_count }} SOURCE_COUNT: ${{ steps.watch.outputs.source_count }} diff --git a/.gitea/workflows/cd-dev.yaml b/.gitea/workflows/cd-dev.yaml index df1bf900..fa3203e1 100644 --- a/.gitea/workflows/cd-dev.yaml +++ b/.gitea/workflows/cd-dev.yaml @@ -19,7 +19,7 @@ concurrency: env: HARBOR: 192.168.0.110:5000 HARBOR_MIRROR: 192.168.0.110:5001 - TELEGRAM_ALERT_CHAT_ID: "-1003711974679" + SRE_GROUP_CHAT_ID: "-1003711974679" OTEL_EXPORTER_OTLP_ENDPOINT: http://192.168.0.188:24318 OTEL_SERVICE_NAME: awoooi-cd-dev OTEL_RESOURCE_ATTRIBUTES: service.version=${{ github.sha }},deployment.environment=dev @@ -52,7 +52,7 @@ jobs: echo "Dev deploy start notification mirrored through AWOOI API" else printf '%b' "$MSG" | curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \ - -d "chat_id=${{ env.TELEGRAM_ALERT_CHAT_ID }}" \ + -d "chat_id=${{ env.SRE_GROUP_CHAT_ID }}" \ -d "parse_mode=HTML" \ --data-urlencode "text@-" fi @@ -130,9 +130,9 @@ jobs: ${{ secrets.TELEGRAM_BOT_TOKEN }} AWOOOI_SECRET_TG_BOT_TOKEN )" - TG_CHAT_ID_B64="$(secret_b64 <<'AWOOOI_SECRET_TG_CHAT_ID' - ${{ secrets.TELEGRAM_CHAT_ID }} - AWOOOI_SECRET_TG_CHAT_ID + TG_CHAT_ID_B64="$(secret_b64 <<'AWOOOI_SECRET_SRE_GROUP_CHAT_ID_COMPAT' + ${{ secrets.SRE_GROUP_CHAT_ID }} + AWOOOI_SECRET_SRE_GROUP_CHAT_ID_COMPAT )" NVIDIA_API_KEY_B64="$(secret_b64 <<'AWOOOI_SECRET_NVIDIA_API_KEY' ${{ secrets.NVIDIA_API_KEY }} @@ -235,7 +235,7 @@ jobs: echo "Dev deploy success notification mirrored through AWOOI API" else printf '%b' "$MSG" | curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \ - -d "chat_id=${{ env.TELEGRAM_ALERT_CHAT_ID }}" \ + -d "chat_id=${{ env.SRE_GROUP_CHAT_ID }}" \ -d "parse_mode=HTML" \ --data-urlencode "text@-" fi @@ -256,7 +256,7 @@ jobs: echo "Dev deploy failure notification mirrored through AWOOI API" else printf '%b' "$MSG" | curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \ - -d "chat_id=${{ env.TELEGRAM_ALERT_CHAT_ID }}" \ + -d "chat_id=${{ env.SRE_GROUP_CHAT_ID }}" \ -d "parse_mode=HTML" \ --data-urlencode "text@-" fi diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index c3e6fcba..26a21c24 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -39,7 +39,7 @@ concurrency: env: HARBOR: 192.168.0.110:5000 - TELEGRAM_ALERT_CHAT_ID: "-1003711974679" + SRE_GROUP_CHAT_ID: "-1003711974679" # Harbor Proxy Cache (指向 DockerHub 的內部 Mirror,避免拉取限額) HARBOR_MIRROR: 192.168.0.110:5001 # OTEL CI/CD 監控 (2026-03-31 #46c - 遷移到 Gitea) @@ -111,7 +111,7 @@ jobs: echo "✅ CI/CD start notification mirrored through AWOOI API" else curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \ - -d "chat_id=${{ env.TELEGRAM_ALERT_CHAT_ID }}" \ + -d "chat_id=${{ env.SRE_GROUP_CHAT_ID }}" \ -d "parse_mode=HTML" \ --data-urlencode "text=${MSG}" || echo "TG notify failed (non-fatal): exit=$?" fi @@ -303,7 +303,7 @@ jobs: echo "✅ CI/CD tests failure notification mirrored through AWOOI API" else curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \ - -d "chat_id=${{ env.TELEGRAM_ALERT_CHAT_ID }}" \ + -d "chat_id=${{ env.SRE_GROUP_CHAT_ID }}" \ -d "parse_mode=HTML" \ --data-urlencode "text=${MSG}" || echo "TG notify failed (non-fatal): exit=$?" fi @@ -509,9 +509,9 @@ jobs: ${{ secrets.TELEGRAM_BOT_TOKEN }} AWOOOI_SECRET_TG_BOT_TOKEN )" - TG_CHAT_ID_B64="$(secret_b64 <<'AWOOOI_SECRET_TG_CHAT_ID' - ${{ secrets.TELEGRAM_CHAT_ID }} - AWOOOI_SECRET_TG_CHAT_ID + TG_CHAT_ID_B64="$(secret_b64 <<'AWOOOI_SECRET_SRE_GROUP_CHAT_ID_COMPAT' + ${{ secrets.SRE_GROUP_CHAT_ID }} + AWOOOI_SECRET_SRE_GROUP_CHAT_ID_COMPAT )" NVIDIA_API_KEY_B64="$(secret_b64 <<'AWOOOI_SECRET_NVIDIA_API_KEY' ${{ secrets.NVIDIA_API_KEY }} @@ -616,6 +616,8 @@ jobs: KUBECTL="sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=\${K8S_API_SERVER}" # 注入 Telegram Secrets (ADR-035 鐵律) + # 2026-06-12 Codex: OPENCLAW_TG_CHAT_ID 僅作舊欄位相容, + # 實際值必須與 SRE_GROUP_CHAT_ID 一致,避免正式告警旁路到其他群組。 \$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[ {"op":"add","path":"/data/OPENCLAW_TG_BOT_TOKEN","value":"${TG_BOT_TOKEN_B64}"}, {"op":"add","path":"/data/OPENCLAW_TG_CHAT_ID","value":"${TG_CHAT_ID_B64}"} @@ -1182,7 +1184,7 @@ jobs: echo "✅ CI/CD build failure notification mirrored through AWOOI API" else curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \ - -d "chat_id=${{ env.TELEGRAM_ALERT_CHAT_ID }}" \ + -d "chat_id=${{ env.SRE_GROUP_CHAT_ID }}" \ -d "parse_mode=HTML" \ --data-urlencode "text=${MSG}" || echo "TG notify failed (non-fatal): exit=$?" fi @@ -1527,7 +1529,7 @@ jobs: echo "✅ CI/CD success notification mirrored through AWOOI API" else printf '%b' "$TG_MSG" | curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \ - -d "chat_id=${{ env.TELEGRAM_ALERT_CHAT_ID }}" \ + -d "chat_id=${{ env.SRE_GROUP_CHAT_ID }}" \ --data-urlencode "text@-" || echo "TG notify warning (non-fatal)" fi @@ -1550,7 +1552,7 @@ jobs: echo "✅ CI/CD post-deploy failure notification mirrored through AWOOI API" else curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \ - -d "chat_id=${{ env.TELEGRAM_ALERT_CHAT_ID }}" \ + -d "chat_id=${{ env.SRE_GROUP_CHAT_ID }}" \ -d "parse_mode=HTML" \ --data-urlencode "text=${MSG}" || echo "TG notify failed (non-fatal): exit=$?" fi diff --git a/.gitea/workflows/code-review.yaml b/.gitea/workflows/code-review.yaml index d422f636..675ac189 100644 --- a/.gitea/workflows/code-review.yaml +++ b/.gitea/workflows/code-review.yaml @@ -19,7 +19,7 @@ concurrency: env: REPORT_URL: https://mo.wooo.work/code-review/ GITEA_ACTIONS_URL: http://192.168.0.110:3001/wooo/awoooi/actions - TELEGRAM_ALERT_CHAT_ID: "-1003711974679" + SRE_GROUP_CHAT_ID: "-1003711974679" jobs: ai-code-review: @@ -105,7 +105,7 @@ jobs: - name: Notify Code Review Start if: steps.stale.outputs.skip != 'true' env: - TG_CHAT_ID: ${{ env.TELEGRAM_ALERT_CHAT_ID }} + SRE_GROUP_CHAT_ID: ${{ env.SRE_GROUP_CHAT_ID }} SHORT_SHA: ${{ steps.ctx.outputs.short_sha }} BRANCH: ${{ steps.ctx.outputs.branch }} COMMIT_MSG: ${{ steps.ctx.outputs.commit_msg }} @@ -130,13 +130,13 @@ jobs: scripts/ci/notify-awoooi-cicd.sh; then echo "Code review start notification mirrored through AWOOI API" else - if [ -z "${TG_BOT_TOKEN:-}" ] || [ -z "${TG_CHAT_ID:-}" ]; then + if [ -z "${TG_BOT_TOKEN:-}" ] || [ -z "${SRE_GROUP_CHAT_ID:-}" ]; then echo "Telegram secret missing and AWOOI API notify failed; skip start notification" exit 0 fi curl -fsS -X POST "https://api.telegram.org/bot${TG_BOT_TOKEN}/sendMessage" \ -H "Content-Type: application/json" \ - -d "$(jq -n --arg c "$TG_CHAT_ID" --arg t "$MSG" '{chat_id:$c,text:$t,parse_mode:"HTML",disable_web_page_preview:true}')" \ + -d "$(jq -n --arg c "$SRE_GROUP_CHAT_ID" --arg t "$MSG" '{chat_id:$c,text:$t,parse_mode:"HTML",disable_web_page_preview:true}')" \ >/dev/null fi @@ -156,7 +156,7 @@ jobs: - name: Notify Code Review Completion if: always() && steps.stale.outputs.skip != 'true' env: - TG_CHAT_ID: ${{ env.TELEGRAM_ALERT_CHAT_ID }} + SRE_GROUP_CHAT_ID: ${{ env.SRE_GROUP_CHAT_ID }} SHORT_SHA: ${{ steps.ctx.outputs.short_sha }} run: | set -euo pipefail @@ -209,12 +209,12 @@ jobs: scripts/ci/notify-awoooi-cicd.sh; then echo "Code review completion notification mirrored through AWOOI API" else - if [ -z "${TG_BOT_TOKEN:-}" ] || [ -z "${TG_CHAT_ID:-}" ]; then + if [ -z "${TG_BOT_TOKEN:-}" ] || [ -z "${SRE_GROUP_CHAT_ID:-}" ]; then echo "Telegram secret missing and AWOOI API notify failed; skip completion notification" exit 0 fi curl -fsS -X POST "https://api.telegram.org/bot${TG_BOT_TOKEN}/sendMessage" \ -H "Content-Type: application/json" \ - -d "$(jq -n --arg c "$TG_CHAT_ID" --arg t "$MSG" '{chat_id:$c,text:$t,parse_mode:"HTML",disable_web_page_preview:true}')" \ + -d "$(jq -n --arg c "$SRE_GROUP_CHAT_ID" --arg t "$MSG" '{chat_id:$c,text:$t,parse_mode:"HTML",disable_web_page_preview:true}')" \ >/dev/null fi diff --git a/.gitea/workflows/deploy-alerts.yaml b/.gitea/workflows/deploy-alerts.yaml index c220036e..63a4d1e1 100644 --- a/.gitea/workflows/deploy-alerts.yaml +++ b/.gitea/workflows/deploy-alerts.yaml @@ -17,7 +17,7 @@ on: workflow_dispatch: env: - TELEGRAM_ALERT_CHAT_ID: "-1003711974679" + SRE_GROUP_CHAT_ID: "-1003711974679" jobs: deploy-alerts: @@ -67,6 +67,6 @@ jobs: echo "Alert rule deploy notification mirrored through AWOOI API" else curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \ - -d "chat_id=${{ env.TELEGRAM_ALERT_CHAT_ID }}" \ + -d "chat_id=${{ env.SRE_GROUP_CHAT_ID }}" \ --data-urlencode "text=${MSG}" || true fi diff --git a/.gitea/workflows/e2e-health.yaml b/.gitea/workflows/e2e-health.yaml index 09e1ccf6..91d55877 100644 --- a/.gitea/workflows/e2e-health.yaml +++ b/.gitea/workflows/e2e-health.yaml @@ -19,7 +19,7 @@ env: OTEL_EXPORTER_OTLP_ENDPOINT: http://192.168.0.188:24318 OTEL_SERVICE_NAME: awoooi-e2e OTEL_RESOURCE_ATTRIBUTES: deployment.environment=production - TELEGRAM_ALERT_CHAT_ID: "-1003711974679" + SRE_GROUP_CHAT_ID: "-1003711974679" jobs: e2e-health: @@ -95,8 +95,8 @@ jobs: scripts/ci/notify-awoooi-cicd.sh; then echo "E2E failure notification mirrored through AWOOI API" else - curl -s -X POST "https://api.telegram.org/bot${{ secrets.OPENCLAW_TG_BOT_TOKEN }}/sendMessage" \ - -d chat_id="${{ env.TELEGRAM_ALERT_CHAT_ID }}" \ + curl -s -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \ + -d chat_id="${{ env.SRE_GROUP_CHAT_ID }}" \ -d parse_mode="HTML" \ -d text="🔴 [E2E Health Check] 失敗%0A%0A📅 $(TZ=Asia/Taipei date '+%Y-%m-%d %H:%M')%0A🔗 API 健康檢查未通過%0A%0A請檢查 K3s 叢集狀態" fi diff --git a/.gitea/workflows/run-migration.yml b/.gitea/workflows/run-migration.yml index cb2002fc..db43646e 100644 --- a/.gitea/workflows/run-migration.yml +++ b/.gitea/workflows/run-migration.yml @@ -20,7 +20,7 @@ on: workflow_dispatch: env: - TELEGRAM_ALERT_CHAT_ID: "-1003711974679" + SRE_GROUP_CHAT_ID: "-1003711974679" jobs: migrate: @@ -188,8 +188,6 @@ jobs: - name: Notify Telegram (if configured) if: always() - env: - TG_CHAT: ${{ env.TELEGRAM_ALERT_CHAT_ID }} run: | TG_TOKEN="$(cat <<'AWOOOI_SECRET_TG_TOKEN' ${{ secrets.TELEGRAM_BOT_TOKEN }} @@ -207,10 +205,10 @@ jobs: echo "Migration notification mirrored through AWOOI API" exit 0 fi - if [ -n "$TG_TOKEN" ] && [ -n "$TG_CHAT" ]; then + if [ -n "$TG_TOKEN" ] && [ -n "${{ env.SRE_GROUP_CHAT_ID }}" ]; then MSG="🗄️ Migration CI: \`${STATUS}\` — commit ${{ github.sha }}" curl -s -X POST "https://api.telegram.org/bot${TG_TOKEN}/sendMessage" \ - -d chat_id="${TG_CHAT}" \ + -d chat_id="${{ env.SRE_GROUP_CHAT_ID }}" \ -d parse_mode="Markdown" \ -d text="${MSG}" || true fi diff --git a/apps/api/src/api/v1/telegram.py b/apps/api/src/api/v1/telegram.py index d0b6d304..5aa208c4 100644 --- a/apps/api/src/api/v1/telegram.py +++ b/apps/api/src/api/v1/telegram.py @@ -454,7 +454,7 @@ async def telegram_health() -> dict: "mode": "long_polling", # Phase 5.5: 已從 webhook 切換至 long_polling "polling_active": gateway._polling_active, "bot_token_set": bool(settings.OPENCLAW_TG_BOT_TOKEN), - "chat_id_set": bool(settings.SRE_GROUP_CHAT_ID or settings.OPENCLAW_TG_CHAT_ID), + "chat_id_set": bool(settings.SRE_GROUP_CHAT_ID), "sre_group_chat_id_set": bool(settings.SRE_GROUP_CHAT_ID), "whitelist_count": len(settings.OPENCLAW_TG_USER_WHITELIST), "last_update_id": gateway._last_update_id, diff --git a/apps/api/src/jobs/capacity_forecaster_job.py b/apps/api/src/jobs/capacity_forecaster_job.py index 18b27c2c..67657921 100644 --- a/apps/api/src/jobs/capacity_forecaster_job.py +++ b/apps/api/src/jobs/capacity_forecaster_job.py @@ -326,7 +326,7 @@ async def _send_telegram_forecast( from src.services.ai_advisory_helpers import build_ai_advisory_keyboard, is_snoozed from src.services.telegram_gateway import get_telegram_gateway - target_chat_id = settings.SRE_GROUP_CHAT_ID or settings.OPENCLAW_TG_CHAT_ID + target_chat_id = settings.SRE_GROUP_CHAT_ID if not target_chat_id: return False diff --git a/apps/api/src/jobs/compliance_scanner_job.py b/apps/api/src/jobs/compliance_scanner_job.py index bfe71dca..8453f331 100644 --- a/apps/api/src/jobs/compliance_scanner_job.py +++ b/apps/api/src/jobs/compliance_scanner_job.py @@ -474,7 +474,7 @@ async def _send_telegram_posture( from src.services.ai_advisory_helpers import build_ai_advisory_keyboard, is_snoozed from src.services.telegram_gateway import get_telegram_gateway - target_chat_id = settings.SRE_GROUP_CHAT_ID or settings.OPENCLAW_TG_CHAT_ID + target_chat_id = settings.SRE_GROUP_CHAT_ID if not target_chat_id: return diff --git a/apps/api/src/jobs/coverage_evaluator_job.py b/apps/api/src/jobs/coverage_evaluator_job.py index e08011e3..8d9843db 100644 --- a/apps/api/src/jobs/coverage_evaluator_job.py +++ b/apps/api/src/jobs/coverage_evaluator_job.py @@ -299,7 +299,7 @@ async def _send_telegram_gaps( from src.services.ai_advisory_helpers import build_ai_advisory_keyboard, is_snoozed from src.services.telegram_gateway import get_telegram_gateway - target_chat_id = settings.SRE_GROUP_CHAT_ID or settings.OPENCLAW_TG_CHAT_ID + target_chat_id = settings.SRE_GROUP_CHAT_ID if not target_chat_id: return diff --git a/apps/api/src/jobs/hermes_rule_quality_job.py b/apps/api/src/jobs/hermes_rule_quality_job.py index 4d342843..cc20fa75 100644 --- a/apps/api/src/jobs/hermes_rule_quality_job.py +++ b/apps/api/src/jobs/hermes_rule_quality_job.py @@ -316,7 +316,7 @@ async def _send_telegram_summary( from src.services.ai_advisory_helpers import build_ai_advisory_keyboard, is_snoozed from src.services.telegram_gateway import get_telegram_gateway - target_chat_id = settings.SRE_GROUP_CHAT_ID or settings.OPENCLAW_TG_CHAT_ID + target_chat_id = settings.SRE_GROUP_CHAT_ID if not target_chat_id: logger.info("hermes_telegram_skip_no_chat_id") return False diff --git a/apps/api/src/services/ai_rate_limiter.py b/apps/api/src/services/ai_rate_limiter.py index 0c02d030..09b2eda2 100644 --- a/apps/api/src/services/ai_rate_limiter.py +++ b/apps/api/src/services/ai_rate_limiter.py @@ -276,7 +276,7 @@ class AIRateLimiter: from src.core.config import settings from src.services.telegram_gateway import get_telegram_gateway - target_chat_id = settings.SRE_GROUP_CHAT_ID or settings.OPENCLAW_TG_CHAT_ID + target_chat_id = settings.SRE_GROUP_CHAT_ID if not settings.OPENCLAW_TG_BOT_TOKEN or not target_chat_id: logger.warning("telegram_not_configured_for_cost_alert") return @@ -328,7 +328,7 @@ class AIRateLimiter: from src.core.config import settings from src.services.telegram_gateway import get_telegram_gateway - target_chat_id = settings.SRE_GROUP_CHAT_ID or settings.OPENCLAW_TG_CHAT_ID + target_chat_id = settings.SRE_GROUP_CHAT_ID if not settings.OPENCLAW_TG_BOT_TOKEN or not target_chat_id: return diff --git a/apps/api/src/services/approval_execution.py b/apps/api/src/services/approval_execution.py index 88a67bcf..809752c4 100644 --- a/apps/api/src/services/approval_execution.py +++ b/apps/api/src/services/approval_execution.py @@ -1119,7 +1119,7 @@ class ApprovalExecutionService: from src.services.telegram_gateway import get_telegram_gateway settings = get_settings() gateway = get_telegram_gateway() - target_chat_id = settings.SRE_GROUP_CHAT_ID or settings.OPENCLAW_TG_CHAT_ID + target_chat_id = settings.SRE_GROUP_CHAT_ID if not target_chat_id: logger.warning( "push_execution_result_no_target_chat", diff --git a/apps/api/src/services/converged_alert_recurrence_notifier.py b/apps/api/src/services/converged_alert_recurrence_notifier.py index a8df6f90..054f0330 100644 --- a/apps/api/src/services/converged_alert_recurrence_notifier.py +++ b/apps/api/src/services/converged_alert_recurrence_notifier.py @@ -3,7 +3,6 @@ import hashlib import html -from src.core.config import settings from src.core.logging import get_logger from src.core.redis_client import get_redis from src.services.telegram_gateway import get_telegram_gateway @@ -179,21 +178,6 @@ async def notify_converged_alert_recurrence( error=str(exc), ) - private_chat_id = settings.OPENCLAW_TG_CHAT_ID - if private_chat_id and private_chat_id != gateway.alert_chat_id: - try: - await gateway.send_notification(text, chat_id=private_chat_id) - sent_count += 1 - except Exception as exc: - failures.append(f"private:{type(exc).__name__}") - logger.warning( - "converged_alert_recurrence_private_mirror_failed", - source=source, - approval_id=approval_id, - recurrence_stage=recurrence_stage, - error=str(exc), - ) - if sent_count: logger.info( "converged_alert_recurrence_sent", @@ -201,7 +185,7 @@ async def notify_converged_alert_recurrence( hit_count=hit_count, approval_id=approval_id, recurrence_stage=recurrence_stage, - mirrored_to_private=bool(private_chat_id and private_chat_id != gateway.alert_chat_id), + mirrored_to_private=False, sent_count=sent_count, ) else: diff --git a/apps/api/src/services/failover_alerter.py b/apps/api/src/services/failover_alerter.py index f2e2f58e..3c945642 100644 --- a/apps/api/src/services/failover_alerter.py +++ b/apps/api/src/services/failover_alerter.py @@ -252,7 +252,7 @@ class FailoverAlerter: from src.services.telegram_gateway import get_telegram_gateway settings = get_settings() - chat_id = getattr(settings, "SRE_GROUP_CHAT_ID", None) or getattr(settings, "OPENCLAW_TG_CHAT_ID", None) + chat_id = getattr(settings, "SRE_GROUP_CHAT_ID", None) if not chat_id: logger.warning("telegram_chat_id_missing_failover_alert") return diff --git a/apps/api/src/services/notification_matrix.py b/apps/api/src/services/notification_matrix.py index 4d6f15b1..fa6f2739 100644 --- a/apps/api/src/services/notification_matrix.py +++ b/apps/api/src/services/notification_matrix.py @@ -4,8 +4,8 @@ Notification routing matrix — ADR-093 單一矩陣決定每種通知類型的發送目標,取代 telegram_gateway.py 內 24 處硬碼 chat_id。 設計原則: -- 正式告警目的地一律 SRE_GROUP_CHAT_ID 優先 -- OPENCLAW_TG_CHAT_ID 只在 SRE_GROUP_CHAT_ID 缺失時作 fail-soft fallback +- 正式告警目的地一律為 SRE_GROUP_CHAT_ID +- SRE_GROUP_CHAT_ID 缺失時必須顯示配置缺口,不得旁路到個人或其他群組 - 未知通知類型預設發群組 2026-04-25 ogt + Claude Sonnet 4.6 @@ -16,7 +16,7 @@ from enum import Enum class Destination(str, Enum): - DM = "dm" # OPENCLAW_TG_CHAT_ID (僅缺群組設定時 fallback) + DM = "dm" # legacy alias: 2026-06-12 起不再旁路至 DM GROUP = "group" # SRE_GROUP_CHAT_ID BOTH = "both" # legacy alias: 2026-04-30 起視為 group-first @@ -28,7 +28,7 @@ class RoutingRule: # ADR-093 D1-D4 路由矩陣 -# 2026-04-30 Codex: 所有告警類型群組優先,DM 只作缺群組設定 fallback。 +# 2026-06-12 Codex: 所有正式告警只送 AwoooI SRE 戰情室;缺群組設定時回空清單。 NOTIFICATION_ROUTING: dict[str, RoutingRule] = { "TYPE-1": RoutingRule(Destination.GROUP), "TYPE-2": RoutingRule(Destination.GROUP), @@ -60,7 +60,5 @@ def resolve_chat_ids( 回傳此通知應發送的 chat_id 清單。 tg_group_cutover 僅保留為舊 caller 相容參數;正式策略永遠群組優先。 """ - rule = get_routing_rule(notification_type) - if rule.destination == Destination.DM and not group_chat_id: - return [dm_chat_id] if dm_chat_id else [] - return [group_chat_id or dm_chat_id] if (group_chat_id or dm_chat_id) else [] + _ = get_routing_rule(notification_type) + return [group_chat_id] if group_chat_id else [] diff --git a/apps/api/src/services/notifications/telegram.py b/apps/api/src/services/notifications/telegram.py index 1aea558b..4a21673a 100644 --- a/apps/api/src/services/notifications/telegram.py +++ b/apps/api/src/services/notifications/telegram.py @@ -29,10 +29,8 @@ class TelegramWebhookProvider(NotificationProvider): @property def enabled(self) -> bool: - """檢查 Telegram bot token 與 chat ID 是否配置""" - return bool(settings.OPENCLAW_TG_BOT_TOKEN) and bool( - settings.SRE_GROUP_CHAT_ID or settings.OPENCLAW_TG_CHAT_ID - ) + """檢查 Telegram bot token 與 AwoooI SRE 戰情室是否配置。""" + return bool(settings.OPENCLAW_TG_BOT_TOKEN) and bool(settings.SRE_GROUP_CHAT_ID) def _format(self, msg: NotificationMessage) -> str: """格式化執行結果為 Telegram 訊息""" diff --git a/apps/api/src/services/post_execution_verifier.py b/apps/api/src/services/post_execution_verifier.py index 04da2cf4..b22fea42 100644 --- a/apps/api/src/services/post_execution_verifier.py +++ b/apps/api/src/services/post_execution_verifier.py @@ -429,7 +429,7 @@ async def _send_rollback_proposal_alert( f"此為提案,不會自動執行 Rollback" ) - target_chat_id = _settings.SRE_GROUP_CHAT_ID or _settings.OPENCLAW_TG_CHAT_ID + target_chat_id = _settings.SRE_GROUP_CHAT_ID await gateway._send_request( "sendMessage", { diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index 10d7247c..d6513ec0 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -3375,8 +3375,8 @@ class TelegramGateway: logger.warning("telegram_gateway_disabled", reason="Bot token not configured") return False - if not settings.OPENCLAW_TG_CHAT_ID and not settings.SRE_GROUP_CHAT_ID: - logger.warning("telegram_gateway_disabled", reason="No Telegram chat ID configured") + if not settings.SRE_GROUP_CHAT_ID: + logger.warning("telegram_gateway_disabled", reason="SRE_GROUP_CHAT_ID not configured") return False # 2026-04-03 ogt: timeout 改用 httpx.Timeout 分開設定 @@ -3400,13 +3400,13 @@ class TelegramGateway: @property def chat_id(self) -> str: - """取得 Chat ID""" - return settings.OPENCLAW_TG_CHAT_ID + """取得正式產品告警 Chat ID。""" + return settings.SRE_GROUP_CHAT_ID @property def alert_chat_id(self) -> str: - """告警訊息收件人:SRE 群組優先,缺設定時才回退個人頻道。""" - return settings.SRE_GROUP_CHAT_ID or settings.OPENCLAW_TG_CHAT_ID + """告警訊息收件人:正式產品告警只送 AwoooI SRE 戰情室。""" + return settings.SRE_GROUP_CHAT_ID def _summarize_callback_data_for_audit(self, callback_data: str) -> dict[str, str | None]: """Return a redaction-safe summary of callback_data without persisting nonce.""" @@ -9338,16 +9338,14 @@ class TelegramGateway: text = report_to_telegram_html(report) - # 只發到 SRE 戰情室群組 - if settings.SRE_GROUP_CHAT_ID: - await self.send_to_group(text=text) - else: - # SRE_GROUP_CHAT_ID 未注入時,fallback 到個人頻道並加警告 - fallback = ( - "⚠️ SRE_GROUP_CHAT_ID 未設定,心跳報告暫發到個人頻道\n\n" - + text + # 只發到 AwoooI SRE 戰情室;缺設定時不得旁路到個人或其他群組。 + if not settings.SRE_GROUP_CHAT_ID: + logger.warning( + "telegram_heartbeat_skipped", + reason="SRE_GROUP_CHAT_ID not configured", ) - await self.send_notification(fallback) + return False + await self.send_to_group(text=text) self._last_message_time = datetime.now(UTC) logger.info( diff --git a/apps/api/tests/test_alert_converged_recurrence.py b/apps/api/tests/test_alert_converged_recurrence.py index a2409fba..5bab1772 100644 --- a/apps/api/tests/test_alert_converged_recurrence.py +++ b/apps/api/tests/test_alert_converged_recurrence.py @@ -126,7 +126,7 @@ async def test_converged_recurrence_falls_back_to_milestones(monkeypatch): @pytest.mark.asyncio -async def test_converged_recurrence_mirrors_to_private_chat(monkeypatch): +async def test_converged_recurrence_does_not_mirror_to_private_chat(monkeypatch): gateway = _FakeGateway() async def _always_notify(*, fingerprint, hit_count): @@ -134,7 +134,6 @@ async def test_converged_recurrence_mirrors_to_private_chat(monkeypatch): monkeypatch.setattr(notifier, "should_notify_converged_alert_recurrence", _always_notify) monkeypatch.setattr(notifier, "get_telegram_gateway", lambda: gateway) - monkeypatch.setattr(notifier.settings, "OPENCLAW_TG_CHAT_ID", "private-chat") await notifier.notify_converged_alert_recurrence( source="alertmanager", @@ -151,9 +150,4 @@ async def test_converged_recurrence_mirrors_to_private_chat(monkeypatch): ) assert len(gateway.primary_messages) == 1 - assert gateway.private_messages == [ - { - "text": gateway.primary_messages[0], - "chat_id": "private-chat", - } - ] + assert gateway.private_messages == [] diff --git a/apps/api/tests/test_notification_matrix_group_cutover.py b/apps/api/tests/test_notification_matrix_group_cutover.py index 1b086a06..ca3a6b8d 100644 --- a/apps/api/tests/test_notification_matrix_group_cutover.py +++ b/apps/api/tests/test_notification_matrix_group_cutover.py @@ -22,10 +22,10 @@ def test_all_alert_types_resolve_to_sre_group_first() -> None: ) == ["-1003711974679"] -def test_dm_is_only_fail_soft_fallback_when_group_missing() -> None: +def test_dm_is_not_used_when_group_missing() -> None: assert resolve_chat_ids( "TYPE-3", dm_chat_id="5619078117", group_chat_id="", tg_group_cutover=True, - ) == ["5619078117"] + ) == [] diff --git a/apps/api/tests/test_telegram_button_consistency.py b/apps/api/tests/test_telegram_button_consistency.py index 660f459d..f96cad5f 100644 --- a/apps/api/tests/test_telegram_button_consistency.py +++ b/apps/api/tests/test_telegram_button_consistency.py @@ -195,6 +195,30 @@ class TestSREGroupCutover: assert "_send_approval_card_to_group(" not in fn_body assert "asyncio.create_task" not in fn_body + def test_alert_chat_id_is_sre_only(self): + source = _read_gateway() + match = re.search( + r"def alert_chat_id\(self\).*?(?=\n def _summarize_callback_data_for_audit)", + source, + re.DOTALL, + ) + assert match, "找不到 alert_chat_id property" + fn_body = match.group(0) + assert "return settings.SRE_GROUP_CHAT_ID" in fn_body + assert "or settings.OPENCLAW_TG_CHAT_ID" not in fn_body + + def test_default_chat_id_is_sre_only(self): + source = _read_gateway() + match = re.search( + r"def chat_id\(self\).*?(?=\n @property\n def alert_chat_id)", + source, + re.DOTALL, + ) + assert match, "找不到 chat_id property" + fn_body = match.group(0) + assert "return settings.SRE_GROUP_CHAT_ID" in fn_body + assert "return settings.OPENCLAW_TG_CHAT_ID" not in fn_body + # ============================================================================= # Test: callback handler 完整性(鬼魂按鈕鐵律) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index c7b6f33e..1a39ae18 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,32 @@ +## 2026-06-12|P2-403K AwoooI SRE 戰情室路由收斂 + +**背景**:統帥指出產品告警不應分散到其他 Telegram Bot 或群組,所有 AWOOOI 產品告警必須集中到 **AwoooI SRE 戰情室**。P2-403J 已先補報表真相、日週月報、風險自動化與 TG 旁路審查;本段開始把實際 workflow、API service、ops script 與 CI guard 收斂到單一正式出口。 + +**完成**: + +- Gitea workflow 告警路由移除舊 `TELEGRAM_ALERT_CHAT_ID` / `TELEGRAM_CHAT_ID` 路徑,direct Telegram fallback 全部改用 `SRE_GROUP_CHAT_ID`;`e2e-health` 舊 `OPENCLAW_TG_BOT_TOKEN` direct fallback 改用正式 `TELEGRAM_BOT_TOKEN` + `SRE_GROUP_CHAT_ID`。 +- CD / dev CD secret 注入已讓舊相容欄位 `OPENCLAW_TG_CHAT_ID` 取自 `SRE_GROUP_CHAT_ID`,避免舊程式碼誤用相容欄位時旁路到其他群組。 +- `telegram_gateway` 的 `chat_id` / `alert_chat_id` 預設收件人收斂為 `settings.SRE_GROUP_CHAT_ID`;缺 `SRE_GROUP_CHAT_ID` 時 Telegram Gateway / heartbeat report 不再 fallback 到個人或舊群組。 +- `notification_matrix`、Telegram provider、recurrence notifier、capacity / coverage / Hermes rule quality / compliance jobs、approval execution、AI rate limiter、post execution verifier、failover alerter 與 `/api/v1/telegram/health` 全部改為 SRE-only;recurrence notifier 移除 private mirror。 +- ops scripts `docker-health-monitor`、DR drill、PostgreSQL backup、110 backup 與 Alertmanager config deploy fallback 全部改用 `SRE_GROUP_CHAT_ID`。 +- `check-gitea-step-env-secrets.js` 新增路由 guard:禁止 workflow 重新引用 `TELEGRAM_ALERT_CHAT_ID` / `TELEGRAM_CHAT_ID`,並擋下 direct Telegram fallback 未指向 `SRE_GROUP_CHAT_ID` 的路徑。 + +**本地驗證**: + +- `node scripts/ci/check-gitea-step-env-secrets.js`:`no Gitea step env/with secrets or legacy Telegram routes`。 +- 路由殘留掃描:`.gitea` / `apps/api/src` / `apps/api/tests` / `scripts/ops` / `k8s/awoooi-prod` 未命中舊 `TELEGRAM_ALERT_CHAT_ID`、舊 `TELEGRAM_CHAT_ID`、SRE/OpenClaw chat fallback 混用、個人 fallback 或 direct OpenClaw bot sendMessage。 +- `python3.11 -m py_compile`:Telegram gateway、notification matrix、Telegram provider、recurrence notifier、failover alerter、post verifier、rate limiter、approval execution、Telegram API 與相關 jobs 通過。 +- `bash -n`:相關 ops scripts 與 CI notify script 通過。 +- `DATABASE_URL='postgresql+asyncpg://test:test@localhost/test' PYTHONPATH=. python3.11 -m pytest -q tests/test_notification_matrix_group_cutover.py tests/test_alert_converged_recurrence.py tests/test_failover_alerter.py tests/test_telegram_button_consistency.py`:`39 passed`。 + +**完成度同步**: + +- P2-403K AwoooI SRE 戰情室路由程式收斂:本地 `100%`,正式站待部署驗證。 +- 三 Agent 主動溝通、學習與成長證據:維持最新工作清單口徑 `100%`;本段提高告警出口一致性,但 Telegram live send、Gateway queue write、receipt worker、KM / PlayBook / timeline / replay score 寫入與 runtime worker 仍未開 gate。 +- AI Agent automation backlog 維持 `92%`;IwoooS 整體仍維持 `64%`;active runtime gate 仍 `0`。 + +**邊界**:本段未讀取 secret value、未發真實 Telegram 測試訊息、未改 Prometheus / Alertmanager route 或 receiver、未改 CronJob、未 SSH、未 active scan、未啟動 runtime repair / verifier worker、未把 SRE 路由收斂解讀為資安 owner response 或 runtime 授權。 + ## 2026-06-12|P2-403J 日週月報與風險自動化 Review **背景**:統帥要求 AI Agent 產生日報、週報、月報,能看到每個 AI Agent 做了哪些工作與工作量,並以數據化、圖表化報告呈現;Agent 看過報告後要能自動分析評估並提出解決方案,高風險需統帥審核,中 / 低風險則朝自動處理並告警回報前進。本段先建立只讀 review、API、治理頁與測試,不直接開啟 runtime 執行。 diff --git a/docs/ai/AI_AGENT_AUTOMATION_WORKLIST_2026-06-04.md b/docs/ai/AI_AGENT_AUTOMATION_WORKLIST_2026-06-04.md index dba05220..4943d286 100644 --- a/docs/ai/AI_AGENT_AUTOMATION_WORKLIST_2026-06-04.md +++ b/docs/ai/AI_AGENT_AUTOMATION_WORKLIST_2026-06-04.md @@ -12,7 +12,7 @@ | Nemotron 實際整合應用 | 30% | 完整回放前仍被關卡擋下 | `blocked_needs_evidence`,下一關是 `refresh_source_evidence_then_5_record_smoke_only` | | 工具 / 服務 / 套件 AI 自動化 | 92% | P0 已完成;P1 服務 / runtime / 監控 / provider / service health / 備份 / DR / 套件與供應鏈只讀基線已完成;P1-007 失敗限定通知合約與前端 redaction 合約已完成;下一主線是 P2-004 依賴 / 供應鏈漂移監控 | 狀態分類、盤點 schema、權限矩陣、靜態盤點種子、只讀 API、UI 骨架、驗證、自動化待辦 schema / 快照 / API / 分組 UI、Backup / DR 目標盤點、準備度矩陣、備份通知政策、Backup / DR 證據 UI、復原演練批准包模板、異地 / escrow 準備度狀態、任務批准邊界、確定性進度彙總、Python 套件 / 供應鏈只讀基線、JS pnpm/npm 只讀基線、Docker build surface 只讀基線、CVE / license / drift 嚴重度政策、定期依賴漂移與外部資料來源檢查設計、依賴升級批准包模板、runtime_surface_inventory_v1 schema / snapshot / API / UI、gitea_workflow_runner_health_v1 schema / snapshot / API / UI、observability_contract_matrix_v1 schema / snapshot / API / UI、ai_provider_route_matrix_v1 schema / snapshot / API / UI、service_health_gap_matrix_v1 schema / snapshot / API / UI、service health evidence cards UI、service_health_failure_notification_policy_v1 schema / snapshot / API / UI 已完成 | | OpenClaw / Hermes / NemoTron 佈建布局 | 45% | P1-401 / P1-402 已完成;仍是只讀 layout 與治理頁顯示,不是 runtime deploy | `ai_agent_deployment_layout_v1` schema、`ai_agent_deployment_layout_2026-06-11.json`、`GET /api/v1/agents/agent-deployment-layout`、治理頁自動化盤點 UI、`AI_AGENT_DEPLOYMENT_LAYOUT_2026-06-11.md` | -| OpenClaw / Hermes / NemoTron 主動溝通、學習與成長證據 | 100% | P2-401A 已完成只讀 contract;P2-403A 已完成互動 / 接手 / 學習 / 成長證據面板;P2-403B 已完成 AgentSession / Redis Streams live read model gate;P2-403C 已完成 Redis Streams consumer group dry-run、handoff envelope、ack / dead-letter / replay gate;P2-403D 已完成 learning writeback approval package;P2-403E 已完成 Telegram receipt approval package;P2-403F 已完成 owner-approved learning dry-run preview、人工操作選項與 fixture-only dry-run 總包;P2-403G 已完成 runtime write gate review,固定雙重批准、dry-run hash、post-write verifier 與 redaction 欄位;P2-403H 已完成 post-write verifier implementation package、rollback lane、failure lane 與人工操作選項;P2-403I 已完成 runtime verifier evidence implementation review;P2-403J 已完成報表真相、告警有效性、日報、週報、月報、每個 Agent 工作量、圖表化報告、AI 分析建議與高 / 中 / 低風險自動化政策審查。runtime worker、DB migration、production Redis consumer group、Telegram 實發、Telegram route change、report delivery、AI analysis runtime、中低風險 auto worker、KM / PlayBook trust / timeline / replay score 寫入、SDK / 付費服務仍未開 gate | `ai_agent_communication_learning_contract_v1`、`ai_agent_interaction_learning_proof_v1`、`ai_agent_live_read_model_gate_v1`、`ai_agent_redis_dry_run_gate_v1`、`ai_agent_learning_writeback_approval_package_v1`、`ai_agent_telegram_receipt_approval_package_v1`、`ai_agent_owner_approved_learning_dry_run_v1`、`ai_agent_owner_approved_fixture_dry_run_v1`、`GET /api/v1/agents/agent-communication-learning-contract`、`GET /api/v1/agents/agent-interaction-learning-proof`、`GET /api/v1/agents/agent-live-read-model-gate`、`GET /api/v1/agents/agent-redis-dry-run-gate`、`GET /api/v1/agents/agent-learning-writeback-approval-package`、`GET /api/v1/agents/agent-telegram-receipt-approval-package`、`GET /api/v1/agents/agent-owner-approved-learning-dry-run`、`GET /api/v1/agents/agent-owner-approved-fixture-dry-run`、`ai_agent_runtime_write_gate_review_v1`、`GET /api/v1/agents/agent-runtime-write-gate-review`、`ai_agent_post_write_verifier_package_v1`、`GET /api/v1/agents/agent-post-write-verifier-package`、`ai_agent_runtime_verifier_evidence_review_v1`、`GET /api/v1/agents/agent-runtime-verifier-evidence-review`、`ai_agent_report_truth_actionability_review_v1`、`GET /api/v1/agents/agent-report-truth-actionability-review`、`ai_agent_report_automation_review_v1`、`GET /api/v1/agents/agent-report-automation-review`、`/zh-TW/governance?tab=automation-inventory`、MASTER §3.2.1b / §3.2.1d / §3.4.3 | +| OpenClaw / Hermes / NemoTron 主動溝通、學習與成長證據 | 100% | P2-401A 已完成只讀 contract;P2-403A 已完成互動 / 接手 / 學習 / 成長證據面板;P2-403B 已完成 AgentSession / Redis Streams live read model gate;P2-403C 已完成 Redis Streams consumer group dry-run、handoff envelope、ack / dead-letter / replay gate;P2-403D 已完成 learning writeback approval package;P2-403E 已完成 Telegram receipt approval package;P2-403F 已完成 owner-approved learning dry-run preview、人工操作選項與 fixture-only dry-run 總包;P2-403G 已完成 runtime write gate review,固定雙重批准、dry-run hash、post-write verifier 與 redaction 欄位;P2-403H 已完成 post-write verifier implementation package、rollback lane、failure lane 與人工操作選項;P2-403I 已完成 runtime verifier evidence implementation review;P2-403J 已完成報表真相、告警有效性、日報、週報、月報、每個 Agent 工作量、圖表化報告、AI 分析建議與高 / 中 / 低風險自動化政策審查;P2-403K 已完成本地程式層 SRE 戰情室路由收斂,移除 Gitea / API / ops script 的舊群組與 private mirror fallback。runtime worker、DB migration、production Redis consumer group、Telegram 實發、delivery receipt E2E、report delivery、AI analysis runtime、中低風險 auto worker、KM / PlayBook trust / timeline / replay score 寫入、SDK / 付費服務仍未開 gate | `ai_agent_communication_learning_contract_v1`、`ai_agent_interaction_learning_proof_v1`、`ai_agent_live_read_model_gate_v1`、`ai_agent_redis_dry_run_gate_v1`、`ai_agent_learning_writeback_approval_package_v1`、`ai_agent_telegram_receipt_approval_package_v1`、`ai_agent_owner_approved_learning_dry_run_v1`、`ai_agent_owner_approved_fixture_dry_run_v1`、`GET /api/v1/agents/agent-communication-learning-contract`、`GET /api/v1/agents/agent-interaction-learning-proof`、`GET /api/v1/agents/agent-live-read-model-gate`、`GET /api/v1/agents/agent-redis-dry-run-gate`、`GET /api/v1/agents/agent-learning-writeback-approval-package`、`GET /api/v1/agents/agent-telegram-receipt-approval-package`、`GET /api/v1/agents/agent-owner-approved-learning-dry-run`、`GET /api/v1/agents/agent-owner-approved-fixture-dry-run`、`ai_agent_runtime_write_gate_review_v1`、`GET /api/v1/agents/agent-runtime-write-gate-review`、`ai_agent_post_write_verifier_package_v1`、`GET /api/v1/agents/agent-post-write-verifier-package`、`ai_agent_runtime_verifier_evidence_review_v1`、`GET /api/v1/agents/agent-runtime-verifier-evidence-review`、`ai_agent_report_truth_actionability_review_v1`、`GET /api/v1/agents/agent-report-truth-actionability-review`、`ai_agent_report_automation_review_v1`、`GET /api/v1/agents/agent-report-automation-review`、`/zh-TW/governance?tab=automation-inventory`、MASTER §3.2.1b / §3.2.1d / §3.4.3 | | AI Agent 主動營運委派與版本生命週期 | 100% | P2-402A / P2-402B / P2-402C / P2-402D / P2-402E / P2-402F / P2-402G 已完成;已建立 repo-only 版本新鮮度快照、工具採用批准包、Telegram action-required digest policy、Gitea PR 草案 lane、host / K3s / stateful 版本只讀盤點、API 與 governance UI。定期排程、外部版本查詢、工具安裝、CI 變更、套件升級、主機更新、container pull、實際 PR creation、auto merge、Telegram 實發、SSH、kubectl、重啟仍未開 gate | `ai_agent_proactive_operations_contract_v1`、`ai_agent_version_freshness_snapshot_v1`、`ai_agent_tool_adoption_approval_package_v1`、`ai_agent_telegram_action_required_digest_policy_v1`、`ai_agent_gitea_pr_draft_lane_v1`、`ai_agent_host_stateful_version_inventory_v1`、`GET /api/v1/agents/agent-proactive-operations-contract`、`GET /api/v1/agents/agent-version-freshness-snapshot`、`GET /api/v1/agents/agent-tool-adoption-approval-package`、`GET /api/v1/agents/agent-telegram-action-required-digest-policy`、`GET /api/v1/agents/agent-gitea-pr-draft-lane`、`GET /api/v1/agents/agent-host-stateful-version-inventory`、`/zh-TW/governance?tab=automation-inventory`、MASTER §3.2.1c | | 本工作清單與分析報告 | 100% | 已完成 | 本 MD 文件 | @@ -20,9 +20,9 @@ AI Agent 自動化工作包目前完成度:**92%**。本工作清單文件本 三 Agent 佈建布局目前完成度:**45%**。第一波已完成只讀 schema / snapshot / API / 測試 / 報告,第二波已接入治理頁自動化盤點 UI;正式 runtime 佈署、Telegram E2E 發送與 AgentSession 工作流仍需逐項 gate。 -三 Agent 主動溝通、學習與成長證據目前完成度:**100%**。已完成只讀契約、互動 / 接手 / 學習 / 成長證據面板、P2-403B live read model gate、P2-403C Redis dry-run gate、P2-403D learning writeback approval package、P2-403E Telegram receipt approval package、P2-403F owner-approved learning dry-run preview、人工操作選項與 fixture-only dry-run 總包、P2-403G runtime write gate review、P2-403H post-write verifier implementation package、P2-403I runtime verifier evidence implementation review、P2-403J 報表真相 / 告警有效性 / 日週月報 / Agent 工作量 / 圖表化報告 / AI 建議 / 風險自動化政策審查、API、治理頁顯示、測試與 MASTER 同步;目前 live AgentSession、Agent message、handoff、learning write、Telegram receipt、Gateway queue write、runtime verifier execution、report delivery、AI analysis runtime、中低風險 auto worker、Telegram route change 與 Telegram send 仍全部為 `0`,下一步依優先順序推 `P2-403K` unified report truth service / 中低風險 runtime guard / SRE 戰情室路由遷移批准包,但在批准前仍不得啟動 runtime loop。 +三 Agent 主動溝通、學習與成長證據目前完成度:**100%**。已完成只讀契約、互動 / 接手 / 學習 / 成長證據面板、P2-403B live read model gate、P2-403C Redis dry-run gate、P2-403D learning writeback approval package、P2-403E Telegram receipt approval package、P2-403F owner-approved learning dry-run preview、人工操作選項與 fixture-only dry-run 總包、P2-403G runtime write gate review、P2-403H post-write verifier implementation package、P2-403I runtime verifier evidence implementation review、P2-403J 報表真相 / 告警有效性 / 日週月報 / Agent 工作量 / 圖表化報告 / AI 建議 / 風險自動化政策審查、P2-403K SRE 戰情室路由程式收斂、API、治理頁顯示、測試與 MASTER 同步;目前 live AgentSession、Agent message、handoff、learning write、Telegram receipt、Gateway queue write、runtime verifier execution、report delivery、AI analysis runtime、中低風險 auto worker、Telegram 實發與 delivery receipt E2E 仍全部為 `0`,下一步依優先順序推 `P2-403L` delivery receipt / queue write E2E,但仍不得跳過 runtime gate。 -AI Agent 主動營運委派與版本生命週期目前完成度:**100%**。已完成 12 類版本 domain、24 類可委派能力、5 種 cadence、8 類 MCP、4 類 RAG memory、只讀 API、`P2-402B` repo-only daily version freshness snapshot、`P2-402C` Renovate / OSV-Scanner / Trivy / Syft / Grype 工具採用批准包、`P2-402D` Telegram action-required digest policy、`P2-402E` Gitea PR 草案 lane、`P2-402F` host OS / K3s / stateful services 版本只讀盤點,以及 `P2-402G` governance UI 顯示可委派能力;`P2-403A`、`P2-403B`、`P2-403C`、`P2-403D`、`P2-403E`、`P2-403F` 、`P2-403G`、`P2-403H`、`P2-403I` 與 `P2-403J` 已先補互動、學習證據面、live read model gate、Redis dry-run gate、learning writeback approval package、Telegram receipt approval package、owner-approved learning dry-run preview、runtime write gate review、post-write verifier package、runtime verifier evidence review、報表真相、TG 戰情室收斂、日週月報、Agent 工作量、圖表化報告與風險自動化政策審查。下一步是 `P2-403K` unified report truth service / 中低風險 runtime guard / SRE 戰情室路由遷移批准包,外部 registry / package source / host probe / SSH / kubectl / 工具安裝 / CI 變更 / 實際 PR creation / Telegram 實發與 learning write 仍需 gate。 +AI Agent 主動營運委派與版本生命週期目前完成度:**100%**。已完成 12 類版本 domain、24 類可委派能力、5 種 cadence、8 類 MCP、4 類 RAG memory、只讀 API、`P2-402B` repo-only daily version freshness snapshot、`P2-402C` Renovate / OSV-Scanner / Trivy / Syft / Grype 工具採用批准包、`P2-402D` Telegram action-required digest policy、`P2-402E` Gitea PR 草案 lane、`P2-402F` host OS / K3s / stateful services 版本只讀盤點,以及 `P2-402G` governance UI 顯示可委派能力;`P2-403A`、`P2-403B`、`P2-403C`、`P2-403D`、`P2-403E`、`P2-403F` 、`P2-403G`、`P2-403H`、`P2-403I`、`P2-403J` 與 `P2-403K` 已先補互動、學習證據面、live read model gate、Redis dry-run gate、learning writeback approval package、Telegram receipt approval package、owner-approved learning dry-run preview、runtime write gate review、post-write verifier package、runtime verifier evidence review、報表真相、TG 戰情室收斂、日週月報、Agent 工作量、圖表化報告、風險自動化政策審查與 SRE 戰情室路由程式收斂。下一步是 `P2-403L` delivery receipt / queue write E2E,外部 registry / package source / host probe / SSH / kubectl / 工具安裝 / CI 變更 / 實際 PR creation / Telegram 實發與 learning write 仍需 gate。 完成度計算模型: @@ -968,6 +968,7 @@ UI: | P2-403G | 完成 | 100 | OpenClaw | Runtime write gate review、雙重批准、dry-run hash、post-write verifier 與 redaction gate | `ai_agent_runtime_write_gate_review_v1` / snapshot / 只讀 API / governance UI;4 個 write target、4 個 approval gate、9 個必填欄位與 live write total `0` | 不寫 KM、不更新 PlayBook trust、不寫 timeline / replay score、不發 Telegram;runtime write 仍未授權 | | P2-403H | 完成 | 100 | OpenClaw | Post-write verifier implementation package、rollback lane、failure lane 與人工操作選項 | `ai_agent_post_write_verifier_package_v1` / snapshot / 只讀 API / governance UI;4 個 verification target、3 個 failure lane、4 個 operator action 與 live verifier execution `0` | 不讀 canonical target、不寫 rollback work item、不發 Telegram、不寫 KM / PlayBook trust / timeline / replay score;runtime verifier 仍未授權 | | P2-403J | 完成 | 100 | Hermes + OpenClaw | 報表真相、告警有效性、日週月報、每個 Agent 工作量、圖表化報告、AI 分析建議與風險自動化政策審查;高風險需審核,中低風險目前只定義 policy | `ai_agent_report_truth_actionability_review_v1` + `ai_agent_report_automation_review_v1` / snapshot / 只讀 API / governance UI;5 個真相缺口、3 個日週月契約、4 個 actionability lane、4 條 TG 旁路風險、3 個報表週期、3 個 Agent 工作量、4 個 chart package、5 個 recommendation | 不發 Telegram、不改 CronJob、不改 Prometheus / Alertmanager、不改 route / receiver、不讀 secret、不寫 work item / KM / PlayBook trust、不開 runtime worker、不排程實發、不啟動中低風險 auto worker、不執行生產優化 | +| P2-403K | 本地完成,正式站待驗證 | 100 | OpenClaw | AwoooI SRE 戰情室路由程式收斂;移除 Gitea / API / ops script 舊群組與 private mirror fallback | Gitea workflow 使用 `SRE_GROUP_CHAT_ID`;CD 舊相容欄位取自 SRE group;Telegram Gateway / notification matrix / jobs / ops scripts SRE-only;CI guard 擋舊 `TELEGRAM_ALERT_CHAT_ID` / `TELEGRAM_CHAT_ID` 與非 SRE direct fallback | 未讀 secret value、未發 Telegram live 測試、未改 Prometheus / Alertmanager route、未開 Gateway queue write / receipt worker / runtime gate | | P2-101 | 待辦 | 0 | OpenClaw | 定義操作類別權限模型 | 操作政策 schema | HITL 關卡 | | P2-102 | 待辦 | 0 | OpenClaw | 所有候選操作都要有 dry-run 證據 | dry-run 合約 | 不直接 apply | | P2-103 | 待辦 | 0 | Hermes | 把任務結果接回 KM / LOGBOOK / 稽核軌跡 | 證據寫入器 | 不洩漏 secret | diff --git a/scripts/ci/check-gitea-step-env-secrets.js b/scripts/ci/check-gitea-step-env-secrets.js index 2a692592..24b44002 100755 --- a/scripts/ci/check-gitea-step-env-secrets.js +++ b/scripts/ci/check-gitea-step-env-secrets.js @@ -10,6 +10,7 @@ const path = require("path"); const root = path.resolve(__dirname, "../.."); const workflowDir = path.join(root, ".gitea", "workflows"); const violations = []; +const routeViolations = []; for (const fileName of fs.readdirSync(workflowDir).sort()) { if (!fileName.endsWith(".yml") && !fileName.endsWith(".yaml")) { @@ -17,9 +18,29 @@ for (const fileName of fs.readdirSync(workflowDir).sort()) { } const filePath = path.join(workflowDir, fileName); - const lines = fs.readFileSync(filePath, "utf8").split(/\r?\n/); + const content = fs.readFileSync(filePath, "utf8"); + const lines = content.split(/\r?\n/); let block = null; + if (content.includes("TELEGRAM_ALERT_CHAT_ID")) { + routeViolations.push(`${filePath}: legacy TELEGRAM_ALERT_CHAT_ID is not allowed; use SRE_GROUP_CHAT_ID`); + } + + if (content.includes("TELEGRAM_CHAT_ID")) { + routeViolations.push(`${filePath}: legacy TELEGRAM_CHAT_ID is not allowed for alert routing; use SRE_GROUP_CHAT_ID`); + } + + let lineOffset = 0; + lines.forEach((line, index) => { + if ( + line.includes("api.telegram.org/bot") + && !content.slice(Math.max(0, lineOffset - 700), lineOffset + line.length + 1200).includes("SRE_GROUP_CHAT_ID") + ) { + routeViolations.push(`${filePath}:${index + 1}: direct Telegram fallback must target SRE_GROUP_CHAT_ID`); + } + lineOffset += line.length + 1; + }); + lines.forEach((line, index) => { const indent = line.match(/^\s*/)[0].length; const trimmed = line.trim(); @@ -51,4 +72,12 @@ if (violations.length > 0) { process.exit(1); } -console.log("no Gitea step env/with secrets"); +if (routeViolations.length > 0) { + console.error("Gitea workflow Telegram route must converge on AwoooI SRE war room:"); + for (const violation of routeViolations) { + console.error(` - ${violation}`); + } + process.exit(1); +} + +console.log("no Gitea step env/with secrets or legacy Telegram routes"); diff --git a/scripts/ops/backup-from-110.sh b/scripts/ops/backup-from-110.sh index 9c0dff33..8b5575ec 100644 --- a/scripts/ops/backup-from-110.sh +++ b/scripts/ops/backup-from-110.sh @@ -59,7 +59,7 @@ notify_awoooi_ops() { notify_telegram_fallback() { local msg="$1" local tg_token="${TG_BOT_TOKEN:-${TELEGRAM_BOT_TOKEN:-}}" - local tg_chat="${TELEGRAM_ALERT_CHAT_ID:-${SRE_GROUP_CHAT_ID:--1003711974679}}" + local tg_chat="${SRE_GROUP_CHAT_ID:--1003711974679}" if [ -n "$tg_token" ] && [ -n "$tg_chat" ]; then curl -s -X POST "https://api.telegram.org/bot${tg_token}/sendMessage" \ -d "chat_id=${tg_chat}" \ diff --git a/scripts/ops/deploy-alertmanager-config.sh b/scripts/ops/deploy-alertmanager-config.sh index 01bd73cd..2f71e2a2 100755 --- a/scripts/ops/deploy-alertmanager-config.sh +++ b/scripts/ops/deploy-alertmanager-config.sh @@ -85,8 +85,7 @@ TELEGRAM_BOT_TOKEN="$( SRE_GROUP_CHAT_ID="$( read_secret_first_available \ "${SRE_GROUP_CHAT_ID:-}" \ - SRE_GROUP_CHAT_ID \ - TELEGRAM_ALERT_CHAT_ID + SRE_GROUP_CHAT_ID )" || die "missing SRE_GROUP_CHAT_ID" [[ "$SRE_GROUP_CHAT_ID" =~ ^-?[0-9]+$ ]] || die "SRE_GROUP_CHAT_ID must be a Telegram numeric chat id" diff --git a/scripts/ops/deploy-docker-health-monitor.sh b/scripts/ops/deploy-docker-health-monitor.sh index a7222ad8..60ec0eb6 100755 --- a/scripts/ops/deploy-docker-health-monitor.sh +++ b/scripts/ops/deploy-docker-health-monitor.sh @@ -87,7 +87,6 @@ deploy_to_host() { AWOOOI_API_URL=https://awoooi.wooo.work TELEGRAM_BOT_TOKEN=CHANGE_ME SRE_GROUP_CHAT_ID=-1003711974679 -TELEGRAM_ALERT_CHAT_ID=-1003711974679 SEND_COOLDOWN_SECONDS=300 SECRETS_TEMPLATE echo ' ⚠️ 請填寫 /etc/awoooi-ops/secrets.env.template 後重命名為 secrets.env' diff --git a/scripts/ops/docker-health-monitor.sh b/scripts/ops/docker-health-monitor.sh index 78bcd1e6..2f1f630d 100755 --- a/scripts/ops/docker-health-monitor.sh +++ b/scripts/ops/docker-health-monitor.sh @@ -25,7 +25,6 @@ fi : "${AWOOOI_API_URL:=https://awoooi.wooo.work}" : "${TELEGRAM_BOT_TOKEN:=}" : "${SRE_GROUP_CHAT_ID:=-1003711974679}" -: "${TELEGRAM_ALERT_CHAT_ID:=${SRE_GROUP_CHAT_ID:-${TELEGRAM_CHAT_ID:-}}}" : "${LOG_FILE:=/var/log/docker-health-monitor.log}" : "${SEND_COOLDOWN_SECONDS:=300}" : "${COOLDOWN_DIR:=/tmp/docker-health-monitor-cooldown}" @@ -87,10 +86,10 @@ matches_pattern() { # ─── Telegram 直發 Fallback ────────────────────────────────────────────────── send_telegram_direct() { local message="$1" - [[ -z "$TELEGRAM_BOT_TOKEN" || -z "$TELEGRAM_ALERT_CHAT_ID" ]] && return 0 + [[ -z "$TELEGRAM_BOT_TOKEN" || -z "$SRE_GROUP_CHAT_ID" ]] && return 0 curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \ -H "Content-Type: application/json" \ - -d "{\"chat_id\":\"${TELEGRAM_ALERT_CHAT_ID}\",\"text\":\"${message}\",\"parse_mode\":\"HTML\"}" \ + -d "{\"chat_id\":\"${SRE_GROUP_CHAT_ID}\",\"text\":\"${message}\",\"parse_mode\":\"HTML\"}" \ > /dev/null 2>&1 || true } diff --git a/scripts/ops/dr-drill.sh b/scripts/ops/dr-drill.sh index 589a8ca2..3ae873ba 100644 --- a/scripts/ops/dr-drill.sh +++ b/scripts/ops/dr-drill.sh @@ -58,7 +58,7 @@ notify_telegram() { # 只有 API 不可達或 helper 未部署時,才使用 Telegram 直發救命旁路。 notify_awoooi_ops "$status" "$msg" && return 0 - local chat_id="${TELEGRAM_ALERT_CHAT_ID:-${SRE_GROUP_CHAT_ID:--1003711974679}}" + local chat_id="${SRE_GROUP_CHAT_ID:--1003711974679}" if [[ -n "${TELEGRAM_BOT_TOKEN:-}" && -n "$chat_id" ]]; then curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \ -d "chat_id=${chat_id}" \ diff --git a/scripts/ops/pg-backup.sh b/scripts/ops/pg-backup.sh index cf3a16c5..d330e9fe 100644 --- a/scripts/ops/pg-backup.sh +++ b/scripts/ops/pg-backup.sh @@ -47,7 +47,7 @@ notify_telegram() { # 只有 API 不可達或 helper 未部署時,才使用 Telegram 直發救命旁路。 notify_awoooi_ops "$status" "$msg" && return 0 - local chat_id="${TELEGRAM_ALERT_CHAT_ID:-${SRE_GROUP_CHAT_ID:--1003711974679}}" + local chat_id="${SRE_GROUP_CHAT_ID:--1003711974679}" if [[ -n "${TELEGRAM_BOT_TOKEN:-}" && -n "$chat_id" ]]; then curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \ -d "chat_id=${chat_id}" \