fix(ops): harden heartbeat and momo alert noise

2026-06-24 19:38:33 +08:00
parent 72cb312aef
commit 2ec7f6f440
7 changed files with 97 additions and 16 deletions
--- a/apps/api/src/services/telegram_gateway.py
+++ b/apps/api/src/services/telegram_gateway.py
@@ -71,6 +71,40 @@ INCIDENT_UPDATE_GLOBAL_FAILURE_DEDUP_TTL_SECONDS = 10 * 60  # 相同失敗摘要
 GROUPED_ALERT_DIGEST_DEDUP_PREFIX = "awoooi:tg_group_digest:"  # {group_key}
 GROUPED_ALERT_DIGEST_DEDUP_TTL_SECONDS = 5 * 60  # 同一告警群組 5 分鐘只推一則 digest

+_HEARTBEAT_WARNING_FINGERPRINT_RULES: tuple[tuple[re.Pattern[str], str], ...] = (
+    # 2026-06-24 Codex + ogt: Telegram heartbeat dedupe must track the
+    # actionable condition, not volatile probe details.  HTTP status, timeout,
+    # latency and count text can vary every 30 minutes while the operator
+    # action stays exactly the same, which caused repeated "heartbeat" noise.
+    (re.compile(r"^(Ollama\s+[^:]+)\s+異常:.*$"), r"\1 異常"),
+    (re.compile(r"^(.+?)\s+服務異常:.*$"), r"\1 服務異常"),
+    (re.compile(r"^(MCP\s+[^:]+):.*$"), r"\1"),
+    (re.compile(r"^(ArgoCD):.*$"), r"\1"),
+    (re.compile(r"^(PostgreSQL):.*$"), r"\1"),
+    (re.compile(r"^(Redis):.*$"), r"\1"),
+    (re.compile(r"^KM\s+向量化率偏低:.*$"), "KM 向量化率偏低"),
+    (re.compile(r"^系統沉默\s+.*$"), "系統沉默（無學習活動）"),
+    (re.compile(r"^待人工審核\s+.*$"), "待人工審核積壓"),
+)
+
+
+def _normalize_heartbeat_warning_for_fingerprint(warning: str) -> str:
+    """Return the stable actionable identity used for heartbeat dedupe."""
+    normalized = " ".join(str(warning).split())
+    for pattern, replacement in _HEARTBEAT_WARNING_FINGERPRINT_RULES:
+        replaced = pattern.sub(replacement, normalized)
+        if replaced != normalized:
+            return replaced
+    return normalized
+
+
+def _heartbeat_warnings_hash(warnings: list[str]) -> str:
+    """Hash heartbeat warnings after stripping volatile probe details."""
+    warnings_str = "|".join(
+        sorted(_normalize_heartbeat_warning_for_fingerprint(item) for item in warnings)
+    )
+    return hashlib.md5(warnings_str.encode()).hexdigest()[:12]
+
 # 2026-04-01 Claude Code: Long Polling 分散式 Leader Election
 # 防止多 Pod 同時 getUpdates → 409 Conflict 互搶問題
 POLLING_LEADER_KEY = "telegram:polling:leader"
@@ -10118,13 +10152,11 @@ class TelegramGateway:
            #   有 warnings 跟上次相同 → 跳過（hash 對比）
            #   有 warnings 跟上次不同 → 立即推送（新狀況不漏）
            #   warnings 消失 → 只推一次恢復通知，之後回到安靜
-            import hashlib
            WARNINGS_HASH_TTL = 24 * 3600
            healthy_suppressed_key = "heartbeat:healthy_suppressed_last_seen"
            warnings_hash_key = "heartbeat:warnings_hash"

-            warnings_str = "|".join(sorted(report.warnings))
-            warnings_hash = hashlib.md5(warnings_str.encode()).hexdigest()[:12]
+            warnings_hash = _heartbeat_warnings_hash(report.warnings)

            if not report.warnings:
                # 健康狀態：沒有上一輪 warning 時不送 Telegram，避免成功心跳洗版。
--- a/apps/api/tests/test_heartbeat_dedup_p0_4.py
+++ b/apps/api/tests/test_heartbeat_dedup_p0_4.py
@@ -140,9 +140,9 @@ class TestHeartbeatDedup:
        gw, fake_redis = gateway_with_fake_redis
        warnings = ["Pod api-x Failed", "Redis: down"]
        # 預設上次 hash
-        import hashlib
-        warnings_str = "|".join(sorted(warnings))
-        last_hash = hashlib.md5(warnings_str.encode()).hexdigest()[:12]
+        from src.services.telegram_gateway import _heartbeat_warnings_hash
+
+        last_hash = _heartbeat_warnings_hash(warnings)
        fake_redis.preset("heartbeat:warnings_hash", last_hash)

        with patch("src.core.redis_client.get_redis", return_value=fake_redis), \
@@ -157,6 +157,35 @@ class TestHeartbeatDedup:
            gw.send_to_group.assert_not_called()
            gw.send_notification.assert_not_called()

+    @pytest.mark.asyncio
+    async def test_warnings_with_same_actionable_condition_are_skipped(
+        self,
+        gateway_with_fake_redis,
+        sre_group_configured,
+    ):
+        """同一可處置 warning 即使 HTTP / timeout / latency 變動也不重複洗版"""
+        gw, fake_redis = gateway_with_fake_redis
+        from src.services.telegram_gateway import _heartbeat_warnings_hash
+
+        fake_redis.preset(
+            "heartbeat:warnings_hash",
+            _heartbeat_warnings_hash(["Ollama 111 異常: ❌ HTTP 502 124ms"]),
+        )
+
+        with patch("src.core.redis_client.get_redis", return_value=fake_redis), \
+             patch("src.services.heartbeat_report_service.HeartbeatReportService") as MockSvc, \
+             patch("src.services.heartbeat_report_service.report_to_telegram_html",
+                   return_value="<b>warnings</b>"):
+            MockSvc.return_value.collect = AsyncMock(
+                return_value=_make_report(["Ollama 111 異常: ❌ HTTP 504 236ms"])
+            )
+
+            result = await gw.send_heartbeat()
+
+            assert result is True
+            gw.send_to_group.assert_not_called()
+            gw.send_notification.assert_not_called()
+
    @pytest.mark.asyncio
    async def test_warnings_changed_pushes(
        self,
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -1,3 +1,21 @@
+## 2026-06-24｜重啟後告警噪音 hardening
+
+**背景**：重啟恢復後，MOMO Pro 5 分鐘 502 舊告警與 AWOOOI 30 分鐘成功心跳都會干擾判斷。正確策略不是關掉告警，而是把「可處置的新異常」與「同一狀態重複回報 / 成功心跳」分開。
+
+**完成**：
+- `telegram_gateway.send_heartbeat()` 新增 heartbeat warning 穩定指紋。`Ollama 111 異常`、AI service、MCP、ArgoCD、DB/Redis、KM vectorization、系統沉默與 pending 積壓等 warning 會用可處置條件去重，不再因 HTTP status、timeout、latency、計數文字每 30 分鐘浮動就被當成新事件。
+- 原始 Telegram 內容仍保留完整 warning；只有 Redis dedupe hash 使用穩定化後的指紋，因此真新類型 warning、warning 消失後的恢復通知仍會送出。
+- `MoWoooWorkDown` Alertmanager 規則從舊 `component=momo-app` 改為 `component=momo-pro-system`，`auto_repair=false`，描述與 runbook 改成先檢查 `https://mo.wooo.work/health`、188 local `127.0.0.1:5003/health`、`momo-pro-system` / `momo-db` / `momo-scheduler` 與資料新鮮度，不再暗示看到 502 就重啟容器。
+
+**重啟恢復判定口徑**：
+- 成功心跳：不進 Telegram 洗版，只留 Redis/log/metrics。
+- 同一 warning：24 小時內以穩定指紋降噪。
+- 新 warning 類型：立即推送。
+- warning 清空：只推一次恢復通知。
+- MOMO public 502：先做 public/local/container/data freshness 分層診斷；不得只因外部 502 判定容器不存在或直接重啟。
+
+**邊界**：本輪只改 repo 程式 / Prometheus 規則 / 文件，尚未宣稱 production 已部署；沒有 SSH、沒有重啟服務、沒有 reload Prometheus / Alertmanager、沒有改 Nginx / firewall、沒有讀 secret、沒有 force push。
+
 ## 2026-06-24｜AWOOOI current-main dev base readback

 **背景**：Mac Mini / MacBook Pro 的 AWOOOI `awoooi-dev` 工作區原本都停在 `gitea/dev=25889d4b8edc`，而最新 `gitea/main=9bc6392770bc`。`gitea/dev` 是 `main` 的祖先、落後 `3708` commits，但 `.gitea/workflows/cd-dev.yaml` 會在 `dev` push 時觸發 dev deploy、K8s secret patch 與 `kubectl apply`，因此不能未批准直接快轉。
--- a/docs/runbooks/FULL-STACK-COLD-START-SOP.md
+++ b/docs/runbooks/FULL-STACK-COLD-START-SOP.md
@@ -1,6 +1,6 @@
 # AWOOOI 全棧冷啟動與主機重啟 SOP

-> Version: v1.32
+> Version: v1.33
 > Last updated: 2026-06-24 Asia/Taipei
 > Scope: 110 / 120 / 121 / 188 full-stack reboot recovery. 112 Kali is recorded as P3 optional and is not part of this recovery path.

@@ -10,7 +10,7 @@

 本節是每次接手、開機、關機、重啟後的第一個判定錨點。若日期不是今天，必須先重跑 live check，再更新本節與 `docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md`。

-2026-06-24 11:35 live readback supersedes the earlier 11:19 wording:
+2026-06-24 18:32 notification-noise hardening supersedes the earlier 11:35 wording where it discusses heartbeat / MOMO alert behavior. The service and data readiness gates below remain unchanged until a fresh live cold-start scorecard says otherwise:

 ```text
 Repo-side reboot SOP / Plan B / automation contracts: COMPLETE, 100%.
@@ -20,7 +20,7 @@ Runtime release state: API/Web/Worker are ready; image remains a84a5a0b because
 MOMO state: mo.wooo.work health is healthy on version V10.639; current-month daily_sales_snapshot and realtime_sales_monthly match, but both stop at 2026-06-17. MOMO_DAILY_FRESHNESS is 7 days, which is a hard blocker because business data is not current.
 Google Drive state: momo scheduler token ownership is fixed for Docker userns, container-side Drive listing works, but folder 當日業績匯入 currently has no matching 即時業績_當日 Excel source file. Archive latest matching file is 2026-06-18T01:30:39Z and was already imported by job 56.
 Backup / monitoring state: backup-status core blockers are 0, last aggregate is 2026-06-24 02:28:39, 188 MinIO is healthy, Velero BackupStorageLocation default is Available, one-off backup reboot-recovery-202606240456 completed, backup-health textfile reports Velero freshness green, and VeleroBackupNotRun / PostgreSQLDown / RedisDown / disk-pressure alerts resolved.
-Notification-noise state: healthy AWOOOI heartbeat is suppressed; MOMO Pro monitor now uses https://mo.wooo.work/health as primary truth and no longer checks the 188 root path; docker-health-monitor keeps 5-minute repair cadence but has a separate 30-minute Telegram fallback cooldown; Bitan public-content check keeps failure alerting with same-fingerprint cooldown and one recovery notice.
+Notification-noise state: healthy AWOOOI heartbeat is suppressed; heartbeat warning dedupe uses stable actionable fingerprints so HTTP status / timeout / latency drift does not create a new Telegram event every 30 minutes; MOMO Pro monitor uses https://mo.wooo.work/health as primary truth and no longer checks the 188 root path; MoWoooWorkDown now labels component=momo-pro-system and requires public/local/container/data-freshness triage instead of blind restart; docker-health-monitor keeps 5-minute repair cadence but has a separate 30-minute Telegram fallback cooldown; Bitan public-content check keeps failure alerting with same-fingerprint cooldown and one recovery notice.
 Allowed declaration: core hosts, routes, K3s, backup/exporter surfaces are recovered; MOMO data pipeline is blocked waiting for a newer source file or owner-provided source evidence.
 Forbidden declaration: full-stack green, MOMO data current, DR complete, or runtime/security acceptance. Credential escrow evidence is still missing and must not be forged.
 ```
--- a/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md
+++ b/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md
@@ -148,7 +148,7 @@ Next: <single next action>
 | P1-014 | DONE | 100 | Publish credential escrow owner request package | 2026-06-13 13:10 live report confirms `SCRIPT_MISSING_COUNT=0`, `OFFSITE_CONFIGURED=1`, `RCLONE_CONFIGURED=1`, `ESCROW_MISSING_COUNT=5`, `PASS=8 WARN=5 BLOCKED=0`. New owner request package defines allowed evidence-id types, forbidden secret values, safe dry-run flow, write flow, and closeout gates. | Dispatch to the credential owners without collecting secret values; keep marker write gated until owner gives real non-secret evidence IDs. | `docs/security/CREDENTIAL-ESCROW-EVIDENCE-OWNER-REQUEST.md` and snapshot exist and validate. |
 | P1-013 | DONE_FOR_SERVICE_READINESS | 100 | Remediate `km-vectorize` CronJob health debt | The retained `km-vectorize-29689620` failed Job is now classified as stale evidence, not an active blocker, because later official `km-vectorize` Jobs completed successfully. 2026-06-18 13:43 cold-start reads `FAILED_JOBS=1`, `STALE_FAILED_JOBS=1`, `ACTIVE_FAILED_JOBS=0`, `BAD_PODS=0`, and returns `PASS=84 WARN=0 BLOCKED=0`. | Keep retained failed Job as evidence unless an explicit maintenance window authorizes cleanup. Reassert ArgoCD app health only with a fresh ArgoCD app readback, not from the cold-start scorecard alone. | Service readiness no longer warns on stale failed Job evidence; active failed Job detection remains guarded. |
 | P1-015 | DONE | 100 | Restore 188 MinIO / Velero backup freshness and DB exporters | 2026-06-24 06:35 resolved real backup / exporter red lights: 188 PostgreSQL exporter and Redis exporter now expose `pg_up=1` / `redis_up=1`; 188 MinIO health is live; 120 Velero BSL is `Available`; one-off backup `reboot-recovery-202606240456` completed; 110 backup-health textfile reports latest Velero backup fresh. 110 disk pressure was reduced from 92% to 73% by Docker image/build-cache cleanup only. | Reconcile MinIO `userns_mode: host` override into formal source-of-truth or data ownership fix; keep Docker volume prune forbidden without explicit owner approval. | `VeleroBackupNotRun`、`PostgreSQLDown`、`RedisDown`、110 disk-pressure alerts are resolved, and SOP includes restore helpers. |
-| P1-016 | DONE | 100 | Control repeated Telegram notification noise without hiding real alerts | 2026-06-24 confirmed MOMO Pro 5-minute spam came from a legacy 110 script checking `http://192.168.0.188/health`; live script now uses `https://mo.wooo.work/health` as primary truth and manual readback returned `OK: public health 200; no alert`. Generic docker-health monitor keeps 5-minute repair checks but adds a separate 30-minute direct Telegram fallback cooldown. Bitan public-content cleanliness keeps failure notification but suppresses the same failure fingerprint for 6 hours and emits one recovery notice. | Fold remaining cross-product direct Telegram egress into the unified notification gateway over time; do not disable real warning/failure/recovery signals. | Healthy heartbeat is quiet, MOMO public health success produces no alert, repeated same-failure direct fallback paths are cooled, and real failure/recovery notifications remain enabled. |
+| P1-016 | DONE | 100 | Control repeated Telegram notification noise without hiding real alerts | 2026-06-24 confirmed MOMO Pro 5-minute spam came from a legacy 110 script checking `http://192.168.0.188/health`; live script now uses `https://mo.wooo.work/health` as primary truth. Heartbeat warning dedupe now hashes stable actionable fingerprints so HTTP status / timeout / latency drift does not create a new Telegram event every 30 minutes. `MoWoooWorkDown` now labels `component=momo-pro-system`, disables blind auto-repair, and requires public/local/container/data-freshness triage. Generic docker-health monitor keeps 5-minute repair checks but adds a separate 30-minute direct Telegram fallback cooldown. Bitan public-content cleanliness keeps failure notification with same-fingerprint cooldown and one recovery notice. | Fold remaining cross-product direct Telegram egress into the unified notification gateway over time; do not disable real warning/failure/recovery signals. Production deployment/readback must confirm the code and Prometheus rules are live before declaring runtime closure. | Healthy heartbeat is quiet, same actionable heartbeat warning is deduped, MOMO public health success produces no alert, repeated same-failure direct fallback paths are cooled, and real failure/recovery/new-warning notifications remain enabled. |

 ---

--- a/ops/monitoring/alerts-unified.yml
+++ b/ops/monitoring/alerts-unified.yml
@@ -945,13 +945,14 @@ groups:
        labels:
          severity: critical
          layer: external
-          component: momo-app
+          component: momo-pro-system
          host: "188"
          team: ops
-          auto_repair: "true"
+          auto_repair: "false"
        annotations:
          summary: "外部網站 mo.wooo.work 離線"
-          description: "mo.wooo.work 探測失敗超過 3 分鐘，容器 momo-app (188) 可能需要重啟"
+          description: "mo.wooo.work public route 探測失敗超過 3 分鐘；先檢查 https://mo.wooo.work/health、188 local http://127.0.0.1:5003/health 與容器 momo-pro-system，不可只因 502 盲目重啟。"
+          runbook: "先比對 public health、188 local app health、momo-pro-system / momo-db / momo-scheduler 狀態與最新 import/data freshness；若 public 502 但 local healthy，優先查 188 Nginx / upstream / TLS，不直接重啟容器。"

      - alert: TsenyangWebsiteDown
        expr: probe_success{job="blackbox-http", instance="https://www.tsenyang.com"} == 0
--- a/ops/monitoring/alerts.yml
+++ b/ops/monitoring/alerts.yml
@@ -818,13 +818,14 @@ groups:
        labels:
          severity: critical
          layer: external
-          component: momo-app
+          component: momo-pro-system
          host: "188"
          team: ops
-          auto_repair: "true"
+          auto_repair: "false"
        annotations:
          summary: "外部網站 mo.wooo.work 離線"
-          description: "mo.wooo.work 探測失敗超過 3 分鐘，容器 momo-app (188) 可能需要重啟"
+          description: "mo.wooo.work public route 探測失敗超過 3 分鐘；先檢查 https://mo.wooo.work/health、188 local http://127.0.0.1:5003/health 與容器 momo-pro-system，不可只因 502 盲目重啟。"
+          runbook: "先比對 public health、188 local app health、momo-pro-system / momo-db / momo-scheduler 狀態與最新 import/data freshness；若 public 502 但 local healthy，優先查 188 Nginx / upstream / TLS，不直接重啟容器。"

      - alert: TsenyangWebsiteDown
        expr: probe_success{job="blackbox-http", instance="https://www.tsenyang.com"} == 0