diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index 897f89bd..56499565 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -71,6 +71,40 @@ INCIDENT_UPDATE_GLOBAL_FAILURE_DEDUP_TTL_SECONDS = 10 * 60 # 相同失敗摘要 GROUPED_ALERT_DIGEST_DEDUP_PREFIX = "awoooi:tg_group_digest:" # {group_key} GROUPED_ALERT_DIGEST_DEDUP_TTL_SECONDS = 5 * 60 # 同一告警群組 5 分鐘只推一則 digest +_HEARTBEAT_WARNING_FINGERPRINT_RULES: tuple[tuple[re.Pattern[str], str], ...] = ( + # 2026-06-24 Codex + ogt: Telegram heartbeat dedupe must track the + # actionable condition, not volatile probe details. HTTP status, timeout, + # latency and count text can vary every 30 minutes while the operator + # action stays exactly the same, which caused repeated "heartbeat" noise. + (re.compile(r"^(Ollama\s+[^:]+)\s+異常:.*$"), r"\1 異常"), + (re.compile(r"^(.+?)\s+服務異常:.*$"), r"\1 服務異常"), + (re.compile(r"^(MCP\s+[^:]+):.*$"), r"\1"), + (re.compile(r"^(ArgoCD):.*$"), r"\1"), + (re.compile(r"^(PostgreSQL):.*$"), r"\1"), + (re.compile(r"^(Redis):.*$"), r"\1"), + (re.compile(r"^KM\s+向量化率偏低:.*$"), "KM 向量化率偏低"), + (re.compile(r"^系統沉默\s+.*$"), "系統沉默(無學習活動)"), + (re.compile(r"^待人工審核\s+.*$"), "待人工審核積壓"), +) + + +def _normalize_heartbeat_warning_for_fingerprint(warning: str) -> str: + """Return the stable actionable identity used for heartbeat dedupe.""" + normalized = " ".join(str(warning).split()) + for pattern, replacement in _HEARTBEAT_WARNING_FINGERPRINT_RULES: + replaced = pattern.sub(replacement, normalized) + if replaced != normalized: + return replaced + return normalized + + +def _heartbeat_warnings_hash(warnings: list[str]) -> str: + """Hash heartbeat warnings after stripping volatile probe details.""" + warnings_str = "|".join( + sorted(_normalize_heartbeat_warning_for_fingerprint(item) for item in warnings) + ) + return hashlib.md5(warnings_str.encode()).hexdigest()[:12] + # 2026-04-01 Claude Code: Long Polling 分散式 Leader Election # 防止多 Pod 同時 getUpdates → 409 Conflict 互搶問題 POLLING_LEADER_KEY = "telegram:polling:leader" @@ -10118,13 +10152,11 @@ class TelegramGateway: # 有 warnings 跟上次相同 → 跳過(hash 對比) # 有 warnings 跟上次不同 → 立即推送(新狀況不漏) # warnings 消失 → 只推一次恢復通知,之後回到安靜 - import hashlib WARNINGS_HASH_TTL = 24 * 3600 healthy_suppressed_key = "heartbeat:healthy_suppressed_last_seen" warnings_hash_key = "heartbeat:warnings_hash" - warnings_str = "|".join(sorted(report.warnings)) - warnings_hash = hashlib.md5(warnings_str.encode()).hexdigest()[:12] + warnings_hash = _heartbeat_warnings_hash(report.warnings) if not report.warnings: # 健康狀態:沒有上一輪 warning 時不送 Telegram,避免成功心跳洗版。 diff --git a/apps/api/tests/test_heartbeat_dedup_p0_4.py b/apps/api/tests/test_heartbeat_dedup_p0_4.py index 1d8733e6..aa0ea460 100644 --- a/apps/api/tests/test_heartbeat_dedup_p0_4.py +++ b/apps/api/tests/test_heartbeat_dedup_p0_4.py @@ -140,9 +140,9 @@ class TestHeartbeatDedup: gw, fake_redis = gateway_with_fake_redis warnings = ["Pod api-x Failed", "Redis: down"] # 預設上次 hash - import hashlib - warnings_str = "|".join(sorted(warnings)) - last_hash = hashlib.md5(warnings_str.encode()).hexdigest()[:12] + from src.services.telegram_gateway import _heartbeat_warnings_hash + + last_hash = _heartbeat_warnings_hash(warnings) fake_redis.preset("heartbeat:warnings_hash", last_hash) with patch("src.core.redis_client.get_redis", return_value=fake_redis), \ @@ -157,6 +157,35 @@ class TestHeartbeatDedup: gw.send_to_group.assert_not_called() gw.send_notification.assert_not_called() + @pytest.mark.asyncio + async def test_warnings_with_same_actionable_condition_are_skipped( + self, + gateway_with_fake_redis, + sre_group_configured, + ): + """同一可處置 warning 即使 HTTP / timeout / latency 變動也不重複洗版""" + gw, fake_redis = gateway_with_fake_redis + from src.services.telegram_gateway import _heartbeat_warnings_hash + + fake_redis.preset( + "heartbeat:warnings_hash", + _heartbeat_warnings_hash(["Ollama 111 異常: ❌ HTTP 502 124ms"]), + ) + + with patch("src.core.redis_client.get_redis", return_value=fake_redis), \ + patch("src.services.heartbeat_report_service.HeartbeatReportService") as MockSvc, \ + patch("src.services.heartbeat_report_service.report_to_telegram_html", + return_value="warnings"): + MockSvc.return_value.collect = AsyncMock( + return_value=_make_report(["Ollama 111 異常: ❌ HTTP 504 236ms"]) + ) + + result = await gw.send_heartbeat() + + assert result is True + gw.send_to_group.assert_not_called() + gw.send_notification.assert_not_called() + @pytest.mark.asyncio async def test_warnings_changed_pushes( self, diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 06569dd5..2353bbcd 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,21 @@ +## 2026-06-24|重啟後告警噪音 hardening + +**背景**:重啟恢復後,MOMO Pro 5 分鐘 502 舊告警與 AWOOOI 30 分鐘成功心跳都會干擾判斷。正確策略不是關掉告警,而是把「可處置的新異常」與「同一狀態重複回報 / 成功心跳」分開。 + +**完成**: +- `telegram_gateway.send_heartbeat()` 新增 heartbeat warning 穩定指紋。`Ollama 111 異常`、AI service、MCP、ArgoCD、DB/Redis、KM vectorization、系統沉默與 pending 積壓等 warning 會用可處置條件去重,不再因 HTTP status、timeout、latency、計數文字每 30 分鐘浮動就被當成新事件。 +- 原始 Telegram 內容仍保留完整 warning;只有 Redis dedupe hash 使用穩定化後的指紋,因此真新類型 warning、warning 消失後的恢復通知仍會送出。 +- `MoWoooWorkDown` Alertmanager 規則從舊 `component=momo-app` 改為 `component=momo-pro-system`,`auto_repair=false`,描述與 runbook 改成先檢查 `https://mo.wooo.work/health`、188 local `127.0.0.1:5003/health`、`momo-pro-system` / `momo-db` / `momo-scheduler` 與資料新鮮度,不再暗示看到 502 就重啟容器。 + +**重啟恢復判定口徑**: +- 成功心跳:不進 Telegram 洗版,只留 Redis/log/metrics。 +- 同一 warning:24 小時內以穩定指紋降噪。 +- 新 warning 類型:立即推送。 +- warning 清空:只推一次恢復通知。 +- MOMO public 502:先做 public/local/container/data freshness 分層診斷;不得只因外部 502 判定容器不存在或直接重啟。 + +**邊界**:本輪只改 repo 程式 / Prometheus 規則 / 文件,尚未宣稱 production 已部署;沒有 SSH、沒有重啟服務、沒有 reload Prometheus / Alertmanager、沒有改 Nginx / firewall、沒有讀 secret、沒有 force push。 + ## 2026-06-24|AWOOOI current-main dev base readback **背景**:Mac Mini / MacBook Pro 的 AWOOOI `awoooi-dev` 工作區原本都停在 `gitea/dev=25889d4b8edc`,而最新 `gitea/main=9bc6392770bc`。`gitea/dev` 是 `main` 的祖先、落後 `3708` commits,但 `.gitea/workflows/cd-dev.yaml` 會在 `dev` push 時觸發 dev deploy、K8s secret patch 與 `kubectl apply`,因此不能未批准直接快轉。 diff --git a/docs/runbooks/FULL-STACK-COLD-START-SOP.md b/docs/runbooks/FULL-STACK-COLD-START-SOP.md index 1cba1e62..5ce19278 100644 --- a/docs/runbooks/FULL-STACK-COLD-START-SOP.md +++ b/docs/runbooks/FULL-STACK-COLD-START-SOP.md @@ -1,6 +1,6 @@ # AWOOOI 全棧冷啟動與主機重啟 SOP -> Version: v1.32 +> Version: v1.33 > Last updated: 2026-06-24 Asia/Taipei > Scope: 110 / 120 / 121 / 188 full-stack reboot recovery. 112 Kali is recorded as P3 optional and is not part of this recovery path. @@ -10,7 +10,7 @@ 本節是每次接手、開機、關機、重啟後的第一個判定錨點。若日期不是今天,必須先重跑 live check,再更新本節與 `docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md`。 -2026-06-24 11:35 live readback supersedes the earlier 11:19 wording: +2026-06-24 18:32 notification-noise hardening supersedes the earlier 11:35 wording where it discusses heartbeat / MOMO alert behavior. The service and data readiness gates below remain unchanged until a fresh live cold-start scorecard says otherwise: ```text Repo-side reboot SOP / Plan B / automation contracts: COMPLETE, 100%. @@ -20,7 +20,7 @@ Runtime release state: API/Web/Worker are ready; image remains a84a5a0b because MOMO state: mo.wooo.work health is healthy on version V10.639; current-month daily_sales_snapshot and realtime_sales_monthly match, but both stop at 2026-06-17. MOMO_DAILY_FRESHNESS is 7 days, which is a hard blocker because business data is not current. Google Drive state: momo scheduler token ownership is fixed for Docker userns, container-side Drive listing works, but folder 當日業績匯入 currently has no matching 即時業績_當日 Excel source file. Archive latest matching file is 2026-06-18T01:30:39Z and was already imported by job 56. Backup / monitoring state: backup-status core blockers are 0, last aggregate is 2026-06-24 02:28:39, 188 MinIO is healthy, Velero BackupStorageLocation default is Available, one-off backup reboot-recovery-202606240456 completed, backup-health textfile reports Velero freshness green, and VeleroBackupNotRun / PostgreSQLDown / RedisDown / disk-pressure alerts resolved. -Notification-noise state: healthy AWOOOI heartbeat is suppressed; MOMO Pro monitor now uses https://mo.wooo.work/health as primary truth and no longer checks the 188 root path; docker-health-monitor keeps 5-minute repair cadence but has a separate 30-minute Telegram fallback cooldown; Bitan public-content check keeps failure alerting with same-fingerprint cooldown and one recovery notice. +Notification-noise state: healthy AWOOOI heartbeat is suppressed; heartbeat warning dedupe uses stable actionable fingerprints so HTTP status / timeout / latency drift does not create a new Telegram event every 30 minutes; MOMO Pro monitor uses https://mo.wooo.work/health as primary truth and no longer checks the 188 root path; MoWoooWorkDown now labels component=momo-pro-system and requires public/local/container/data-freshness triage instead of blind restart; docker-health-monitor keeps 5-minute repair cadence but has a separate 30-minute Telegram fallback cooldown; Bitan public-content check keeps failure alerting with same-fingerprint cooldown and one recovery notice. Allowed declaration: core hosts, routes, K3s, backup/exporter surfaces are recovered; MOMO data pipeline is blocked waiting for a newer source file or owner-provided source evidence. Forbidden declaration: full-stack green, MOMO data current, DR complete, or runtime/security acceptance. Credential escrow evidence is still missing and must not be forged. ``` diff --git a/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md b/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md index c91f722f..509f7e12 100644 --- a/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md +++ b/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md @@ -148,7 +148,7 @@ Next: | P1-014 | DONE | 100 | Publish credential escrow owner request package | 2026-06-13 13:10 live report confirms `SCRIPT_MISSING_COUNT=0`, `OFFSITE_CONFIGURED=1`, `RCLONE_CONFIGURED=1`, `ESCROW_MISSING_COUNT=5`, `PASS=8 WARN=5 BLOCKED=0`. New owner request package defines allowed evidence-id types, forbidden secret values, safe dry-run flow, write flow, and closeout gates. | Dispatch to the credential owners without collecting secret values; keep marker write gated until owner gives real non-secret evidence IDs. | `docs/security/CREDENTIAL-ESCROW-EVIDENCE-OWNER-REQUEST.md` and snapshot exist and validate. | | P1-013 | DONE_FOR_SERVICE_READINESS | 100 | Remediate `km-vectorize` CronJob health debt | The retained `km-vectorize-29689620` failed Job is now classified as stale evidence, not an active blocker, because later official `km-vectorize` Jobs completed successfully. 2026-06-18 13:43 cold-start reads `FAILED_JOBS=1`, `STALE_FAILED_JOBS=1`, `ACTIVE_FAILED_JOBS=0`, `BAD_PODS=0`, and returns `PASS=84 WARN=0 BLOCKED=0`. | Keep retained failed Job as evidence unless an explicit maintenance window authorizes cleanup. Reassert ArgoCD app health only with a fresh ArgoCD app readback, not from the cold-start scorecard alone. | Service readiness no longer warns on stale failed Job evidence; active failed Job detection remains guarded. | | P1-015 | DONE | 100 | Restore 188 MinIO / Velero backup freshness and DB exporters | 2026-06-24 06:35 resolved real backup / exporter red lights: 188 PostgreSQL exporter and Redis exporter now expose `pg_up=1` / `redis_up=1`; 188 MinIO health is live; 120 Velero BSL is `Available`; one-off backup `reboot-recovery-202606240456` completed; 110 backup-health textfile reports latest Velero backup fresh. 110 disk pressure was reduced from 92% to 73% by Docker image/build-cache cleanup only. | Reconcile MinIO `userns_mode: host` override into formal source-of-truth or data ownership fix; keep Docker volume prune forbidden without explicit owner approval. | `VeleroBackupNotRun`、`PostgreSQLDown`、`RedisDown`、110 disk-pressure alerts are resolved, and SOP includes restore helpers. | -| P1-016 | DONE | 100 | Control repeated Telegram notification noise without hiding real alerts | 2026-06-24 confirmed MOMO Pro 5-minute spam came from a legacy 110 script checking `http://192.168.0.188/health`; live script now uses `https://mo.wooo.work/health` as primary truth and manual readback returned `OK: public health 200; no alert`. Generic docker-health monitor keeps 5-minute repair checks but adds a separate 30-minute direct Telegram fallback cooldown. Bitan public-content cleanliness keeps failure notification but suppresses the same failure fingerprint for 6 hours and emits one recovery notice. | Fold remaining cross-product direct Telegram egress into the unified notification gateway over time; do not disable real warning/failure/recovery signals. | Healthy heartbeat is quiet, MOMO public health success produces no alert, repeated same-failure direct fallback paths are cooled, and real failure/recovery notifications remain enabled. | +| P1-016 | DONE | 100 | Control repeated Telegram notification noise without hiding real alerts | 2026-06-24 confirmed MOMO Pro 5-minute spam came from a legacy 110 script checking `http://192.168.0.188/health`; live script now uses `https://mo.wooo.work/health` as primary truth. Heartbeat warning dedupe now hashes stable actionable fingerprints so HTTP status / timeout / latency drift does not create a new Telegram event every 30 minutes. `MoWoooWorkDown` now labels `component=momo-pro-system`, disables blind auto-repair, and requires public/local/container/data-freshness triage. Generic docker-health monitor keeps 5-minute repair checks but adds a separate 30-minute direct Telegram fallback cooldown. Bitan public-content cleanliness keeps failure notification with same-fingerprint cooldown and one recovery notice. | Fold remaining cross-product direct Telegram egress into the unified notification gateway over time; do not disable real warning/failure/recovery signals. Production deployment/readback must confirm the code and Prometheus rules are live before declaring runtime closure. | Healthy heartbeat is quiet, same actionable heartbeat warning is deduped, MOMO public health success produces no alert, repeated same-failure direct fallback paths are cooled, and real failure/recovery/new-warning notifications remain enabled. | --- diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index 59895c58..416674d4 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -945,13 +945,14 @@ groups: labels: severity: critical layer: external - component: momo-app + component: momo-pro-system host: "188" team: ops - auto_repair: "true" + auto_repair: "false" annotations: summary: "外部網站 mo.wooo.work 離線" - description: "mo.wooo.work 探測失敗超過 3 分鐘,容器 momo-app (188) 可能需要重啟" + description: "mo.wooo.work public route 探測失敗超過 3 分鐘;先檢查 https://mo.wooo.work/health、188 local http://127.0.0.1:5003/health 與容器 momo-pro-system,不可只因 502 盲目重啟。" + runbook: "先比對 public health、188 local app health、momo-pro-system / momo-db / momo-scheduler 狀態與最新 import/data freshness;若 public 502 但 local healthy,優先查 188 Nginx / upstream / TLS,不直接重啟容器。" - alert: TsenyangWebsiteDown expr: probe_success{job="blackbox-http", instance="https://www.tsenyang.com"} == 0 diff --git a/ops/monitoring/alerts.yml b/ops/monitoring/alerts.yml index 8a004f98..34303250 100644 --- a/ops/monitoring/alerts.yml +++ b/ops/monitoring/alerts.yml @@ -818,13 +818,14 @@ groups: labels: severity: critical layer: external - component: momo-app + component: momo-pro-system host: "188" team: ops - auto_repair: "true" + auto_repair: "false" annotations: summary: "外部網站 mo.wooo.work 離線" - description: "mo.wooo.work 探測失敗超過 3 分鐘,容器 momo-app (188) 可能需要重啟" + description: "mo.wooo.work public route 探測失敗超過 3 分鐘;先檢查 https://mo.wooo.work/health、188 local http://127.0.0.1:5003/health 與容器 momo-pro-system,不可只因 502 盲目重啟。" + runbook: "先比對 public health、188 local app health、momo-pro-system / momo-db / momo-scheduler 狀態與最新 import/data freshness;若 public 502 但 local healthy,優先查 188 Nginx / upstream / TLS,不直接重啟容器。" - alert: TsenyangWebsiteDown expr: probe_success{job="blackbox-http", instance="https://www.tsenyang.com"} == 0