fix(ops): harden heartbeat and momo alert noise
Some checks failed
Code Review / ai-code-review (push) Successful in 14s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 31s
CD Pipeline / tests (push) Successful in 1m59s
CD Pipeline / build-and-deploy (push) Successful in 7m36s
CD Pipeline / post-deploy-checks (push) Failing after 43s
Ansible / Reboot Recovery Contract / validate (push) Has been cancelled
Some checks failed
Code Review / ai-code-review (push) Successful in 14s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 31s
CD Pipeline / tests (push) Successful in 1m59s
CD Pipeline / build-and-deploy (push) Successful in 7m36s
CD Pipeline / post-deploy-checks (push) Failing after 43s
Ansible / Reboot Recovery Contract / validate (push) Has been cancelled
This commit is contained in:
@@ -71,6 +71,40 @@ INCIDENT_UPDATE_GLOBAL_FAILURE_DEDUP_TTL_SECONDS = 10 * 60 # 相同失敗摘要
|
||||
GROUPED_ALERT_DIGEST_DEDUP_PREFIX = "awoooi:tg_group_digest:" # {group_key}
|
||||
GROUPED_ALERT_DIGEST_DEDUP_TTL_SECONDS = 5 * 60 # 同一告警群組 5 分鐘只推一則 digest
|
||||
|
||||
_HEARTBEAT_WARNING_FINGERPRINT_RULES: tuple[tuple[re.Pattern[str], str], ...] = (
|
||||
# 2026-06-24 Codex + ogt: Telegram heartbeat dedupe must track the
|
||||
# actionable condition, not volatile probe details. HTTP status, timeout,
|
||||
# latency and count text can vary every 30 minutes while the operator
|
||||
# action stays exactly the same, which caused repeated "heartbeat" noise.
|
||||
(re.compile(r"^(Ollama\s+[^:]+)\s+異常:.*$"), r"\1 異常"),
|
||||
(re.compile(r"^(.+?)\s+服務異常:.*$"), r"\1 服務異常"),
|
||||
(re.compile(r"^(MCP\s+[^:]+):.*$"), r"\1"),
|
||||
(re.compile(r"^(ArgoCD):.*$"), r"\1"),
|
||||
(re.compile(r"^(PostgreSQL):.*$"), r"\1"),
|
||||
(re.compile(r"^(Redis):.*$"), r"\1"),
|
||||
(re.compile(r"^KM\s+向量化率偏低:.*$"), "KM 向量化率偏低"),
|
||||
(re.compile(r"^系統沉默\s+.*$"), "系統沉默(無學習活動)"),
|
||||
(re.compile(r"^待人工審核\s+.*$"), "待人工審核積壓"),
|
||||
)
|
||||
|
||||
|
||||
def _normalize_heartbeat_warning_for_fingerprint(warning: str) -> str:
|
||||
"""Return the stable actionable identity used for heartbeat dedupe."""
|
||||
normalized = " ".join(str(warning).split())
|
||||
for pattern, replacement in _HEARTBEAT_WARNING_FINGERPRINT_RULES:
|
||||
replaced = pattern.sub(replacement, normalized)
|
||||
if replaced != normalized:
|
||||
return replaced
|
||||
return normalized
|
||||
|
||||
|
||||
def _heartbeat_warnings_hash(warnings: list[str]) -> str:
|
||||
"""Hash heartbeat warnings after stripping volatile probe details."""
|
||||
warnings_str = "|".join(
|
||||
sorted(_normalize_heartbeat_warning_for_fingerprint(item) for item in warnings)
|
||||
)
|
||||
return hashlib.md5(warnings_str.encode()).hexdigest()[:12]
|
||||
|
||||
# 2026-04-01 Claude Code: Long Polling 分散式 Leader Election
|
||||
# 防止多 Pod 同時 getUpdates → 409 Conflict 互搶問題
|
||||
POLLING_LEADER_KEY = "telegram:polling:leader"
|
||||
@@ -10118,13 +10152,11 @@ class TelegramGateway:
|
||||
# 有 warnings 跟上次相同 → 跳過(hash 對比)
|
||||
# 有 warnings 跟上次不同 → 立即推送(新狀況不漏)
|
||||
# warnings 消失 → 只推一次恢復通知,之後回到安靜
|
||||
import hashlib
|
||||
WARNINGS_HASH_TTL = 24 * 3600
|
||||
healthy_suppressed_key = "heartbeat:healthy_suppressed_last_seen"
|
||||
warnings_hash_key = "heartbeat:warnings_hash"
|
||||
|
||||
warnings_str = "|".join(sorted(report.warnings))
|
||||
warnings_hash = hashlib.md5(warnings_str.encode()).hexdigest()[:12]
|
||||
warnings_hash = _heartbeat_warnings_hash(report.warnings)
|
||||
|
||||
if not report.warnings:
|
||||
# 健康狀態:沒有上一輪 warning 時不送 Telegram,避免成功心跳洗版。
|
||||
|
||||
@@ -140,9 +140,9 @@ class TestHeartbeatDedup:
|
||||
gw, fake_redis = gateway_with_fake_redis
|
||||
warnings = ["Pod api-x Failed", "Redis: down"]
|
||||
# 預設上次 hash
|
||||
import hashlib
|
||||
warnings_str = "|".join(sorted(warnings))
|
||||
last_hash = hashlib.md5(warnings_str.encode()).hexdigest()[:12]
|
||||
from src.services.telegram_gateway import _heartbeat_warnings_hash
|
||||
|
||||
last_hash = _heartbeat_warnings_hash(warnings)
|
||||
fake_redis.preset("heartbeat:warnings_hash", last_hash)
|
||||
|
||||
with patch("src.core.redis_client.get_redis", return_value=fake_redis), \
|
||||
@@ -157,6 +157,35 @@ class TestHeartbeatDedup:
|
||||
gw.send_to_group.assert_not_called()
|
||||
gw.send_notification.assert_not_called()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_warnings_with_same_actionable_condition_are_skipped(
|
||||
self,
|
||||
gateway_with_fake_redis,
|
||||
sre_group_configured,
|
||||
):
|
||||
"""同一可處置 warning 即使 HTTP / timeout / latency 變動也不重複洗版"""
|
||||
gw, fake_redis = gateway_with_fake_redis
|
||||
from src.services.telegram_gateway import _heartbeat_warnings_hash
|
||||
|
||||
fake_redis.preset(
|
||||
"heartbeat:warnings_hash",
|
||||
_heartbeat_warnings_hash(["Ollama 111 異常: ❌ HTTP 502 124ms"]),
|
||||
)
|
||||
|
||||
with patch("src.core.redis_client.get_redis", return_value=fake_redis), \
|
||||
patch("src.services.heartbeat_report_service.HeartbeatReportService") as MockSvc, \
|
||||
patch("src.services.heartbeat_report_service.report_to_telegram_html",
|
||||
return_value="<b>warnings</b>"):
|
||||
MockSvc.return_value.collect = AsyncMock(
|
||||
return_value=_make_report(["Ollama 111 異常: ❌ HTTP 504 236ms"])
|
||||
)
|
||||
|
||||
result = await gw.send_heartbeat()
|
||||
|
||||
assert result is True
|
||||
gw.send_to_group.assert_not_called()
|
||||
gw.send_notification.assert_not_called()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_warnings_changed_pushes(
|
||||
self,
|
||||
|
||||
@@ -1,3 +1,21 @@
|
||||
## 2026-06-24|重啟後告警噪音 hardening
|
||||
|
||||
**背景**:重啟恢復後,MOMO Pro 5 分鐘 502 舊告警與 AWOOOI 30 分鐘成功心跳都會干擾判斷。正確策略不是關掉告警,而是把「可處置的新異常」與「同一狀態重複回報 / 成功心跳」分開。
|
||||
|
||||
**完成**:
|
||||
- `telegram_gateway.send_heartbeat()` 新增 heartbeat warning 穩定指紋。`Ollama 111 異常`、AI service、MCP、ArgoCD、DB/Redis、KM vectorization、系統沉默與 pending 積壓等 warning 會用可處置條件去重,不再因 HTTP status、timeout、latency、計數文字每 30 分鐘浮動就被當成新事件。
|
||||
- 原始 Telegram 內容仍保留完整 warning;只有 Redis dedupe hash 使用穩定化後的指紋,因此真新類型 warning、warning 消失後的恢復通知仍會送出。
|
||||
- `MoWoooWorkDown` Alertmanager 規則從舊 `component=momo-app` 改為 `component=momo-pro-system`,`auto_repair=false`,描述與 runbook 改成先檢查 `https://mo.wooo.work/health`、188 local `127.0.0.1:5003/health`、`momo-pro-system` / `momo-db` / `momo-scheduler` 與資料新鮮度,不再暗示看到 502 就重啟容器。
|
||||
|
||||
**重啟恢復判定口徑**:
|
||||
- 成功心跳:不進 Telegram 洗版,只留 Redis/log/metrics。
|
||||
- 同一 warning:24 小時內以穩定指紋降噪。
|
||||
- 新 warning 類型:立即推送。
|
||||
- warning 清空:只推一次恢復通知。
|
||||
- MOMO public 502:先做 public/local/container/data freshness 分層診斷;不得只因外部 502 判定容器不存在或直接重啟。
|
||||
|
||||
**邊界**:本輪只改 repo 程式 / Prometheus 規則 / 文件,尚未宣稱 production 已部署;沒有 SSH、沒有重啟服務、沒有 reload Prometheus / Alertmanager、沒有改 Nginx / firewall、沒有讀 secret、沒有 force push。
|
||||
|
||||
## 2026-06-24|AWOOOI current-main dev base readback
|
||||
|
||||
**背景**:Mac Mini / MacBook Pro 的 AWOOOI `awoooi-dev` 工作區原本都停在 `gitea/dev=25889d4b8edc`,而最新 `gitea/main=9bc6392770bc`。`gitea/dev` 是 `main` 的祖先、落後 `3708` commits,但 `.gitea/workflows/cd-dev.yaml` 會在 `dev` push 時觸發 dev deploy、K8s secret patch 與 `kubectl apply`,因此不能未批准直接快轉。
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# AWOOOI 全棧冷啟動與主機重啟 SOP
|
||||
|
||||
> Version: v1.32
|
||||
> Version: v1.33
|
||||
> Last updated: 2026-06-24 Asia/Taipei
|
||||
> Scope: 110 / 120 / 121 / 188 full-stack reboot recovery. 112 Kali is recorded as P3 optional and is not part of this recovery path.
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
|
||||
本節是每次接手、開機、關機、重啟後的第一個判定錨點。若日期不是今天,必須先重跑 live check,再更新本節與 `docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md`。
|
||||
|
||||
2026-06-24 11:35 live readback supersedes the earlier 11:19 wording:
|
||||
2026-06-24 18:32 notification-noise hardening supersedes the earlier 11:35 wording where it discusses heartbeat / MOMO alert behavior. The service and data readiness gates below remain unchanged until a fresh live cold-start scorecard says otherwise:
|
||||
|
||||
```text
|
||||
Repo-side reboot SOP / Plan B / automation contracts: COMPLETE, 100%.
|
||||
@@ -20,7 +20,7 @@ Runtime release state: API/Web/Worker are ready; image remains a84a5a0b because
|
||||
MOMO state: mo.wooo.work health is healthy on version V10.639; current-month daily_sales_snapshot and realtime_sales_monthly match, but both stop at 2026-06-17. MOMO_DAILY_FRESHNESS is 7 days, which is a hard blocker because business data is not current.
|
||||
Google Drive state: momo scheduler token ownership is fixed for Docker userns, container-side Drive listing works, but folder 當日業績匯入 currently has no matching 即時業績_當日 Excel source file. Archive latest matching file is 2026-06-18T01:30:39Z and was already imported by job 56.
|
||||
Backup / monitoring state: backup-status core blockers are 0, last aggregate is 2026-06-24 02:28:39, 188 MinIO is healthy, Velero BackupStorageLocation default is Available, one-off backup reboot-recovery-202606240456 completed, backup-health textfile reports Velero freshness green, and VeleroBackupNotRun / PostgreSQLDown / RedisDown / disk-pressure alerts resolved.
|
||||
Notification-noise state: healthy AWOOOI heartbeat is suppressed; MOMO Pro monitor now uses https://mo.wooo.work/health as primary truth and no longer checks the 188 root path; docker-health-monitor keeps 5-minute repair cadence but has a separate 30-minute Telegram fallback cooldown; Bitan public-content check keeps failure alerting with same-fingerprint cooldown and one recovery notice.
|
||||
Notification-noise state: healthy AWOOOI heartbeat is suppressed; heartbeat warning dedupe uses stable actionable fingerprints so HTTP status / timeout / latency drift does not create a new Telegram event every 30 minutes; MOMO Pro monitor uses https://mo.wooo.work/health as primary truth and no longer checks the 188 root path; MoWoooWorkDown now labels component=momo-pro-system and requires public/local/container/data-freshness triage instead of blind restart; docker-health-monitor keeps 5-minute repair cadence but has a separate 30-minute Telegram fallback cooldown; Bitan public-content check keeps failure alerting with same-fingerprint cooldown and one recovery notice.
|
||||
Allowed declaration: core hosts, routes, K3s, backup/exporter surfaces are recovered; MOMO data pipeline is blocked waiting for a newer source file or owner-provided source evidence.
|
||||
Forbidden declaration: full-stack green, MOMO data current, DR complete, or runtime/security acceptance. Credential escrow evidence is still missing and must not be forged.
|
||||
```
|
||||
|
||||
@@ -148,7 +148,7 @@ Next: <single next action>
|
||||
| P1-014 | DONE | 100 | Publish credential escrow owner request package | 2026-06-13 13:10 live report confirms `SCRIPT_MISSING_COUNT=0`, `OFFSITE_CONFIGURED=1`, `RCLONE_CONFIGURED=1`, `ESCROW_MISSING_COUNT=5`, `PASS=8 WARN=5 BLOCKED=0`. New owner request package defines allowed evidence-id types, forbidden secret values, safe dry-run flow, write flow, and closeout gates. | Dispatch to the credential owners without collecting secret values; keep marker write gated until owner gives real non-secret evidence IDs. | `docs/security/CREDENTIAL-ESCROW-EVIDENCE-OWNER-REQUEST.md` and snapshot exist and validate. |
|
||||
| P1-013 | DONE_FOR_SERVICE_READINESS | 100 | Remediate `km-vectorize` CronJob health debt | The retained `km-vectorize-29689620` failed Job is now classified as stale evidence, not an active blocker, because later official `km-vectorize` Jobs completed successfully. 2026-06-18 13:43 cold-start reads `FAILED_JOBS=1`, `STALE_FAILED_JOBS=1`, `ACTIVE_FAILED_JOBS=0`, `BAD_PODS=0`, and returns `PASS=84 WARN=0 BLOCKED=0`. | Keep retained failed Job as evidence unless an explicit maintenance window authorizes cleanup. Reassert ArgoCD app health only with a fresh ArgoCD app readback, not from the cold-start scorecard alone. | Service readiness no longer warns on stale failed Job evidence; active failed Job detection remains guarded. |
|
||||
| P1-015 | DONE | 100 | Restore 188 MinIO / Velero backup freshness and DB exporters | 2026-06-24 06:35 resolved real backup / exporter red lights: 188 PostgreSQL exporter and Redis exporter now expose `pg_up=1` / `redis_up=1`; 188 MinIO health is live; 120 Velero BSL is `Available`; one-off backup `reboot-recovery-202606240456` completed; 110 backup-health textfile reports latest Velero backup fresh. 110 disk pressure was reduced from 92% to 73% by Docker image/build-cache cleanup only. | Reconcile MinIO `userns_mode: host` override into formal source-of-truth or data ownership fix; keep Docker volume prune forbidden without explicit owner approval. | `VeleroBackupNotRun`、`PostgreSQLDown`、`RedisDown`、110 disk-pressure alerts are resolved, and SOP includes restore helpers. |
|
||||
| P1-016 | DONE | 100 | Control repeated Telegram notification noise without hiding real alerts | 2026-06-24 confirmed MOMO Pro 5-minute spam came from a legacy 110 script checking `http://192.168.0.188/health`; live script now uses `https://mo.wooo.work/health` as primary truth and manual readback returned `OK: public health 200; no alert`. Generic docker-health monitor keeps 5-minute repair checks but adds a separate 30-minute direct Telegram fallback cooldown. Bitan public-content cleanliness keeps failure notification but suppresses the same failure fingerprint for 6 hours and emits one recovery notice. | Fold remaining cross-product direct Telegram egress into the unified notification gateway over time; do not disable real warning/failure/recovery signals. | Healthy heartbeat is quiet, MOMO public health success produces no alert, repeated same-failure direct fallback paths are cooled, and real failure/recovery notifications remain enabled. |
|
||||
| P1-016 | DONE | 100 | Control repeated Telegram notification noise without hiding real alerts | 2026-06-24 confirmed MOMO Pro 5-minute spam came from a legacy 110 script checking `http://192.168.0.188/health`; live script now uses `https://mo.wooo.work/health` as primary truth. Heartbeat warning dedupe now hashes stable actionable fingerprints so HTTP status / timeout / latency drift does not create a new Telegram event every 30 minutes. `MoWoooWorkDown` now labels `component=momo-pro-system`, disables blind auto-repair, and requires public/local/container/data-freshness triage. Generic docker-health monitor keeps 5-minute repair checks but adds a separate 30-minute direct Telegram fallback cooldown. Bitan public-content cleanliness keeps failure notification with same-fingerprint cooldown and one recovery notice. | Fold remaining cross-product direct Telegram egress into the unified notification gateway over time; do not disable real warning/failure/recovery signals. Production deployment/readback must confirm the code and Prometheus rules are live before declaring runtime closure. | Healthy heartbeat is quiet, same actionable heartbeat warning is deduped, MOMO public health success produces no alert, repeated same-failure direct fallback paths are cooled, and real failure/recovery/new-warning notifications remain enabled. |
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -945,13 +945,14 @@ groups:
|
||||
labels:
|
||||
severity: critical
|
||||
layer: external
|
||||
component: momo-app
|
||||
component: momo-pro-system
|
||||
host: "188"
|
||||
team: ops
|
||||
auto_repair: "true"
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "外部網站 mo.wooo.work 離線"
|
||||
description: "mo.wooo.work 探測失敗超過 3 分鐘,容器 momo-app (188) 可能需要重啟"
|
||||
description: "mo.wooo.work public route 探測失敗超過 3 分鐘;先檢查 https://mo.wooo.work/health、188 local http://127.0.0.1:5003/health 與容器 momo-pro-system,不可只因 502 盲目重啟。"
|
||||
runbook: "先比對 public health、188 local app health、momo-pro-system / momo-db / momo-scheduler 狀態與最新 import/data freshness;若 public 502 但 local healthy,優先查 188 Nginx / upstream / TLS,不直接重啟容器。"
|
||||
|
||||
- alert: TsenyangWebsiteDown
|
||||
expr: probe_success{job="blackbox-http", instance="https://www.tsenyang.com"} == 0
|
||||
|
||||
@@ -818,13 +818,14 @@ groups:
|
||||
labels:
|
||||
severity: critical
|
||||
layer: external
|
||||
component: momo-app
|
||||
component: momo-pro-system
|
||||
host: "188"
|
||||
team: ops
|
||||
auto_repair: "true"
|
||||
auto_repair: "false"
|
||||
annotations:
|
||||
summary: "外部網站 mo.wooo.work 離線"
|
||||
description: "mo.wooo.work 探測失敗超過 3 分鐘,容器 momo-app (188) 可能需要重啟"
|
||||
description: "mo.wooo.work public route 探測失敗超過 3 分鐘;先檢查 https://mo.wooo.work/health、188 local http://127.0.0.1:5003/health 與容器 momo-pro-system,不可只因 502 盲目重啟。"
|
||||
runbook: "先比對 public health、188 local app health、momo-pro-system / momo-db / momo-scheduler 狀態與最新 import/data freshness;若 public 502 但 local healthy,優先查 188 Nginx / upstream / TLS,不直接重啟容器。"
|
||||
|
||||
- alert: TsenyangWebsiteDown
|
||||
expr: probe_success{job="blackbox-http", instance="https://www.tsenyang.com"} == 0
|
||||
|
||||
Reference in New Issue
Block a user