From 7768924fea4b72ffece34cc1354afc096b85bb60 Mon Sep 17 00:00:00 2001 From: OG T Date: Fri, 10 Apr 2026 11:52:04 +0800 Subject: [PATCH] =?UTF-8?q?fix(flywheel):=20=E8=87=AA=E5=8B=95=E4=BF=AE?= =?UTF-8?q?=E5=BE=A9=E5=BE=8C=E7=A7=BB=E9=99=A4=20Telegram=20=E6=8C=89?= =?UTF-8?q?=E9=88=95=20+=20=E5=BF=83=E8=B7=B3=E5=91=8A=E8=AD=A6=E6=8E=92?= =?UTF-8?q?=E9=99=A4=E9=A3=9B=E8=BC=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 問題: 自動修復成功後 Telegram 卡片仍顯示批准/拒絕/靜默按鈕 Fix 1 — Telegram 卡片回饋閉環 (積木化合規): - telegram_gateway.send_approval_card: 發送後自動存 tg_approval:{id} 到 Redis - telegram_gateway.mark_auto_repaired(): 新方法 — 移除按鈕 + reply 結果 - _try_auto_repair_background: 改呼叫 gateway.mark_auto_repaired() (Service 層) Fix 2 — 心跳/看門狗告警排除飛輪: - constants.py: is_heartbeat_alertname() + HEARTBEAT_ALERT_NAMES - NoAlertsReceived2Hours 等不觸發 _try_auto_repair_background Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/api/v1/webhooks.py | 35 +++++++------ apps/api/src/core/constants.py | 28 +++++++++++ apps/api/src/services/telegram_gateway.py | 61 ++++++++++++++++++++++- 3 files changed, 109 insertions(+), 15 deletions(-) diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index 63b2ab3d..f5109649 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -30,7 +30,7 @@ from fastapi import APIRouter, BackgroundTasks, Header, HTTPException, Request, from pydantic import BaseModel, Field from src.core.config import settings -from src.core.constants import is_cicd_alertname +from src.core.constants import is_cicd_alertname, is_heartbeat_alertname from src.core.logging import get_logger from src.core.metrics import record_alert_chain_success @@ -206,26 +206,23 @@ async def _try_auto_repair_background( }, ) - # 通知 Telegram 自動修復結果 + # 2026-04-10 Claude Sonnet 4.6 Asia/Taipei: 自動修復後更新 Telegram 卡片 + # 透過 TelegramGateway Service 層移除按鈕並回覆結果 (積木化鐵律) if result: try: - telegram = get_telegram_gateway() - status_icon = "✅" if result.success else "❌" - steps_summary = "\n".join(f" • {s}" for s in result.executed_steps[:3]) or "-" - await telegram.send_message( - f"{status_icon} *自動修復{'完成' if result.success else '失敗'}*\n" - f"資源: `{target_resource}` ({namespace})\n" - f"告警: {alert_type}\n" - f"耗時: {result.execution_time_ms}ms\n" - f"步驟:\n{steps_summary}" + _pb_name = decision.playbook.name if decision.playbook else "unknown" + await get_telegram_gateway().mark_auto_repaired( + approval_id=approval_id, + playbook_name=_pb_name, + execution_time_ms=result.execution_time_ms, + success=result.success, ) - # 記錄 Telegram 推送 await op_log.append( "TELEGRAM_RESULT_SENT", incident_id=incident_id, approval_id=approval_id, actor="system", - action_detail="auto_repair_result", + action_detail="auto_repair_card_updated", success=result.success, context={"target_resource": target_resource, "namespace": namespace}, ) @@ -1303,7 +1300,17 @@ async def alertmanager_webhook( # Sprint 5.1 Q9: auto_repair=false 旗標 → 強制 HITL,不觸發背景任務 # (2026-04-08 Claude Sonnet 4.6 Asia/Taipei,ADR-062) # ================================================================ - if _can_auto_repair_by_rule: + # 2026-04-10 Claude Sonnet 4.6 Asia/Taipei: 心跳/看門狗告警不進飛輪 + # NoAlertsReceived2Hours 等代表監控系統狀態,不是服務故障 + _is_heartbeat = is_heartbeat_alertname(alertname) + if _is_heartbeat: + logger.info( + "auto_repair_skipped_heartbeat", + incident_id=incident_id, + alertname=alertname, + ) + + if _can_auto_repair_by_rule and not _is_heartbeat: background_tasks.add_task( _try_auto_repair_background, incident_id=incident_id, diff --git a/apps/api/src/core/constants.py b/apps/api/src/core/constants.py index e77d5f5d..43895e6e 100644 --- a/apps/api/src/core/constants.py +++ b/apps/api/src/core/constants.py @@ -85,6 +85,34 @@ CICD_ALERT_SUFFIXES = ( # CI/CD 告警關鍵字 (不區分大小寫) CICD_ALERT_KEYWORDS = ("CI/CD", "cicd") +# ============================================================================= +# Heartbeat/Watchdog Alert Detection (2026-04-10 Claude Sonnet 4.6 Asia/Taipei) +# 心跳/看門狗告警不觸發自動修復飛輪 — 這類告警代表監控系統狀態,不是服務故障 +# ============================================================================= +HEARTBEAT_ALERT_NAMES = frozenset({ + "Watchdog", + "DeadMansSwitch", + "NoAlertsReceived", + "NoAlertsReceived2Hours", + "AlertmanagerDown", + "PrometheusNotConnectedToAlertmanager", +}) + +HEARTBEAT_ALERT_KEYWORDS = ("NoAlertsReceived", "Watchdog", "DeadMansSwitch", "Heartbeat") + + +def is_heartbeat_alertname(alertname: str) -> bool: + """ + 判斷 alertname 是否為心跳/看門狗告警 + + 心跳告警代表監控系統自身健康狀態,不是服務故障, + 不應進入自動修復飛輪(不存在對應的 Playbook 修復動作)。 + """ + return ( + alertname in HEARTBEAT_ALERT_NAMES + or any(kw in alertname for kw in HEARTBEAT_ALERT_KEYWORDS) + ) + def is_cicd_alertname(alertname: str) -> bool: """ diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index f997978a..7c9ad268 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -1450,12 +1450,21 @@ class TelegramGateway: result = await self._send_request("sendMessage", payload) + _msg_id = result.get("result", {}).get("message_id") logger.info( "telegram_approval_card_sent", approval_id=approval_id, - message_id=result.get("result", {}).get("message_id"), + message_id=_msg_id, ) + # 2026-04-10 Claude Sonnet 4.6 Asia/Taipei: 儲存 message_id 供自動修復後更新卡片 + # key: tg_approval:{approval_id},TTL 24h + if _msg_id: + try: + await get_redis().setex(f"tg_approval:{approval_id}", 86400, str(_msg_id)) + except Exception as _e: + logger.warning("tg_approval_msg_id_store_failed", approval_id=approval_id, error=str(_e)) + # 2026-04-03 ogt: 發到 SRE 群組並觸發 AI 雙向討論 (Triumvirate ADR-053) # 2026-04-05 ogt: 升級為完整 v7.0 格式,與個人 chat 一致 # 非同步執行,失敗不影響告警主流程 @@ -2503,6 +2512,56 @@ class TelegramGateway: # 文字更新失敗不影響整體流程,按鈕已移除 logger.warning("telegram_update_text_failed", message_id=message_id, error=str(e)) + async def mark_auto_repaired( + self, + approval_id: str, + playbook_name: str, + execution_time_ms: int, + success: bool = True, + ) -> bool: + """ + 自動修復完成後更新 Telegram 卡片: + 1. 移除批准/拒絕/靜默按鈕 + 2. 回覆原訊息顯示修復結果 + + 2026-04-10 Claude Sonnet 4.6 Asia/Taipei (ADR-068 閉環) + """ + try: + stored = await get_redis().get(f"tg_approval:{approval_id}") + if not stored: + logger.warning("mark_auto_repaired_no_msg_id", approval_id=approval_id) + return False + + message_id = int(stored) + + # 移除按鈕 + try: + await self._send_request("editMessageReplyMarkup", { + "chat_id": self.chat_id, + "message_id": message_id, + "reply_markup": {"inline_keyboard": []}, + }) + except TelegramGatewayError as e: + logger.warning("mark_auto_repaired_remove_buttons_failed", message_id=message_id, error=str(e)) + + # 回覆原訊息說明結果 + _status = "✅ 已自動修復" if success else "❌ 自動修復失敗" + await self._send_request("sendMessage", { + "chat_id": self.chat_id, + "text": ( + f"{_status}\n" + f"Playbook: {html.escape(playbook_name)}\n" + f"耗時: {execution_time_ms}ms" + ), + "parse_mode": "HTML", + "reply_parameters": {"message_id": message_id}, + }) + return True + + except Exception as e: + logger.warning("mark_auto_repaired_failed", approval_id=approval_id, error=str(e)) + return False + async def append_incident_update( self, incident_id: str,