fix(flywheel): 自動修復後移除 Telegram 按鈕 + 心跳告警排除飛輪
Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 6m56s

問題: 自動修復成功後 Telegram 卡片仍顯示批准/拒絕/靜默按鈕

Fix 1 — Telegram 卡片回饋閉環 (積木化合規):
- telegram_gateway.send_approval_card: 發送後自動存 tg_approval:{id} 到 Redis
- telegram_gateway.mark_auto_repaired(): 新方法 — 移除按鈕 + reply 結果
- _try_auto_repair_background: 改呼叫 gateway.mark_auto_repaired() (Service 層)

Fix 2 — 心跳/看門狗告警排除飛輪:
- constants.py: is_heartbeat_alertname() + HEARTBEAT_ALERT_NAMES
- NoAlertsReceived2Hours 等不觸發 _try_auto_repair_background

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-10 11:52:04 +08:00
parent a42e9f6c8f
commit 7768924fea
3 changed files with 109 additions and 15 deletions

View File

@@ -30,7 +30,7 @@ from fastapi import APIRouter, BackgroundTasks, Header, HTTPException, Request,
from pydantic import BaseModel, Field
from src.core.config import settings
from src.core.constants import is_cicd_alertname
from src.core.constants import is_cicd_alertname, is_heartbeat_alertname
from src.core.logging import get_logger
from src.core.metrics import record_alert_chain_success
@@ -206,26 +206,23 @@ async def _try_auto_repair_background(
},
)
# 通知 Telegram 自動修復結果
# 2026-04-10 Claude Sonnet 4.6 Asia/Taipei: 自動修復後更新 Telegram 卡片
# 透過 TelegramGateway Service 層移除按鈕並回覆結果 (積木化鐵律)
if result:
try:
telegram = get_telegram_gateway()
status_icon = "" if result.success else ""
steps_summary = "\n".join(f"{s}" for s in result.executed_steps[:3]) or "-"
await telegram.send_message(
f"{status_icon} *自動修復{'完成' if result.success else '失敗'}*\n"
f"資源: `{target_resource}` ({namespace})\n"
f"告警: {alert_type}\n"
f"耗時: {result.execution_time_ms}ms\n"
f"步驟:\n{steps_summary}"
_pb_name = decision.playbook.name if decision.playbook else "unknown"
await get_telegram_gateway().mark_auto_repaired(
approval_id=approval_id,
playbook_name=_pb_name,
execution_time_ms=result.execution_time_ms,
success=result.success,
)
# 記錄 Telegram 推送
await op_log.append(
"TELEGRAM_RESULT_SENT",
incident_id=incident_id,
approval_id=approval_id,
actor="system",
action_detail="auto_repair_result",
action_detail="auto_repair_card_updated",
success=result.success,
context={"target_resource": target_resource, "namespace": namespace},
)
@@ -1303,7 +1300,17 @@ async def alertmanager_webhook(
# Sprint 5.1 Q9: auto_repair=false 旗標 → 強制 HITL不觸發背景任務
# (2026-04-08 Claude Sonnet 4.6 Asia/TaipeiADR-062)
# ================================================================
if _can_auto_repair_by_rule:
# 2026-04-10 Claude Sonnet 4.6 Asia/Taipei: 心跳/看門狗告警不進飛輪
# NoAlertsReceived2Hours 等代表監控系統狀態,不是服務故障
_is_heartbeat = is_heartbeat_alertname(alertname)
if _is_heartbeat:
logger.info(
"auto_repair_skipped_heartbeat",
incident_id=incident_id,
alertname=alertname,
)
if _can_auto_repair_by_rule and not _is_heartbeat:
background_tasks.add_task(
_try_auto_repair_background,
incident_id=incident_id,

View File

@@ -85,6 +85,34 @@ CICD_ALERT_SUFFIXES = (
# CI/CD 告警關鍵字 (不區分大小寫)
CICD_ALERT_KEYWORDS = ("CI/CD", "cicd")
# =============================================================================
# Heartbeat/Watchdog Alert Detection (2026-04-10 Claude Sonnet 4.6 Asia/Taipei)
# 心跳/看門狗告警不觸發自動修復飛輪 — 這類告警代表監控系統狀態,不是服務故障
# =============================================================================
HEARTBEAT_ALERT_NAMES = frozenset({
"Watchdog",
"DeadMansSwitch",
"NoAlertsReceived",
"NoAlertsReceived2Hours",
"AlertmanagerDown",
"PrometheusNotConnectedToAlertmanager",
})
HEARTBEAT_ALERT_KEYWORDS = ("NoAlertsReceived", "Watchdog", "DeadMansSwitch", "Heartbeat")
def is_heartbeat_alertname(alertname: str) -> bool:
"""
判斷 alertname 是否為心跳/看門狗告警
心跳告警代表監控系統自身健康狀態,不是服務故障,
不應進入自動修復飛輪(不存在對應的 Playbook 修復動作)。
"""
return (
alertname in HEARTBEAT_ALERT_NAMES
or any(kw in alertname for kw in HEARTBEAT_ALERT_KEYWORDS)
)
def is_cicd_alertname(alertname: str) -> bool:
"""

View File

@@ -1450,12 +1450,21 @@ class TelegramGateway:
result = await self._send_request("sendMessage", payload)
_msg_id = result.get("result", {}).get("message_id")
logger.info(
"telegram_approval_card_sent",
approval_id=approval_id,
message_id=result.get("result", {}).get("message_id"),
message_id=_msg_id,
)
# 2026-04-10 Claude Sonnet 4.6 Asia/Taipei: 儲存 message_id 供自動修復後更新卡片
# key: tg_approval:{approval_id}TTL 24h
if _msg_id:
try:
await get_redis().setex(f"tg_approval:{approval_id}", 86400, str(_msg_id))
except Exception as _e:
logger.warning("tg_approval_msg_id_store_failed", approval_id=approval_id, error=str(_e))
# 2026-04-03 ogt: 發到 SRE 群組並觸發 AI 雙向討論 (Triumvirate ADR-053)
# 2026-04-05 ogt: 升級為完整 v7.0 格式,與個人 chat 一致
# 非同步執行,失敗不影響告警主流程
@@ -2503,6 +2512,56 @@ class TelegramGateway:
# 文字更新失敗不影響整體流程,按鈕已移除
logger.warning("telegram_update_text_failed", message_id=message_id, error=str(e))
async def mark_auto_repaired(
self,
approval_id: str,
playbook_name: str,
execution_time_ms: int,
success: bool = True,
) -> bool:
"""
自動修復完成後更新 Telegram 卡片:
1. 移除批准/拒絕/靜默按鈕
2. 回覆原訊息顯示修復結果
2026-04-10 Claude Sonnet 4.6 Asia/Taipei (ADR-068 閉環)
"""
try:
stored = await get_redis().get(f"tg_approval:{approval_id}")
if not stored:
logger.warning("mark_auto_repaired_no_msg_id", approval_id=approval_id)
return False
message_id = int(stored)
# 移除按鈕
try:
await self._send_request("editMessageReplyMarkup", {
"chat_id": self.chat_id,
"message_id": message_id,
"reply_markup": {"inline_keyboard": []},
})
except TelegramGatewayError as e:
logger.warning("mark_auto_repaired_remove_buttons_failed", message_id=message_id, error=str(e))
# 回覆原訊息說明結果
_status = "✅ 已自動修復" if success else "❌ 自動修復失敗"
await self._send_request("sendMessage", {
"chat_id": self.chat_id,
"text": (
f"{_status}\n"
f"Playbook: <code>{html.escape(playbook_name)}</code>\n"
f"耗時: {execution_time_ms}ms"
),
"parse_mode": "HTML",
"reply_parameters": {"message_id": message_id},
})
return True
except Exception as e:
logger.warning("mark_auto_repaired_failed", approval_id=approval_id, error=str(e))
return False
async def append_incident_update(
self,
incident_id: str,