fix(flywheel): 自動修復後移除 Telegram 按鈕 + 心跳告警排除飛輪
Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 6m56s
Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 6m56s
問題: 自動修復成功後 Telegram 卡片仍顯示批准/拒絕/靜默按鈕
Fix 1 — Telegram 卡片回饋閉環 (積木化合規):
- telegram_gateway.send_approval_card: 發送後自動存 tg_approval:{id} 到 Redis
- telegram_gateway.mark_auto_repaired(): 新方法 — 移除按鈕 + reply 結果
- _try_auto_repair_background: 改呼叫 gateway.mark_auto_repaired() (Service 層)
Fix 2 — 心跳/看門狗告警排除飛輪:
- constants.py: is_heartbeat_alertname() + HEARTBEAT_ALERT_NAMES
- NoAlertsReceived2Hours 等不觸發 _try_auto_repair_background
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -30,7 +30,7 @@ from fastapi import APIRouter, BackgroundTasks, Header, HTTPException, Request,
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from src.core.config import settings
|
||||
from src.core.constants import is_cicd_alertname
|
||||
from src.core.constants import is_cicd_alertname, is_heartbeat_alertname
|
||||
from src.core.logging import get_logger
|
||||
from src.core.metrics import record_alert_chain_success
|
||||
|
||||
@@ -206,26 +206,23 @@ async def _try_auto_repair_background(
|
||||
},
|
||||
)
|
||||
|
||||
# 通知 Telegram 自動修復結果
|
||||
# 2026-04-10 Claude Sonnet 4.6 Asia/Taipei: 自動修復後更新 Telegram 卡片
|
||||
# 透過 TelegramGateway Service 層移除按鈕並回覆結果 (積木化鐵律)
|
||||
if result:
|
||||
try:
|
||||
telegram = get_telegram_gateway()
|
||||
status_icon = "✅" if result.success else "❌"
|
||||
steps_summary = "\n".join(f" • {s}" for s in result.executed_steps[:3]) or "-"
|
||||
await telegram.send_message(
|
||||
f"{status_icon} *自動修復{'完成' if result.success else '失敗'}*\n"
|
||||
f"資源: `{target_resource}` ({namespace})\n"
|
||||
f"告警: {alert_type}\n"
|
||||
f"耗時: {result.execution_time_ms}ms\n"
|
||||
f"步驟:\n{steps_summary}"
|
||||
_pb_name = decision.playbook.name if decision.playbook else "unknown"
|
||||
await get_telegram_gateway().mark_auto_repaired(
|
||||
approval_id=approval_id,
|
||||
playbook_name=_pb_name,
|
||||
execution_time_ms=result.execution_time_ms,
|
||||
success=result.success,
|
||||
)
|
||||
# 記錄 Telegram 推送
|
||||
await op_log.append(
|
||||
"TELEGRAM_RESULT_SENT",
|
||||
incident_id=incident_id,
|
||||
approval_id=approval_id,
|
||||
actor="system",
|
||||
action_detail="auto_repair_result",
|
||||
action_detail="auto_repair_card_updated",
|
||||
success=result.success,
|
||||
context={"target_resource": target_resource, "namespace": namespace},
|
||||
)
|
||||
@@ -1303,7 +1300,17 @@ async def alertmanager_webhook(
|
||||
# Sprint 5.1 Q9: auto_repair=false 旗標 → 強制 HITL,不觸發背景任務
|
||||
# (2026-04-08 Claude Sonnet 4.6 Asia/Taipei,ADR-062)
|
||||
# ================================================================
|
||||
if _can_auto_repair_by_rule:
|
||||
# 2026-04-10 Claude Sonnet 4.6 Asia/Taipei: 心跳/看門狗告警不進飛輪
|
||||
# NoAlertsReceived2Hours 等代表監控系統狀態,不是服務故障
|
||||
_is_heartbeat = is_heartbeat_alertname(alertname)
|
||||
if _is_heartbeat:
|
||||
logger.info(
|
||||
"auto_repair_skipped_heartbeat",
|
||||
incident_id=incident_id,
|
||||
alertname=alertname,
|
||||
)
|
||||
|
||||
if _can_auto_repair_by_rule and not _is_heartbeat:
|
||||
background_tasks.add_task(
|
||||
_try_auto_repair_background,
|
||||
incident_id=incident_id,
|
||||
|
||||
@@ -85,6 +85,34 @@ CICD_ALERT_SUFFIXES = (
|
||||
# CI/CD 告警關鍵字 (不區分大小寫)
|
||||
CICD_ALERT_KEYWORDS = ("CI/CD", "cicd")
|
||||
|
||||
# =============================================================================
|
||||
# Heartbeat/Watchdog Alert Detection (2026-04-10 Claude Sonnet 4.6 Asia/Taipei)
|
||||
# 心跳/看門狗告警不觸發自動修復飛輪 — 這類告警代表監控系統狀態,不是服務故障
|
||||
# =============================================================================
|
||||
HEARTBEAT_ALERT_NAMES = frozenset({
|
||||
"Watchdog",
|
||||
"DeadMansSwitch",
|
||||
"NoAlertsReceived",
|
||||
"NoAlertsReceived2Hours",
|
||||
"AlertmanagerDown",
|
||||
"PrometheusNotConnectedToAlertmanager",
|
||||
})
|
||||
|
||||
HEARTBEAT_ALERT_KEYWORDS = ("NoAlertsReceived", "Watchdog", "DeadMansSwitch", "Heartbeat")
|
||||
|
||||
|
||||
def is_heartbeat_alertname(alertname: str) -> bool:
|
||||
"""
|
||||
判斷 alertname 是否為心跳/看門狗告警
|
||||
|
||||
心跳告警代表監控系統自身健康狀態,不是服務故障,
|
||||
不應進入自動修復飛輪(不存在對應的 Playbook 修復動作)。
|
||||
"""
|
||||
return (
|
||||
alertname in HEARTBEAT_ALERT_NAMES
|
||||
or any(kw in alertname for kw in HEARTBEAT_ALERT_KEYWORDS)
|
||||
)
|
||||
|
||||
|
||||
def is_cicd_alertname(alertname: str) -> bool:
|
||||
"""
|
||||
|
||||
@@ -1450,12 +1450,21 @@ class TelegramGateway:
|
||||
|
||||
result = await self._send_request("sendMessage", payload)
|
||||
|
||||
_msg_id = result.get("result", {}).get("message_id")
|
||||
logger.info(
|
||||
"telegram_approval_card_sent",
|
||||
approval_id=approval_id,
|
||||
message_id=result.get("result", {}).get("message_id"),
|
||||
message_id=_msg_id,
|
||||
)
|
||||
|
||||
# 2026-04-10 Claude Sonnet 4.6 Asia/Taipei: 儲存 message_id 供自動修復後更新卡片
|
||||
# key: tg_approval:{approval_id},TTL 24h
|
||||
if _msg_id:
|
||||
try:
|
||||
await get_redis().setex(f"tg_approval:{approval_id}", 86400, str(_msg_id))
|
||||
except Exception as _e:
|
||||
logger.warning("tg_approval_msg_id_store_failed", approval_id=approval_id, error=str(_e))
|
||||
|
||||
# 2026-04-03 ogt: 發到 SRE 群組並觸發 AI 雙向討論 (Triumvirate ADR-053)
|
||||
# 2026-04-05 ogt: 升級為完整 v7.0 格式,與個人 chat 一致
|
||||
# 非同步執行,失敗不影響告警主流程
|
||||
@@ -2503,6 +2512,56 @@ class TelegramGateway:
|
||||
# 文字更新失敗不影響整體流程,按鈕已移除
|
||||
logger.warning("telegram_update_text_failed", message_id=message_id, error=str(e))
|
||||
|
||||
async def mark_auto_repaired(
|
||||
self,
|
||||
approval_id: str,
|
||||
playbook_name: str,
|
||||
execution_time_ms: int,
|
||||
success: bool = True,
|
||||
) -> bool:
|
||||
"""
|
||||
自動修復完成後更新 Telegram 卡片:
|
||||
1. 移除批准/拒絕/靜默按鈕
|
||||
2. 回覆原訊息顯示修復結果
|
||||
|
||||
2026-04-10 Claude Sonnet 4.6 Asia/Taipei (ADR-068 閉環)
|
||||
"""
|
||||
try:
|
||||
stored = await get_redis().get(f"tg_approval:{approval_id}")
|
||||
if not stored:
|
||||
logger.warning("mark_auto_repaired_no_msg_id", approval_id=approval_id)
|
||||
return False
|
||||
|
||||
message_id = int(stored)
|
||||
|
||||
# 移除按鈕
|
||||
try:
|
||||
await self._send_request("editMessageReplyMarkup", {
|
||||
"chat_id": self.chat_id,
|
||||
"message_id": message_id,
|
||||
"reply_markup": {"inline_keyboard": []},
|
||||
})
|
||||
except TelegramGatewayError as e:
|
||||
logger.warning("mark_auto_repaired_remove_buttons_failed", message_id=message_id, error=str(e))
|
||||
|
||||
# 回覆原訊息說明結果
|
||||
_status = "✅ 已自動修復" if success else "❌ 自動修復失敗"
|
||||
await self._send_request("sendMessage", {
|
||||
"chat_id": self.chat_id,
|
||||
"text": (
|
||||
f"{_status}\n"
|
||||
f"Playbook: <code>{html.escape(playbook_name)}</code>\n"
|
||||
f"耗時: {execution_time_ms}ms"
|
||||
),
|
||||
"parse_mode": "HTML",
|
||||
"reply_parameters": {"message_id": message_id},
|
||||
})
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("mark_auto_repaired_failed", approval_id=approval_id, error=str(e))
|
||||
return False
|
||||
|
||||
async def append_incident_update(
|
||||
self,
|
||||
incident_id: str,
|
||||
|
||||
Reference in New Issue
Block a user