Files
awoooi/apps/api/src/services/report_generation_service.py
Your Name 88af639651
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 9m46s
fix(report): 修正 approval_records.status 大小寫不一致
DB 以 SQLEnum 儲存 enum name(EXECUTION_FAILED 大寫),
而非 enum value(execution_failed 小寫)。
SQL 加 UPPER(status::text) 確保不論大小寫皆能命中。

驗證:live DB 查詢 success=0, failed=2(之前永遠 0/0)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-22 09:10:39 +08:00

587 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
自動報告生成服務 (Report Generation Service)
=============================================
ADR-076: 展現價值 — 日度巡檢報告 + 事後檢討 (Postmortem)
建立: 2026-04-14 (台北時區) Claude Haiku 4.5
功能:
1. 日度巡檢報告 — 每日 08:00 台北時間,收集前 24h 關鍵 KPI
2. 事後檢討 (Postmortem) — Incident resolved 且 duration > 10 分鐘自動觸發
設計原則:
- 遵循 leWOOOgo 積木化鐵律
- 不直接存取 Redis透過 Service 層)
- 所有數據從 DB 聚合,不使用假數據
- Graceful Degradation各資料來源失敗獨立處理
- 統帥鐵律:台北時區(+8禁止 UTC
報告流程:
日度巡檢: lifespan 啟動 → _run_daily_report_loop() 無限迴圈
→ 計算距下一個 08:00 台北時間的秒數
→ sleep → 收集數據 → 組裝 → Telegram 推送
Postmortem: Incident resolve 時,由呼叫方 await trigger_postmortem(incident)
"""
from __future__ import annotations
import asyncio
from dataclasses import dataclass, field
from datetime import datetime, timedelta, timezone
import structlog
from src.utils.timezone import now_taipei
logger = structlog.get_logger(__name__)
# 台北時區 (UTC+8)
_TZ_TAIPEI = timezone(timedelta(hours=8))
# 日度報告觸發時間(台北時間 08:00
DAILY_REPORT_HOUR_TAIPEI = 8
# Postmortem 觸發最低時長(分鐘)
POSTMORTEM_MIN_DURATION_MINUTES = 10
# =============================================================================
# Data Types
# =============================================================================
@dataclass
class DailyKpi:
"""24 小時 KPI 摘要"""
period_start: datetime
period_end: datetime
# 告警
total_alerts: int = 0
auto_resolved: int = 0
human_approved: int = 0
converged_alerts: int = 0
grouped_alerts: int = 0
# 自動修復
auto_repair_success: int = 0
auto_repair_failed: int = 0
# 飛輪
km_new_entries: int = 0
playbook_count: int = 0
# 告警分類分佈
alert_category_breakdown: dict[str, int] = field(default_factory=dict)
@property
def auto_repair_rate(self) -> float:
total = self.auto_repair_success + self.auto_repair_failed
return self.auto_repair_success / total if total > 0 else 0.0
@property
def auto_resolve_rate(self) -> float:
return self.auto_resolved / self.total_alerts if self.total_alerts > 0 else 0.0
@dataclass
class PostmortemData:
"""事後檢討資料"""
incident_id: str
title: str
duration_minutes: float
root_cause: str | None
resolution_action: str | None
ai_provider: str | None
auto_repaired: bool
retry_count: int
created_at: datetime
resolved_at: datetime
# =============================================================================
# ReportGenerationService
# =============================================================================
class ReportGenerationService:
"""
自動報告生成服務
統帥指令 (2026-04-14):
- 日度巡檢報告:每日 08:00 台北時間
- 事後檢討Incident resolved 且 duration > 10 分鐘
- 所有報告推送至 Telegram SRE 群組
"""
async def collect_daily_kpi(self) -> DailyKpi:
"""
收集過去 24 小時 KPI
資料來源: PostgreSQL (incidents, approvals, knowledge_entries)
Graceful Degradation: 每個資料源失敗獨立處理,不中止整體
Returns:
DailyKpi 摘要
"""
now = now_taipei()
period_start = now - timedelta(hours=24)
kpi = DailyKpi(period_start=period_start, period_end=now)
# 並行收集各項 KPI
results = await asyncio.gather(
self._collect_alert_stats(period_start),
self._collect_repair_stats(period_start),
self._collect_km_stats(period_start),
self._collect_playbook_count(),
return_exceptions=True,
)
alert_stats, repair_stats, km_stats, playbook_count = results
if isinstance(alert_stats, dict):
kpi.total_alerts = alert_stats.get("total", 0)
kpi.auto_resolved = alert_stats.get("auto_resolved", 0)
kpi.human_approved = alert_stats.get("human_approved", 0)
kpi.converged_alerts = alert_stats.get("converged", 0)
kpi.alert_category_breakdown = alert_stats.get("categories", {})
else:
logger.warning("daily_kpi_alert_stats_failed", error=str(alert_stats))
if isinstance(repair_stats, dict):
kpi.auto_repair_success = repair_stats.get("success", 0)
kpi.auto_repair_failed = repair_stats.get("failed", 0)
else:
logger.warning("daily_kpi_repair_stats_failed", error=str(repair_stats))
if isinstance(km_stats, int):
kpi.km_new_entries = km_stats
else:
logger.warning("daily_kpi_km_stats_failed", error=str(km_stats))
if isinstance(playbook_count, int):
kpi.playbook_count = playbook_count
else:
logger.warning("daily_kpi_playbook_count_failed", error=str(playbook_count))
return kpi
async def _collect_alert_stats(self, since: datetime) -> dict:
"""收集告警統計incident 表)"""
from sqlalchemy import func, select, text as sa_text
from src.db.base import get_db_context
from src.db.models import IncidentRecord
async with get_db_context() as db:
# 總數
total = await db.scalar(
select(func.count()).select_from(IncidentRecord).where(
IncidentRecord.created_at >= since
)
) or 0
# 自動解決status=resolved無人工簽核
auto_resolved = await db.scalar(
select(func.count()).select_from(IncidentRecord).where(
IncidentRecord.created_at >= since,
IncidentRecord.status == "resolved",
)
) or 0
# 告警分類分佈alert_category 欄位)
categories: dict[str, int] = {}
try:
cat_result = await db.execute(
sa_text(
"SELECT alert_category, COUNT(*) as cnt "
"FROM incidents "
"WHERE created_at >= :since AND alert_category IS NOT NULL "
"GROUP BY alert_category "
"ORDER BY cnt DESC "
"LIMIT 10"
).bindparams(since=since)
)
for row in cat_result:
categories[row[0]] = row[1]
except Exception as _cat_e:
logger.debug("alert_category_breakdown_failed", error=str(_cat_e))
return {
"total": total,
"auto_resolved": auto_resolved,
"human_approved": 0, # TODO: 從 signatures 表統計
"converged": 0, # 已由 DB hit_count 記錄,暫略
"categories": categories,
}
async def _collect_repair_stats(self, since: datetime) -> dict:
"""
收集自動修復統計
2026-04-22 Claude Sonnet 4.6 修復 — incidents.outcome JSON 在執行鏈路中從未被寫入
execution_success導致永遠查詢到 0。改查 approval_records.status 作為 source of truth
approval_execution.py 每次執行後都會寫入 EXECUTION_SUCCESS / EXECUTION_FAILED
"""
from sqlalchemy import text
from src.db.base import get_db_context
async with get_db_context() as db:
row = await db.execute(
text("""
SELECT
COUNT(*) FILTER (WHERE UPPER(status::text) = 'EXECUTION_SUCCESS') AS success,
COUNT(*) FILTER (WHERE UPPER(status::text) = 'EXECUTION_FAILED') AS failed
FROM approval_records
WHERE created_at >= :since
"""),
{"since": since},
)
r = row.one()
return {"success": int(r.success or 0), "failed": int(r.failed or 0)}
async def _collect_km_stats(self, since: datetime) -> int:
"""收集新增 KM 條目數"""
from sqlalchemy import func, select
from src.db.base import get_db_context
from src.db.models import KnowledgeEntryRecord
async with get_db_context() as db:
count = await db.scalar(
select(func.count()).select_from(KnowledgeEntryRecord).where(
KnowledgeEntryRecord.created_at >= since
)
) or 0
return int(count)
async def _collect_playbook_count(self) -> int:
"""
收集活躍 Playbook 數量
2026-04-14 Claude Sonnet 4.6 修復 — Playbook 儲存在 Redis 非 PostgreSQL
改用 playbook_service.list_playbooks() 讀 Redis。
"""
from src.services.playbook_service import get_playbook_service
try:
svc = get_playbook_service()
playbooks, total = await svc.list_playbooks(limit=1000)
return int(total or len(playbooks))
except Exception as e:
logger.warning("daily_kpi_playbook_count_failed", error=str(e))
return 0
def format_daily_report(self, kpi: DailyKpi) -> str:
"""
組裝日度巡檢報告Telegram HTML 格式)
Args:
kpi: DailyKpi 摘要
Returns:
Telegram HTML 格式字串
"""
date_str = kpi.period_end.strftime("%Y-%m-%d")
period_str = f"{kpi.period_start.strftime('%H:%M')} ~ {kpi.period_end.strftime('%H:%M')}"
auto_repair_rate_pct = f"{kpi.auto_repair_rate * 100:.1f}%"
auto_resolve_rate_pct = f"{kpi.auto_resolve_rate * 100:.1f}%"
# 告警分類表
cat_lines = ""
if kpi.alert_category_breakdown:
for cat, cnt in list(kpi.alert_category_breakdown.items())[:6]:
cat_lines += f"\n{cat}: {cnt}"
# 整體健康度評估
if kpi.auto_repair_rate >= 0.8:
health_icon = "💚"
health_label = "優秀"
elif kpi.auto_repair_rate >= 0.5:
health_icon = "🟡"
health_label = "良好"
else:
health_icon = "🔴"
health_label = "需關注"
lines = [
f"<b>📊 AWOOOI 日度巡檢報告</b>",
f"<b>{date_str}</b> | {period_str} 台北時間",
"",
f"<b>{health_icon} 整體健康度: {health_label}</b>",
"",
"<b>🚨 告警統計</b>",
f" 總計: <b>{kpi.total_alerts}</b> 個",
f" 自動解決: {kpi.auto_resolved} 個 ({auto_resolve_rate_pct})",
f" 人工批准: {kpi.human_approved}",
f" 告警收斂: {kpi.converged_alerts}",
]
if cat_lines:
lines += [f"\n<b>📂 分類分佈</b>{cat_lines}"]
lines += [
"",
"<b>🔧 自動修復</b>",
f" 成功: {kpi.auto_repair_success}",
f" 失敗: {kpi.auto_repair_failed}",
f" 成功率: <b>{auto_repair_rate_pct}</b>",
"",
"<b>🧠 知識積累</b>",
f" 新增 KM 條目: {kpi.km_new_entries}",
f" 活躍 Playbook: {kpi.playbook_count}",
"",
f"<i>🤖 AWOOOI AIOps 自動生成 | {kpi.period_end.strftime('%Y-%m-%d %H:%M')} 台北時間</i>",
]
return "\n".join(lines)
def format_postmortem(self, data: PostmortemData) -> str:
"""
組裝事後檢討報告Telegram HTML 格式)
Args:
data: PostmortemData
Returns:
Telegram HTML 格式字串
"""
duration_str = f"{data.duration_minutes:.1f} 分鐘"
auto_str = "✅ 自動修復" if data.auto_repaired else "👤 人工介入"
retry_str = f"(重試 {data.retry_count} 次)" if data.retry_count > 0 else ""
created_str = data.created_at.strftime("%H:%M:%S")
resolved_str = data.resolved_at.strftime("%H:%M:%S")
lines = [
f"<b>📋 事後檢討 (Postmortem)</b>",
f"<b>Incident:</b> {data.incident_id}",
"",
f"<b>⏱ 影響時長:</b> {duration_str}",
f"<b>🕐 發生:</b> {created_str} → <b>解決:</b> {resolved_str}",
f"<b>🔧 處置方式:</b> {auto_str}{retry_str}",
]
if data.root_cause:
lines += [f"\n<b>🔍 根本原因</b>\n{data.root_cause[:300]}"]
if data.resolution_action:
lines += [f"\n<b>⚡ 執行動作</b>\n<code>{data.resolution_action[:200]}</code>"]
if data.ai_provider:
lines += [f"\n<i>AI 決策: {data.ai_provider}</i>"]
lines += [
"",
f"<i>🤖 AWOOOI Postmortem 自動生成 | {now_taipei().strftime('%Y-%m-%d %H:%M')} 台北時間</i>",
]
return "\n".join(lines)
async def send_daily_report(self) -> None:
"""
收集 KPI → 組裝 → 推送 Telegram SRE 群組
Graceful Degradation: 失敗只記錄 log不拋出例外
"""
try:
kpi = await self.collect_daily_kpi()
report_text = self.format_daily_report(kpi)
from src.services.telegram_gateway import get_telegram_gateway
gateway = get_telegram_gateway()
await gateway.send_to_group(report_text, parse_mode="HTML")
logger.info(
"daily_report_sent",
total_alerts=kpi.total_alerts,
auto_repair_rate=f"{kpi.auto_repair_rate:.1%}",
)
except Exception as e:
logger.error("daily_report_failed", error=str(e))
async def trigger_postmortem(
self,
incident_id: str,
title: str,
created_at: datetime,
resolved_at: datetime,
root_cause: str | None = None,
resolution_action: str | None = None,
ai_provider: str | None = None,
auto_repaired: bool = False,
retry_count: int = 0,
) -> None:
"""
觸發事後檢討報告
呼叫方incident_service.resolve_incident() 或 approval_execution.py
觸發條件duration > POSTMORTEM_MIN_DURATION_MINUTES
Args:
incident_id: Incident ID
title: Incident 標題
created_at: 建立時間
resolved_at: 解決時間
root_cause: 根本原因AI 分析結果)
resolution_action: 執行動作
ai_provider: 決策 AI provider
auto_repaired: 是否自動修復
retry_count: 重試次數
"""
duration_minutes = (resolved_at - created_at).total_seconds() / 60
if duration_minutes < POSTMORTEM_MIN_DURATION_MINUTES:
logger.debug(
"postmortem_skipped_short_duration",
incident_id=incident_id,
duration_minutes=duration_minutes,
min_required=POSTMORTEM_MIN_DURATION_MINUTES,
)
return
data = PostmortemData(
incident_id=incident_id,
title=title,
duration_minutes=duration_minutes,
root_cause=root_cause,
resolution_action=resolution_action,
ai_provider=ai_provider,
auto_repaired=auto_repaired,
retry_count=retry_count,
created_at=created_at,
resolved_at=resolved_at,
)
# 技術債修復 (2026-04-14 Claude Sonnet 4.6): 3 次重試 + 指數退避
# 失敗時發送告警到 SRE 群組,避免靜默吞掉錯誤
import asyncio as _asyncio
report_text = self.format_postmortem(data)
from src.services.telegram_gateway import get_telegram_gateway
gateway = get_telegram_gateway()
max_attempts = 3
backoff_seconds = 2.0
last_error: Exception | None = None
for attempt in range(1, max_attempts + 1):
try:
await gateway.send_to_group(report_text, parse_mode="HTML")
logger.info(
"postmortem_sent",
incident_id=incident_id,
duration_minutes=duration_minutes,
attempt=attempt,
)
return
except Exception as e:
last_error = e
logger.warning(
"postmortem_send_retry",
incident_id=incident_id,
attempt=attempt,
max_attempts=max_attempts,
error=str(e),
)
if attempt < max_attempts:
await _asyncio.sleep(backoff_seconds * attempt)
# 3 次全失敗 → 記 error + 嘗試簡化降級通知(防止完全靜默)
logger.error(
"postmortem_failed",
incident_id=incident_id,
error=str(last_error),
attempts=max_attempts,
)
try:
fallback_text = (
f"⚠️ <b>Postmortem 發送失敗 (3 次重試)</b>\n"
f"Incident: <code>{incident_id}</code>\n"
f"Duration: {duration_minutes:.1f} 分鐘\n"
f"Error: {str(last_error)[:200]}"
)
await gateway.send_to_group(fallback_text, parse_mode="HTML")
except Exception as _fe:
logger.error(
"postmortem_fallback_failed",
incident_id=incident_id,
error=str(_fe),
)
# =============================================================================
# 日度報告排程迴圈
# =============================================================================
def _seconds_until_next_report() -> float:
"""
計算距下一個 08:00 台北時間的秒數
Returns:
秒數float
"""
now = now_taipei()
target = now.replace(hour=DAILY_REPORT_HOUR_TAIPEI, minute=0, second=0, microsecond=0)
if now >= target:
# 已過今天的 08:00 → 等到明天
target += timedelta(days=1)
return (target - now).total_seconds()
async def run_daily_report_loop() -> None:
"""
日度巡檢報告無限排程迴圈
每次睡到下一個 08:00 台北時間,然後發送報告。
以 asyncio.create_task() 從 lifespan 啟動。
Graceful Degradation: 任何例外都只記錄 log迴圈繼續
"""
service = ReportGenerationService()
logger.info(
"daily_report_loop_started",
trigger_hour_taipei=DAILY_REPORT_HOUR_TAIPEI,
)
while True:
sleep_seconds = _seconds_until_next_report()
logger.info(
"daily_report_next_in",
sleep_seconds=int(sleep_seconds),
next_at=f"{DAILY_REPORT_HOUR_TAIPEI:02d}:00 台北時間",
)
await asyncio.sleep(sleep_seconds)
# 2026-04-22 Claude Sonnet 4.6: 多 Pod 競速保護 — 只有搶到 Redis SETNX 的 Pod 才發報告
from src.services.ai_advisory_helpers import try_acquire_daily_lock
if not await try_acquire_daily_lock("daily_report"):
logger.info("daily_report_skipped_other_pod")
continue
logger.info("daily_report_triggered")
await service.send_daily_report()
# =============================================================================
# Factory Function
# =============================================================================
_instance: ReportGenerationService | None = None
def get_report_generation_service() -> ReportGenerationService:
"""
取得 ReportGenerationService 單例
Returns:
ReportGenerationService 實例
"""
global _instance
if _instance is None:
_instance = ReportGenerationService()
return _instance