""" K3s Monitor Service - Phase 21.2 定期報告 ========================================== 收集 K3s 叢集狀態並推送 Telegram 報告。 數據來源: - Prometheus (kube-state-metrics :30888) - kubectl (fallback) 符合 leWOOOgo 積木化規範: - Service 層封裝所有邏輯 - 透過 DI 注入 TelegramGateway @author Claude Code (首席架構師) @version 1.0.0 @date 2026-03-31 (台北時間) @see ADR-041-periodic-reporting-architecture.md """ import asyncio from datetime import datetime from typing import Any, Protocol, runtime_checkable import httpx import structlog from zoneinfo import ZoneInfo from src.core.config import settings from src.services.telegram_gateway import K3sStatusMessage, get_telegram_gateway logger = structlog.get_logger(__name__) # 台北時區 TZ_TAIPEI = ZoneInfo("Asia/Taipei") # Prometheus endpoint. # # 2026-05-05 Codex: do not pin this report job to a K3s worker NodePort. # Production already injects PROMETHEUS_URL from ConfigMap, currently the # Docker Prometheus on 110. This keeps reboot recovery independent of 121. PROMETHEUS_URL = settings.PROMETHEUS_URL.rstrip("/") # kube-state-metrics 查詢 PROM_QUERIES = { "node_total": 'count(kube_node_info)', "node_ready": 'sum(kube_node_status_condition{condition="Ready",status="true"})', "pod_total": 'count(kube_pod_info{namespace="awoooi-prod"})', "pod_running": 'count(kube_pod_status_phase{namespace="awoooi-prod",phase="Running"})', "pod_pending": 'count(kube_pod_status_phase{namespace="awoooi-prod",phase="Pending"})', "pod_failed": 'count(kube_pod_status_phase{namespace="awoooi-prod",phase="Failed"})', "pod_restart_48h": 'sum(increase(kube_pod_container_status_restarts_total{namespace="awoooi-prod"}[48h]))', } # ============================================================================= # Protocol (Interface) # ============================================================================= @runtime_checkable class IK3sMonitorService(Protocol): """ K3s 監控服務介面 Phase 21.2: 定義 Protocol 供依賴注入 """ async def collect_cluster_status(self) -> K3sStatusMessage: """收集叢集狀態""" ... async def send_daily_report(self) -> bool: """發送每日報告""" ... # ============================================================================= # Implementation # ============================================================================= class K3sMonitorService: """ K3s 監控服務實作 收集 K3s 叢集狀態並推送 Telegram 報告 """ def __init__(self): self._http_client: httpx.AsyncClient | None = None async def _ensure_client(self) -> httpx.AsyncClient: """確保 HTTP Client 存在""" if self._http_client is None: self._http_client = httpx.AsyncClient(timeout=10.0) return self._http_client async def _query_prometheus(self, query: str) -> float | None: """查詢 Prometheus""" try: client = await self._ensure_client() response = await client.get( f"{PROMETHEUS_URL}/api/v1/query", params={"query": query}, ) response.raise_for_status() data = response.json() if data.get("status") == "success": result = data.get("data", {}).get("result", []) if result: value = result[0].get("value", [None, None])[1] return float(value) if value else 0.0 return 0.0 except Exception as e: logger.warning("prometheus_query_failed", query=query, error=str(e)) return None async def _get_hpa_status(self, name: str) -> str: """取得 HPA 副本狀態 (current/max)""" query = f'kube_horizontalpodautoscaler_status_current_replicas{{horizontalpodautoscaler="{name}",namespace="awoooi-prod"}}' current = await self._query_prometheus(query) query_max = f'kube_horizontalpodautoscaler_spec_max_replicas{{horizontalpodautoscaler="{name}",namespace="awoooi-prod"}}' max_replicas = await self._query_prometheus(query_max) if current is not None and max_replicas is not None: return f"{int(current)}/{int(max_replicas)}" return "N/A" async def _get_backup_status(self) -> tuple[str, str]: """取得備份狀態 (etcd, velero)""" # 這裡簡化處理,實際應查詢 Velero API 或檢查備份時間 now = datetime.now(TZ_TAIPEI) # 假設每 6h 備份,顯示最後備份時間 etcd_last = now.strftime("%m-%d %H:00") velero_last = now.strftime("%m-%d 02:00") return etcd_last, velero_last async def _get_alert_count_48h(self) -> int: """取得 48h 內告警數量""" query = 'count(ALERTS{alertstate="firing",namespace="awoooi-prod"}) or vector(0)' result = await self._query_prometheus(query) return int(result) if result is not None else 0 async def collect_cluster_status(self) -> K3sStatusMessage: """ 收集叢集狀態 從 Prometheus 收集 K3s 叢集各項指標 """ now = datetime.now(TZ_TAIPEI) report_date = now.strftime("%Y-%m-%d %H:%M") # 並行查詢所有指標 tasks = { key: self._query_prometheus(query) for key, query in PROM_QUERIES.items() } results: dict[str, Any] = {} for key, task in tasks.items(): results[key] = await task # HPA 狀態 hpa_api = await self._get_hpa_status("awoooi-api") hpa_web = await self._get_hpa_status("awoooi-web") hpa_worker = await self._get_hpa_status("awoooi-worker") # 備份狀態 etcd_backup, velero_backup = await self._get_backup_status() # 告警數量 alert_count = await self._get_alert_count_48h() # 組裝訊息 message = K3sStatusMessage( report_date=report_date, node_total=int(results.get("node_total") or 2), node_ready=int(results.get("node_ready") or 2), pod_total=int(results.get("pod_total") or 0), pod_running=int(results.get("pod_running") or 0), pod_pending=int(results.get("pod_pending") or 0), pod_failed=int(results.get("pod_failed") or 0), hpa_api_replicas=hpa_api, hpa_web_replicas=hpa_web, hpa_worker_replicas=hpa_worker, etcd_backup_last=etcd_backup, velero_backup_last=velero_backup, alert_count_48h=alert_count, pod_restart_48h=int(results.get("pod_restart_48h") or 0), k3s_version=settings.K3S_VERSION if hasattr(settings, 'K3S_VERSION') else "v1.34.5+k3s1", ) logger.info( "k3s_status_collected", nodes=f"{message.node_ready}/{message.node_total}", pods_running=message.pod_running, alert_count=message.alert_count_48h, ) return message async def send_daily_report(self) -> bool: """ 發送每日報告 收集狀態並推送到 Telegram """ try: # 收集狀態 status = await self.collect_cluster_status() # 取得 Telegram Gateway gateway = get_telegram_gateway() if not gateway._initialized: await gateway.initialize() # 發送訊息 formatted = status.format() result = await gateway.send_text(formatted) if result: logger.info("k3s_daily_report_sent", date=status.report_date) return True else: logger.error("k3s_daily_report_failed", date=status.report_date) return False except Exception as e: logger.error("k3s_daily_report_error", error=str(e)) return False async def close(self): """關閉資源""" if self._http_client: await self._http_client.aclose() self._http_client = None # ============================================================================= # Dependency Injection # ============================================================================= _k3s_monitor_service: K3sMonitorService | None = None def get_k3s_monitor_service() -> K3sMonitorService: """取得 K3sMonitorService 實例 (Singleton)""" global _k3s_monitor_service if _k3s_monitor_service is None: _k3s_monitor_service = K3sMonitorService() return _k3s_monitor_service # ============================================================================= # CLI Entry Point (for CronJob) # ============================================================================= async def main(): """ CLI 入口點 供 K8s CronJob 調用: python -m src.services.k3s_monitor_service """ service = get_k3s_monitor_service() try: success = await service.send_daily_report() return 0 if success else 1 finally: await service.close() if __name__ == "__main__": import sys exit_code = asyncio.run(main()) sys.exit(exit_code)