Files
awoooi/apps/api/src/services/k3s_monitor_service.py
Your Name a4e9a04982
Some checks failed
Code Review / ai-code-review (push) Successful in 10s
run-migration / migrate (push) Successful in 7s
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / tests (push) Has been cancelled
fix(ops): harden cold-start schedule recovery
2026-05-05 22:17:10 +08:00

281 lines
9.1 KiB
Python

"""
K3s Monitor Service - Phase 21.2 定期報告
==========================================
收集 K3s 叢集狀態並推送 Telegram 報告。
數據來源:
- Prometheus (kube-state-metrics :30888)
- kubectl (fallback)
符合 leWOOOgo 積木化規範:
- Service 層封裝所有邏輯
- 透過 DI 注入 TelegramGateway
@author Claude Code (首席架構師)
@version 1.0.0
@date 2026-03-31 (台北時間)
@see ADR-041-periodic-reporting-architecture.md
"""
import asyncio
from datetime import datetime
from typing import Any, Protocol, runtime_checkable
import httpx
import structlog
from zoneinfo import ZoneInfo
from src.core.config import settings
from src.services.telegram_gateway import K3sStatusMessage, get_telegram_gateway
logger = structlog.get_logger(__name__)
# 台北時區
TZ_TAIPEI = ZoneInfo("Asia/Taipei")
# Prometheus endpoint.
#
# 2026-05-05 Codex: do not pin this report job to a K3s worker NodePort.
# Production already injects PROMETHEUS_URL from ConfigMap, currently the
# Docker Prometheus on 110. This keeps reboot recovery independent of 121.
PROMETHEUS_URL = settings.PROMETHEUS_URL.rstrip("/")
# kube-state-metrics 查詢
PROM_QUERIES = {
"node_total": 'count(kube_node_info)',
"node_ready": 'sum(kube_node_status_condition{condition="Ready",status="true"})',
"pod_total": 'count(kube_pod_info{namespace="awoooi-prod"})',
"pod_running": 'count(kube_pod_status_phase{namespace="awoooi-prod",phase="Running"})',
"pod_pending": 'count(kube_pod_status_phase{namespace="awoooi-prod",phase="Pending"})',
"pod_failed": 'count(kube_pod_status_phase{namespace="awoooi-prod",phase="Failed"})',
"pod_restart_48h": 'sum(increase(kube_pod_container_status_restarts_total{namespace="awoooi-prod"}[48h]))',
}
# =============================================================================
# Protocol (Interface)
# =============================================================================
@runtime_checkable
class IK3sMonitorService(Protocol):
"""
K3s 監控服務介面
Phase 21.2: 定義 Protocol 供依賴注入
"""
async def collect_cluster_status(self) -> K3sStatusMessage:
"""收集叢集狀態"""
...
async def send_daily_report(self) -> bool:
"""發送每日報告"""
...
# =============================================================================
# Implementation
# =============================================================================
class K3sMonitorService:
"""
K3s 監控服務實作
收集 K3s 叢集狀態並推送 Telegram 報告
"""
def __init__(self):
self._http_client: httpx.AsyncClient | None = None
async def _ensure_client(self) -> httpx.AsyncClient:
"""確保 HTTP Client 存在"""
if self._http_client is None:
self._http_client = httpx.AsyncClient(timeout=10.0)
return self._http_client
async def _query_prometheus(self, query: str) -> float | None:
"""查詢 Prometheus"""
try:
client = await self._ensure_client()
response = await client.get(
f"{PROMETHEUS_URL}/api/v1/query",
params={"query": query},
)
response.raise_for_status()
data = response.json()
if data.get("status") == "success":
result = data.get("data", {}).get("result", [])
if result:
value = result[0].get("value", [None, None])[1]
return float(value) if value else 0.0
return 0.0
except Exception as e:
logger.warning("prometheus_query_failed", query=query, error=str(e))
return None
async def _get_hpa_status(self, name: str) -> str:
"""取得 HPA 副本狀態 (current/max)"""
query = f'kube_horizontalpodautoscaler_status_current_replicas{{horizontalpodautoscaler="{name}",namespace="awoooi-prod"}}'
current = await self._query_prometheus(query)
query_max = f'kube_horizontalpodautoscaler_spec_max_replicas{{horizontalpodautoscaler="{name}",namespace="awoooi-prod"}}'
max_replicas = await self._query_prometheus(query_max)
if current is not None and max_replicas is not None:
return f"{int(current)}/{int(max_replicas)}"
return "N/A"
async def _get_backup_status(self) -> tuple[str, str]:
"""取得備份狀態 (etcd, velero)"""
# 這裡簡化處理,實際應查詢 Velero API 或檢查備份時間
now = datetime.now(TZ_TAIPEI)
# 假設每 6h 備份,顯示最後備份時間
etcd_last = now.strftime("%m-%d %H:00")
velero_last = now.strftime("%m-%d 02:00")
return etcd_last, velero_last
async def _get_alert_count_48h(self) -> int:
"""取得 48h 內告警數量"""
query = 'count(ALERTS{alertstate="firing",namespace="awoooi-prod"}) or vector(0)'
result = await self._query_prometheus(query)
return int(result) if result is not None else 0
async def collect_cluster_status(self) -> K3sStatusMessage:
"""
收集叢集狀態
從 Prometheus 收集 K3s 叢集各項指標
"""
now = datetime.now(TZ_TAIPEI)
report_date = now.strftime("%Y-%m-%d %H:%M")
# 並行查詢所有指標
tasks = {
key: self._query_prometheus(query)
for key, query in PROM_QUERIES.items()
}
results: dict[str, Any] = {}
for key, task in tasks.items():
results[key] = await task
# HPA 狀態
hpa_api = await self._get_hpa_status("awoooi-api")
hpa_web = await self._get_hpa_status("awoooi-web")
hpa_worker = await self._get_hpa_status("awoooi-worker")
# 備份狀態
etcd_backup, velero_backup = await self._get_backup_status()
# 告警數量
alert_count = await self._get_alert_count_48h()
# 組裝訊息
message = K3sStatusMessage(
report_date=report_date,
node_total=int(results.get("node_total") or 2),
node_ready=int(results.get("node_ready") or 2),
pod_total=int(results.get("pod_total") or 0),
pod_running=int(results.get("pod_running") or 0),
pod_pending=int(results.get("pod_pending") or 0),
pod_failed=int(results.get("pod_failed") or 0),
hpa_api_replicas=hpa_api,
hpa_web_replicas=hpa_web,
hpa_worker_replicas=hpa_worker,
etcd_backup_last=etcd_backup,
velero_backup_last=velero_backup,
alert_count_48h=alert_count,
pod_restart_48h=int(results.get("pod_restart_48h") or 0),
k3s_version=settings.K3S_VERSION if hasattr(settings, 'K3S_VERSION') else "v1.34.5+k3s1",
)
logger.info(
"k3s_status_collected",
nodes=f"{message.node_ready}/{message.node_total}",
pods_running=message.pod_running,
alert_count=message.alert_count_48h,
)
return message
async def send_daily_report(self) -> bool:
"""
發送每日報告
收集狀態並推送到 Telegram
"""
try:
# 收集狀態
status = await self.collect_cluster_status()
# 取得 Telegram Gateway
gateway = get_telegram_gateway()
if not gateway._initialized:
await gateway.initialize()
# 發送訊息
formatted = status.format()
result = await gateway.send_text(formatted)
if result:
logger.info("k3s_daily_report_sent", date=status.report_date)
return True
else:
logger.error("k3s_daily_report_failed", date=status.report_date)
return False
except Exception as e:
logger.error("k3s_daily_report_error", error=str(e))
return False
async def close(self):
"""關閉資源"""
if self._http_client:
await self._http_client.aclose()
self._http_client = None
# =============================================================================
# Dependency Injection
# =============================================================================
_k3s_monitor_service: K3sMonitorService | None = None
def get_k3s_monitor_service() -> K3sMonitorService:
"""取得 K3sMonitorService 實例 (Singleton)"""
global _k3s_monitor_service
if _k3s_monitor_service is None:
_k3s_monitor_service = K3sMonitorService()
return _k3s_monitor_service
# =============================================================================
# CLI Entry Point (for CronJob)
# =============================================================================
async def main():
"""
CLI 入口點
供 K8s CronJob 調用:
python -m src.services.k3s_monitor_service
"""
service = get_k3s_monitor_service()
try:
success = await service.send_daily_report()
return 0 if success else 1
finally:
await service.close()
if __name__ == "__main__":
import sys
exit_code = asyncio.run(main())
sys.exit(exit_code)