- 刪除舊版 clawbot.py (已有新版 openclaw.py) - 更新 models/ai.py 類型定義 (ClawBotAnalysisRequest/Response) - 更新 api/v1/ai.py import 與註解 - 更新 Discord username - 更新所有註解與文檔 依據: feedback_openclaw_naming.md (統帥 2026-03-20 正式命名決議) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
502 lines
16 KiB
Python
502 lines
16 KiB
Python
"""
|
||
Four Host Aggregator Service
|
||
============================
|
||
真實 Host Probing - 使用 asyncio TCP/HTTP 探測
|
||
|
||
Hosts:
|
||
- 192.168.0.110: DevOps 金庫 (Harbor, GH Runner)
|
||
- 192.168.0.112: Kali Security (Scanner API)
|
||
- 192.168.0.120: K3s Master (awoooi-prod namespace)
|
||
- 192.168.0.188: AI+Web 中心 (Nginx, PostgreSQL, Redis, Ollama, OpenClaw, SigNoz)
|
||
|
||
Features:
|
||
- asyncio.gather for parallel fetching
|
||
- Real TCP port probing with asyncio.open_connection
|
||
- HTTP health check for services with endpoints
|
||
- Graceful degradation on partial failures
|
||
- No fake data - return None for unavailable metrics
|
||
"""
|
||
|
||
import asyncio
|
||
import ssl
|
||
from dataclasses import dataclass, field
|
||
from datetime import UTC, datetime
|
||
from enum import Enum
|
||
from typing import Literal
|
||
|
||
import httpx
|
||
|
||
from src.core.config import settings
|
||
from src.core.logging import get_logger
|
||
|
||
logger = get_logger("awoooi.aggregator")
|
||
|
||
|
||
# =============================================================================
|
||
# Data Models
|
||
# =============================================================================
|
||
|
||
class HostRole(str, Enum):
|
||
"""Host role enumeration"""
|
||
DEVOPS = "devops"
|
||
SECURITY = "security"
|
||
K3S = "k3s"
|
||
AI_WEB = "ai_web"
|
||
|
||
|
||
@dataclass
|
||
class ServiceStatus:
|
||
"""Individual service status"""
|
||
name: str
|
||
status: Literal["up", "down", "degraded"]
|
||
port: int | None = None
|
||
latency_ms: float | None = None
|
||
error: str | None = None
|
||
|
||
|
||
@dataclass
|
||
class BaselineData:
|
||
"""
|
||
Dynamic Baseline 數據
|
||
|
||
基準線計算邏輯:
|
||
- baseline_value: 過去時間窗口的移動平均值
|
||
- std_deviation: 標準差
|
||
- sigma_deviation: 當前值偏離基準線的 Sigma 數
|
||
|
||
目前使用靜態基準線(預留 Prometheus/SigNoz 接口)
|
||
"""
|
||
baseline_value: float
|
||
std_deviation: float
|
||
sigma_deviation: float | None = None
|
||
window_hours: int = 24 # 時間窗口(小時)
|
||
|
||
|
||
@dataclass
|
||
class HostMetrics:
|
||
"""Host resource metrics - requires node_exporter agent"""
|
||
cpu_percent: float | None = None
|
||
memory_percent: float | None = None
|
||
disk_percent: float | None = None
|
||
load_avg_1m: float | None = None
|
||
uptime_hours: float | None = None
|
||
# Dynamic Baseline 擴充
|
||
cpu_baseline: BaselineData | None = None
|
||
memory_baseline: BaselineData | None = None
|
||
|
||
|
||
@dataclass
|
||
class HostStatus:
|
||
"""Complete host status"""
|
||
ip: str
|
||
name: str
|
||
role: HostRole
|
||
status: Literal["healthy", "degraded", "unhealthy", "unreachable"]
|
||
services: list[ServiceStatus]
|
||
metrics: HostMetrics | None = None
|
||
last_check: datetime = field(default_factory=lambda: datetime.now(UTC))
|
||
error: str | None = None
|
||
|
||
|
||
@dataclass
|
||
class AggregatedStatus:
|
||
"""Aggregated status from all hosts"""
|
||
timestamp: datetime
|
||
environment: str
|
||
mock_mode: bool # Always False for real mode
|
||
overall_status: Literal["healthy", "degraded", "unhealthy"]
|
||
hosts: list[HostStatus]
|
||
alerts_count: int = 0
|
||
pending_approvals: int = 0
|
||
|
||
|
||
# =============================================================================
|
||
# Dynamic Baseline Engine
|
||
# =============================================================================
|
||
|
||
# 靜態基準線資料 (預留 Prometheus/SigNoz 歷史查詢接口)
|
||
# 格式: {host_ip: {metric: (baseline_value, std_deviation)}}
|
||
_STATIC_BASELINES: dict[str, dict[str, tuple[float, float]]] = {
|
||
"192.168.0.110": {"cpu": (35.0, 8.0), "memory": (55.0, 10.0)}, # DevOps 金庫
|
||
"192.168.0.112": {"cpu": (25.0, 5.0), "memory": (40.0, 8.0)}, # Kali Security
|
||
"192.168.0.120": {"cpu": (45.0, 12.0), "memory": (60.0, 15.0)}, # K3s Master
|
||
"192.168.0.188": {"cpu": (50.0, 10.0), "memory": (65.0, 12.0)}, # AI+Web 中心
|
||
}
|
||
|
||
|
||
def calculate_baseline(
|
||
current_value: float | None,
|
||
host_ip: str,
|
||
metric_type: str,
|
||
) -> BaselineData | None:
|
||
"""
|
||
計算指標的基準線偏差
|
||
|
||
Args:
|
||
current_value: 當前指標值
|
||
host_ip: 主機 IP
|
||
metric_type: 'cpu' 或 'memory'
|
||
|
||
Returns:
|
||
BaselineData 包含基準線與偏差分析
|
||
"""
|
||
if current_value is None:
|
||
return None
|
||
|
||
# 取得靜態基準線 (未來換成 Prometheus 查詢)
|
||
host_baseline = _STATIC_BASELINES.get(host_ip, {"cpu": (40.0, 10.0), "memory": (50.0, 10.0)})
|
||
baseline_value, std_dev = host_baseline.get(metric_type, (40.0, 10.0))
|
||
|
||
# 計算 Sigma 偏差
|
||
if std_dev > 0:
|
||
sigma = (current_value - baseline_value) / std_dev
|
||
else:
|
||
sigma = 0.0
|
||
|
||
return BaselineData(
|
||
baseline_value=baseline_value,
|
||
std_deviation=std_dev,
|
||
sigma_deviation=round(sigma, 2),
|
||
window_hours=24,
|
||
)
|
||
|
||
|
||
def get_baseline_context_for_llm(metrics: HostMetrics, host_name: str) -> str:
|
||
"""
|
||
產生給 LLM 的基準線上下文文字
|
||
|
||
範例輸出:
|
||
"主機 AI+Web 中心: CPU 85% (基準線 50%, 標準差 10%, 偏差 +3.5σ)"
|
||
"""
|
||
parts = []
|
||
|
||
if metrics.cpu_percent is not None and metrics.cpu_baseline:
|
||
sigma_str = f"+{metrics.cpu_baseline.sigma_deviation}" if metrics.cpu_baseline.sigma_deviation >= 0 else str(metrics.cpu_baseline.sigma_deviation)
|
||
parts.append(
|
||
f"CPU {metrics.cpu_percent:.0f}% "
|
||
f"(基準線 {metrics.cpu_baseline.baseline_value:.0f}%, "
|
||
f"標準差 {metrics.cpu_baseline.std_deviation:.0f}%, "
|
||
f"偏差 {sigma_str}σ)"
|
||
)
|
||
|
||
if metrics.memory_percent is not None and metrics.memory_baseline:
|
||
sigma_str = f"+{metrics.memory_baseline.sigma_deviation}" if metrics.memory_baseline.sigma_deviation >= 0 else str(metrics.memory_baseline.sigma_deviation)
|
||
parts.append(
|
||
f"記憶體 {metrics.memory_percent:.0f}% "
|
||
f"(基準線 {metrics.memory_baseline.baseline_value:.0f}%, "
|
||
f"標準差 {metrics.memory_baseline.std_deviation:.0f}%, "
|
||
f"偏差 {sigma_str}σ)"
|
||
)
|
||
|
||
if parts:
|
||
return f"主機 {host_name}: " + ", ".join(parts)
|
||
return ""
|
||
|
||
|
||
# =============================================================================
|
||
# Real Host Probing
|
||
# =============================================================================
|
||
|
||
async def _tcp_probe(ip: str, port: int, timeout: float = 3.0) -> tuple[bool, float | None, str | None]:
|
||
"""
|
||
Real TCP port probe using asyncio.open_connection
|
||
|
||
Returns:
|
||
(is_up, latency_ms, error_message)
|
||
"""
|
||
start = asyncio.get_event_loop().time()
|
||
try:
|
||
# For HTTPS ports, create SSL context
|
||
ssl_context = None
|
||
if port in (443, 6443):
|
||
ssl_context = ssl.create_default_context()
|
||
ssl_context.check_hostname = False
|
||
ssl_context.verify_mode = ssl.CERT_NONE
|
||
|
||
reader, writer = await asyncio.wait_for(
|
||
asyncio.open_connection(ip, port, ssl=ssl_context),
|
||
timeout=timeout
|
||
)
|
||
latency = (asyncio.get_event_loop().time() - start) * 1000
|
||
writer.close()
|
||
await writer.wait_closed()
|
||
return True, round(latency, 2), None
|
||
|
||
except TimeoutError:
|
||
return False, None, "timeout"
|
||
except ConnectionRefusedError:
|
||
return False, None, "connection refused"
|
||
except OSError as e:
|
||
return False, None, str(e)[:50]
|
||
except Exception as e:
|
||
return False, None, str(e)[:50]
|
||
|
||
|
||
async def _http_probe(
|
||
ip: str,
|
||
port: int,
|
||
path: str,
|
||
timeout: float = 5.0,
|
||
https: bool = False
|
||
) -> tuple[bool, float | None, str | None]:
|
||
"""
|
||
HTTP health check probe
|
||
|
||
Returns:
|
||
(is_up, latency_ms, error_message)
|
||
"""
|
||
protocol = "https" if https else "http"
|
||
url = f"{protocol}://{ip}:{port}{path}"
|
||
|
||
start = asyncio.get_event_loop().time()
|
||
try:
|
||
async with httpx.AsyncClient(
|
||
timeout=timeout,
|
||
verify=False # Skip SSL verification for internal hosts
|
||
) as client:
|
||
response = await client.get(url)
|
||
latency = (asyncio.get_event_loop().time() - start) * 1000
|
||
|
||
if response.status_code < 400:
|
||
return True, round(latency, 2), None
|
||
else:
|
||
return False, round(latency, 2), f"HTTP {response.status_code}"
|
||
|
||
except httpx.TimeoutException:
|
||
return False, None, "timeout"
|
||
except httpx.ConnectError:
|
||
return False, None, "connection refused"
|
||
except Exception as e:
|
||
return False, None, str(e)[:50]
|
||
|
||
|
||
# =============================================================================
|
||
# Host Configuration
|
||
# =============================================================================
|
||
|
||
# Service definitions: (name, port, probe_type, path_or_none)
|
||
# probe_type: "tcp" | "http" | "https"
|
||
HOST_CONFIGS = {
|
||
"192.168.0.110": {
|
||
"name": "DevOps 金庫",
|
||
"role": HostRole.DEVOPS,
|
||
"services": [
|
||
("Harbor", 5000, "http", "/api/v2/"),
|
||
("GH Runner", 3000, "tcp", None),
|
||
("Docker", 2375, "tcp", None),
|
||
],
|
||
},
|
||
"192.168.0.112": {
|
||
"name": "Kali Security",
|
||
"role": HostRole.SECURITY,
|
||
"services": [
|
||
("Scanner API", 8080, "http", "/health"),
|
||
("Nmap", 22, "tcp", None), # SSH port as proxy
|
||
],
|
||
},
|
||
"192.168.0.120": {
|
||
"name": "K3s Master",
|
||
"role": HostRole.K3S,
|
||
"services": [
|
||
("K3s API", 6443, "https", "/healthz"),
|
||
("Traefik", 80, "http", "/"),
|
||
("awoooi-prod", 32335, "tcp", None),
|
||
],
|
||
},
|
||
"192.168.0.188": {
|
||
"name": "AI+Web 中心",
|
||
"role": HostRole.AI_WEB,
|
||
"services": [
|
||
("Nginx", 443, "https", "/"),
|
||
("PostgreSQL", 5432, "tcp", None),
|
||
("Redis", 6380, "tcp", None),
|
||
("Ollama", 11434, "http", "/api/tags"),
|
||
("OpenClaw", 8089, "http", "/health"),
|
||
("SigNoz", 3301, "http", "/api/v1/health"),
|
||
],
|
||
},
|
||
}
|
||
|
||
|
||
# =============================================================================
|
||
# Main Aggregator
|
||
# =============================================================================
|
||
|
||
class HostAggregator:
|
||
"""
|
||
Four-host status aggregator with real probing
|
||
|
||
Uses asyncio.gather for parallel fetching of all host statuses.
|
||
Performs real TCP/HTTP probes to determine service availability.
|
||
"""
|
||
|
||
@classmethod
|
||
async def _probe_service(
|
||
cls,
|
||
ip: str,
|
||
service_name: str,
|
||
port: int,
|
||
probe_type: str,
|
||
path: str | None
|
||
) -> ServiceStatus:
|
||
"""Probe a single service"""
|
||
if probe_type == "tcp":
|
||
is_up, latency, error = await _tcp_probe(ip, port)
|
||
elif probe_type == "https":
|
||
is_up, latency, error = await _http_probe(ip, port, path or "/", https=True)
|
||
else: # http
|
||
is_up, latency, error = await _http_probe(ip, port, path or "/")
|
||
|
||
if is_up:
|
||
status: Literal["up", "down", "degraded"] = "up"
|
||
# High latency = degraded
|
||
if latency and latency > 1000:
|
||
status = "degraded"
|
||
error = "high latency"
|
||
else:
|
||
status = "down"
|
||
|
||
return ServiceStatus(
|
||
name=service_name,
|
||
status=status,
|
||
port=port,
|
||
latency_ms=latency,
|
||
error=error,
|
||
)
|
||
|
||
@classmethod
|
||
async def _fetch_host(cls, ip: str, config: dict) -> HostStatus:
|
||
"""Fetch status from a single host"""
|
||
services: list[ServiceStatus] = []
|
||
|
||
# Probe all services in parallel
|
||
tasks = [
|
||
cls._probe_service(ip, name, port, probe_type, path)
|
||
for name, port, probe_type, path in config["services"]
|
||
]
|
||
services = await asyncio.gather(*tasks)
|
||
|
||
# Determine overall host status
|
||
down_count = sum(1 for s in services if s.status == "down")
|
||
degraded_count = sum(1 for s in services if s.status == "degraded")
|
||
total = len(services)
|
||
|
||
if down_count == total:
|
||
host_status: Literal["healthy", "degraded", "unhealthy", "unreachable"] = "unreachable"
|
||
elif down_count >= total // 2:
|
||
host_status = "unhealthy"
|
||
elif down_count > 0 or degraded_count > 0:
|
||
host_status = "degraded"
|
||
else:
|
||
host_status = "healthy"
|
||
|
||
# 模擬 Metrics (預留 node_exporter 接口)
|
||
# 根據服務健康狀態模擬 CPU/Memory
|
||
import random
|
||
|
||
# 異常狀態時模擬高負載
|
||
if host_status in ("unhealthy", "unreachable"):
|
||
cpu_pct = random.uniform(75, 95)
|
||
mem_pct = random.uniform(70, 90)
|
||
elif host_status == "degraded":
|
||
cpu_pct = random.uniform(50, 75)
|
||
mem_pct = random.uniform(55, 75)
|
||
else:
|
||
cpu_pct = random.uniform(25, 50)
|
||
mem_pct = random.uniform(40, 60)
|
||
|
||
# 計算基準線偏差
|
||
cpu_baseline = calculate_baseline(cpu_pct, ip, "cpu")
|
||
mem_baseline = calculate_baseline(mem_pct, ip, "memory")
|
||
|
||
metrics = HostMetrics(
|
||
cpu_percent=round(cpu_pct, 1),
|
||
memory_percent=round(mem_pct, 1),
|
||
cpu_baseline=cpu_baseline,
|
||
memory_baseline=mem_baseline,
|
||
)
|
||
|
||
return HostStatus(
|
||
ip=ip,
|
||
name=config["name"],
|
||
role=config["role"],
|
||
status=host_status,
|
||
services=services,
|
||
metrics=metrics,
|
||
)
|
||
|
||
@classmethod
|
||
async def fetch_all(cls) -> AggregatedStatus:
|
||
"""
|
||
Fetch status from all four hosts in parallel
|
||
|
||
Uses asyncio.gather for maximum concurrency.
|
||
Always performs real probing - no mock data.
|
||
"""
|
||
logger.info("aggregator_fetch_start", mode="real_probing")
|
||
|
||
# Fetch all hosts in parallel
|
||
tasks = [
|
||
cls._fetch_host(ip, config)
|
||
for ip, config in HOST_CONFIGS.items()
|
||
]
|
||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||
|
||
# Process results
|
||
hosts: list[HostStatus] = []
|
||
for i, (ip, config) in enumerate(HOST_CONFIGS.items()):
|
||
if isinstance(results[i], Exception):
|
||
logger.error(
|
||
"aggregator_host_error",
|
||
ip=ip,
|
||
error=str(results[i]),
|
||
)
|
||
hosts.append(HostStatus(
|
||
ip=ip,
|
||
name=config["name"],
|
||
role=config["role"],
|
||
status="unreachable",
|
||
services=[],
|
||
error=str(results[i]),
|
||
))
|
||
else:
|
||
hosts.append(results[i])
|
||
|
||
# Determine overall status
|
||
statuses = [h.status for h in hosts]
|
||
unhealthy_count = statuses.count("unhealthy") + statuses.count("unreachable")
|
||
degraded_count = statuses.count("degraded")
|
||
|
||
if unhealthy_count >= 2:
|
||
overall: Literal["healthy", "degraded", "unhealthy"] = "unhealthy"
|
||
elif unhealthy_count >= 1 or degraded_count >= 2:
|
||
overall = "degraded"
|
||
else:
|
||
overall = "healthy"
|
||
|
||
logger.info(
|
||
"aggregator_fetch_complete",
|
||
overall_status=overall,
|
||
host_statuses={h.ip: h.status for h in hosts},
|
||
)
|
||
|
||
return AggregatedStatus(
|
||
timestamp=datetime.now(UTC),
|
||
environment=settings.ENVIRONMENT,
|
||
mock_mode=False, # Always real mode
|
||
overall_status=overall,
|
||
hosts=hosts,
|
||
)
|
||
|
||
@classmethod
|
||
async def fetch_single(cls, ip: str) -> HostStatus | None:
|
||
"""Fetch status from a single host"""
|
||
if ip not in HOST_CONFIGS:
|
||
return None
|
||
|
||
return await cls._fetch_host(ip, HOST_CONFIGS[ip])
|
||
|
||
|
||
# Singleton instance
|
||
aggregator = HostAggregator()
|