新增: 1. MONITORING_COMPLETE_STRATEGY.md - 完整監控策略 - 5 主機 × 60+ 服務監控矩陣 - P0/P1/P2 告警規則清單 - AI 自動修復閉環流程 - 安全護欄配置 2. MONITORING_INTEGRATION_ARCHITECTURE.md - 自動整合架構 - 服務註冊表 (Single Source of Truth) - CI/CD 自動驗證監控覆蓋率 - 新服務自動獲得監控 3. ops/monitoring/service-registry.yaml - 服務清單 - K8s 工作負載 (API/Web/Worker/ArgoCD) - Docker 容器 (Ollama/OpenClaw/Redis/Postgres) - 前端頁面 SLO - API 端點 SLO - 告警模板與自動修復動作 4. ops/monitoring/validate_coverage.py - 覆蓋率驗證 - CI 階段執行 - 檢測未監控服務 - 生成覆蓋率報告 設計原則: - 監控即代碼 (Monitoring as Code) - 新服務必須在 registry 註冊才能部署 - 自動發現機制防止遺漏 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
248 lines
8.3 KiB
Python
Executable File
248 lines
8.3 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
AWOOOI 監控覆蓋率驗證
|
|
====================
|
|
CI/CD 階段執行,確保所有服務都有對應的監控配置
|
|
|
|
用法:
|
|
python ops/monitoring/validate_coverage.py
|
|
|
|
退出碼:
|
|
0 - 所有服務都已註冊
|
|
1 - 發現未監控的服務
|
|
"""
|
|
|
|
import subprocess
|
|
import sys
|
|
import yaml
|
|
from pathlib import Path
|
|
from typing import NamedTuple
|
|
|
|
|
|
class ValidationResult(NamedTuple):
|
|
"""驗證結果"""
|
|
passed: bool
|
|
errors: list[str]
|
|
warnings: list[str]
|
|
coverage: dict[str, float]
|
|
|
|
|
|
def load_registry() -> dict:
|
|
"""載入服務註冊表"""
|
|
registry_path = Path(__file__).parent / 'service-registry.yaml'
|
|
with open(registry_path) as f:
|
|
return yaml.safe_load(f)
|
|
|
|
|
|
def get_k8s_deployments() -> list[dict]:
|
|
"""取得所有 K8s Deployments"""
|
|
try:
|
|
result = subprocess.run(
|
|
[
|
|
'kubectl', 'get', 'deployments', '-A',
|
|
'-o', 'jsonpath={range .items[*]}{.metadata.namespace}/{.metadata.name}{\"\\n\"}{end}'
|
|
],
|
|
capture_output=True, text=True, timeout=30
|
|
)
|
|
deployments = []
|
|
for line in result.stdout.strip().split('\n'):
|
|
if line and '/' in line:
|
|
ns, name = line.split('/', 1)
|
|
deployments.append({'namespace': ns, 'name': name})
|
|
return deployments
|
|
except Exception as e:
|
|
print(f"Warning: Cannot get K8s deployments: {e}")
|
|
return []
|
|
|
|
|
|
def get_k8s_services() -> list[dict]:
|
|
"""取得所有 K8s Services"""
|
|
try:
|
|
result = subprocess.run(
|
|
[
|
|
'kubectl', 'get', 'services', '-A',
|
|
'-o', 'jsonpath={range .items[*]}{.metadata.namespace}/{.metadata.name}{\"\\n\"}{end}'
|
|
],
|
|
capture_output=True, text=True, timeout=30
|
|
)
|
|
services = []
|
|
for line in result.stdout.strip().split('\n'):
|
|
if line and '/' in line:
|
|
ns, name = line.split('/', 1)
|
|
services.append({'namespace': ns, 'name': name})
|
|
return services
|
|
except Exception as e:
|
|
print(f"Warning: Cannot get K8s services: {e}")
|
|
return []
|
|
|
|
|
|
def check_docker_containers(host: str) -> list[str]:
|
|
"""檢查主機上的 Docker 容器"""
|
|
try:
|
|
result = subprocess.run(
|
|
['ssh', '-o', 'ConnectTimeout=5', host, 'docker', 'ps', '--format', '{{.Names}}'],
|
|
capture_output=True, text=True, timeout=10
|
|
)
|
|
return [c for c in result.stdout.strip().split('\n') if c]
|
|
except Exception as e:
|
|
print(f"Warning: Cannot check Docker on {host}: {e}")
|
|
return []
|
|
|
|
|
|
def validate_registry(registry: dict) -> ValidationResult:
|
|
"""驗證服務註冊表完整性"""
|
|
errors = []
|
|
warnings = []
|
|
|
|
registered_services = {s['name'] for s in registry.get('services', [])}
|
|
registered_k8s = {
|
|
(s['namespace'], s['name'])
|
|
for s in registry.get('services', [])
|
|
if s.get('type') == 'k8s-deployment'
|
|
}
|
|
|
|
# ==========================================================================
|
|
# 1. 檢查 K8s Deployments
|
|
# ==========================================================================
|
|
k8s_deployments = get_k8s_deployments()
|
|
ignored_namespaces = {'kube-system', 'kube-public', 'kube-node-lease', 'local-path-storage'}
|
|
ignored_prefixes = {'coredns', 'metrics-server', 'local-path-provisioner'}
|
|
|
|
for deploy in k8s_deployments:
|
|
ns, name = deploy['namespace'], deploy['name']
|
|
|
|
# 跳過系統命名空間
|
|
if ns in ignored_namespaces:
|
|
continue
|
|
|
|
# 跳過系統元件
|
|
if any(name.startswith(p) for p in ignored_prefixes):
|
|
continue
|
|
|
|
if (ns, name) not in registered_k8s:
|
|
errors.append(f"K8s Deployment '{ns}/{name}' 未在 service-registry.yaml 中註冊")
|
|
|
|
# ==========================================================================
|
|
# 2. 檢查 Docker 容器
|
|
# ==========================================================================
|
|
docker_hosts = ['192.168.0.188', '192.168.0.110']
|
|
docker_services = {
|
|
s['name']
|
|
for s in registry.get('services', [])
|
|
if s.get('type') == 'docker'
|
|
}
|
|
|
|
ignored_containers = {
|
|
'k3s', 'pause', 'registry', 'nginx', 'traefik',
|
|
# SignOz 相關容器群組
|
|
'signoz-alertmanager', 'signoz-query-service', 'signoz-otel-collector-metrics',
|
|
'zookeeper', 'clickhouse',
|
|
# Sentry 相關容器群組
|
|
'sentry-web', 'sentry-worker', 'sentry-cron', 'sentry-kafka', 'sentry-redis',
|
|
'sentry-postgres', 'sentry-zookeeper', 'sentry-snuba',
|
|
}
|
|
|
|
for host in docker_hosts:
|
|
containers = check_docker_containers(host)
|
|
for container in containers:
|
|
if not container:
|
|
continue
|
|
|
|
# 跳過已知系統容器
|
|
if any(ignored in container for ignored in ignored_containers):
|
|
continue
|
|
|
|
# 提取主要名稱 (去除 _1, -1 等後綴)
|
|
base_name = container.split('_')[0].split('-')[0]
|
|
|
|
if container not in docker_services and base_name not in docker_services:
|
|
warnings.append(f"Docker 容器 '{container}' on {host} 未在 registry 中 (可能需要加入)")
|
|
|
|
# ==========================================================================
|
|
# 3. 檢查 API 端點覆蓋
|
|
# ==========================================================================
|
|
api_endpoints = registry.get('api_endpoints', [])
|
|
critical_endpoints = [e for e in api_endpoints if e.get('critical')]
|
|
|
|
if len(critical_endpoints) < 5:
|
|
warnings.append(f"僅定義了 {len(critical_endpoints)} 個關鍵 API 端點,建議至少 5 個")
|
|
|
|
# ==========================================================================
|
|
# 4. 檢查前端頁面覆蓋
|
|
# ==========================================================================
|
|
pages = registry.get('pages', [])
|
|
if len(pages) < 3:
|
|
warnings.append(f"僅定義了 {len(pages)} 個前端頁面監控,建議至少 3 個")
|
|
|
|
# ==========================================================================
|
|
# 5. 計算覆蓋率
|
|
# ==========================================================================
|
|
services = registry.get('services', [])
|
|
total = len(services) if services else 1
|
|
|
|
coverage = {
|
|
'prometheus': sum(1 for s in services if s.get('monitoring', {}).get('prometheus')) / total,
|
|
'sentry': sum(1 for s in services if s.get('monitoring', {}).get('sentry')) / total,
|
|
'otel': sum(1 for s in services if s.get('monitoring', {}).get('otel')) / total,
|
|
'alerts': sum(1 for s in services if s.get('alerts')) / total,
|
|
'auto_repair': sum(1 for s in services if s.get('auto_repair', {}).get('enabled')) / total,
|
|
}
|
|
|
|
# 覆蓋率低於 80% 產生警告
|
|
for metric, rate in coverage.items():
|
|
if rate < 0.8:
|
|
warnings.append(f"{metric} 覆蓋率僅 {rate:.0%},建議提升至 80% 以上")
|
|
|
|
passed = len(errors) == 0
|
|
return ValidationResult(passed=passed, errors=errors, warnings=warnings, coverage=coverage)
|
|
|
|
|
|
def print_report(result: ValidationResult):
|
|
"""輸出驗證報告"""
|
|
print("\n" + "=" * 60)
|
|
print("AWOOOI 監控覆蓋率驗證報告")
|
|
print("=" * 60)
|
|
|
|
# 覆蓋率
|
|
print("\n📊 覆蓋率:")
|
|
for metric, rate in result.coverage.items():
|
|
status = "✅" if rate >= 0.8 else "⚠️" if rate >= 0.5 else "❌"
|
|
print(f" {status} {metric}: {rate:.0%}")
|
|
|
|
# 錯誤
|
|
if result.errors:
|
|
print(f"\n❌ 錯誤 ({len(result.errors)}):")
|
|
for err in result.errors:
|
|
print(f" • {err}")
|
|
|
|
# 警告
|
|
if result.warnings:
|
|
print(f"\n⚠️ 警告 ({len(result.warnings)}):")
|
|
for warn in result.warnings:
|
|
print(f" • {warn}")
|
|
|
|
# 結論
|
|
print("\n" + "-" * 60)
|
|
if result.passed:
|
|
print("✅ 驗證通過 - 所有關鍵服務都已註冊監控")
|
|
else:
|
|
print("❌ 驗證失敗 - 請更新 ops/monitoring/service-registry.yaml")
|
|
print("=" * 60 + "\n")
|
|
|
|
|
|
def main():
|
|
"""主函數"""
|
|
registry = load_registry()
|
|
result = validate_registry(registry)
|
|
print_report(result)
|
|
|
|
# 錯誤時退出碼 1
|
|
if not result.passed:
|
|
sys.exit(1)
|
|
|
|
sys.exit(0)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|