#!/usr/bin/env python3 """ AWOOOI 監控覆蓋率驗證 ==================== CI/CD 階段執行,確保所有服務都有對應的監控配置 用法: python ops/monitoring/validate_coverage.py 退出碼: 0 - 所有服務都已註冊 1 - 發現未監控的服務 """ import subprocess import sys import yaml from pathlib import Path from typing import NamedTuple class ValidationResult(NamedTuple): """驗證結果""" passed: bool errors: list[str] warnings: list[str] coverage: dict[str, float] def load_registry() -> dict: """載入服務註冊表""" registry_path = Path(__file__).parent / 'service-registry.yaml' with open(registry_path) as f: return yaml.safe_load(f) def get_k8s_deployments() -> list[dict]: """取得所有 K8s Deployments""" try: result = subprocess.run( [ 'kubectl', 'get', 'deployments', '-A', '-o', 'jsonpath={range .items[*]}{.metadata.namespace}/{.metadata.name}{\"\\n\"}{end}' ], capture_output=True, text=True, timeout=30 ) deployments = [] for line in result.stdout.strip().split('\n'): if line and '/' in line: ns, name = line.split('/', 1) deployments.append({'namespace': ns, 'name': name}) return deployments except Exception as e: print(f"Warning: Cannot get K8s deployments: {e}") return [] def get_k8s_services() -> list[dict]: """取得所有 K8s Services""" try: result = subprocess.run( [ 'kubectl', 'get', 'services', '-A', '-o', 'jsonpath={range .items[*]}{.metadata.namespace}/{.metadata.name}{\"\\n\"}{end}' ], capture_output=True, text=True, timeout=30 ) services = [] for line in result.stdout.strip().split('\n'): if line and '/' in line: ns, name = line.split('/', 1) services.append({'namespace': ns, 'name': name}) return services except Exception as e: print(f"Warning: Cannot get K8s services: {e}") return [] def check_docker_containers(host: str) -> list[str]: """檢查主機上的 Docker 容器""" try: result = subprocess.run( ['ssh', '-o', 'ConnectTimeout=5', host, 'docker', 'ps', '--format', '{{.Names}}'], capture_output=True, text=True, timeout=10 ) return [c for c in result.stdout.strip().split('\n') if c] except Exception as e: print(f"Warning: Cannot check Docker on {host}: {e}") return [] def validate_registry(registry: dict) -> ValidationResult: """驗證服務註冊表完整性""" errors = [] warnings = [] registered_services = {s['name'] for s in registry.get('services', [])} registered_k8s = { (s['namespace'], s['name']) for s in registry.get('services', []) if s.get('type') == 'k8s-deployment' } # ========================================================================== # 1. 檢查 K8s Deployments # ========================================================================== k8s_deployments = get_k8s_deployments() ignored_namespaces = {'kube-system', 'kube-public', 'kube-node-lease', 'local-path-storage'} ignored_prefixes = {'coredns', 'metrics-server', 'local-path-provisioner'} for deploy in k8s_deployments: ns, name = deploy['namespace'], deploy['name'] # 跳過系統命名空間 if ns in ignored_namespaces: continue # 跳過系統元件 if any(name.startswith(p) for p in ignored_prefixes): continue if (ns, name) not in registered_k8s: errors.append(f"K8s Deployment '{ns}/{name}' 未在 service-registry.yaml 中註冊") # ========================================================================== # 2. 檢查 Docker 容器 # ========================================================================== docker_hosts = ['192.168.0.188', '192.168.0.110'] docker_services = { s['name'] for s in registry.get('services', []) if s.get('type') == 'docker' } ignored_containers = { 'k3s', 'pause', 'registry', 'nginx', 'traefik', # SignOz 相關容器群組 'signoz-alertmanager', 'signoz-query-service', 'signoz-otel-collector-metrics', 'zookeeper', 'clickhouse', # Sentry 相關容器群組 'sentry-web', 'sentry-worker', 'sentry-cron', 'sentry-kafka', 'sentry-redis', 'sentry-postgres', 'sentry-zookeeper', 'sentry-snuba', } for host in docker_hosts: containers = check_docker_containers(host) for container in containers: if not container: continue # 跳過已知系統容器 if any(ignored in container for ignored in ignored_containers): continue # 提取主要名稱 (去除 _1, -1 等後綴) base_name = container.split('_')[0].split('-')[0] if container not in docker_services and base_name not in docker_services: warnings.append(f"Docker 容器 '{container}' on {host} 未在 registry 中 (可能需要加入)") # ========================================================================== # 3. 檢查 API 端點覆蓋 # ========================================================================== api_endpoints = registry.get('api_endpoints', []) critical_endpoints = [e for e in api_endpoints if e.get('critical')] if len(critical_endpoints) < 5: warnings.append(f"僅定義了 {len(critical_endpoints)} 個關鍵 API 端點,建議至少 5 個") # ========================================================================== # 4. 檢查前端頁面覆蓋 # ========================================================================== pages = registry.get('pages', []) if len(pages) < 3: warnings.append(f"僅定義了 {len(pages)} 個前端頁面監控,建議至少 3 個") # ========================================================================== # 5. 計算覆蓋率 # ========================================================================== services = registry.get('services', []) total = len(services) if services else 1 coverage = { 'prometheus': sum(1 for s in services if s.get('monitoring', {}).get('prometheus')) / total, 'sentry': sum(1 for s in services if s.get('monitoring', {}).get('sentry')) / total, 'otel': sum(1 for s in services if s.get('monitoring', {}).get('otel')) / total, 'alerts': sum(1 for s in services if s.get('alerts')) / total, 'auto_repair': sum(1 for s in services if s.get('auto_repair', {}).get('enabled')) / total, } # 覆蓋率低於 80% 產生警告 for metric, rate in coverage.items(): if rate < 0.8: warnings.append(f"{metric} 覆蓋率僅 {rate:.0%},建議提升至 80% 以上") passed = len(errors) == 0 return ValidationResult(passed=passed, errors=errors, warnings=warnings, coverage=coverage) def print_report(result: ValidationResult): """輸出驗證報告""" print("\n" + "=" * 60) print("AWOOOI 監控覆蓋率驗證報告") print("=" * 60) # 覆蓋率 print("\n📊 覆蓋率:") for metric, rate in result.coverage.items(): status = "✅" if rate >= 0.8 else "⚠️" if rate >= 0.5 else "❌" print(f" {status} {metric}: {rate:.0%}") # 錯誤 if result.errors: print(f"\n❌ 錯誤 ({len(result.errors)}):") for err in result.errors: print(f" • {err}") # 警告 if result.warnings: print(f"\n⚠️ 警告 ({len(result.warnings)}):") for warn in result.warnings: print(f" • {warn}") # 結論 print("\n" + "-" * 60) if result.passed: print("✅ 驗證通過 - 所有關鍵服務都已註冊監控") else: print("❌ 驗證失敗 - 請更新 ops/monitoring/service-registry.yaml") print("=" * 60 + "\n") def main(): """主函數""" registry = load_registry() result = validate_registry(registry) print_report(result) # 錯誤時退出碼 1 if not result.passed: sys.exit(1) sys.exit(0) if __name__ == '__main__': main()