Files
awoooi/ops/monitoring/validate_coverage.py
OG T 40163a51b5 feat(monitoring): 完整監控策略與自動整合架構
新增:
1. MONITORING_COMPLETE_STRATEGY.md - 完整監控策略
   - 5 主機 × 60+ 服務監控矩陣
   - P0/P1/P2 告警規則清單
   - AI 自動修復閉環流程
   - 安全護欄配置

2. MONITORING_INTEGRATION_ARCHITECTURE.md - 自動整合架構
   - 服務註冊表 (Single Source of Truth)
   - CI/CD 自動驗證監控覆蓋率
   - 新服務自動獲得監控

3. ops/monitoring/service-registry.yaml - 服務清單
   - K8s 工作負載 (API/Web/Worker/ArgoCD)
   - Docker 容器 (Ollama/OpenClaw/Redis/Postgres)
   - 前端頁面 SLO
   - API 端點 SLO
   - 告警模板與自動修復動作

4. ops/monitoring/validate_coverage.py - 覆蓋率驗證
   - CI 階段執行
   - 檢測未監控服務
   - 生成覆蓋率報告

設計原則:
- 監控即代碼 (Monitoring as Code)
- 新服務必須在 registry 註冊才能部署
- 自動發現機制防止遺漏

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-29 01:52:08 +08:00

248 lines
8.3 KiB
Python
Executable File

#!/usr/bin/env python3
"""
AWOOOI 監控覆蓋率驗證
====================
CI/CD 階段執行,確保所有服務都有對應的監控配置
用法:
python ops/monitoring/validate_coverage.py
退出碼:
0 - 所有服務都已註冊
1 - 發現未監控的服務
"""
import subprocess
import sys
import yaml
from pathlib import Path
from typing import NamedTuple
class ValidationResult(NamedTuple):
"""驗證結果"""
passed: bool
errors: list[str]
warnings: list[str]
coverage: dict[str, float]
def load_registry() -> dict:
"""載入服務註冊表"""
registry_path = Path(__file__).parent / 'service-registry.yaml'
with open(registry_path) as f:
return yaml.safe_load(f)
def get_k8s_deployments() -> list[dict]:
"""取得所有 K8s Deployments"""
try:
result = subprocess.run(
[
'kubectl', 'get', 'deployments', '-A',
'-o', 'jsonpath={range .items[*]}{.metadata.namespace}/{.metadata.name}{\"\\n\"}{end}'
],
capture_output=True, text=True, timeout=30
)
deployments = []
for line in result.stdout.strip().split('\n'):
if line and '/' in line:
ns, name = line.split('/', 1)
deployments.append({'namespace': ns, 'name': name})
return deployments
except Exception as e:
print(f"Warning: Cannot get K8s deployments: {e}")
return []
def get_k8s_services() -> list[dict]:
"""取得所有 K8s Services"""
try:
result = subprocess.run(
[
'kubectl', 'get', 'services', '-A',
'-o', 'jsonpath={range .items[*]}{.metadata.namespace}/{.metadata.name}{\"\\n\"}{end}'
],
capture_output=True, text=True, timeout=30
)
services = []
for line in result.stdout.strip().split('\n'):
if line and '/' in line:
ns, name = line.split('/', 1)
services.append({'namespace': ns, 'name': name})
return services
except Exception as e:
print(f"Warning: Cannot get K8s services: {e}")
return []
def check_docker_containers(host: str) -> list[str]:
"""檢查主機上的 Docker 容器"""
try:
result = subprocess.run(
['ssh', '-o', 'ConnectTimeout=5', host, 'docker', 'ps', '--format', '{{.Names}}'],
capture_output=True, text=True, timeout=10
)
return [c for c in result.stdout.strip().split('\n') if c]
except Exception as e:
print(f"Warning: Cannot check Docker on {host}: {e}")
return []
def validate_registry(registry: dict) -> ValidationResult:
"""驗證服務註冊表完整性"""
errors = []
warnings = []
registered_services = {s['name'] for s in registry.get('services', [])}
registered_k8s = {
(s['namespace'], s['name'])
for s in registry.get('services', [])
if s.get('type') == 'k8s-deployment'
}
# ==========================================================================
# 1. 檢查 K8s Deployments
# ==========================================================================
k8s_deployments = get_k8s_deployments()
ignored_namespaces = {'kube-system', 'kube-public', 'kube-node-lease', 'local-path-storage'}
ignored_prefixes = {'coredns', 'metrics-server', 'local-path-provisioner'}
for deploy in k8s_deployments:
ns, name = deploy['namespace'], deploy['name']
# 跳過系統命名空間
if ns in ignored_namespaces:
continue
# 跳過系統元件
if any(name.startswith(p) for p in ignored_prefixes):
continue
if (ns, name) not in registered_k8s:
errors.append(f"K8s Deployment '{ns}/{name}' 未在 service-registry.yaml 中註冊")
# ==========================================================================
# 2. 檢查 Docker 容器
# ==========================================================================
docker_hosts = ['192.168.0.188', '192.168.0.110']
docker_services = {
s['name']
for s in registry.get('services', [])
if s.get('type') == 'docker'
}
ignored_containers = {
'k3s', 'pause', 'registry', 'nginx', 'traefik',
# SignOz 相關容器群組
'signoz-alertmanager', 'signoz-query-service', 'signoz-otel-collector-metrics',
'zookeeper', 'clickhouse',
# Sentry 相關容器群組
'sentry-web', 'sentry-worker', 'sentry-cron', 'sentry-kafka', 'sentry-redis',
'sentry-postgres', 'sentry-zookeeper', 'sentry-snuba',
}
for host in docker_hosts:
containers = check_docker_containers(host)
for container in containers:
if not container:
continue
# 跳過已知系統容器
if any(ignored in container for ignored in ignored_containers):
continue
# 提取主要名稱 (去除 _1, -1 等後綴)
base_name = container.split('_')[0].split('-')[0]
if container not in docker_services and base_name not in docker_services:
warnings.append(f"Docker 容器 '{container}' on {host} 未在 registry 中 (可能需要加入)")
# ==========================================================================
# 3. 檢查 API 端點覆蓋
# ==========================================================================
api_endpoints = registry.get('api_endpoints', [])
critical_endpoints = [e for e in api_endpoints if e.get('critical')]
if len(critical_endpoints) < 5:
warnings.append(f"僅定義了 {len(critical_endpoints)} 個關鍵 API 端點,建議至少 5 個")
# ==========================================================================
# 4. 檢查前端頁面覆蓋
# ==========================================================================
pages = registry.get('pages', [])
if len(pages) < 3:
warnings.append(f"僅定義了 {len(pages)} 個前端頁面監控,建議至少 3 個")
# ==========================================================================
# 5. 計算覆蓋率
# ==========================================================================
services = registry.get('services', [])
total = len(services) if services else 1
coverage = {
'prometheus': sum(1 for s in services if s.get('monitoring', {}).get('prometheus')) / total,
'sentry': sum(1 for s in services if s.get('monitoring', {}).get('sentry')) / total,
'otel': sum(1 for s in services if s.get('monitoring', {}).get('otel')) / total,
'alerts': sum(1 for s in services if s.get('alerts')) / total,
'auto_repair': sum(1 for s in services if s.get('auto_repair', {}).get('enabled')) / total,
}
# 覆蓋率低於 80% 產生警告
for metric, rate in coverage.items():
if rate < 0.8:
warnings.append(f"{metric} 覆蓋率僅 {rate:.0%},建議提升至 80% 以上")
passed = len(errors) == 0
return ValidationResult(passed=passed, errors=errors, warnings=warnings, coverage=coverage)
def print_report(result: ValidationResult):
"""輸出驗證報告"""
print("\n" + "=" * 60)
print("AWOOOI 監控覆蓋率驗證報告")
print("=" * 60)
# 覆蓋率
print("\n📊 覆蓋率:")
for metric, rate in result.coverage.items():
status = "" if rate >= 0.8 else "⚠️" if rate >= 0.5 else ""
print(f" {status} {metric}: {rate:.0%}")
# 錯誤
if result.errors:
print(f"\n❌ 錯誤 ({len(result.errors)}):")
for err in result.errors:
print(f"{err}")
# 警告
if result.warnings:
print(f"\n⚠️ 警告 ({len(result.warnings)}):")
for warn in result.warnings:
print(f"{warn}")
# 結論
print("\n" + "-" * 60)
if result.passed:
print("✅ 驗證通過 - 所有關鍵服務都已註冊監控")
else:
print("❌ 驗證失敗 - 請更新 ops/monitoring/service-registry.yaml")
print("=" * 60 + "\n")
def main():
"""主函數"""
registry = load_registry()
result = validate_registry(registry)
print_report(result)
# 錯誤時退出碼 1
if not result.passed:
sys.exit(1)
sys.exit(0)
if __name__ == '__main__':
main()