Files
awoooi/ops/monitoring/generate_monitoring.py
OG T c7f9c119e7 fix(cd): 補提交 ops/monitoring 腳本
遺漏文件導致 CD Monitoring Coverage 步驟失敗

新增:
- generate_monitoring.py - 監控覆蓋率檢查
- coverage_report.py - 覆蓋率報告
- discover_docker.py - Docker 服務發現
- deploy-exporters.sh - Exporter 部署腳本
- postgres-exporter-queries.yaml - PostgreSQL 查詢配置

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-29 15:45:42 +08:00

325 lines
10 KiB
Python
Executable File

#!/usr/bin/env python3
"""
AWOOOI 監控配置生成器
======================
ADR-037 Wave C.1: 從 service-registry.yaml 自動生成監控配置
功能:
1. 讀取 service-registry.yaml (Single Source of Truth)
2. 生成 Prometheus scrape 配置
3. 生成 Blackbox Exporter targets
4. 驗證監控覆蓋率
用法:
python ops/monitoring/generate_monitoring.py
python ops/monitoring/generate_monitoring.py --validate-only
python ops/monitoring/generate_monitoring.py --output-dir /tmp/monitoring
版本: v1.0
建立: 2026-03-29 (台北時區)
建立者: Claude Code (Phase 21 ADR-037)
"""
import argparse
import sys
from pathlib import Path
import yaml
# 配置
SCRIPT_DIR = Path(__file__).parent
REGISTRY_FILE = SCRIPT_DIR / "service-registry.yaml"
OUTPUT_DIR = SCRIPT_DIR / "generated"
def load_registry() -> dict:
"""載入服務註冊表"""
if not REGISTRY_FILE.exists():
print(f"Error: Service registry not found: {REGISTRY_FILE}")
sys.exit(1)
with open(REGISTRY_FILE) as f:
return yaml.safe_load(f)
def generate_prometheus_scrape_configs(registry: dict) -> list[dict]:
"""
生成 Prometheus scrape 配置
Returns:
List of scrape_config dicts
"""
scrape_configs = []
# K8s 服務
for svc in registry.get("services", []):
if svc.get("monitoring", {}).get("prometheus"):
if svc.get("type") == "k8s-deployment":
# K8s ServiceMonitor 風格
config = {
"job_name": svc["name"],
"kubernetes_sd_configs": [{
"role": "pod",
"namespaces": {
"names": [svc.get("namespace", "default")]
}
}],
"relabel_configs": [
{
"source_labels": ["__meta_kubernetes_pod_label_app"],
"regex": svc["name"],
"action": "keep"
},
{
"source_labels": ["__meta_kubernetes_namespace"],
"target_label": "namespace"
},
{
"source_labels": ["__meta_kubernetes_pod_name"],
"target_label": "pod"
}
]
}
scrape_configs.append(config)
elif svc.get("type") == "docker":
# Docker 直接 scrape
host = svc.get("host", "localhost")
port = svc.get("port", 8080)
config = {
"job_name": svc["name"],
"static_configs": [{
"targets": [f"{host}:{port}"],
"labels": {
"service": svc["name"],
"type": "docker",
"owner": svc.get("owner", "unknown"),
"criticality": svc.get("criticality", "P2")
}
}]
}
scrape_configs.append(config)
# Database Exporters (特殊處理)
# PostgreSQL Exporter
scrape_configs.append({
"job_name": "postgres-exporter",
"static_configs": [{
"targets": ["192.168.0.188:9187"],
"labels": {
"service": "postgres",
"type": "exporter"
}
}]
})
# Redis Exporter
scrape_configs.append({
"job_name": "redis-exporter",
"static_configs": [{
"targets": ["192.168.0.188:9121"],
"labels": {
"service": "redis",
"type": "exporter"
}
}]
})
return scrape_configs
def generate_blackbox_targets(registry: dict) -> list[dict]:
"""
生成 Blackbox Exporter targets
Returns:
List of target configs for HTTP probing
"""
targets = []
for svc in registry.get("services", []):
health_endpoint = svc.get("health_endpoint")
health_type = svc.get("health_type", "http")
if health_endpoint and health_type == "http":
if svc.get("type") == "k8s-deployment":
# K8s 內部 DNS
url = f"http://{svc['name']}.{svc.get('namespace', 'default')}.svc.cluster.local:{svc.get('port', 8080)}{health_endpoint}"
elif svc.get("type") == "docker":
host = svc.get("host", "localhost")
port = svc.get("port", 8080)
url = f"http://{host}:{port}{health_endpoint}"
else:
continue
targets.append({
"url": url,
"labels": {
"service": svc["name"],
"criticality": svc.get("criticality", "P2"),
"owner": svc.get("owner", "unknown")
}
})
# API 端點
for endpoint in registry.get("api_endpoints", []):
if endpoint.get("critical"):
url = f"http://awoooi-api.awoooi-prod.svc.cluster.local:8000{endpoint['path']}"
targets.append({
"url": url,
"labels": {
"endpoint": endpoint["path"],
"method": endpoint.get("method", "GET"),
"criticality": "P0" if endpoint.get("critical") else "P2"
}
})
return targets
def validate_coverage(registry: dict) -> dict:
"""
驗證監控覆蓋率
Returns:
Coverage report dict
"""
report = {
"total_services": 0,
"monitored_services": 0,
"coverage_percent": 0.0,
"missing_prometheus": [],
"missing_health_endpoint": [],
"missing_alerts": [],
"missing_auto_repair": [],
}
for svc in registry.get("services", []):
report["total_services"] += 1
monitoring = svc.get("monitoring", {})
if monitoring.get("prometheus"):
report["monitored_services"] += 1
else:
report["missing_prometheus"].append(svc["name"])
if not svc.get("health_endpoint"):
report["missing_health_endpoint"].append(svc["name"])
if not svc.get("alerts"):
report["missing_alerts"].append(svc["name"])
if svc.get("criticality") in ["P0", "P1"] and not svc.get("auto_repair", {}).get("enabled"):
report["missing_auto_repair"].append(svc["name"])
if report["total_services"] > 0:
report["coverage_percent"] = round(
100 * report["monitored_services"] / report["total_services"], 1
)
return report
def write_prometheus_config(scrape_configs: list[dict], output_dir: Path):
"""寫入 Prometheus scrape 配置"""
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "prometheus-scrape-generated.yaml"
config = {
"# Auto-generated by generate_monitoring.py": None,
"# DO NOT EDIT MANUALLY": None,
"scrape_configs": scrape_configs
}
with open(output_file, "w") as f:
yaml.dump({"scrape_configs": scrape_configs}, f, default_flow_style=False, allow_unicode=True)
print(f"Generated: {output_file}")
def write_blackbox_targets(targets: list[dict], output_dir: Path):
"""寫入 Blackbox Exporter targets"""
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "blackbox-targets-generated.yaml"
with open(output_file, "w") as f:
yaml.dump(targets, f, default_flow_style=False, allow_unicode=True)
print(f"Generated: {output_file}")
def print_coverage_report(report: dict):
"""輸出覆蓋率報告"""
print("\n" + "=" * 60)
print(" AWOOOI Monitoring Coverage Report")
print("=" * 60)
print(f"\n Total Services: {report['total_services']}")
print(f" Monitored: {report['monitored_services']}")
print(f" Coverage: {report['coverage_percent']}%")
if report["missing_prometheus"]:
print(f"\n Missing Prometheus Monitoring ({len(report['missing_prometheus'])}):")
for svc in report["missing_prometheus"]:
print(f" - {svc}")
if report["missing_health_endpoint"]:
print(f"\n Missing Health Endpoint ({len(report['missing_health_endpoint'])}):")
for svc in report["missing_health_endpoint"]:
print(f" - {svc}")
if report["missing_alerts"]:
print(f"\n Missing Alert Rules ({len(report['missing_alerts'])}):")
for svc in report["missing_alerts"]:
print(f" - {svc}")
if report["missing_auto_repair"]:
print(f"\n P0/P1 Without Auto-Repair ({len(report['missing_auto_repair'])}):")
for svc in report["missing_auto_repair"]:
print(f" - {svc}")
print("\n" + "=" * 60)
# CI/CD 用: 覆蓋率低於 90% 視為失敗
if report["coverage_percent"] < 90:
print(f"\n WARNING: Coverage below 90% threshold!")
return False
return True
def main():
parser = argparse.ArgumentParser(description="AWOOOI Monitoring Config Generator")
parser.add_argument("--validate-only", action="store_true", help="Only validate coverage, don't generate")
parser.add_argument("--output-dir", type=Path, default=OUTPUT_DIR, help="Output directory")
parser.add_argument("--ci", action="store_true", help="CI mode: exit 1 if coverage < 90%")
args = parser.parse_args()
print("Loading service registry...")
registry = load_registry()
# 驗證覆蓋率
report = validate_coverage(registry)
coverage_ok = print_coverage_report(report)
if args.validate_only:
sys.exit(0 if coverage_ok else 1)
# 生成配置
print("\nGenerating monitoring configs...")
scrape_configs = generate_prometheus_scrape_configs(registry)
write_prometheus_config(scrape_configs, args.output_dir)
blackbox_targets = generate_blackbox_targets(registry)
write_blackbox_targets(blackbox_targets, args.output_dir)
print(f"\nGenerated {len(scrape_configs)} scrape configs")
print(f"Generated {len(blackbox_targets)} blackbox targets")
if args.ci and not coverage_ok:
sys.exit(1)
if __name__ == "__main__":
main()