#!/usr/bin/env python3 """ AWOOOI 監控配置生成器 ====================== ADR-037 Wave C.1: 從 service-registry.yaml 自動生成監控配置 功能: 1. 讀取 service-registry.yaml (Single Source of Truth) 2. 生成 Prometheus scrape 配置 3. 生成 Blackbox Exporter targets 4. 驗證監控覆蓋率 用法: python ops/monitoring/generate_monitoring.py python ops/monitoring/generate_monitoring.py --validate-only python ops/monitoring/generate_monitoring.py --output-dir /tmp/monitoring 版本: v1.0 建立: 2026-03-29 (台北時區) 建立者: Claude Code (Phase 21 ADR-037) """ import argparse import sys from pathlib import Path import yaml # 配置 SCRIPT_DIR = Path(__file__).parent REGISTRY_FILE = SCRIPT_DIR / "service-registry.yaml" OUTPUT_DIR = SCRIPT_DIR / "generated" def load_registry() -> dict: """載入服務註冊表""" if not REGISTRY_FILE.exists(): print(f"Error: Service registry not found: {REGISTRY_FILE}") sys.exit(1) with open(REGISTRY_FILE) as f: return yaml.safe_load(f) def generate_prometheus_scrape_configs(registry: dict) -> list[dict]: """ 生成 Prometheus scrape 配置 Returns: List of scrape_config dicts """ scrape_configs = [] # K8s 服務 for svc in registry.get("services", []): if svc.get("monitoring", {}).get("prometheus"): if svc.get("type") == "k8s-deployment": # K8s ServiceMonitor 風格 config = { "job_name": svc["name"], "kubernetes_sd_configs": [{ "role": "pod", "namespaces": { "names": [svc.get("namespace", "default")] } }], "relabel_configs": [ { "source_labels": ["__meta_kubernetes_pod_label_app"], "regex": svc["name"], "action": "keep" }, { "source_labels": ["__meta_kubernetes_namespace"], "target_label": "namespace" }, { "source_labels": ["__meta_kubernetes_pod_name"], "target_label": "pod" } ] } scrape_configs.append(config) elif svc.get("type") == "docker": # Docker 直接 scrape host = svc.get("host", "localhost") port = svc.get("port", 8080) config = { "job_name": svc["name"], "static_configs": [{ "targets": [f"{host}:{port}"], "labels": { "service": svc["name"], "type": "docker", "owner": svc.get("owner", "unknown"), "criticality": svc.get("criticality", "P2") } }] } scrape_configs.append(config) # Database Exporters (特殊處理) # PostgreSQL Exporter scrape_configs.append({ "job_name": "postgres-exporter", "static_configs": [{ "targets": ["192.168.0.188:9187"], "labels": { "service": "postgres", "type": "exporter" } }] }) # Redis Exporter scrape_configs.append({ "job_name": "redis-exporter", "static_configs": [{ "targets": ["192.168.0.188:9121"], "labels": { "service": "redis", "type": "exporter" } }] }) return scrape_configs def generate_blackbox_targets(registry: dict) -> list[dict]: """ 生成 Blackbox Exporter targets Returns: List of target configs for HTTP probing """ targets = [] for svc in registry.get("services", []): health_endpoint = svc.get("health_endpoint") health_type = svc.get("health_type", "http") if health_endpoint and health_type == "http": if svc.get("type") == "k8s-deployment": # K8s 內部 DNS url = f"http://{svc['name']}.{svc.get('namespace', 'default')}.svc.cluster.local:{svc.get('port', 8080)}{health_endpoint}" elif svc.get("type") == "docker": host = svc.get("host", "localhost") port = svc.get("port", 8080) url = f"http://{host}:{port}{health_endpoint}" else: continue targets.append({ "url": url, "labels": { "service": svc["name"], "criticality": svc.get("criticality", "P2"), "owner": svc.get("owner", "unknown") } }) # API 端點 for endpoint in registry.get("api_endpoints", []): if endpoint.get("critical"): url = f"http://awoooi-api.awoooi-prod.svc.cluster.local:8000{endpoint['path']}" targets.append({ "url": url, "labels": { "endpoint": endpoint["path"], "method": endpoint.get("method", "GET"), "criticality": "P0" if endpoint.get("critical") else "P2" } }) return targets def validate_coverage(registry: dict) -> dict: """ 驗證監控覆蓋率 Returns: Coverage report dict """ report = { "total_services": 0, "monitored_services": 0, "coverage_percent": 0.0, "missing_prometheus": [], "missing_health_endpoint": [], "missing_alerts": [], "missing_auto_repair": [], } for svc in registry.get("services", []): report["total_services"] += 1 monitoring = svc.get("monitoring", {}) if monitoring.get("prometheus"): report["monitored_services"] += 1 else: report["missing_prometheus"].append(svc["name"]) if not svc.get("health_endpoint"): report["missing_health_endpoint"].append(svc["name"]) if not svc.get("alerts"): report["missing_alerts"].append(svc["name"]) if svc.get("criticality") in ["P0", "P1"] and not svc.get("auto_repair", {}).get("enabled"): report["missing_auto_repair"].append(svc["name"]) if report["total_services"] > 0: report["coverage_percent"] = round( 100 * report["monitored_services"] / report["total_services"], 1 ) return report def write_prometheus_config(scrape_configs: list[dict], output_dir: Path): """寫入 Prometheus scrape 配置""" output_dir.mkdir(parents=True, exist_ok=True) output_file = output_dir / "prometheus-scrape-generated.yaml" config = { "# Auto-generated by generate_monitoring.py": None, "# DO NOT EDIT MANUALLY": None, "scrape_configs": scrape_configs } with open(output_file, "w") as f: yaml.dump({"scrape_configs": scrape_configs}, f, default_flow_style=False, allow_unicode=True) print(f"Generated: {output_file}") def write_blackbox_targets(targets: list[dict], output_dir: Path): """寫入 Blackbox Exporter targets""" output_dir.mkdir(parents=True, exist_ok=True) output_file = output_dir / "blackbox-targets-generated.yaml" with open(output_file, "w") as f: yaml.dump(targets, f, default_flow_style=False, allow_unicode=True) print(f"Generated: {output_file}") def print_coverage_report(report: dict): """輸出覆蓋率報告""" print("\n" + "=" * 60) print(" AWOOOI Monitoring Coverage Report") print("=" * 60) print(f"\n Total Services: {report['total_services']}") print(f" Monitored: {report['monitored_services']}") print(f" Coverage: {report['coverage_percent']}%") if report["missing_prometheus"]: print(f"\n Missing Prometheus Monitoring ({len(report['missing_prometheus'])}):") for svc in report["missing_prometheus"]: print(f" - {svc}") if report["missing_health_endpoint"]: print(f"\n Missing Health Endpoint ({len(report['missing_health_endpoint'])}):") for svc in report["missing_health_endpoint"]: print(f" - {svc}") if report["missing_alerts"]: print(f"\n Missing Alert Rules ({len(report['missing_alerts'])}):") for svc in report["missing_alerts"]: print(f" - {svc}") if report["missing_auto_repair"]: print(f"\n P0/P1 Without Auto-Repair ({len(report['missing_auto_repair'])}):") for svc in report["missing_auto_repair"]: print(f" - {svc}") print("\n" + "=" * 60) # CI/CD 用: 覆蓋率低於 90% 視為失敗 if report["coverage_percent"] < 90: print(f"\n WARNING: Coverage below 90% threshold!") return False return True def main(): parser = argparse.ArgumentParser(description="AWOOOI Monitoring Config Generator") parser.add_argument("--validate-only", action="store_true", help="Only validate coverage, don't generate") parser.add_argument("--output-dir", type=Path, default=OUTPUT_DIR, help="Output directory") parser.add_argument("--ci", action="store_true", help="CI mode: exit 1 if coverage < 90%") args = parser.parse_args() print("Loading service registry...") registry = load_registry() # 驗證覆蓋率 report = validate_coverage(registry) coverage_ok = print_coverage_report(report) if args.validate_only: sys.exit(0 if coverage_ok else 1) # 生成配置 print("\nGenerating monitoring configs...") scrape_configs = generate_prometheus_scrape_configs(registry) write_prometheus_config(scrape_configs, args.output_dir) blackbox_targets = generate_blackbox_targets(registry) write_blackbox_targets(blackbox_targets, args.output_dir) print(f"\nGenerated {len(scrape_configs)} scrape configs") print(f"Generated {len(blackbox_targets)} blackbox targets") if args.ci and not coverage_ok: sys.exit(1) if __name__ == "__main__": main()