遺漏文件導致 CD Monitoring Coverage 步驟失敗 新增: - generate_monitoring.py - 監控覆蓋率檢查 - coverage_report.py - 覆蓋率報告 - discover_docker.py - Docker 服務發現 - deploy-exporters.sh - Exporter 部署腳本 - postgres-exporter-queries.yaml - PostgreSQL 查詢配置 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
325 lines
10 KiB
Python
Executable File
325 lines
10 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
AWOOOI 監控配置生成器
|
|
======================
|
|
|
|
ADR-037 Wave C.1: 從 service-registry.yaml 自動生成監控配置
|
|
|
|
功能:
|
|
1. 讀取 service-registry.yaml (Single Source of Truth)
|
|
2. 生成 Prometheus scrape 配置
|
|
3. 生成 Blackbox Exporter targets
|
|
4. 驗證監控覆蓋率
|
|
|
|
用法:
|
|
python ops/monitoring/generate_monitoring.py
|
|
python ops/monitoring/generate_monitoring.py --validate-only
|
|
python ops/monitoring/generate_monitoring.py --output-dir /tmp/monitoring
|
|
|
|
版本: v1.0
|
|
建立: 2026-03-29 (台北時區)
|
|
建立者: Claude Code (Phase 21 ADR-037)
|
|
"""
|
|
|
|
import argparse
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import yaml
|
|
|
|
# 配置
|
|
SCRIPT_DIR = Path(__file__).parent
|
|
REGISTRY_FILE = SCRIPT_DIR / "service-registry.yaml"
|
|
OUTPUT_DIR = SCRIPT_DIR / "generated"
|
|
|
|
|
|
def load_registry() -> dict:
|
|
"""載入服務註冊表"""
|
|
if not REGISTRY_FILE.exists():
|
|
print(f"Error: Service registry not found: {REGISTRY_FILE}")
|
|
sys.exit(1)
|
|
|
|
with open(REGISTRY_FILE) as f:
|
|
return yaml.safe_load(f)
|
|
|
|
|
|
def generate_prometheus_scrape_configs(registry: dict) -> list[dict]:
|
|
"""
|
|
生成 Prometheus scrape 配置
|
|
|
|
Returns:
|
|
List of scrape_config dicts
|
|
"""
|
|
scrape_configs = []
|
|
|
|
# K8s 服務
|
|
for svc in registry.get("services", []):
|
|
if svc.get("monitoring", {}).get("prometheus"):
|
|
if svc.get("type") == "k8s-deployment":
|
|
# K8s ServiceMonitor 風格
|
|
config = {
|
|
"job_name": svc["name"],
|
|
"kubernetes_sd_configs": [{
|
|
"role": "pod",
|
|
"namespaces": {
|
|
"names": [svc.get("namespace", "default")]
|
|
}
|
|
}],
|
|
"relabel_configs": [
|
|
{
|
|
"source_labels": ["__meta_kubernetes_pod_label_app"],
|
|
"regex": svc["name"],
|
|
"action": "keep"
|
|
},
|
|
{
|
|
"source_labels": ["__meta_kubernetes_namespace"],
|
|
"target_label": "namespace"
|
|
},
|
|
{
|
|
"source_labels": ["__meta_kubernetes_pod_name"],
|
|
"target_label": "pod"
|
|
}
|
|
]
|
|
}
|
|
scrape_configs.append(config)
|
|
|
|
elif svc.get("type") == "docker":
|
|
# Docker 直接 scrape
|
|
host = svc.get("host", "localhost")
|
|
port = svc.get("port", 8080)
|
|
config = {
|
|
"job_name": svc["name"],
|
|
"static_configs": [{
|
|
"targets": [f"{host}:{port}"],
|
|
"labels": {
|
|
"service": svc["name"],
|
|
"type": "docker",
|
|
"owner": svc.get("owner", "unknown"),
|
|
"criticality": svc.get("criticality", "P2")
|
|
}
|
|
}]
|
|
}
|
|
scrape_configs.append(config)
|
|
|
|
# Database Exporters (特殊處理)
|
|
# PostgreSQL Exporter
|
|
scrape_configs.append({
|
|
"job_name": "postgres-exporter",
|
|
"static_configs": [{
|
|
"targets": ["192.168.0.188:9187"],
|
|
"labels": {
|
|
"service": "postgres",
|
|
"type": "exporter"
|
|
}
|
|
}]
|
|
})
|
|
|
|
# Redis Exporter
|
|
scrape_configs.append({
|
|
"job_name": "redis-exporter",
|
|
"static_configs": [{
|
|
"targets": ["192.168.0.188:9121"],
|
|
"labels": {
|
|
"service": "redis",
|
|
"type": "exporter"
|
|
}
|
|
}]
|
|
})
|
|
|
|
return scrape_configs
|
|
|
|
|
|
def generate_blackbox_targets(registry: dict) -> list[dict]:
|
|
"""
|
|
生成 Blackbox Exporter targets
|
|
|
|
Returns:
|
|
List of target configs for HTTP probing
|
|
"""
|
|
targets = []
|
|
|
|
for svc in registry.get("services", []):
|
|
health_endpoint = svc.get("health_endpoint")
|
|
health_type = svc.get("health_type", "http")
|
|
|
|
if health_endpoint and health_type == "http":
|
|
if svc.get("type") == "k8s-deployment":
|
|
# K8s 內部 DNS
|
|
url = f"http://{svc['name']}.{svc.get('namespace', 'default')}.svc.cluster.local:{svc.get('port', 8080)}{health_endpoint}"
|
|
elif svc.get("type") == "docker":
|
|
host = svc.get("host", "localhost")
|
|
port = svc.get("port", 8080)
|
|
url = f"http://{host}:{port}{health_endpoint}"
|
|
else:
|
|
continue
|
|
|
|
targets.append({
|
|
"url": url,
|
|
"labels": {
|
|
"service": svc["name"],
|
|
"criticality": svc.get("criticality", "P2"),
|
|
"owner": svc.get("owner", "unknown")
|
|
}
|
|
})
|
|
|
|
# API 端點
|
|
for endpoint in registry.get("api_endpoints", []):
|
|
if endpoint.get("critical"):
|
|
url = f"http://awoooi-api.awoooi-prod.svc.cluster.local:8000{endpoint['path']}"
|
|
targets.append({
|
|
"url": url,
|
|
"labels": {
|
|
"endpoint": endpoint["path"],
|
|
"method": endpoint.get("method", "GET"),
|
|
"criticality": "P0" if endpoint.get("critical") else "P2"
|
|
}
|
|
})
|
|
|
|
return targets
|
|
|
|
|
|
def validate_coverage(registry: dict) -> dict:
|
|
"""
|
|
驗證監控覆蓋率
|
|
|
|
Returns:
|
|
Coverage report dict
|
|
"""
|
|
report = {
|
|
"total_services": 0,
|
|
"monitored_services": 0,
|
|
"coverage_percent": 0.0,
|
|
"missing_prometheus": [],
|
|
"missing_health_endpoint": [],
|
|
"missing_alerts": [],
|
|
"missing_auto_repair": [],
|
|
}
|
|
|
|
for svc in registry.get("services", []):
|
|
report["total_services"] += 1
|
|
|
|
monitoring = svc.get("monitoring", {})
|
|
if monitoring.get("prometheus"):
|
|
report["monitored_services"] += 1
|
|
else:
|
|
report["missing_prometheus"].append(svc["name"])
|
|
|
|
if not svc.get("health_endpoint"):
|
|
report["missing_health_endpoint"].append(svc["name"])
|
|
|
|
if not svc.get("alerts"):
|
|
report["missing_alerts"].append(svc["name"])
|
|
|
|
if svc.get("criticality") in ["P0", "P1"] and not svc.get("auto_repair", {}).get("enabled"):
|
|
report["missing_auto_repair"].append(svc["name"])
|
|
|
|
if report["total_services"] > 0:
|
|
report["coverage_percent"] = round(
|
|
100 * report["monitored_services"] / report["total_services"], 1
|
|
)
|
|
|
|
return report
|
|
|
|
|
|
def write_prometheus_config(scrape_configs: list[dict], output_dir: Path):
|
|
"""寫入 Prometheus scrape 配置"""
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
output_file = output_dir / "prometheus-scrape-generated.yaml"
|
|
|
|
config = {
|
|
"# Auto-generated by generate_monitoring.py": None,
|
|
"# DO NOT EDIT MANUALLY": None,
|
|
"scrape_configs": scrape_configs
|
|
}
|
|
|
|
with open(output_file, "w") as f:
|
|
yaml.dump({"scrape_configs": scrape_configs}, f, default_flow_style=False, allow_unicode=True)
|
|
|
|
print(f"Generated: {output_file}")
|
|
|
|
|
|
def write_blackbox_targets(targets: list[dict], output_dir: Path):
|
|
"""寫入 Blackbox Exporter targets"""
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
output_file = output_dir / "blackbox-targets-generated.yaml"
|
|
|
|
with open(output_file, "w") as f:
|
|
yaml.dump(targets, f, default_flow_style=False, allow_unicode=True)
|
|
|
|
print(f"Generated: {output_file}")
|
|
|
|
|
|
def print_coverage_report(report: dict):
|
|
"""輸出覆蓋率報告"""
|
|
print("\n" + "=" * 60)
|
|
print(" AWOOOI Monitoring Coverage Report")
|
|
print("=" * 60)
|
|
print(f"\n Total Services: {report['total_services']}")
|
|
print(f" Monitored: {report['monitored_services']}")
|
|
print(f" Coverage: {report['coverage_percent']}%")
|
|
|
|
if report["missing_prometheus"]:
|
|
print(f"\n Missing Prometheus Monitoring ({len(report['missing_prometheus'])}):")
|
|
for svc in report["missing_prometheus"]:
|
|
print(f" - {svc}")
|
|
|
|
if report["missing_health_endpoint"]:
|
|
print(f"\n Missing Health Endpoint ({len(report['missing_health_endpoint'])}):")
|
|
for svc in report["missing_health_endpoint"]:
|
|
print(f" - {svc}")
|
|
|
|
if report["missing_alerts"]:
|
|
print(f"\n Missing Alert Rules ({len(report['missing_alerts'])}):")
|
|
for svc in report["missing_alerts"]:
|
|
print(f" - {svc}")
|
|
|
|
if report["missing_auto_repair"]:
|
|
print(f"\n P0/P1 Without Auto-Repair ({len(report['missing_auto_repair'])}):")
|
|
for svc in report["missing_auto_repair"]:
|
|
print(f" - {svc}")
|
|
|
|
print("\n" + "=" * 60)
|
|
|
|
# CI/CD 用: 覆蓋率低於 90% 視為失敗
|
|
if report["coverage_percent"] < 90:
|
|
print(f"\n WARNING: Coverage below 90% threshold!")
|
|
return False
|
|
return True
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="AWOOOI Monitoring Config Generator")
|
|
parser.add_argument("--validate-only", action="store_true", help="Only validate coverage, don't generate")
|
|
parser.add_argument("--output-dir", type=Path, default=OUTPUT_DIR, help="Output directory")
|
|
parser.add_argument("--ci", action="store_true", help="CI mode: exit 1 if coverage < 90%")
|
|
args = parser.parse_args()
|
|
|
|
print("Loading service registry...")
|
|
registry = load_registry()
|
|
|
|
# 驗證覆蓋率
|
|
report = validate_coverage(registry)
|
|
coverage_ok = print_coverage_report(report)
|
|
|
|
if args.validate_only:
|
|
sys.exit(0 if coverage_ok else 1)
|
|
|
|
# 生成配置
|
|
print("\nGenerating monitoring configs...")
|
|
|
|
scrape_configs = generate_prometheus_scrape_configs(registry)
|
|
write_prometheus_config(scrape_configs, args.output_dir)
|
|
|
|
blackbox_targets = generate_blackbox_targets(registry)
|
|
write_blackbox_targets(blackbox_targets, args.output_dir)
|
|
|
|
print(f"\nGenerated {len(scrape_configs)} scrape configs")
|
|
print(f"Generated {len(blackbox_targets)} blackbox targets")
|
|
|
|
if args.ci and not coverage_ok:
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|