#!/usr/bin/env python3 """ generate_monitoring.py — 監控覆蓋率自動發現 Phase O-5 Wave C.1 (2026-04-02 ogt) 功能: 1. 查詢 Prometheus targets API,取得全量 scrape 狀態 2. 掃描 K8s Services,找出未被監控的服務 3. 輸出覆蓋率報告 (JSON + 人可讀格式) 用法: python3 scripts/generate_monitoring.py python3 scripts/generate_monitoring.py --json python3 scripts/generate_monitoring.py --check # CI mode: exit 1 if coverage < threshold """ import argparse import json import subprocess import sys from datetime import datetime import requests # ============================================================ # 設定 # ============================================================ PROMETHEUS_URL = "http://192.168.0.110:9090" COVERAGE_THRESHOLD = 70 # CI 模式: 覆蓋率低於此值則 exit 1 # 已知服務清單 (job名稱 → 說明) EXPECTED_JOBS = { "awoooi-api": "AWOOOI API (K8s)", "clawbot": "OpenClaw 188:8088", "node-exporter-110": "Node Exporter 110", "node-exporter-112": "Node Exporter 112 (Kali)", "node-exporter-188": "Node Exporter 188", "cadvisor-110": "cAdvisor 110", "prometheus": "Prometheus self-scrape", "blackbox-http": "Blackbox HTTP probe", "blackbox-tcp": "Blackbox TCP probe", "github-actions": "GitHub Actions exporter", } # 允許 down 的 target (已知問題,不影響覆蓋率計算) KNOWN_DOWN_TARGETS = { "federation-k8s": "K8s federation — SigNoz 內部 Prometheus,非外部暴露", "kube-state-metrics": "kube-state-metrics NodePort 30180 — 僅 OTEL Collector 內部存取", "node-exporter-120": "node-exporter 120 — K8s master 節點防火牆規則", "node-exporter-121": "node-exporter 121 — K8s worker 節點防火牆規則", } def get_prometheus_targets() -> dict: """查詢 Prometheus targets API""" try: resp = requests.get(f"{PROMETHEUS_URL}/api/v1/targets", timeout=10) resp.raise_for_status() return resp.json()["data"] except requests.RequestException as e: print(f"❌ 無法連接 Prometheus ({PROMETHEUS_URL}): {e}", file=sys.stderr) sys.exit(1) def get_k8s_services() -> list[dict]: """查詢 K8s services (需要 kubectl)""" try: result = subprocess.run( ["kubectl", "get", "services", "--all-namespaces", "-o", "json"], capture_output=True, text=True, timeout=15, ) if result.returncode != 0: return [] data = json.loads(result.stdout) return data.get("items", []) except (subprocess.TimeoutExpired, json.JSONDecodeError, FileNotFoundError): return [] def analyze_targets(targets_data: dict) -> dict: """分析 targets 狀態""" active = targets_data.get("activeTargets", []) jobs: dict[str, dict] = {} for t in active: job = t["labels"].get("job", "unknown") instance = t["labels"].get("instance", "?") health = t["health"] if job not in jobs: jobs[job] = {"up": [], "down": [], "unknown": []} jobs[job][health].append(instance) return jobs def build_report(jobs: dict) -> dict: """建立覆蓋率報告""" total_jobs = len(jobs) up_jobs = sum(1 for j in jobs.values() if j["up"] and not j["down"]) partial_jobs = sum(1 for j in jobs.values() if j["up"] and j["down"]) down_jobs = sum(1 for j in jobs.values() if not j["up"] and j["down"]) # 只計算非 known_down 的問題 real_down_jobs = { job: data for job, data in jobs.items() if not data["up"] and job not in KNOWN_DOWN_TARGETS } expected_covered = sum(1 for j in EXPECTED_JOBS if j in jobs and jobs[j]["up"]) coverage_pct = round(expected_covered / len(EXPECTED_JOBS) * 100, 1) return { "generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "prometheus_url": PROMETHEUS_URL, "summary": { "total_jobs": total_jobs, "up_jobs": up_jobs, "partial_jobs": partial_jobs, "down_jobs": down_jobs, "real_down_jobs": len(real_down_jobs), "expected_coverage_pct": coverage_pct, }, "jobs": jobs, "expected_jobs": EXPECTED_JOBS, "known_down": KNOWN_DOWN_TARGETS, "real_down_jobs": list(real_down_jobs.keys()), "missing_expected": [j for j in EXPECTED_JOBS if j not in jobs], } def print_human_report(report: dict) -> None: """輸出人可讀格式報告""" s = report["summary"] print(f"\n{'='*60}") print(f" AWOOOI 監控覆蓋率報告") print(f" 生成時間: {report['generated_at']}") print(f"{'='*60}") print(f"\n📊 總覽") print(f" Jobs 總數: {s['total_jobs']}") print(f" 全部 UP: {s['up_jobs']}") print(f" 部分 UP: {s['partial_jobs']}") print(f" 全部 DOWN: {s['down_jobs']}") print(f" 真實問題 (非已知): {s['real_down_jobs']}") print(f" 預期覆蓋率: {s['expected_coverage_pct']}% ({COVERAGE_THRESHOLD}% 門檻)") print(f"\n✅ 預期服務狀態") for job, desc in report["expected_jobs"].items(): jobs = report["jobs"] if job not in jobs: status = "❌ 缺失" elif jobs[job]["up"] and not jobs[job]["down"]: status = "✅ UP" elif jobs[job]["up"]: status = f"⚠️ 部分 UP ({len(jobs[job]['up'])} up, {len(jobs[job]['down'])} down)" else: status = "❌ DOWN" print(f" {status:<30} {job:<25} {desc}") if report["known_down"]: print(f"\n⚠️ 已知 DOWN (不影響覆蓋率)") for job, reason in report["known_down"].items(): if job in report["jobs"]: print(f" {job:<30} {reason}") if report["real_down_jobs"]: print(f"\n🔴 需處理的 DOWN targets") for job in report["real_down_jobs"]: instances = report["jobs"][job].get("down", []) print(f" {job}: {', '.join(instances)}") if report["missing_expected"]: print(f"\n🔴 缺少預期服務監控") for job in report["missing_expected"]: print(f" {job}: {report['expected_jobs'][job]}") pct = s["expected_coverage_pct"] threshold = COVERAGE_THRESHOLD if pct >= threshold and not report["real_down_jobs"]: print(f"\n✅ 監控健康: 覆蓋率 {pct}% >= {threshold}%,無真實問題\n") elif pct >= threshold: print(f"\n⚠️ 覆蓋率達標 ({pct}%),但有 {s['real_down_jobs']} 個真實 DOWN 需處理\n") else: print(f"\n❌ 覆蓋率不足: {pct}% < {threshold}%\n") def main() -> None: parser = argparse.ArgumentParser(description="AWOOOI 監控覆蓋率自動發現") parser.add_argument("--json", action="store_true", help="輸出 JSON 格式") parser.add_argument( "--check", action="store_true", help=f"CI 模式: 覆蓋率 < {COVERAGE_THRESHOLD}% 則 exit 1", ) args = parser.parse_args() targets_data = get_prometheus_targets() jobs = analyze_targets(targets_data) report = build_report(jobs) if args.json: print(json.dumps(report, ensure_ascii=False, indent=2)) else: print_human_report(report) if args.check: pct = report["summary"]["expected_coverage_pct"] real_down = report["summary"]["real_down_jobs"] if pct < COVERAGE_THRESHOLD or real_down > 0: sys.exit(1) if __name__ == "__main__": main()