#!/usr/bin/env python3 """ generate_monitoring.py — 監控覆蓋率自動發現 Phase O-5 Wave C.1 (2026-04-02 ogt) 功能: 1. 查詢 Prometheus targets API,取得全量 scrape 狀態 2. 掃描 K8s Services,找出未被監控的服務 3. 輸出覆蓋率報告 (JSON + 人可讀格式) 用法: python3 scripts/generate_monitoring.py python3 scripts/generate_monitoring.py --json python3 scripts/generate_monitoring.py --check # CI mode: exit 1 if coverage < threshold """ import argparse import json import os import subprocess import sys import time from datetime import datetime from typing import Callable from urllib.error import HTTPError, URLError from urllib.request import urlopen # ============================================================ # 設定 # ============================================================ PROMETHEUS_URL = "http://192.168.0.110:9090" COVERAGE_THRESHOLD = 70 # CI 模式: 覆蓋率低於此值則 exit 1 DEFAULT_STABILIZATION_ATTEMPTS = 3 DEFAULT_STABILIZATION_SLEEP_SECONDS = 10.0 # 已知服務清單 (job名稱 → 說明) EXPECTED_JOBS = { "awoooi-api": "AWOOOI API (K8s)", "clawbot": "OpenClaw 188:8088", "node-exporter-110": "Node Exporter 110", "node-exporter-112": "Node Exporter 112 (Kali)", "node-exporter-188": "Node Exporter 188", "cadvisor-110": "cAdvisor 110", "prometheus": "Prometheus self-scrape", "blackbox-http": "Blackbox HTTP probe", "blackbox-tcp": "Blackbox TCP probe", "github-actions": "GitHub Actions exporter", } # 允許 down 的 target (已知問題,不影響覆蓋率計算) KNOWN_DOWN_TARGETS = { "federation-k8s": "K8s federation — SigNoz 內部 Prometheus,非外部暴露", "kube-state-metrics": "kube-state-metrics NodePort 30180 — 僅 OTEL Collector 內部存取", "node-exporter-120": "node-exporter 120 — K8s master 節點防火牆規則", "node-exporter-121": "node-exporter 121 — K8s worker 節點防火牆規則", } def _int_env(name: str, default: int) -> int: try: return max(1, int(os.environ.get(name, default))) except ValueError: return default def _float_env(name: str, default: float) -> float: try: return max(0.0, float(os.environ.get(name, default))) except ValueError: return default def get_prometheus_targets() -> dict: """查詢 Prometheus targets API""" try: with urlopen(f"{PROMETHEUS_URL}/api/v1/targets", timeout=10) as resp: payload = json.loads(resp.read().decode("utf-8")) return payload["data"] except (HTTPError, URLError, TimeoutError, json.JSONDecodeError, KeyError) as e: print(f"❌ 無法連接 Prometheus ({PROMETHEUS_URL}): {e}", file=sys.stderr) sys.exit(1) def get_k8s_services() -> list[dict]: """查詢 K8s services (需要 kubectl)""" try: result = subprocess.run( ["kubectl", "get", "services", "--all-namespaces", "-o", "json"], capture_output=True, text=True, timeout=15, ) if result.returncode != 0: return [] data = json.loads(result.stdout) return data.get("items", []) except (subprocess.TimeoutExpired, json.JSONDecodeError, FileNotFoundError): return [] def analyze_targets(targets_data: dict) -> dict: """分析 targets 狀態""" active = targets_data.get("activeTargets", []) jobs: dict[str, dict] = {} for t in active: job = t["labels"].get("job", "unknown") instance = t["labels"].get("instance", "?") health = t["health"] if job not in jobs: jobs[job] = {"up": [], "down": [], "unknown": []} jobs[job][health].append(instance) return jobs def build_report(jobs: dict) -> dict: """建立覆蓋率報告""" total_jobs = len(jobs) up_jobs = sum(1 for j in jobs.values() if j["up"] and not j["down"]) partial_jobs = sum(1 for j in jobs.values() if j["up"] and j["down"]) down_jobs = sum(1 for j in jobs.values() if not j["up"] and j["down"]) # 只計算非 known_down 的問題 real_down_jobs = { job: data for job, data in jobs.items() if not data["up"] and job not in KNOWN_DOWN_TARGETS } expected_covered = sum(1 for j in EXPECTED_JOBS if j in jobs and jobs[j]["up"]) coverage_pct = round(expected_covered / len(EXPECTED_JOBS) * 100, 1) return { "generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "prometheus_url": PROMETHEUS_URL, "summary": { "total_jobs": total_jobs, "up_jobs": up_jobs, "partial_jobs": partial_jobs, "down_jobs": down_jobs, "real_down_jobs": len(real_down_jobs), "expected_coverage_pct": coverage_pct, }, "jobs": jobs, "expected_jobs": EXPECTED_JOBS, "known_down": KNOWN_DOWN_TARGETS, "real_down_jobs": list(real_down_jobs.keys()), "missing_expected": [j for j in EXPECTED_JOBS if j not in jobs], } def build_report_from_targets(targets_data: dict) -> dict: """從 Prometheus targets API payload 建立覆蓋率報告""" return build_report(analyze_targets(targets_data)) def report_needs_stabilization(report: dict) -> bool: """是否需要重查,避免 post-deploy 瞬間 scrape 狀態造成 false red.""" return bool(report["real_down_jobs"] or report["missing_expected"]) def stabilization_reason(report: dict) -> str: parts: list[str] = [] if report["real_down_jobs"]: parts.append(f"real_down={','.join(report['real_down_jobs'])}") if report["missing_expected"]: parts.append(f"missing_expected={','.join(report['missing_expected'])}") return "; ".join(parts) if parts else "stable" def build_stabilized_report( fetch_targets: Callable[[], dict], attempts: int, sleep_seconds: float, emit_status: bool = True, ) -> dict: """重查 Prometheus targets,讓 CI gate 避開 rollout/scrape freshness 瞬間值.""" attempts = max(1, attempts) sleep_seconds = max(0.0, sleep_seconds) report: dict | None = None for attempt in range(1, attempts + 1): report = build_report_from_targets(fetch_targets()) needs_retry = report_needs_stabilization(report) status = "stable" if needs_retry and attempt < attempts: status = "retrying" elif needs_retry: status = "failed" elif attempt > 1: status = "cleared" report["stabilization"] = { "attempt": attempt, "attempts": attempts, "sleep_seconds": sleep_seconds, "status": status, "reason": stabilization_reason(report), } if not needs_retry or attempt == attempts: if emit_status and attempt > 1 and not needs_retry: print( "✅ Prometheus target stabilization cleared transient coverage drift", file=sys.stderr, ) return report if emit_status: print( "⏳ Prometheus target stabilization " f"{attempt}/{attempts}: {stabilization_reason(report)}", file=sys.stderr, ) time.sleep(sleep_seconds) if report is None: raise RuntimeError("monitoring report stabilization did not run") return report def print_human_report(report: dict) -> None: """輸出人可讀格式報告""" s = report["summary"] print(f"\n{'='*60}") print(f" AWOOOI 監控覆蓋率報告") print(f" 生成時間: {report['generated_at']}") print(f"{'='*60}") print(f"\n📊 總覽") print(f" Jobs 總數: {s['total_jobs']}") print(f" 全部 UP: {s['up_jobs']}") print(f" 部分 UP: {s['partial_jobs']}") print(f" 全部 DOWN: {s['down_jobs']}") print(f" 真實問題 (非已知): {s['real_down_jobs']}") print(f" 預期覆蓋率: {s['expected_coverage_pct']}% ({COVERAGE_THRESHOLD}% 門檻)") print(f"\n✅ 預期服務狀態") for job, desc in report["expected_jobs"].items(): jobs = report["jobs"] if job not in jobs: status = "❌ 缺失" elif jobs[job]["up"] and not jobs[job]["down"]: status = "✅ UP" elif jobs[job]["up"]: status = f"⚠️ 部分 UP ({len(jobs[job]['up'])} up, {len(jobs[job]['down'])} down)" else: status = "❌ DOWN" print(f" {status:<30} {job:<25} {desc}") known_down_present = [ (job, reason) for job, reason in report["known_down"].items() if job in report["jobs"] and report["jobs"][job]["down"] ] if known_down_present: print(f"\n⚠️ 已知 DOWN (不影響覆蓋率)") for job, reason in known_down_present: print(f" {job:<30} {reason}") if report["real_down_jobs"]: print(f"\n🔴 需處理的 DOWN targets") for job in report["real_down_jobs"]: instances = report["jobs"][job].get("down", []) print(f" {job}: {', '.join(instances)}") if report["missing_expected"]: print(f"\n🔴 缺少預期服務監控") for job in report["missing_expected"]: print(f" {job}: {report['expected_jobs'][job]}") stabilization = report.get("stabilization") if stabilization and stabilization["attempt"] > 1: print(f"\n⏱️ Prometheus target 穩定化") print( " " f"{stabilization['status']} after " f"{stabilization['attempt']}/{stabilization['attempts']} attempts" ) pct = s["expected_coverage_pct"] threshold = COVERAGE_THRESHOLD if pct >= threshold and not report["real_down_jobs"]: print(f"\n✅ 監控健康: 覆蓋率 {pct}% >= {threshold}%,無真實問題\n") elif pct >= threshold: print(f"\n⚠️ 覆蓋率達標 ({pct}%),但有 {s['real_down_jobs']} 個真實 DOWN 需處理\n") else: print(f"\n❌ 覆蓋率不足: {pct}% < {threshold}%\n") def main() -> None: parser = argparse.ArgumentParser(description="AWOOOI 監控覆蓋率自動發現") parser.add_argument("--json", action="store_true", help="輸出 JSON 格式") parser.add_argument( "--check", action="store_true", help=f"CI 模式: 覆蓋率 < {COVERAGE_THRESHOLD}% 則 exit 1", ) parser.add_argument( "--stabilization-attempts", type=int, default=_int_env( "AWOOOI_MONITORING_TARGET_STABILIZATION_ATTEMPTS", DEFAULT_STABILIZATION_ATTEMPTS, ), help="CI 模式: Prometheus target 狀態重查次數", ) parser.add_argument( "--stabilization-sleep-seconds", type=float, default=_float_env( "AWOOOI_MONITORING_TARGET_STABILIZATION_SLEEP_SECONDS", DEFAULT_STABILIZATION_SLEEP_SECONDS, ), help="CI 模式: Prometheus target 重查間隔秒數", ) args = parser.parse_args() if args.check: report = build_stabilized_report( get_prometheus_targets, attempts=args.stabilization_attempts, sleep_seconds=args.stabilization_sleep_seconds, ) else: report = build_report_from_targets(get_prometheus_targets()) if args.json: print(json.dumps(report, ensure_ascii=False, indent=2)) else: print_human_report(report) if args.check: pct = report["summary"]["expected_coverage_pct"] real_down = report["summary"]["real_down_jobs"] if pct < COVERAGE_THRESHOLD or real_down > 0: sys.exit(1) if __name__ == "__main__": main()