344 lines
12 KiB
Python
344 lines
12 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
generate_monitoring.py — 監控覆蓋率自動發現
|
||
Phase O-5 Wave C.1 (2026-04-02 ogt)
|
||
|
||
功能:
|
||
1. 查詢 Prometheus targets API,取得全量 scrape 狀態
|
||
2. 掃描 K8s Services,找出未被監控的服務
|
||
3. 輸出覆蓋率報告 (JSON + 人可讀格式)
|
||
|
||
用法:
|
||
python3 scripts/generate_monitoring.py
|
||
python3 scripts/generate_monitoring.py --json
|
||
python3 scripts/generate_monitoring.py --check # CI mode: exit 1 if coverage < threshold
|
||
"""
|
||
|
||
import argparse
|
||
import json
|
||
import os
|
||
import subprocess
|
||
import sys
|
||
import time
|
||
from datetime import datetime
|
||
from typing import Callable
|
||
from urllib.error import HTTPError, URLError
|
||
from urllib.request import urlopen
|
||
|
||
# ============================================================
|
||
# 設定
|
||
# ============================================================
|
||
|
||
PROMETHEUS_URL = "http://192.168.0.110:9090"
|
||
COVERAGE_THRESHOLD = 70 # CI 模式: 覆蓋率低於此值則 exit 1
|
||
DEFAULT_STABILIZATION_ATTEMPTS = 3
|
||
DEFAULT_STABILIZATION_SLEEP_SECONDS = 10.0
|
||
|
||
# 已知服務清單 (job名稱 → 說明)
|
||
EXPECTED_JOBS = {
|
||
"awoooi-api": "AWOOOI API (K8s)",
|
||
"clawbot": "OpenClaw 188:8088",
|
||
"node-exporter-110": "Node Exporter 110",
|
||
"node-exporter-112": "Node Exporter 112 (Kali)",
|
||
"node-exporter-188": "Node Exporter 188",
|
||
"cadvisor-110": "cAdvisor 110",
|
||
"prometheus": "Prometheus self-scrape",
|
||
"blackbox-http": "Blackbox HTTP probe",
|
||
"blackbox-tcp": "Blackbox TCP probe",
|
||
"github-actions": "GitHub Actions exporter",
|
||
}
|
||
|
||
# 允許 down 的 target (已知問題,不影響覆蓋率計算)
|
||
KNOWN_DOWN_TARGETS = {
|
||
"federation-k8s": "K8s federation — SigNoz 內部 Prometheus,非外部暴露",
|
||
"kube-state-metrics": "kube-state-metrics NodePort 30180 — 僅 OTEL Collector 內部存取",
|
||
"node-exporter-120": "node-exporter 120 — K8s master 節點防火牆規則",
|
||
"node-exporter-121": "node-exporter 121 — K8s worker 節點防火牆規則",
|
||
}
|
||
|
||
|
||
def _int_env(name: str, default: int) -> int:
|
||
try:
|
||
return max(1, int(os.environ.get(name, default)))
|
||
except ValueError:
|
||
return default
|
||
|
||
|
||
def _float_env(name: str, default: float) -> float:
|
||
try:
|
||
return max(0.0, float(os.environ.get(name, default)))
|
||
except ValueError:
|
||
return default
|
||
|
||
|
||
def get_prometheus_targets() -> dict:
|
||
"""查詢 Prometheus targets API"""
|
||
try:
|
||
with urlopen(f"{PROMETHEUS_URL}/api/v1/targets", timeout=10) as resp:
|
||
payload = json.loads(resp.read().decode("utf-8"))
|
||
return payload["data"]
|
||
except (HTTPError, URLError, TimeoutError, json.JSONDecodeError, KeyError) as e:
|
||
print(f"❌ 無法連接 Prometheus ({PROMETHEUS_URL}): {e}", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
|
||
def get_k8s_services() -> list[dict]:
|
||
"""查詢 K8s services (需要 kubectl)"""
|
||
try:
|
||
result = subprocess.run(
|
||
["kubectl", "get", "services", "--all-namespaces", "-o", "json"],
|
||
capture_output=True,
|
||
text=True,
|
||
timeout=15,
|
||
)
|
||
if result.returncode != 0:
|
||
return []
|
||
data = json.loads(result.stdout)
|
||
return data.get("items", [])
|
||
except (subprocess.TimeoutExpired, json.JSONDecodeError, FileNotFoundError):
|
||
return []
|
||
|
||
|
||
def analyze_targets(targets_data: dict) -> dict:
|
||
"""分析 targets 狀態"""
|
||
active = targets_data.get("activeTargets", [])
|
||
|
||
jobs: dict[str, dict] = {}
|
||
for t in active:
|
||
job = t["labels"].get("job", "unknown")
|
||
instance = t["labels"].get("instance", "?")
|
||
health = t["health"]
|
||
|
||
if job not in jobs:
|
||
jobs[job] = {"up": [], "down": [], "unknown": []}
|
||
jobs[job][health].append(instance)
|
||
|
||
return jobs
|
||
|
||
|
||
def build_report(jobs: dict) -> dict:
|
||
"""建立覆蓋率報告"""
|
||
total_jobs = len(jobs)
|
||
up_jobs = sum(1 for j in jobs.values() if j["up"] and not j["down"])
|
||
partial_jobs = sum(1 for j in jobs.values() if j["up"] and j["down"])
|
||
down_jobs = sum(1 for j in jobs.values() if not j["up"] and j["down"])
|
||
|
||
# 只計算非 known_down 的問題
|
||
real_down_jobs = {
|
||
job: data
|
||
for job, data in jobs.items()
|
||
if not data["up"] and job not in KNOWN_DOWN_TARGETS
|
||
}
|
||
|
||
expected_covered = sum(1 for j in EXPECTED_JOBS if j in jobs and jobs[j]["up"])
|
||
coverage_pct = round(expected_covered / len(EXPECTED_JOBS) * 100, 1)
|
||
|
||
return {
|
||
"generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||
"prometheus_url": PROMETHEUS_URL,
|
||
"summary": {
|
||
"total_jobs": total_jobs,
|
||
"up_jobs": up_jobs,
|
||
"partial_jobs": partial_jobs,
|
||
"down_jobs": down_jobs,
|
||
"real_down_jobs": len(real_down_jobs),
|
||
"expected_coverage_pct": coverage_pct,
|
||
},
|
||
"jobs": jobs,
|
||
"expected_jobs": EXPECTED_JOBS,
|
||
"known_down": KNOWN_DOWN_TARGETS,
|
||
"real_down_jobs": list(real_down_jobs.keys()),
|
||
"missing_expected": [j for j in EXPECTED_JOBS if j not in jobs],
|
||
}
|
||
|
||
|
||
def build_report_from_targets(targets_data: dict) -> dict:
|
||
"""從 Prometheus targets API payload 建立覆蓋率報告"""
|
||
return build_report(analyze_targets(targets_data))
|
||
|
||
|
||
def report_needs_stabilization(report: dict) -> bool:
|
||
"""是否需要重查,避免 post-deploy 瞬間 scrape 狀態造成 false red."""
|
||
return bool(report["real_down_jobs"] or report["missing_expected"])
|
||
|
||
|
||
def stabilization_reason(report: dict) -> str:
|
||
parts: list[str] = []
|
||
if report["real_down_jobs"]:
|
||
parts.append(f"real_down={','.join(report['real_down_jobs'])}")
|
||
if report["missing_expected"]:
|
||
parts.append(f"missing_expected={','.join(report['missing_expected'])}")
|
||
return "; ".join(parts) if parts else "stable"
|
||
|
||
|
||
def build_stabilized_report(
|
||
fetch_targets: Callable[[], dict],
|
||
attempts: int,
|
||
sleep_seconds: float,
|
||
emit_status: bool = True,
|
||
) -> dict:
|
||
"""重查 Prometheus targets,讓 CI gate 避開 rollout/scrape freshness 瞬間值."""
|
||
attempts = max(1, attempts)
|
||
sleep_seconds = max(0.0, sleep_seconds)
|
||
|
||
report: dict | None = None
|
||
for attempt in range(1, attempts + 1):
|
||
report = build_report_from_targets(fetch_targets())
|
||
needs_retry = report_needs_stabilization(report)
|
||
status = "stable"
|
||
if needs_retry and attempt < attempts:
|
||
status = "retrying"
|
||
elif needs_retry:
|
||
status = "failed"
|
||
elif attempt > 1:
|
||
status = "cleared"
|
||
|
||
report["stabilization"] = {
|
||
"attempt": attempt,
|
||
"attempts": attempts,
|
||
"sleep_seconds": sleep_seconds,
|
||
"status": status,
|
||
"reason": stabilization_reason(report),
|
||
}
|
||
|
||
if not needs_retry or attempt == attempts:
|
||
if emit_status and attempt > 1 and not needs_retry:
|
||
print(
|
||
"✅ Prometheus target stabilization cleared transient coverage drift",
|
||
file=sys.stderr,
|
||
)
|
||
return report
|
||
|
||
if emit_status:
|
||
print(
|
||
"⏳ Prometheus target stabilization "
|
||
f"{attempt}/{attempts}: {stabilization_reason(report)}",
|
||
file=sys.stderr,
|
||
)
|
||
time.sleep(sleep_seconds)
|
||
|
||
if report is None:
|
||
raise RuntimeError("monitoring report stabilization did not run")
|
||
return report
|
||
|
||
|
||
def print_human_report(report: dict) -> None:
|
||
"""輸出人可讀格式報告"""
|
||
s = report["summary"]
|
||
print(f"\n{'='*60}")
|
||
print(f" AWOOOI 監控覆蓋率報告")
|
||
print(f" 生成時間: {report['generated_at']}")
|
||
print(f"{'='*60}")
|
||
print(f"\n📊 總覽")
|
||
print(f" Jobs 總數: {s['total_jobs']}")
|
||
print(f" 全部 UP: {s['up_jobs']}")
|
||
print(f" 部分 UP: {s['partial_jobs']}")
|
||
print(f" 全部 DOWN: {s['down_jobs']}")
|
||
print(f" 真實問題 (非已知): {s['real_down_jobs']}")
|
||
print(f" 預期覆蓋率: {s['expected_coverage_pct']}% ({COVERAGE_THRESHOLD}% 門檻)")
|
||
|
||
print(f"\n✅ 預期服務狀態")
|
||
for job, desc in report["expected_jobs"].items():
|
||
jobs = report["jobs"]
|
||
if job not in jobs:
|
||
status = "❌ 缺失"
|
||
elif jobs[job]["up"] and not jobs[job]["down"]:
|
||
status = "✅ UP"
|
||
elif jobs[job]["up"]:
|
||
status = f"⚠️ 部分 UP ({len(jobs[job]['up'])} up, {len(jobs[job]['down'])} down)"
|
||
else:
|
||
status = "❌ DOWN"
|
||
print(f" {status:<30} {job:<25} {desc}")
|
||
|
||
known_down_present = [
|
||
(job, reason)
|
||
for job, reason in report["known_down"].items()
|
||
if job in report["jobs"] and report["jobs"][job]["down"]
|
||
]
|
||
if known_down_present:
|
||
print(f"\n⚠️ 已知 DOWN (不影響覆蓋率)")
|
||
for job, reason in known_down_present:
|
||
print(f" {job:<30} {reason}")
|
||
|
||
if report["real_down_jobs"]:
|
||
print(f"\n🔴 需處理的 DOWN targets")
|
||
for job in report["real_down_jobs"]:
|
||
instances = report["jobs"][job].get("down", [])
|
||
print(f" {job}: {', '.join(instances)}")
|
||
|
||
if report["missing_expected"]:
|
||
print(f"\n🔴 缺少預期服務監控")
|
||
for job in report["missing_expected"]:
|
||
print(f" {job}: {report['expected_jobs'][job]}")
|
||
|
||
stabilization = report.get("stabilization")
|
||
if stabilization and stabilization["attempt"] > 1:
|
||
print(f"\n⏱️ Prometheus target 穩定化")
|
||
print(
|
||
" "
|
||
f"{stabilization['status']} after "
|
||
f"{stabilization['attempt']}/{stabilization['attempts']} attempts"
|
||
)
|
||
|
||
pct = s["expected_coverage_pct"]
|
||
threshold = COVERAGE_THRESHOLD
|
||
if pct >= threshold and not report["real_down_jobs"]:
|
||
print(f"\n✅ 監控健康: 覆蓋率 {pct}% >= {threshold}%,無真實問題\n")
|
||
elif pct >= threshold:
|
||
print(f"\n⚠️ 覆蓋率達標 ({pct}%),但有 {s['real_down_jobs']} 個真實 DOWN 需處理\n")
|
||
else:
|
||
print(f"\n❌ 覆蓋率不足: {pct}% < {threshold}%\n")
|
||
|
||
|
||
def main() -> None:
|
||
parser = argparse.ArgumentParser(description="AWOOOI 監控覆蓋率自動發現")
|
||
parser.add_argument("--json", action="store_true", help="輸出 JSON 格式")
|
||
parser.add_argument(
|
||
"--check",
|
||
action="store_true",
|
||
help=f"CI 模式: 覆蓋率 < {COVERAGE_THRESHOLD}% 則 exit 1",
|
||
)
|
||
parser.add_argument(
|
||
"--stabilization-attempts",
|
||
type=int,
|
||
default=_int_env(
|
||
"AWOOOI_MONITORING_TARGET_STABILIZATION_ATTEMPTS",
|
||
DEFAULT_STABILIZATION_ATTEMPTS,
|
||
),
|
||
help="CI 模式: Prometheus target 狀態重查次數",
|
||
)
|
||
parser.add_argument(
|
||
"--stabilization-sleep-seconds",
|
||
type=float,
|
||
default=_float_env(
|
||
"AWOOOI_MONITORING_TARGET_STABILIZATION_SLEEP_SECONDS",
|
||
DEFAULT_STABILIZATION_SLEEP_SECONDS,
|
||
),
|
||
help="CI 模式: Prometheus target 重查間隔秒數",
|
||
)
|
||
args = parser.parse_args()
|
||
|
||
if args.check:
|
||
report = build_stabilized_report(
|
||
get_prometheus_targets,
|
||
attempts=args.stabilization_attempts,
|
||
sleep_seconds=args.stabilization_sleep_seconds,
|
||
)
|
||
else:
|
||
report = build_report_from_targets(get_prometheus_targets())
|
||
|
||
if args.json:
|
||
print(json.dumps(report, ensure_ascii=False, indent=2))
|
||
else:
|
||
print_human_report(report)
|
||
|
||
if args.check:
|
||
pct = report["summary"]["expected_coverage_pct"]
|
||
real_down = report["summary"]["real_down_jobs"]
|
||
if pct < COVERAGE_THRESHOLD or real_down > 0:
|
||
sys.exit(1)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|