- 查詢 Prometheus targets API 取得全量 scrape 狀態 - 10 個預期服務覆蓋率計算 (門檻 70%) - 已知 DOWN targets 豁免清單 (不影響健康判斷) - --json 機器可讀輸出 / --check CI 模式 (exit 1 if coverage < threshold) - 首次執行: 100% 覆蓋率,無真實問題 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
219 lines
7.4 KiB
Python
219 lines
7.4 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
generate_monitoring.py — 監控覆蓋率自動發現
|
||
Phase O-5 Wave C.1 (2026-04-02 ogt)
|
||
|
||
功能:
|
||
1. 查詢 Prometheus targets API,取得全量 scrape 狀態
|
||
2. 掃描 K8s Services,找出未被監控的服務
|
||
3. 輸出覆蓋率報告 (JSON + 人可讀格式)
|
||
|
||
用法:
|
||
python3 scripts/generate_monitoring.py
|
||
python3 scripts/generate_monitoring.py --json
|
||
python3 scripts/generate_monitoring.py --check # CI mode: exit 1 if coverage < threshold
|
||
"""
|
||
|
||
import argparse
|
||
import json
|
||
import subprocess
|
||
import sys
|
||
from datetime import datetime
|
||
|
||
import requests
|
||
|
||
# ============================================================
|
||
# 設定
|
||
# ============================================================
|
||
|
||
PROMETHEUS_URL = "http://192.168.0.110:9090"
|
||
COVERAGE_THRESHOLD = 70 # CI 模式: 覆蓋率低於此值則 exit 1
|
||
|
||
# 已知服務清單 (job名稱 → 說明)
|
||
EXPECTED_JOBS = {
|
||
"awoooi-api": "AWOOOI API (K8s)",
|
||
"clawbot": "OpenClaw 188:8088",
|
||
"node-exporter-110": "Node Exporter 110",
|
||
"node-exporter-112": "Node Exporter 112 (Kali)",
|
||
"node-exporter-188": "Node Exporter 188",
|
||
"cadvisor-110": "cAdvisor 110",
|
||
"prometheus": "Prometheus self-scrape",
|
||
"blackbox-http": "Blackbox HTTP probe",
|
||
"blackbox-tcp": "Blackbox TCP probe",
|
||
"github-actions": "GitHub Actions exporter",
|
||
}
|
||
|
||
# 允許 down 的 target (已知問題,不影響覆蓋率計算)
|
||
KNOWN_DOWN_TARGETS = {
|
||
"federation-k8s": "K8s federation — SigNoz 內部 Prometheus,非外部暴露",
|
||
"kube-state-metrics": "kube-state-metrics NodePort 30180 — 僅 OTEL Collector 內部存取",
|
||
"node-exporter-120": "node-exporter 120 — K8s master 節點防火牆規則",
|
||
"node-exporter-121": "node-exporter 121 — K8s worker 節點防火牆規則",
|
||
}
|
||
|
||
|
||
def get_prometheus_targets() -> dict:
|
||
"""查詢 Prometheus targets API"""
|
||
try:
|
||
resp = requests.get(f"{PROMETHEUS_URL}/api/v1/targets", timeout=10)
|
||
resp.raise_for_status()
|
||
return resp.json()["data"]
|
||
except requests.RequestException as e:
|
||
print(f"❌ 無法連接 Prometheus ({PROMETHEUS_URL}): {e}", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
|
||
def get_k8s_services() -> list[dict]:
|
||
"""查詢 K8s services (需要 kubectl)"""
|
||
try:
|
||
result = subprocess.run(
|
||
["kubectl", "get", "services", "--all-namespaces", "-o", "json"],
|
||
capture_output=True,
|
||
text=True,
|
||
timeout=15,
|
||
)
|
||
if result.returncode != 0:
|
||
return []
|
||
data = json.loads(result.stdout)
|
||
return data.get("items", [])
|
||
except (subprocess.TimeoutExpired, json.JSONDecodeError, FileNotFoundError):
|
||
return []
|
||
|
||
|
||
def analyze_targets(targets_data: dict) -> dict:
|
||
"""分析 targets 狀態"""
|
||
active = targets_data.get("activeTargets", [])
|
||
|
||
jobs: dict[str, dict] = {}
|
||
for t in active:
|
||
job = t["labels"].get("job", "unknown")
|
||
instance = t["labels"].get("instance", "?")
|
||
health = t["health"]
|
||
|
||
if job not in jobs:
|
||
jobs[job] = {"up": [], "down": [], "unknown": []}
|
||
jobs[job][health].append(instance)
|
||
|
||
return jobs
|
||
|
||
|
||
def build_report(jobs: dict) -> dict:
|
||
"""建立覆蓋率報告"""
|
||
total_jobs = len(jobs)
|
||
up_jobs = sum(1 for j in jobs.values() if j["up"] and not j["down"])
|
||
partial_jobs = sum(1 for j in jobs.values() if j["up"] and j["down"])
|
||
down_jobs = sum(1 for j in jobs.values() if not j["up"] and j["down"])
|
||
|
||
# 只計算非 known_down 的問題
|
||
real_down_jobs = {
|
||
job: data
|
||
for job, data in jobs.items()
|
||
if not data["up"] and job not in KNOWN_DOWN_TARGETS
|
||
}
|
||
|
||
expected_covered = sum(1 for j in EXPECTED_JOBS if j in jobs and jobs[j]["up"])
|
||
coverage_pct = round(expected_covered / len(EXPECTED_JOBS) * 100, 1)
|
||
|
||
return {
|
||
"generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||
"prometheus_url": PROMETHEUS_URL,
|
||
"summary": {
|
||
"total_jobs": total_jobs,
|
||
"up_jobs": up_jobs,
|
||
"partial_jobs": partial_jobs,
|
||
"down_jobs": down_jobs,
|
||
"real_down_jobs": len(real_down_jobs),
|
||
"expected_coverage_pct": coverage_pct,
|
||
},
|
||
"jobs": jobs,
|
||
"expected_jobs": EXPECTED_JOBS,
|
||
"known_down": KNOWN_DOWN_TARGETS,
|
||
"real_down_jobs": list(real_down_jobs.keys()),
|
||
"missing_expected": [j for j in EXPECTED_JOBS if j not in jobs],
|
||
}
|
||
|
||
|
||
def print_human_report(report: dict) -> None:
|
||
"""輸出人可讀格式報告"""
|
||
s = report["summary"]
|
||
print(f"\n{'='*60}")
|
||
print(f" AWOOOI 監控覆蓋率報告")
|
||
print(f" 生成時間: {report['generated_at']}")
|
||
print(f"{'='*60}")
|
||
print(f"\n📊 總覽")
|
||
print(f" Jobs 總數: {s['total_jobs']}")
|
||
print(f" 全部 UP: {s['up_jobs']}")
|
||
print(f" 部分 UP: {s['partial_jobs']}")
|
||
print(f" 全部 DOWN: {s['down_jobs']}")
|
||
print(f" 真實問題 (非已知): {s['real_down_jobs']}")
|
||
print(f" 預期覆蓋率: {s['expected_coverage_pct']}% ({COVERAGE_THRESHOLD}% 門檻)")
|
||
|
||
print(f"\n✅ 預期服務狀態")
|
||
for job, desc in report["expected_jobs"].items():
|
||
jobs = report["jobs"]
|
||
if job not in jobs:
|
||
status = "❌ 缺失"
|
||
elif jobs[job]["up"] and not jobs[job]["down"]:
|
||
status = "✅ UP"
|
||
elif jobs[job]["up"]:
|
||
status = f"⚠️ 部分 UP ({len(jobs[job]['up'])} up, {len(jobs[job]['down'])} down)"
|
||
else:
|
||
status = "❌ DOWN"
|
||
print(f" {status:<30} {job:<25} {desc}")
|
||
|
||
if report["known_down"]:
|
||
print(f"\n⚠️ 已知 DOWN (不影響覆蓋率)")
|
||
for job, reason in report["known_down"].items():
|
||
if job in report["jobs"]:
|
||
print(f" {job:<30} {reason}")
|
||
|
||
if report["real_down_jobs"]:
|
||
print(f"\n🔴 需處理的 DOWN targets")
|
||
for job in report["real_down_jobs"]:
|
||
instances = report["jobs"][job].get("down", [])
|
||
print(f" {job}: {', '.join(instances)}")
|
||
|
||
if report["missing_expected"]:
|
||
print(f"\n🔴 缺少預期服務監控")
|
||
for job in report["missing_expected"]:
|
||
print(f" {job}: {report['expected_jobs'][job]}")
|
||
|
||
pct = s["expected_coverage_pct"]
|
||
threshold = COVERAGE_THRESHOLD
|
||
if pct >= threshold and not report["real_down_jobs"]:
|
||
print(f"\n✅ 監控健康: 覆蓋率 {pct}% >= {threshold}%,無真實問題\n")
|
||
elif pct >= threshold:
|
||
print(f"\n⚠️ 覆蓋率達標 ({pct}%),但有 {s['real_down_jobs']} 個真實 DOWN 需處理\n")
|
||
else:
|
||
print(f"\n❌ 覆蓋率不足: {pct}% < {threshold}%\n")
|
||
|
||
|
||
def main() -> None:
|
||
parser = argparse.ArgumentParser(description="AWOOOI 監控覆蓋率自動發現")
|
||
parser.add_argument("--json", action="store_true", help="輸出 JSON 格式")
|
||
parser.add_argument(
|
||
"--check",
|
||
action="store_true",
|
||
help=f"CI 模式: 覆蓋率 < {COVERAGE_THRESHOLD}% 則 exit 1",
|
||
)
|
||
args = parser.parse_args()
|
||
|
||
targets_data = get_prometheus_targets()
|
||
jobs = analyze_targets(targets_data)
|
||
report = build_report(jobs)
|
||
|
||
if args.json:
|
||
print(json.dumps(report, ensure_ascii=False, indent=2))
|
||
else:
|
||
print_human_report(report)
|
||
|
||
if args.check:
|
||
pct = report["summary"]["expected_coverage_pct"]
|
||
real_down = report["summary"]["real_down_jobs"]
|
||
if pct < COVERAGE_THRESHOLD or real_down > 0:
|
||
sys.exit(1)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|