Files
awoooi/scripts/generate_monitoring.py
OG T 827923b9b9
Some checks failed
E2E Health Check / e2e-health (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
feat(monitoring): Phase O-5 Wave C.1 generate_monitoring.py 自動發現
- 查詢 Prometheus targets API 取得全量 scrape 狀態
- 10 個預期服務覆蓋率計算 (門檻 70%)
- 已知 DOWN targets 豁免清單 (不影響健康判斷)
- --json 機器可讀輸出 / --check CI 模式 (exit 1 if coverage < threshold)
- 首次執行: 100% 覆蓋率,無真實問題

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-02 21:33:28 +08:00

219 lines
7.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
generate_monitoring.py — 監控覆蓋率自動發現
Phase O-5 Wave C.1 (2026-04-02 ogt)
功能:
1. 查詢 Prometheus targets API取得全量 scrape 狀態
2. 掃描 K8s Services找出未被監控的服務
3. 輸出覆蓋率報告 (JSON + 人可讀格式)
用法:
python3 scripts/generate_monitoring.py
python3 scripts/generate_monitoring.py --json
python3 scripts/generate_monitoring.py --check # CI mode: exit 1 if coverage < threshold
"""
import argparse
import json
import subprocess
import sys
from datetime import datetime
import requests
# ============================================================
# 設定
# ============================================================
PROMETHEUS_URL = "http://192.168.0.110:9090"
COVERAGE_THRESHOLD = 70 # CI 模式: 覆蓋率低於此值則 exit 1
# 已知服務清單 (job名稱 → 說明)
EXPECTED_JOBS = {
"awoooi-api": "AWOOOI API (K8s)",
"clawbot": "OpenClaw 188:8088",
"node-exporter-110": "Node Exporter 110",
"node-exporter-112": "Node Exporter 112 (Kali)",
"node-exporter-188": "Node Exporter 188",
"cadvisor-110": "cAdvisor 110",
"prometheus": "Prometheus self-scrape",
"blackbox-http": "Blackbox HTTP probe",
"blackbox-tcp": "Blackbox TCP probe",
"github-actions": "GitHub Actions exporter",
}
# 允許 down 的 target (已知問題,不影響覆蓋率計算)
KNOWN_DOWN_TARGETS = {
"federation-k8s": "K8s federation — SigNoz 內部 Prometheus非外部暴露",
"kube-state-metrics": "kube-state-metrics NodePort 30180 — 僅 OTEL Collector 內部存取",
"node-exporter-120": "node-exporter 120 — K8s master 節點防火牆規則",
"node-exporter-121": "node-exporter 121 — K8s worker 節點防火牆規則",
}
def get_prometheus_targets() -> dict:
"""查詢 Prometheus targets API"""
try:
resp = requests.get(f"{PROMETHEUS_URL}/api/v1/targets", timeout=10)
resp.raise_for_status()
return resp.json()["data"]
except requests.RequestException as e:
print(f"❌ 無法連接 Prometheus ({PROMETHEUS_URL}): {e}", file=sys.stderr)
sys.exit(1)
def get_k8s_services() -> list[dict]:
"""查詢 K8s services (需要 kubectl)"""
try:
result = subprocess.run(
["kubectl", "get", "services", "--all-namespaces", "-o", "json"],
capture_output=True,
text=True,
timeout=15,
)
if result.returncode != 0:
return []
data = json.loads(result.stdout)
return data.get("items", [])
except (subprocess.TimeoutExpired, json.JSONDecodeError, FileNotFoundError):
return []
def analyze_targets(targets_data: dict) -> dict:
"""分析 targets 狀態"""
active = targets_data.get("activeTargets", [])
jobs: dict[str, dict] = {}
for t in active:
job = t["labels"].get("job", "unknown")
instance = t["labels"].get("instance", "?")
health = t["health"]
if job not in jobs:
jobs[job] = {"up": [], "down": [], "unknown": []}
jobs[job][health].append(instance)
return jobs
def build_report(jobs: dict) -> dict:
"""建立覆蓋率報告"""
total_jobs = len(jobs)
up_jobs = sum(1 for j in jobs.values() if j["up"] and not j["down"])
partial_jobs = sum(1 for j in jobs.values() if j["up"] and j["down"])
down_jobs = sum(1 for j in jobs.values() if not j["up"] and j["down"])
# 只計算非 known_down 的問題
real_down_jobs = {
job: data
for job, data in jobs.items()
if not data["up"] and job not in KNOWN_DOWN_TARGETS
}
expected_covered = sum(1 for j in EXPECTED_JOBS if j in jobs and jobs[j]["up"])
coverage_pct = round(expected_covered / len(EXPECTED_JOBS) * 100, 1)
return {
"generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"prometheus_url": PROMETHEUS_URL,
"summary": {
"total_jobs": total_jobs,
"up_jobs": up_jobs,
"partial_jobs": partial_jobs,
"down_jobs": down_jobs,
"real_down_jobs": len(real_down_jobs),
"expected_coverage_pct": coverage_pct,
},
"jobs": jobs,
"expected_jobs": EXPECTED_JOBS,
"known_down": KNOWN_DOWN_TARGETS,
"real_down_jobs": list(real_down_jobs.keys()),
"missing_expected": [j for j in EXPECTED_JOBS if j not in jobs],
}
def print_human_report(report: dict) -> None:
"""輸出人可讀格式報告"""
s = report["summary"]
print(f"\n{'='*60}")
print(f" AWOOOI 監控覆蓋率報告")
print(f" 生成時間: {report['generated_at']}")
print(f"{'='*60}")
print(f"\n📊 總覽")
print(f" Jobs 總數: {s['total_jobs']}")
print(f" 全部 UP: {s['up_jobs']}")
print(f" 部分 UP: {s['partial_jobs']}")
print(f" 全部 DOWN: {s['down_jobs']}")
print(f" 真實問題 (非已知): {s['real_down_jobs']}")
print(f" 預期覆蓋率: {s['expected_coverage_pct']}% ({COVERAGE_THRESHOLD}% 門檻)")
print(f"\n✅ 預期服務狀態")
for job, desc in report["expected_jobs"].items():
jobs = report["jobs"]
if job not in jobs:
status = "❌ 缺失"
elif jobs[job]["up"] and not jobs[job]["down"]:
status = "✅ UP"
elif jobs[job]["up"]:
status = f"⚠️ 部分 UP ({len(jobs[job]['up'])} up, {len(jobs[job]['down'])} down)"
else:
status = "❌ DOWN"
print(f" {status:<30} {job:<25} {desc}")
if report["known_down"]:
print(f"\n⚠️ 已知 DOWN (不影響覆蓋率)")
for job, reason in report["known_down"].items():
if job in report["jobs"]:
print(f" {job:<30} {reason}")
if report["real_down_jobs"]:
print(f"\n🔴 需處理的 DOWN targets")
for job in report["real_down_jobs"]:
instances = report["jobs"][job].get("down", [])
print(f" {job}: {', '.join(instances)}")
if report["missing_expected"]:
print(f"\n🔴 缺少預期服務監控")
for job in report["missing_expected"]:
print(f" {job}: {report['expected_jobs'][job]}")
pct = s["expected_coverage_pct"]
threshold = COVERAGE_THRESHOLD
if pct >= threshold and not report["real_down_jobs"]:
print(f"\n✅ 監控健康: 覆蓋率 {pct}% >= {threshold}%,無真實問題\n")
elif pct >= threshold:
print(f"\n⚠️ 覆蓋率達標 ({pct}%),但有 {s['real_down_jobs']} 個真實 DOWN 需處理\n")
else:
print(f"\n❌ 覆蓋率不足: {pct}% < {threshold}%\n")
def main() -> None:
parser = argparse.ArgumentParser(description="AWOOOI 監控覆蓋率自動發現")
parser.add_argument("--json", action="store_true", help="輸出 JSON 格式")
parser.add_argument(
"--check",
action="store_true",
help=f"CI 模式: 覆蓋率 < {COVERAGE_THRESHOLD}% 則 exit 1",
)
args = parser.parse_args()
targets_data = get_prometheus_targets()
jobs = analyze_targets(targets_data)
report = build_report(jobs)
if args.json:
print(json.dumps(report, ensure_ascii=False, indent=2))
else:
print_human_report(report)
if args.check:
pct = report["summary"]["expected_coverage_pct"]
real_down = report["summary"]["real_down_jobs"]
if pct < COVERAGE_THRESHOLD or real_down > 0:
sys.exit(1)
if __name__ == "__main__":
main()