Files
awoooi/scripts/generate_monitoring.py
Your Name 8fa8d690a2
Some checks failed
Code Review / ai-code-review (push) Successful in 10s
CD Pipeline / tests (push) Successful in 4m7s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
fix(monitoring): stabilize post-deploy target coverage
2026-05-20 12:41:09 +08:00

344 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
generate_monitoring.py — 監控覆蓋率自動發現
Phase O-5 Wave C.1 (2026-04-02 ogt)
功能:
1. 查詢 Prometheus targets API取得全量 scrape 狀態
2. 掃描 K8s Services找出未被監控的服務
3. 輸出覆蓋率報告 (JSON + 人可讀格式)
用法:
python3 scripts/generate_monitoring.py
python3 scripts/generate_monitoring.py --json
python3 scripts/generate_monitoring.py --check # CI mode: exit 1 if coverage < threshold
"""
import argparse
import json
import os
import subprocess
import sys
import time
from datetime import datetime
from typing import Callable
from urllib.error import HTTPError, URLError
from urllib.request import urlopen
# ============================================================
# 設定
# ============================================================
PROMETHEUS_URL = "http://192.168.0.110:9090"
COVERAGE_THRESHOLD = 70 # CI 模式: 覆蓋率低於此值則 exit 1
DEFAULT_STABILIZATION_ATTEMPTS = 3
DEFAULT_STABILIZATION_SLEEP_SECONDS = 10.0
# 已知服務清單 (job名稱 → 說明)
EXPECTED_JOBS = {
"awoooi-api": "AWOOOI API (K8s)",
"clawbot": "OpenClaw 188:8088",
"node-exporter-110": "Node Exporter 110",
"node-exporter-112": "Node Exporter 112 (Kali)",
"node-exporter-188": "Node Exporter 188",
"cadvisor-110": "cAdvisor 110",
"prometheus": "Prometheus self-scrape",
"blackbox-http": "Blackbox HTTP probe",
"blackbox-tcp": "Blackbox TCP probe",
"github-actions": "GitHub Actions exporter",
}
# 允許 down 的 target (已知問題,不影響覆蓋率計算)
KNOWN_DOWN_TARGETS = {
"federation-k8s": "K8s federation — SigNoz 內部 Prometheus非外部暴露",
"kube-state-metrics": "kube-state-metrics NodePort 30180 — 僅 OTEL Collector 內部存取",
"node-exporter-120": "node-exporter 120 — K8s master 節點防火牆規則",
"node-exporter-121": "node-exporter 121 — K8s worker 節點防火牆規則",
}
def _int_env(name: str, default: int) -> int:
try:
return max(1, int(os.environ.get(name, default)))
except ValueError:
return default
def _float_env(name: str, default: float) -> float:
try:
return max(0.0, float(os.environ.get(name, default)))
except ValueError:
return default
def get_prometheus_targets() -> dict:
"""查詢 Prometheus targets API"""
try:
with urlopen(f"{PROMETHEUS_URL}/api/v1/targets", timeout=10) as resp:
payload = json.loads(resp.read().decode("utf-8"))
return payload["data"]
except (HTTPError, URLError, TimeoutError, json.JSONDecodeError, KeyError) as e:
print(f"❌ 無法連接 Prometheus ({PROMETHEUS_URL}): {e}", file=sys.stderr)
sys.exit(1)
def get_k8s_services() -> list[dict]:
"""查詢 K8s services (需要 kubectl)"""
try:
result = subprocess.run(
["kubectl", "get", "services", "--all-namespaces", "-o", "json"],
capture_output=True,
text=True,
timeout=15,
)
if result.returncode != 0:
return []
data = json.loads(result.stdout)
return data.get("items", [])
except (subprocess.TimeoutExpired, json.JSONDecodeError, FileNotFoundError):
return []
def analyze_targets(targets_data: dict) -> dict:
"""分析 targets 狀態"""
active = targets_data.get("activeTargets", [])
jobs: dict[str, dict] = {}
for t in active:
job = t["labels"].get("job", "unknown")
instance = t["labels"].get("instance", "?")
health = t["health"]
if job not in jobs:
jobs[job] = {"up": [], "down": [], "unknown": []}
jobs[job][health].append(instance)
return jobs
def build_report(jobs: dict) -> dict:
"""建立覆蓋率報告"""
total_jobs = len(jobs)
up_jobs = sum(1 for j in jobs.values() if j["up"] and not j["down"])
partial_jobs = sum(1 for j in jobs.values() if j["up"] and j["down"])
down_jobs = sum(1 for j in jobs.values() if not j["up"] and j["down"])
# 只計算非 known_down 的問題
real_down_jobs = {
job: data
for job, data in jobs.items()
if not data["up"] and job not in KNOWN_DOWN_TARGETS
}
expected_covered = sum(1 for j in EXPECTED_JOBS if j in jobs and jobs[j]["up"])
coverage_pct = round(expected_covered / len(EXPECTED_JOBS) * 100, 1)
return {
"generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"prometheus_url": PROMETHEUS_URL,
"summary": {
"total_jobs": total_jobs,
"up_jobs": up_jobs,
"partial_jobs": partial_jobs,
"down_jobs": down_jobs,
"real_down_jobs": len(real_down_jobs),
"expected_coverage_pct": coverage_pct,
},
"jobs": jobs,
"expected_jobs": EXPECTED_JOBS,
"known_down": KNOWN_DOWN_TARGETS,
"real_down_jobs": list(real_down_jobs.keys()),
"missing_expected": [j for j in EXPECTED_JOBS if j not in jobs],
}
def build_report_from_targets(targets_data: dict) -> dict:
"""從 Prometheus targets API payload 建立覆蓋率報告"""
return build_report(analyze_targets(targets_data))
def report_needs_stabilization(report: dict) -> bool:
"""是否需要重查,避免 post-deploy 瞬間 scrape 狀態造成 false red."""
return bool(report["real_down_jobs"] or report["missing_expected"])
def stabilization_reason(report: dict) -> str:
parts: list[str] = []
if report["real_down_jobs"]:
parts.append(f"real_down={','.join(report['real_down_jobs'])}")
if report["missing_expected"]:
parts.append(f"missing_expected={','.join(report['missing_expected'])}")
return "; ".join(parts) if parts else "stable"
def build_stabilized_report(
fetch_targets: Callable[[], dict],
attempts: int,
sleep_seconds: float,
emit_status: bool = True,
) -> dict:
"""重查 Prometheus targets讓 CI gate 避開 rollout/scrape freshness 瞬間值."""
attempts = max(1, attempts)
sleep_seconds = max(0.0, sleep_seconds)
report: dict | None = None
for attempt in range(1, attempts + 1):
report = build_report_from_targets(fetch_targets())
needs_retry = report_needs_stabilization(report)
status = "stable"
if needs_retry and attempt < attempts:
status = "retrying"
elif needs_retry:
status = "failed"
elif attempt > 1:
status = "cleared"
report["stabilization"] = {
"attempt": attempt,
"attempts": attempts,
"sleep_seconds": sleep_seconds,
"status": status,
"reason": stabilization_reason(report),
}
if not needs_retry or attempt == attempts:
if emit_status and attempt > 1 and not needs_retry:
print(
"✅ Prometheus target stabilization cleared transient coverage drift",
file=sys.stderr,
)
return report
if emit_status:
print(
"⏳ Prometheus target stabilization "
f"{attempt}/{attempts}: {stabilization_reason(report)}",
file=sys.stderr,
)
time.sleep(sleep_seconds)
if report is None:
raise RuntimeError("monitoring report stabilization did not run")
return report
def print_human_report(report: dict) -> None:
"""輸出人可讀格式報告"""
s = report["summary"]
print(f"\n{'='*60}")
print(f" AWOOOI 監控覆蓋率報告")
print(f" 生成時間: {report['generated_at']}")
print(f"{'='*60}")
print(f"\n📊 總覽")
print(f" Jobs 總數: {s['total_jobs']}")
print(f" 全部 UP: {s['up_jobs']}")
print(f" 部分 UP: {s['partial_jobs']}")
print(f" 全部 DOWN: {s['down_jobs']}")
print(f" 真實問題 (非已知): {s['real_down_jobs']}")
print(f" 預期覆蓋率: {s['expected_coverage_pct']}% ({COVERAGE_THRESHOLD}% 門檻)")
print(f"\n✅ 預期服務狀態")
for job, desc in report["expected_jobs"].items():
jobs = report["jobs"]
if job not in jobs:
status = "❌ 缺失"
elif jobs[job]["up"] and not jobs[job]["down"]:
status = "✅ UP"
elif jobs[job]["up"]:
status = f"⚠️ 部分 UP ({len(jobs[job]['up'])} up, {len(jobs[job]['down'])} down)"
else:
status = "❌ DOWN"
print(f" {status:<30} {job:<25} {desc}")
known_down_present = [
(job, reason)
for job, reason in report["known_down"].items()
if job in report["jobs"] and report["jobs"][job]["down"]
]
if known_down_present:
print(f"\n⚠️ 已知 DOWN (不影響覆蓋率)")
for job, reason in known_down_present:
print(f" {job:<30} {reason}")
if report["real_down_jobs"]:
print(f"\n🔴 需處理的 DOWN targets")
for job in report["real_down_jobs"]:
instances = report["jobs"][job].get("down", [])
print(f" {job}: {', '.join(instances)}")
if report["missing_expected"]:
print(f"\n🔴 缺少預期服務監控")
for job in report["missing_expected"]:
print(f" {job}: {report['expected_jobs'][job]}")
stabilization = report.get("stabilization")
if stabilization and stabilization["attempt"] > 1:
print(f"\n⏱️ Prometheus target 穩定化")
print(
" "
f"{stabilization['status']} after "
f"{stabilization['attempt']}/{stabilization['attempts']} attempts"
)
pct = s["expected_coverage_pct"]
threshold = COVERAGE_THRESHOLD
if pct >= threshold and not report["real_down_jobs"]:
print(f"\n✅ 監控健康: 覆蓋率 {pct}% >= {threshold}%,無真實問題\n")
elif pct >= threshold:
print(f"\n⚠️ 覆蓋率達標 ({pct}%),但有 {s['real_down_jobs']} 個真實 DOWN 需處理\n")
else:
print(f"\n❌ 覆蓋率不足: {pct}% < {threshold}%\n")
def main() -> None:
parser = argparse.ArgumentParser(description="AWOOOI 監控覆蓋率自動發現")
parser.add_argument("--json", action="store_true", help="輸出 JSON 格式")
parser.add_argument(
"--check",
action="store_true",
help=f"CI 模式: 覆蓋率 < {COVERAGE_THRESHOLD}% 則 exit 1",
)
parser.add_argument(
"--stabilization-attempts",
type=int,
default=_int_env(
"AWOOOI_MONITORING_TARGET_STABILIZATION_ATTEMPTS",
DEFAULT_STABILIZATION_ATTEMPTS,
),
help="CI 模式: Prometheus target 狀態重查次數",
)
parser.add_argument(
"--stabilization-sleep-seconds",
type=float,
default=_float_env(
"AWOOOI_MONITORING_TARGET_STABILIZATION_SLEEP_SECONDS",
DEFAULT_STABILIZATION_SLEEP_SECONDS,
),
help="CI 模式: Prometheus target 重查間隔秒數",
)
args = parser.parse_args()
if args.check:
report = build_stabilized_report(
get_prometheus_targets,
attempts=args.stabilization_attempts,
sleep_seconds=args.stabilization_sleep_seconds,
)
else:
report = build_report_from_targets(get_prometheus_targets())
if args.json:
print(json.dumps(report, ensure_ascii=False, indent=2))
else:
print_human_report(report)
if args.check:
pct = report["summary"]["expected_coverage_pct"]
real_down = report["summary"]["real_down_jobs"]
if pct < COVERAGE_THRESHOLD or real_down > 0:
sys.exit(1)
if __name__ == "__main__":
main()