awoooi/ops/monitoring/coverage_report.py

#!/usr/bin/env python3
"""
AWOOOI 監控覆蓋率報告生成器
============================

ADR-037 Wave D.2: 生成完整的監控覆蓋率報告

功能:
1. 分析 service-registry.yaml 覆蓋率
2. 檢查告警規則完整性
3. 生成 HTML/JSON 報告
4. 支援 CI/CD 整合

用法:
    python ops/monitoring/coverage_report.py
    python ops/monitoring/coverage_report.py --format html --output /tmp/report.html
    python ops/monitoring/coverage_report.py --format json
    python ops/monitoring/coverage_report.py --ci  # CI 模式

版本: v1.0
建立: 2026-03-29 (台北時區)
建立者: Claude Code (Phase 21 ADR-037)
"""

import argparse
import json
import sys
from datetime import datetime
from pathlib import Path
from zoneinfo import ZoneInfo

import yaml

# 配置
SCRIPT_DIR = Path(__file__).parent
REGISTRY_FILE = SCRIPT_DIR / "service-registry.yaml"
ALERT_RULES_DIR = Path(__file__).parent.parent.parent / "k8s" / "monitoring"

# 台北時區
TZ_TAIPEI = ZoneInfo("Asia/Taipei")


def load_registry() -> dict:
    """載入服務註冊表"""
    if not REGISTRY_FILE.exists():
        return {"services": [], "nodes": [], "api_endpoints": [], "pages": []}
    with open(REGISTRY_FILE) as f:
        return yaml.safe_load(f) or {}


def load_alert_rules() -> list[dict]:
    """載入所有告警規則"""
    rules = []
    if not ALERT_RULES_DIR.exists():
        return rules

    for f in ALERT_RULES_DIR.glob("*.yaml"):
        try:
            with open(f) as file:
                data = yaml.safe_load(file)
                if data and "spec" in data and "groups" in data["spec"]:
                    for group in data["spec"]["groups"]:
                        for rule in group.get("rules", []):
                            if "alert" in rule:
                                rules.append({
                                    "name": rule["alert"],
                                    "severity": rule.get("labels", {}).get("severity", "unknown"),
                                    "service": rule.get("labels", {}).get("service", "unknown"),
                                    "file": f.name,
                                })
        except (yaml.YAMLError, KeyError):
            continue

    return rules


def analyze_services(registry: dict) -> dict:
    """分析服務監控覆蓋率"""
    services = registry.get("services", [])

    stats = {
        "total": len(services),
        "by_type": {},
        "by_criticality": {},
        "monitoring": {
            "prometheus": {"covered": 0, "missing": []},
            "sentry": {"covered": 0, "missing": []},
            "otel": {"covered": 0, "missing": []},
            "langfuse": {"covered": 0, "missing": []},
        },
        "health_endpoint": {"covered": 0, "missing": []},
        "alerts": {"covered": 0, "missing": []},
        "auto_repair": {
            "enabled": 0,
            "p0_p1_without": [],
        },
    }

    for svc in services:
        name = svc.get("name", "unknown")
        svc_type = svc.get("type", "unknown")
        criticality = svc.get("criticality", "P2")

        # 按類型統計
        stats["by_type"][svc_type] = stats["by_type"].get(svc_type, 0) + 1

        # 按關鍵度統計
        stats["by_criticality"][criticality] = stats["by_criticality"].get(criticality, 0) + 1

        # 監控覆蓋
        monitoring = svc.get("monitoring", {})
        for key in ["prometheus", "sentry", "otel", "langfuse"]:
            if monitoring.get(key):
                stats["monitoring"][key]["covered"] += 1
            else:
                stats["monitoring"][key]["missing"].append(name)

        # Health endpoint
        if svc.get("health_endpoint"):
            stats["health_endpoint"]["covered"] += 1
        else:
            stats["health_endpoint"]["missing"].append(name)

        # 告警規則
        if svc.get("alerts"):
            stats["alerts"]["covered"] += 1
        else:
            stats["alerts"]["missing"].append(name)

        # 自動修復
        auto_repair = svc.get("auto_repair", {})
        if auto_repair.get("enabled"):
            stats["auto_repair"]["enabled"] += 1
        elif criticality in ["P0", "P1"]:
            stats["auto_repair"]["p0_p1_without"].append(name)

    return stats


def analyze_nodes(registry: dict) -> dict:
    """分析節點監控覆蓋率"""
    nodes = registry.get("nodes", [])

    stats = {
        "total": len(nodes),
        "by_role": {},
        "with_alerts": 0,
        "missing_alerts": [],
    }

    for node in nodes:
        name = node.get("name", "unknown")
        role = node.get("role", "unknown")

        stats["by_role"][role] = stats["by_role"].get(role, 0) + 1

        if node.get("alerts"):
            stats["with_alerts"] += 1
        else:
            stats["missing_alerts"].append(name)

    return stats


def analyze_api_endpoints(registry: dict) -> dict:
    """分析 API 端點監控覆蓋率"""
    endpoints = registry.get("api_endpoints", [])

    stats = {
        "total": len(endpoints),
        "critical": 0,
        "with_slo": 0,
        "missing_slo": [],
    }

    for ep in endpoints:
        path = ep.get("path", "unknown")

        if ep.get("critical"):
            stats["critical"] += 1

        if ep.get("slo"):
            stats["with_slo"] += 1
        else:
            stats["missing_slo"].append(path)

    return stats


def analyze_alert_rules(registry: dict, alert_rules: list[dict]) -> dict:
    """分析告警規則覆蓋率"""
    # 從 registry 收集所有服務
    services = {s["name"] for s in registry.get("services", [])}

    # 從告警規則收集被覆蓋的服務
    covered_by_rules = {r["service"] for r in alert_rules if r["service"] != "unknown"}

    # 分析
    missing = services - covered_by_rules

    stats = {
        "total_rules": len(alert_rules),
        "by_severity": {},
        "services_covered": len(services & covered_by_rules),
        "services_missing": list(missing),
    }

    for rule in alert_rules:
        severity = rule["severity"]
        stats["by_severity"][severity] = stats["by_severity"].get(severity, 0) + 1

    return stats


def calculate_scores(service_stats: dict, node_stats: dict, api_stats: dict, alert_stats: dict) -> dict:
    """計算各維度分數"""
    scores = {}

    # 服務 Prometheus 覆蓋率
    if service_stats["total"] > 0:
        scores["prometheus"] = round(
            100 * service_stats["monitoring"]["prometheus"]["covered"] / service_stats["total"], 1
        )
    else:
        scores["prometheus"] = 0

    # Health endpoint 覆蓋率
    if service_stats["total"] > 0:
        scores["health_endpoint"] = round(
            100 * service_stats["health_endpoint"]["covered"] / service_stats["total"], 1
        )
    else:
        scores["health_endpoint"] = 0

    # 告警覆蓋率
    if service_stats["total"] > 0:
        scores["alerts"] = round(
            100 * service_stats["alerts"]["covered"] / service_stats["total"], 1
        )
    else:
        scores["alerts"] = 0

    # API SLO 覆蓋率
    if api_stats["total"] > 0:
        scores["api_slo"] = round(100 * api_stats["with_slo"] / api_stats["total"], 1)
    else:
        scores["api_slo"] = 0

    # P0/P1 自動修復覆蓋率
    p0_p1_count = service_stats["by_criticality"].get("P0", 0) + service_stats["by_criticality"].get("P1", 0)
    p0_p1_without_repair = len(service_stats["auto_repair"]["p0_p1_without"])
    if p0_p1_count > 0:
        scores["auto_repair_p0p1"] = round(100 * (p0_p1_count - p0_p1_without_repair) / p0_p1_count, 1)
    else:
        scores["auto_repair_p0p1"] = 100

    # 綜合分數 (加權平均)
    weights = {
        "prometheus": 0.3,
        "health_endpoint": 0.2,
        "alerts": 0.2,
        "api_slo": 0.15,
        "auto_repair_p0p1": 0.15,
    }
    scores["overall"] = round(sum(scores[k] * weights[k] for k in weights), 1)

    return scores


def generate_text_report(
    service_stats: dict,
    node_stats: dict,
    api_stats: dict,
    alert_stats: dict,
    scores: dict,
) -> str:
    """生成文字報告"""
    now = datetime.now(TZ_TAIPEI).strftime("%Y-%m-%d %H:%M:%S")

    lines = [
        "=" * 70,
        f"  AWOOOI Monitoring Coverage Report",
        f"  Generated: {now} (Taipei)",
        "=" * 70,
        "",
        "  OVERALL SCORE",
        "  " + "-" * 40,
        f"  Overall:          {scores['overall']}%",
        f"  Prometheus:       {scores['prometheus']}%",
        f"  Health Endpoint:  {scores['health_endpoint']}%",
        f"  Alert Rules:      {scores['alerts']}%",
        f"  API SLO:          {scores['api_slo']}%",
        f"  Auto-Repair P0/1: {scores['auto_repair_p0p1']}%",
        "",
        "  SERVICES",
        "  " + "-" * 40,
        f"  Total: {service_stats['total']}",
    ]

    # 按類型
    lines.append("  By Type:")
    for t, count in sorted(service_stats["by_type"].items()):
        lines.append(f"    - {t}: {count}")

    # 按關鍵度
    lines.append("  By Criticality:")
    for c, count in sorted(service_stats["by_criticality"].items()):
        lines.append(f"    - {c}: {count}")

    # 缺失項目
    if service_stats["monitoring"]["prometheus"]["missing"]:
        lines.append("")
        lines.append(f"  Missing Prometheus ({len(service_stats['monitoring']['prometheus']['missing'])}):")
        for s in service_stats["monitoring"]["prometheus"]["missing"][:5]:
            lines.append(f"    - {s}")
        if len(service_stats["monitoring"]["prometheus"]["missing"]) > 5:
            lines.append(f"    ... and {len(service_stats['monitoring']['prometheus']['missing']) - 5} more")

    if service_stats["auto_repair"]["p0_p1_without"]:
        lines.append("")
        lines.append(f"  P0/P1 Without Auto-Repair ({len(service_stats['auto_repair']['p0_p1_without'])}):")
        for s in service_stats["auto_repair"]["p0_p1_without"]:
            lines.append(f"    - {s}")

    # 節點
    lines.extend([
        "",
        "  NODES",
        "  " + "-" * 40,
        f"  Total: {node_stats['total']}",
        f"  With Alerts: {node_stats['with_alerts']}",
    ])

    # API
    lines.extend([
        "",
        "  API ENDPOINTS",
        "  " + "-" * 40,
        f"  Total: {api_stats['total']}",
        f"  Critical: {api_stats['critical']}",
        f"  With SLO: {api_stats['with_slo']}",
    ])

    # 告警規則
    lines.extend([
        "",
        "  ALERT RULES",
        "  " + "-" * 40,
        f"  Total Rules: {alert_stats['total_rules']}",
    ])
    lines.append("  By Severity:")
    for sev, count in sorted(alert_stats["by_severity"].items()):
        lines.append(f"    - {sev}: {count}")

    lines.extend([
        "",
        "=" * 70,
    ])

    # CI 結果
    if scores["overall"] >= 90:
        lines.append("  RESULT: PASS (>= 90%)")
    elif scores["overall"] >= 80:
        lines.append("  RESULT: WARNING (80-89%)")
    else:
        lines.append("  RESULT: FAIL (< 80%)")

    lines.append("=" * 70)

    return "\n".join(lines)


def generate_html_report(
    service_stats: dict,
    node_stats: dict,
    api_stats: dict,
    alert_stats: dict,
    scores: dict,
) -> str:
    """生成 HTML 報告"""
    now = datetime.now(TZ_TAIPEI).strftime("%Y-%m-%d %H:%M:%S")

    def score_class(score):
        if score >= 90:
            return "good"
        elif score >= 80:
            return "warning"
        return "critical"

    html = f"""<!DOCTYPE html>
<html lang="zh-TW">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>AWOOOI Monitoring Coverage Report</title>
    <style>
        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
        body {{
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
            background: #0f0f0f;
            color: #e0e0e0;
            padding: 2rem;
        }}
        .container {{ max-width: 1200px; margin: 0 auto; }}
        h1 {{
            font-size: 2rem;
            margin-bottom: 0.5rem;
            color: #fff;
        }}
        .timestamp {{ color: #888; margin-bottom: 2rem; }}
        .score-grid {{
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
            gap: 1rem;
            margin-bottom: 2rem;
        }}
        .score-card {{
            background: #1a1a1a;
            border-radius: 8px;
            padding: 1.5rem;
            text-align: center;
        }}
        .score-card.overall {{
            grid-column: span 2;
            background: linear-gradient(135deg, #1a1a1a, #252525);
        }}
        .score-value {{
            font-size: 2.5rem;
            font-weight: bold;
            margin-bottom: 0.5rem;
        }}
        .score-value.good {{ color: #4ade80; }}
        .score-value.warning {{ color: #fbbf24; }}
        .score-value.critical {{ color: #ef4444; }}
        .score-label {{ color: #888; font-size: 0.875rem; }}
        .section {{
            background: #1a1a1a;
            border-radius: 8px;
            padding: 1.5rem;
            margin-bottom: 1rem;
        }}
        .section h2 {{
            font-size: 1.25rem;
            margin-bottom: 1rem;
            color: #fff;
        }}
        table {{
            width: 100%;
            border-collapse: collapse;
        }}
        th, td {{
            padding: 0.75rem;
            text-align: left;
            border-bottom: 1px solid #333;
        }}
        th {{ color: #888; font-weight: 500; }}
        .badge {{
            display: inline-block;
            padding: 0.25rem 0.5rem;
            border-radius: 4px;
            font-size: 0.75rem;
            font-weight: 500;
        }}
        .badge-good {{ background: rgba(74, 222, 128, 0.2); color: #4ade80; }}
        .badge-warning {{ background: rgba(251, 191, 36, 0.2); color: #fbbf24; }}
        .badge-critical {{ background: rgba(239, 68, 68, 0.2); color: #ef4444; }}
        .missing-list {{
            background: #252525;
            border-radius: 4px;
            padding: 1rem;
            margin-top: 1rem;
        }}
        .missing-list h3 {{
            font-size: 0.875rem;
            color: #ef4444;
            margin-bottom: 0.5rem;
        }}
        .missing-list ul {{
            list-style: none;
            display: flex;
            flex-wrap: wrap;
            gap: 0.5rem;
        }}
        .missing-list li {{
            background: rgba(239, 68, 68, 0.1);
            padding: 0.25rem 0.5rem;
            border-radius: 4px;
            font-size: 0.875rem;
            color: #ef4444;
        }}
    </style>
</head>
<body>
    <div class="container">
        <h1>AWOOOI Monitoring Coverage Report</h1>
        <p class="timestamp">Generated: {now} (Taipei) | ADR-037</p>

        <div class="score-grid">
            <div class="score-card overall">
                <div class="score-value {score_class(scores['overall'])}">{scores['overall']}%</div>
                <div class="score-label">Overall Score</div>
            </div>
            <div class="score-card">
                <div class="score-value {score_class(scores['prometheus'])}">{scores['prometheus']}%</div>
                <div class="score-label">Prometheus</div>
            </div>
            <div class="score-card">
                <div class="score-value {score_class(scores['health_endpoint'])}">{scores['health_endpoint']}%</div>
                <div class="score-label">Health Endpoint</div>
            </div>
            <div class="score-card">
                <div class="score-value {score_class(scores['alerts'])}">{scores['alerts']}%</div>
                <div class="score-label">Alert Rules</div>
            </div>
            <div class="score-card">
                <div class="score-value {score_class(scores['api_slo'])}">{scores['api_slo']}%</div>
                <div class="score-label">API SLO</div>
            </div>
            <div class="score-card">
                <div class="score-value {score_class(scores['auto_repair_p0p1'])}">{scores['auto_repair_p0p1']}%</div>
                <div class="score-label">Auto-Repair P0/P1</div>
            </div>
        </div>

        <div class="section">
            <h2>Services ({service_stats['total']})</h2>
            <table>
                <thead>
                    <tr>
                        <th>Dimension</th>
                        <th>Covered</th>
                        <th>Missing</th>
                        <th>Coverage</th>
                    </tr>
                </thead>
                <tbody>
                    <tr>
                        <td>Prometheus</td>
                        <td>{service_stats['monitoring']['prometheus']['covered']}</td>
                        <td>{len(service_stats['monitoring']['prometheus']['missing'])}</td>
                        <td><span class="badge badge-{score_class(scores['prometheus'])}">{scores['prometheus']}%</span></td>
                    </tr>
                    <tr>
                        <td>Health Endpoint</td>
                        <td>{service_stats['health_endpoint']['covered']}</td>
                        <td>{len(service_stats['health_endpoint']['missing'])}</td>
                        <td><span class="badge badge-{score_class(scores['health_endpoint'])}">{scores['health_endpoint']}%</span></td>
                    </tr>
                    <tr>
                        <td>Alert Rules</td>
                        <td>{service_stats['alerts']['covered']}</td>
                        <td>{len(service_stats['alerts']['missing'])}</td>
                        <td><span class="badge badge-{score_class(scores['alerts'])}">{scores['alerts']}%</span></td>
                    </tr>
                </tbody>
            </table>
"""

    # P0/P1 without auto-repair
    if service_stats["auto_repair"]["p0_p1_without"]:
        html += """
            <div class="missing-list">
                <h3>P0/P1 Without Auto-Repair</h3>
                <ul>
"""
        for s in service_stats["auto_repair"]["p0_p1_without"]:
            html += f"                    <li>{s}</li>\n"
        html += """
                </ul>
            </div>
"""

    html += f"""
        </div>

        <div class="section">
            <h2>Alert Rules ({alert_stats['total_rules']})</h2>
            <table>
                <thead>
                    <tr>
                        <th>Severity</th>
                        <th>Count</th>
                    </tr>
                </thead>
                <tbody>
"""
    for sev in ["critical", "warning", "info"]:
        count = alert_stats["by_severity"].get(sev, 0)
        badge_class = "critical" if sev == "critical" else ("warning" if sev == "warning" else "good")
        html += f"""
                    <tr>
                        <td><span class="badge badge-{badge_class}">{sev}</span></td>
                        <td>{count}</td>
                    </tr>
"""

    html += f"""
                </tbody>
            </table>
        </div>

        <div class="section">
            <h2>API Endpoints ({api_stats['total']})</h2>
            <table>
                <thead>
                    <tr>
                        <th>Metric</th>
                        <th>Value</th>
                    </tr>
                </thead>
                <tbody>
                    <tr>
                        <td>Critical Endpoints</td>
                        <td>{api_stats['critical']}</td>
                    </tr>
                    <tr>
                        <td>With SLO Defined</td>
                        <td>{api_stats['with_slo']}</td>
                    </tr>
                </tbody>
            </table>
        </div>

        <div class="section">
            <h2>Nodes ({node_stats['total']})</h2>
            <table>
                <thead>
                    <tr>
                        <th>Role</th>
                        <th>Count</th>
                    </tr>
                </thead>
                <tbody>
"""
    for role, count in sorted(node_stats["by_role"].items()):
        html += f"""
                    <tr>
                        <td>{role}</td>
                        <td>{count}</td>
                    </tr>
"""

    html += """
                </tbody>
            </table>
        </div>
    </div>
</body>
</html>
"""
    return html


def main():
    parser = argparse.ArgumentParser(description="AWOOOI Monitoring Coverage Report")
    parser.add_argument("--format", choices=["text", "html", "json"], default="text")
    parser.add_argument("--output", "-o", type=Path, help="Output file path")
    parser.add_argument("--ci", action="store_true", help="CI mode: exit 1 if < 80%")
    args = parser.parse_args()

    # 載入資料
    registry = load_registry()
    alert_rules = load_alert_rules()

    # 分析
    service_stats = analyze_services(registry)
    node_stats = analyze_nodes(registry)
    api_stats = analyze_api_endpoints(registry)
    alert_stats = analyze_alert_rules(registry, alert_rules)

    # 計算分數
    scores = calculate_scores(service_stats, node_stats, api_stats, alert_stats)

    # 生成報告
    if args.format == "json":
        report = json.dumps({
            "generated_at": datetime.now(TZ_TAIPEI).isoformat(),
            "scores": scores,
            "services": service_stats,
            "nodes": node_stats,
            "api_endpoints": api_stats,
            "alert_rules": alert_stats,
        }, indent=2, ensure_ascii=False)
    elif args.format == "html":
        report = generate_html_report(service_stats, node_stats, api_stats, alert_stats, scores)
    else:
        report = generate_text_report(service_stats, node_stats, api_stats, alert_stats, scores)

    # 輸出
    if args.output:
        args.output.write_text(report)
        print(f"Report saved to: {args.output}")
    else:
        print(report)

    # CI 模式
    if args.ci and scores["overall"] < 80:
        sys.exit(1)


if __name__ == "__main__":
    main()