#!/usr/bin/env python3 """ IwoooS monitoring / alerting / observability repo-only 清冊。 本工具只讀取已提交的 repo 檔案,整理 Prometheus、Alertmanager、 Grafana、SigNoz、Sentry、Langfuse、OTEL、Telegram / notification policy、deploy / reload scripts 與 smoke scripts。它不連 live Prometheus、不 reload Alertmanager、不改 Grafana、不套用 SigNoz rule、 不部署 Sentry、不發 Telegram、不建立 silence、不 SSH、不 kubectl、不讀 secret value。 """ from __future__ import annotations import argparse import hashlib import json import subprocess import sys from datetime import datetime, timedelta, timezone from pathlib import Path from typing import Any TAIPEI = timezone(timedelta(hours=8)) SURFACES: list[dict[str, Any]] = [ { "surface_id": "prometheus_k8s_base_config", "label": "K8s Prometheus base config", "source_path": "k8s/monitoring/prometheus.yml", "expected_scope": "k8s_monitoring_prometheus_base", "config_kind": "prometheus_config", "control_tier": "C1", "current_state": "repo_source_visible_reload_not_authorized", "observability_scope": ["scrape config", "service discovery", "alert rule includes"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 Prometheus owner、live config hash、reload owner、rule diff 與 route smoke 指標。", }, { "surface_id": "prometheus_k8s_additions_config", "label": "K8s Prometheus additions", "source_path": "k8s/monitoring/prometheus-config-additions.yaml", "expected_scope": "k8s_monitoring_prometheus_additions", "config_kind": "prometheus_config", "control_tier": "C1", "current_state": "repo_source_visible_needs_drift_disposition", "observability_scope": ["supplemental scrape", "additional rule paths"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 additions owner、live parity evidence、rollout window 與 rollback owner。", }, { "surface_id": "prometheus_phase_o_config", "label": "Prometheus Phase O config", "source_path": "k8s/monitoring/prometheus-config-phase-o.yaml", "expected_scope": "phase_o_prometheus_config", "config_kind": "prometheus_config", "control_tier": "C1", "current_state": "phase_config_visible_reload_not_authorized", "observability_scope": ["Phase O monitoring", "baseline scrape"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 Phase O owner、stale config disposition、reload owner 與 post-check 指標。", }, { "surface_id": "prometheus_remote_write_signoz", "label": "Prometheus remote write to SigNoz", "source_path": "k8s/monitoring/prometheus-remote-write-signoz.yaml", "expected_scope": "prometheus_remote_write_signoz", "config_kind": "prometheus_remote_write", "control_tier": "C1", "current_state": "data_export_config_visible_gate_closed", "observability_scope": ["remote write", "SigNoz ingestion", "data export boundary"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 data export owner、privacy boundary、SigNoz ingest owner、rollback owner 與 volume guard。", }, { "surface_id": "prometheus_generated_scrape_config", "label": "Generated Prometheus scrape config", "source_path": "ops/monitoring/generated/prometheus-scrape-generated.yaml", "expected_scope": "generated_scrape_targets", "config_kind": "prometheus_generated_config", "control_tier": "C1", "current_state": "generated_source_visible_needs_source_registry_parity", "observability_scope": ["generated targets", "service registry"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 generator owner、service registry parity、live target count 與 stale target disposition。", }, { "surface_id": "prometheus_generated_blackbox_targets", "label": "Generated blackbox targets", "source_path": "ops/monitoring/generated/blackbox-targets-generated.yaml", "expected_scope": "generated_blackbox_targets", "config_kind": "prometheus_generated_config", "control_tier": "C1", "current_state": "generated_blackbox_targets_visible_probe_not_executed", "observability_scope": ["blackbox targets", "public route smoke candidates"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 blackbox owner、target freshness、public route owner 與 probe execution approval。", }, { "surface_id": "monitoring_service_registry", "label": "Monitoring service registry", "source_path": "ops/monitoring/service-registry.yaml", "expected_scope": "monitoring_service_registry", "config_kind": "monitoring_service_registry", "control_tier": "C1", "current_state": "registry_visible_needs_owner_disposition", "observability_scope": ["service registry", "target generation"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 registry owner、產品 owner mapping、retired target disposition 與 generator smoke。", }, { "surface_id": "postgres_exporter_queries", "label": "Postgres exporter query config", "source_path": "ops/monitoring/postgres-exporter-queries.yaml", "expected_scope": "postgres_exporter_queries", "config_kind": "exporter_query_config", "control_tier": "C1", "current_state": "query_config_visible_needs_db_owner_review", "observability_scope": ["PostgreSQL metrics", "custom exporter queries"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 DB owner、query cost boundary、metric name owner 與 rollback owner。", }, { "surface_id": "monitoring_110_compose", "label": "110 monitoring compose", "source_path": "k8s/monitoring/docker-compose-110.yml", "expected_scope": "192.168.0.110_monitoring_stack", "config_kind": "monitoring_runtime_compose", "control_tier": "C1", "current_state": "runtime_compose_visible_live_hash_missing", "observability_scope": ["Prometheus", "Grafana", "Alertmanager", "blackbox", "cadvisor"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 110 live compose hash、restart window、rollback owner、admin secret boundary 與 post-check。", }, { "surface_id": "monitoring_exporters_compose", "label": "188 monitoring exporters compose", "source_path": "ops/monitoring/docker-compose.exporters.yaml", "expected_scope": "192.168.0.188_exporters", "config_kind": "monitoring_runtime_compose", "control_tier": "C1", "current_state": "exporter_compose_visible_live_hash_missing", "observability_scope": ["postgres-exporter", "redis-exporter"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 188 exporter owner、live hash、env source policy、restart window 與 rollback owner。", }, { "surface_id": "alertmanager_receiver_config", "label": "Alertmanager receiver config", "source_path": "ops/alertmanager/alertmanager.yml", "expected_scope": "alertmanager_routes_receivers", "config_kind": "alertmanager_receiver_config", "control_tier": "C1", "current_state": "receiver_route_visible_reload_not_authorized", "observability_scope": ["routes", "receivers", "grouping", "Telegram / webhook receiver boundary"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 receiver owner、route diff、silence policy owner、reload owner 與 failure-only notification proof。", }, { "surface_id": "prometheus_alerts_ops", "label": "Ops Prometheus alerts", "source_path": "ops/monitoring/alerts.yml", "expected_scope": "ops_prometheus_alerts", "config_kind": "prometheus_alert_rules", "control_tier": "C1", "current_state": "alert_rules_visible_reload_not_authorized", "observability_scope": ["infrastructure alerts", "backup alerts", "route alerts"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 alert rule owner、rule diff、receiver mapping、reload owner 與 false-green guard。", }, { "surface_id": "prometheus_alerts_unified", "label": "Unified Prometheus alerts", "source_path": "ops/monitoring/alerts-unified.yml", "expected_scope": "unified_prometheus_alerts", "config_kind": "prometheus_alert_rules", "control_tier": "C1", "current_state": "unified_rules_visible_needs_receiver_parity", "observability_scope": ["unified alerts", "routing labels"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 unified rule owner、label taxonomy owner、receiver parity 與 rollout window。", }, { "surface_id": "prometheus_slo_rules", "label": "Prometheus SLO rules", "source_path": "ops/monitoring/slo-rules.yml", "expected_scope": "prometheus_slo_rules", "config_kind": "prometheus_alert_rules", "control_tier": "C1", "current_state": "slo_rules_visible_reload_not_authorized", "observability_scope": ["SLO burn rate", "availability indicators"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 SLO owner、error budget owner、rule test evidence 與 receiver mapping。", }, { "surface_id": "prometheus_ollama_health_rules", "label": "Ollama health alert rules", "source_path": "ops/monitoring/ollama_health_rules.yaml", "expected_scope": "ollama_health_rules", "config_kind": "prometheus_alert_rules", "control_tier": "C1", "current_state": "ai_runtime_health_rules_visible_reload_not_authorized", "observability_scope": ["Ollama health", "AI provider availability"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 AI provider owner、fallback owner、receiver owner 與 reload owner。", }, { "surface_id": "k8s_k3s_alerts", "label": "K3s alert rules", "source_path": "k8s/monitoring/k3s-alerts.yaml", "expected_scope": "k3s_cluster_alerts", "config_kind": "prometheus_alert_rules", "control_tier": "C1", "current_state": "k3s_rules_visible_apply_not_authorized", "observability_scope": ["K3s workloads", "cluster health"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 K3s owner、rule apply window、ArgoCD / kubectl boundary 與 rollback owner。", }, { "surface_id": "k8s_k3s_supplemental_alerts", "label": "K3s supplemental alert rules", "source_path": "k8s/monitoring/k3s-alerts-supplemental.yaml", "expected_scope": "k3s_supplemental_alerts", "config_kind": "prometheus_alert_rules", "control_tier": "C1", "current_state": "supplemental_rules_visible_apply_not_authorized", "observability_scope": ["K3s supplemental health", "gap alerts"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 supplemental rule owner、overlap disposition、receiver owner 與 apply boundary。", }, { "surface_id": "k8s_database_alerts", "label": "Database alert rules", "source_path": "k8s/monitoring/database-alerts.yaml", "expected_scope": "database_alerts", "config_kind": "prometheus_alert_rules", "control_tier": "C1", "current_state": "database_rules_visible_apply_not_authorized", "observability_scope": ["PostgreSQL", "Redis", "DB availability"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 DB alert owner、threshold owner、receiver mapping 與 live metric evidence。", }, { "surface_id": "k8s_nvidia_alerts", "label": "NVIDIA alert rules", "source_path": "k8s/monitoring/nvidia-alerts.yaml", "expected_scope": "nvidia_gpu_alerts", "config_kind": "prometheus_alert_rules", "control_tier": "C1", "current_state": "gpu_rules_visible_apply_not_authorized", "observability_scope": ["GPU health", "AI workload capacity"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 GPU owner、NVIDIA exporter live evidence、receiver owner 與 rollout boundary。", }, { "surface_id": "k8s_minio_kali_alerts", "label": "MinIO / Kali alert rules", "source_path": "k8s/monitoring/minio-kali-alerts.yaml", "expected_scope": "minio_kali_alerts", "config_kind": "prometheus_alert_rules", "control_tier": "C1", "current_state": "minio_kali_rules_visible_apply_not_authorized", "observability_scope": ["MinIO", "Kali scanner", "read-only scanner status"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 MinIO / Kali owner、active scan boundary、receiver owner 與 rule apply owner。", }, { "surface_id": "k8s_flywheel_alerts", "label": "AI flywheel alert rules", "source_path": "k8s/monitoring/flywheel-alerts.yaml", "expected_scope": "ai_flywheel_alerts", "config_kind": "prometheus_alert_rules", "control_tier": "C1", "current_state": "flywheel_rules_visible_apply_not_authorized", "observability_scope": ["AI flywheel", "learning / runtime health"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 AI flywheel owner、runtime gate boundary、receiver owner 與 false-green guard。", }, { "surface_id": "k8s_alert_chain_monitor", "label": "Alert chain monitor", "source_path": "k8s/monitoring/alert-chain-monitor.yaml", "expected_scope": "alert_chain_monitor", "config_kind": "prometheus_alert_rules", "control_tier": "C1", "current_state": "alert_chain_monitor_visible_apply_not_authorized", "observability_scope": ["Alertmanager chain", "E2E alert visibility"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 alert chain owner、E2E smoke owner、receiver owner 與 post-check 指標。", }, { "surface_id": "app_alert_rules_yaml", "label": "API alert rule engine rules", "source_path": "apps/api/alert_rules.yaml", "expected_scope": "api_alert_rule_engine", "config_kind": "app_alert_rule_contract", "control_tier": "C1", "current_state": "app_rule_contract_visible_runtime_change_not_authorized", "observability_scope": ["alert classification", "approval recommendations", "playbook matching"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 rule owner、AI decision owner、classification drift evidence 與 rollback owner。", }, { "surface_id": "grafana_agent_step_latency_rules", "label": "Grafana agent step latency rules", "source_path": "ops/monitoring/grafana/agent_step_latency_rules.yaml", "expected_scope": "grafana_agent_step_latency_rules", "config_kind": "grafana_alert_rules", "control_tier": "C1", "current_state": "grafana_rules_visible_apply_not_authorized", "observability_scope": ["agent step latency", "Grafana alerting"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 Grafana alert owner、dashboard / rule UID owner、apply owner 與 rollback owner。", }, { "surface_id": "grafana_ai_slo_dashboard", "label": "Grafana AI SLO dashboard", "source_path": "ops/monitoring/grafana/dashboards/ai-slo-dashboard.json", "expected_scope": "grafana_ai_slo_dashboard", "config_kind": "grafana_dashboard", "control_tier": "C1", "current_state": "dashboard_json_visible_apply_not_authorized", "observability_scope": ["AI SLO", "dashboard panels"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 dashboard owner、folder owner、UID conflict check 與 import rollback owner。", }, { "surface_id": "grafana_ollama_failover_dashboard", "label": "Grafana Ollama failover dashboard", "source_path": "ops/monitoring/grafana/dashboards/ollama_failover.json", "expected_scope": "grafana_ollama_failover_dashboard", "config_kind": "grafana_dashboard", "control_tier": "C1", "current_state": "dashboard_json_visible_apply_not_authorized", "observability_scope": ["Ollama failover", "AI provider health"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 dashboard owner、AI provider owner、Grafana import owner 與 rollback ref。", }, { "surface_id": "grafana_ai_monitoring_dashboard", "label": "Grafana AI monitoring dashboard", "source_path": "ops/grafana/dashboards/ai-monitoring.json", "expected_scope": "grafana_ai_monitoring_dashboard", "config_kind": "grafana_dashboard", "control_tier": "C1", "current_state": "dashboard_json_visible_apply_not_authorized", "observability_scope": ["AI monitoring", "runtime health"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 dashboard owner、panel query owner、import window 與 rollback owner。", }, { "surface_id": "grafana_infra_monitoring_dashboard", "label": "Grafana infra monitoring dashboard", "source_path": "ops/grafana/dashboards/infra-monitoring.json", "expected_scope": "grafana_infra_monitoring_dashboard", "config_kind": "grafana_dashboard", "control_tier": "C1", "current_state": "dashboard_json_visible_apply_not_authorized", "observability_scope": ["infrastructure monitoring", "host health"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 infra dashboard owner、host source owner、import owner 與 smoke plan。", }, { "surface_id": "grafana_nvidia_nemotron_dashboard", "label": "Grafana NVIDIA / NemoTron dashboard", "source_path": "ops/grafana/dashboards/nvidia-nemotron.json", "expected_scope": "grafana_nvidia_nemotron_dashboard", "config_kind": "grafana_dashboard", "control_tier": "C1", "current_state": "dashboard_json_visible_apply_not_authorized", "observability_scope": ["NVIDIA", "NemoTron", "GPU AI route"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 GPU / NemoTron owner、dashboard UID owner、import owner 與 rollback owner。", }, { "surface_id": "signoz_otel_collector_config", "label": "SigNoz OTEL collector config", "source_path": "ops/signoz/otel-collector-config-phase-o.yaml", "expected_scope": "signoz_otel_collector_phase_o", "config_kind": "otel_collector_config", "control_tier": "C1", "current_state": "otel_collector_config_visible_reload_not_authorized", "observability_scope": ["OTEL collector", "traces / metrics / logs routing"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 OTEL owner、pipeline diff、secret redaction proof、reload owner 與 data export boundary。", }, { "surface_id": "signoz_alerting_rules", "label": "SigNoz alerting rules", "source_path": "ops/signoz/alerting/rules.yaml", "expected_scope": "signoz_alerting_rules", "config_kind": "signoz_alert_rules", "control_tier": "C1", "current_state": "signoz_rules_visible_apply_not_authorized", "observability_scope": ["SigNoz alert rules", "logs / traces alerting"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 SigNoz rule owner、apply owner、receiver mapping、silence owner 與 rollback owner。", }, { "surface_id": "signoz_log_rules", "label": "SigNoz log alert rules", "source_path": "ops/signoz/alerting/log-rules.md", "expected_scope": "signoz_log_alert_rules", "config_kind": "signoz_log_rules", "control_tier": "C1", "current_state": "log_rule_runbook_visible_apply_not_authorized", "observability_scope": ["log alerting", "manual rule translation"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 log rule owner、manual translation reviewer、apply proof 與 rollback owner。", }, { "surface_id": "sentry_self_hosted_compose", "label": "Sentry self-hosted compose", "source_path": "ops/sentry-self-hosted/docker-compose.yml", "expected_scope": "sentry_self_hosted_runtime", "config_kind": "sentry_runtime_compose", "control_tier": "C1", "current_state": "sentry_compose_visible_live_hash_missing", "observability_scope": ["Sentry self-hosted", "ClickHouse / Postgres / Redis boundary"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 Sentry runtime owner、live hash、admin secret boundary、upgrade window 與 rollback owner。", }, { "surface_id": "sentry_self_hosted_deploy", "label": "Sentry self-hosted deploy script", "source_path": "ops/sentry-self-hosted/deploy.sh", "expected_scope": "sentry_self_hosted_deploy", "config_kind": "sentry_deploy_script", "control_tier": "C1", "current_state": "deploy_script_visible_not_executed", "observability_scope": ["Sentry deploy", "compose pull / up", "migration boundary"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 deploy owner、maintenance window、backup owner、migration rollback owner 與 smoke plan。", }, { "surface_id": "langfuse_compose", "label": "Langfuse compose", "source_path": "infra/langfuse/docker-compose.yml", "expected_scope": "langfuse_observability_runtime", "config_kind": "langfuse_runtime_compose", "control_tier": "C1", "current_state": "langfuse_compose_visible_live_hash_missing", "observability_scope": ["Langfuse", "LLM trace storage", "PostgreSQL dependency"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 Langfuse owner、trace privacy owner、live hash、restart window 與 rollback owner。", }, { "surface_id": "langfuse_readme", "label": "Langfuse deployment README", "source_path": "infra/langfuse/README.md", "expected_scope": "langfuse_observability_runbook", "config_kind": "langfuse_runbook", "control_tier": "C1", "current_state": "runbook_visible_needs_stale_disposition", "observability_scope": ["Langfuse deployment", "trace governance"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 runbook owner、live deployment parity、trace retention owner 與 secret redaction proof。", }, { "surface_id": "service_health_failure_policy_snapshot", "label": "Service health failure notification policy", "source_path": "docs/evaluations/service_health_failure_notification_policy_2026-06-05.json", "expected_scope": "service_health_failure_notification_policy", "config_kind": "notification_policy_snapshot", "control_tier": "C1", "current_state": "failure_only_policy_visible_live_send_gate_closed", "observability_scope": ["failure-only notification", "redaction contract"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 notification owner、receiver owner、live receipt proof、redaction proof 與 retry boundary。", }, { "surface_id": "backup_notification_policy_snapshot", "label": "Backup notification policy", "source_path": "docs/evaluations/backup_notification_policy_2026-06-04.json", "expected_scope": "backup_notification_policy", "config_kind": "notification_policy_snapshot", "control_tier": "C1", "current_state": "backup_notification_policy_visible_live_send_gate_closed", "observability_scope": ["backup failure notification", "restore / offsite alerting"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 backup notification owner、Telegram receipt owner、failure-only proof 與 false-green guard。", }, { "surface_id": "observability_contract_matrix_snapshot", "label": "Observability contract matrix", "source_path": "docs/evaluations/observability_contract_matrix_2026-06-05.json", "expected_scope": "observability_contract_matrix", "config_kind": "observability_contract_snapshot", "control_tier": "C1", "current_state": "contract_matrix_visible_needs_live_parity", "observability_scope": ["observability contract", "source coverage", "runtime evidence"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 contract owner、live parity owner、coverage gap disposition 與 evidence freshness owner。", }, { "surface_id": "telegram_action_required_digest_snapshot", "label": "Telegram action required digest policy", "source_path": "docs/evaluations/ai_agent_telegram_action_required_digest_policy_2026-06-11.json", "expected_scope": "ai_agent_telegram_digest_policy", "config_kind": "telegram_policy_snapshot", "control_tier": "C1", "current_state": "telegram_policy_visible_live_send_gate_closed", "observability_scope": ["Telegram digest", "action required routing"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 Telegram owner、receiver owner、receipt owner、redaction proof 與 retry boundary。", }, { "surface_id": "telegram_receipt_approval_snapshot", "label": "Telegram receipt approval package", "source_path": "docs/evaluations/ai_agent_telegram_receipt_approval_package_2026-06-11.json", "expected_scope": "ai_agent_telegram_receipt_policy", "config_kind": "telegram_policy_snapshot", "control_tier": "C1", "current_state": "receipt_policy_visible_live_receipt_gate_closed", "observability_scope": ["delivery receipt", "ack / retry policy"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 receipt owner、gateway queue owner、ack policy owner 與 no-live-send boundary。", }, { "surface_id": "telegram_gateway_service", "label": "Telegram gateway service", "source_path": "apps/api/src/services/telegram_gateway.py", "expected_scope": "telegram_gateway_runtime_sender", "config_kind": "telegram_runtime_sender", "control_tier": "C1", "current_state": "send_capable_service_visible_gate_closed", "observability_scope": ["Telegram Bot API", "dedup", "delivery queue"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 gateway owner、token injection owner、receipt owner、send approval gate 與 rollback owner。", }, { "surface_id": "notification_manager_service", "label": "Notification manager", "source_path": "apps/api/src/services/notifications/manager.py", "expected_scope": "notification_runtime_routing", "config_kind": "notification_runtime_router", "control_tier": "C1", "current_state": "runtime_router_visible_send_gate_closed", "observability_scope": ["notification channel routing", "Telegram / Discord manager"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 channel owner、routing owner、failure-only policy owner、receipt owner 與 rollback owner。", }, { "surface_id": "notification_matrix_service", "label": "Notification matrix", "source_path": "apps/api/src/services/notification_matrix.py", "expected_scope": "notification_matrix_policy", "config_kind": "notification_policy_code", "control_tier": "C1", "current_state": "policy_code_visible_runtime_gate_closed", "observability_scope": ["notification policy", "recipient / channel mapping"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 matrix owner、recipient owner、channel parity evidence 與 redaction policy。", }, { "surface_id": "alert_chain_metrics_service", "label": "Alert chain metrics service", "source_path": "apps/api/src/services/alert_chain_metrics_service.py", "expected_scope": "alert_chain_metrics", "config_kind": "alert_metrics_service", "control_tier": "C1", "current_state": "metrics_service_visible_needs_runtime_evidence", "observability_scope": ["alert chain metrics", "delivery visibility"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 metrics owner、Prometheus scrape owner、delivery evidence 與 false-green guard。", }, { "surface_id": "converged_alert_recurrence_notifier", "label": "Converged alert recurrence notifier", "source_path": "apps/api/src/services/converged_alert_recurrence_notifier.py", "expected_scope": "recurrence_notification_runtime", "config_kind": "notification_runtime_router", "control_tier": "C1", "current_state": "recurrence_notifier_visible_send_gate_closed", "observability_scope": ["recurrence notification", "dedup", "escalation routing"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 recurrence owner、noise budget owner、receipt owner、silence boundary 與 rollback owner。", }, { "surface_id": "sentry_webhook_service", "label": "Sentry webhook service", "source_path": "apps/api/src/services/sentry_webhook_service.py", "expected_scope": "sentry_webhook_receiver", "config_kind": "webhook_receiver", "control_tier": "C1", "current_state": "webhook_receiver_visible_secret_value_not_collected", "observability_scope": ["Sentry webhook", "signature validation", "triage route"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 webhook owner、secret name owner、signature proof、route smoke 與 replay boundary。", }, { "surface_id": "signoz_client_service", "label": "SigNoz client service", "source_path": "apps/api/src/services/signoz_client.py", "expected_scope": "signoz_api_client", "config_kind": "signoz_observability_client", "control_tier": "C1", "current_state": "client_visible_live_api_read_not_executed", "observability_scope": ["SigNoz API", "log query", "trace query"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 SigNoz client owner、token name owner、rate limit boundary 與 read-only smoke owner。", }, { "surface_id": "sentry_service_client", "label": "Sentry service client", "source_path": "apps/api/src/services/sentry_service.py", "expected_scope": "sentry_api_client", "config_kind": "sentry_observability_client", "control_tier": "C1", "current_state": "client_visible_live_api_read_not_executed", "observability_scope": ["Sentry API", "issue query", "AI triage"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 Sentry client owner、token name owner、read-only evidence 與 privacy boundary。", }, { "surface_id": "langfuse_client_service", "label": "Langfuse client service", "source_path": "apps/api/src/services/langfuse_client.py", "expected_scope": "langfuse_api_client", "config_kind": "langfuse_observability_client", "control_tier": "C1", "current_state": "client_visible_trace_read_write_boundary_needs_owner", "observability_scope": ["Langfuse API", "AI trace", "prompt / output privacy"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 Langfuse owner、trace privacy owner、token name owner、write gate 與 retention owner。", }, { "surface_id": "deploy_alertmanager_config_script", "label": "Alertmanager config deploy script", "source_path": "scripts/ops/deploy-alertmanager-config.sh", "expected_scope": "alertmanager_config_deploy", "config_kind": "reload_capable_script", "control_tier": "C1", "current_state": "reload_capable_script_visible_gate_closed", "observability_scope": ["Alertmanager config copy", "reload"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 deploy owner、maintenance window、rollback ref、receiver smoke 與 failure-only notification proof。", }, { "surface_id": "deploy_prometheus_alerts_script", "label": "Prometheus alerts deploy script", "source_path": "scripts/ops/deploy-alerts.sh", "expected_scope": "prometheus_alerts_deploy", "config_kind": "reload_capable_script", "control_tier": "C1", "current_state": "reload_capable_script_visible_gate_closed", "observability_scope": ["Prometheus alert rule deploy", "reload"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 deploy owner、rule test proof、reload owner、receiver smoke 與 rollback owner。", }, { "surface_id": "prometheus_rule_drift_guard_script", "label": "Prometheus rule drift guard", "source_path": "scripts/ops/prometheus-rule-drift-guard.sh", "expected_scope": "prometheus_rule_drift_guard", "config_kind": "drift_guard_script", "control_tier": "C1", "current_state": "drift_guard_visible_not_executed_by_inventory", "observability_scope": ["rule diff", "drift evidence", "reload guard"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 drift guard owner、live rule export owner、diff disposition 與 blocking policy owner。", }, { "surface_id": "k8s_deploy_prometheus_config_script", "label": "K8s Prometheus config deploy script", "source_path": "k8s/monitoring/deploy-prometheus-config.sh", "expected_scope": "k8s_prometheus_config_deploy", "config_kind": "reload_capable_script", "control_tier": "C1", "current_state": "deploy_script_visible_not_executed", "observability_scope": ["K8s config apply", "Prometheus reload"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 K8s deploy owner、kubectl / ArgoCD boundary、reload owner 與 rollback owner。", }, { "surface_id": "api_apply_prometheus_config_script", "label": "API Prometheus config apply script", "source_path": "apps/api/scripts/apply_prometheus_config.sh", "expected_scope": "api_prometheus_config_apply", "config_kind": "reload_capable_script", "control_tier": "C1", "current_state": "apply_script_visible_not_executed", "observability_scope": ["Prometheus config apply", "API-owned alert deploy"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 API deploy owner、config source owner、reload proof 與 rollback owner。", }, { "surface_id": "monitoring_exporter_deploy_script", "label": "Monitoring exporter deploy script", "source_path": "ops/monitoring/deploy-exporters.sh", "expected_scope": "monitoring_exporter_deploy", "config_kind": "host_deploy_script", "control_tier": "C1", "current_state": "host_deploy_script_visible_gate_closed", "observability_scope": ["exporter deploy", "host service restart"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 host owner、SSH boundary、restart window、rollback owner 與 exporter scrape proof。", }, { "surface_id": "fire_live_alert_script", "label": "Live alert firing script", "source_path": "apps/api/scripts/fire_live_alert.py", "expected_scope": "live_alert_fire_script", "config_kind": "live_alert_sender", "control_tier": "C1", "current_state": "live_alert_sender_visible_gate_closed", "observability_scope": ["live alert", "Alertmanager ingestion", "notification route"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 live alert owner、test window、receiver owner、noise budget 與 rollback / stop condition。", }, { "surface_id": "fire_test_alert_script", "label": "Test alert firing script", "source_path": "apps/api/scripts/fire_test_alert.py", "expected_scope": "test_alert_fire_script", "config_kind": "live_alert_sender", "control_tier": "C1", "current_state": "test_alert_sender_visible_gate_closed", "observability_scope": ["test alert", "notification chain smoke"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 test alert owner、allowed receiver、dedup proof、noisy route guard 與 cleanup owner。", }, { "surface_id": "alert_chain_smoke_script", "label": "Alert chain smoke script", "source_path": "scripts/alert_chain_smoke_test.py", "expected_scope": "alert_chain_smoke", "config_kind": "smoke_script", "control_tier": "C1", "current_state": "smoke_script_visible_not_executed_by_inventory", "observability_scope": ["alert chain E2E", "notification delivery"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 smoke owner、execution window、receiver owner、expected receipt 與 cleanup owner。", }, { "surface_id": "ops_alert_chain_smoke_script", "label": "Ops alert chain smoke script", "source_path": "ops/scripts/alert_chain_smoke_test.py", "expected_scope": "ops_alert_chain_smoke", "config_kind": "smoke_script", "control_tier": "C1", "current_state": "ops_smoke_script_visible_not_executed_by_inventory", "observability_scope": ["ops alert chain", "Alertmanager / Telegram smoke"], "requires_live_evidence": True, "requires_owner_response": True, "next_owner_action": "補 ops smoke owner、execution window、receiver owner、expected receipt 與 rollback owner。", }, ] FALSE_BOUNDARIES = { "runtime_execution_authorized": False, "host_write_authorized": False, "prometheus_reload_authorized": False, "alertmanager_reload_authorized": False, "grafana_dashboard_apply_authorized": False, "signoz_rule_apply_authorized": False, "sentry_deploy_authorized": False, "langfuse_config_change_authorized": False, "otel_collector_reload_authorized": False, "receiver_route_change_authorized": False, "silence_policy_change_authorized": False, "telegram_send_authorized": False, "notification_route_change_authorized": False, "webhook_receiver_change_authorized": False, "remote_write_change_authorized": False, "exporter_deploy_authorized": False, "live_alert_fire_authorized": False, "alert_chain_smoke_authorized": False, "ssh_read_authorized": False, "ssh_write_authorized": False, "kubectl_action_authorized": False, "secret_value_collection_allowed": False, "active_scan_authorized": False, "action_buttons_allowed": False, } WRITE_CAPABLE_KINDS = { "reload_capable_script", "host_deploy_script", "sentry_deploy_script", "telegram_runtime_sender", "notification_runtime_router", "live_alert_sender", } def git_short_sha(root: Path) -> str: try: result = subprocess.run( ["git", "rev-parse", "--short", "HEAD"], cwd=root, check=True, capture_output=True, text=True, ) return result.stdout.strip() except Exception: return "unknown" def file_metadata(root: Path, source_path: str) -> dict[str, Any]: path = root / source_path exists = path.exists() if not exists: return {"source_exists": False, "line_count": 0, "sha256": None} content = path.read_bytes() return { "source_exists": True, "line_count": len(content.decode("utf-8", errors="replace").splitlines()), "sha256": hashlib.sha256(content).hexdigest(), } def build_surface(root: Path, surface: dict[str, Any]) -> dict[str, Any]: metadata = file_metadata(root, surface["source_path"]) return { **surface, **metadata, "owner_response_received": False, "owner_response_accepted": False, "live_evidence_received": False, "reload_owner_accepted": False, "receiver_owner_accepted": False, "route_smoke_accepted": False, "maintenance_window_accepted": False, "rollback_owner_accepted": False, "runtime_gate_open": False, "action_buttons_allowed": False, } def count_kind(surfaces: list[dict[str, Any]], kinds: set[str]) -> int: return sum(1 for surface in surfaces if surface["config_kind"] in kinds) def build_report(root: Path, generated_at: str | None) -> dict[str, Any]: report_time = generated_at or datetime.now(TAIPEI).isoformat(timespec="seconds") surfaces = [build_surface(root, surface) for surface in SURFACES] expected_scopes = sorted({surface["expected_scope"] for surface in surfaces}) write_capable = [surface for surface in surfaces if surface["config_kind"] in WRITE_CAPABLE_KINDS] return { "schema_version": "monitoring_alerting_observability_inventory_v1", "generated_at": report_time, "git_commit": git_short_sha(root), "status": "repo_only_inventory_ready", "source_scope": "committed_repo_files_only", "summary": { "surface_count": len(surfaces), "source_exists_count": sum(1 for surface in surfaces if surface["source_exists"]), "expected_scope_count": len(expected_scopes), "prometheus_config_surface_count": count_kind( surfaces, { "prometheus_config", "prometheus_remote_write", "prometheus_generated_config", "monitoring_service_registry", "exporter_query_config", }, ), "alert_rule_surface_count": count_kind( surfaces, {"prometheus_alert_rules", "app_alert_rule_contract", "grafana_alert_rules"}, ), "alertmanager_receiver_surface_count": count_kind(surfaces, {"alertmanager_receiver_config"}), "grafana_surface_count": count_kind(surfaces, {"grafana_alert_rules", "grafana_dashboard"}), "signoz_surface_count": count_kind( surfaces, {"signoz_alert_rules", "signoz_log_rules", "signoz_observability_client"}, ), "sentry_surface_count": count_kind( surfaces, {"sentry_runtime_compose", "sentry_deploy_script", "webhook_receiver", "sentry_observability_client"}, ), "langfuse_surface_count": count_kind( surfaces, {"langfuse_runtime_compose", "langfuse_runbook", "langfuse_observability_client"}, ), "notification_policy_surface_count": count_kind( surfaces, {"notification_policy_snapshot", "observability_contract_snapshot", "notification_policy_code"}, ), "telegram_surface_count": count_kind( surfaces, {"telegram_policy_snapshot", "telegram_runtime_sender"}, ), "otel_surface_count": count_kind(surfaces, {"otel_collector_config"}), "deploy_or_reload_surface_count": count_kind( surfaces, {"reload_capable_script", "host_deploy_script", "sentry_deploy_script"}, ), "drift_guard_surface_count": count_kind(surfaces, {"drift_guard_script"}), "smoke_surface_count": count_kind(surfaces, {"smoke_script", "live_alert_sender"}), "write_capable_surface_count": len(write_capable), "surfaces_requiring_owner_response_count": sum(1 for surface in surfaces if surface["requires_owner_response"]), "surfaces_requiring_live_evidence_count": sum(1 for surface in surfaces if surface["requires_live_evidence"]), "owner_response_received_count": 0, "owner_response_accepted_count": 0, "live_evidence_received_count": 0, "reload_owner_accepted_count": 0, "receiver_owner_accepted_count": 0, "route_smoke_accepted_count": 0, "maintenance_window_accepted_count": 0, "rollback_owner_accepted_count": 0, "runtime_gate_count": 0, "action_button_count": 0, "coverage_percent_before_inventory": 56, "coverage_percent_after_inventory": 62, }, "execution_boundaries": FALSE_BOUNDARIES, "expected_scopes": expected_scopes, "observability_surfaces": surfaces, "write_capable_surfaces": [ { "surface_id": surface["surface_id"], "label": surface["label"], "config_kind": surface["config_kind"], "expected_scope": surface["expected_scope"], "required_gate": "owner_response_plus_maintenance_window_plus_rollback_owner", } for surface in write_capable ], "next_collection_order": [ "alertmanager_receiver_config", "prometheus_alerts_ops", "prometheus_rule_drift_guard_script", "deploy_alertmanager_config_script", "deploy_prometheus_alerts_script", "monitoring_110_compose", "signoz_otel_collector_config", "sentry_self_hosted_compose", "telegram_gateway_service", "alert_chain_smoke_script", ], "operator_interpretation": [ "這是 monitoring / alerting / observability 的 repo-only 清冊,不是 live monitoring truth。", "source_exists 與 sha256 只代表 repo source 可追溯,不代表 live Prometheus / Alertmanager / Grafana / SigNoz / Sentry 已一致。", "write-capable surface 可改 route、reload、send notification、deploy exporter 或發測試告警;在 owner response、維護窗口、rollback owner 前全部維持 gate closed。", "不得從本清冊啟動 reload、silence、Telegram send、Sentry deploy、SigNoz apply、Prometheus remote_write 變更、SSH 或 kubectl。", ], } def main() -> int: parser = argparse.ArgumentParser(description="IwoooS monitoring / alerting / observability repo-only 清冊") parser.add_argument("--root", default=".", help="repo root") parser.add_argument("--output", help="寫出 JSON 報告") parser.add_argument("--generated-at", help="固定報告時間,供 committed snapshot 使用") args = parser.parse_args() root = Path(args.root).resolve() report = build_report(root, args.generated_at) payload = json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True) if args.output: output = Path(args.output) output.parent.mkdir(parents=True, exist_ok=True) output.write_text(payload + "\n", encoding="utf-8") else: print(payload) summary = report["summary"] print( "MONITORING_ALERTING_OBSERVABILITY_INVENTORY_OK " f"surfaces={summary['surface_count']} " f"alert_rules={summary['alert_rule_surface_count']} " f"write_capable={summary['write_capable_surface_count']} " f"runtime_gate={summary['runtime_gate_count']}", file=sys.stderr, ) return 0 if __name__ == "__main__": sys.exit(main())