1040 lines
50 KiB
Python
1040 lines
50 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
IwoooS monitoring / alerting / observability repo-only 清冊。
|
|
|
|
本工具只讀取已提交的 repo 檔案,整理 Prometheus、Alertmanager、
|
|
Grafana、SigNoz、Sentry、Langfuse、OTEL、Telegram / notification
|
|
policy、deploy / reload scripts 與 smoke scripts。它不連 live
|
|
Prometheus、不 reload Alertmanager、不改 Grafana、不套用 SigNoz rule、
|
|
不部署 Sentry、不發 Telegram、不建立 silence、不 SSH、不 kubectl、不讀
|
|
secret value。
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import hashlib
|
|
import json
|
|
import subprocess
|
|
import sys
|
|
from datetime import datetime, timedelta, timezone
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
|
|
TAIPEI = timezone(timedelta(hours=8))
|
|
|
|
|
|
SURFACES: list[dict[str, Any]] = [
|
|
{
|
|
"surface_id": "prometheus_k8s_base_config",
|
|
"label": "K8s Prometheus base config",
|
|
"source_path": "k8s/monitoring/prometheus.yml",
|
|
"expected_scope": "k8s_monitoring_prometheus_base",
|
|
"config_kind": "prometheus_config",
|
|
"control_tier": "C1",
|
|
"current_state": "repo_source_visible_reload_not_authorized",
|
|
"observability_scope": ["scrape config", "service discovery", "alert rule includes"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 Prometheus owner、live config hash、reload owner、rule diff 與 route smoke 指標。",
|
|
},
|
|
{
|
|
"surface_id": "prometheus_k8s_additions_config",
|
|
"label": "K8s Prometheus additions",
|
|
"source_path": "k8s/monitoring/prometheus-config-additions.yaml",
|
|
"expected_scope": "k8s_monitoring_prometheus_additions",
|
|
"config_kind": "prometheus_config",
|
|
"control_tier": "C1",
|
|
"current_state": "repo_source_visible_needs_drift_disposition",
|
|
"observability_scope": ["supplemental scrape", "additional rule paths"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 additions owner、live parity evidence、rollout window 與 rollback owner。",
|
|
},
|
|
{
|
|
"surface_id": "prometheus_phase_o_config",
|
|
"label": "Prometheus Phase O config",
|
|
"source_path": "k8s/monitoring/prometheus-config-phase-o.yaml",
|
|
"expected_scope": "phase_o_prometheus_config",
|
|
"config_kind": "prometheus_config",
|
|
"control_tier": "C1",
|
|
"current_state": "phase_config_visible_reload_not_authorized",
|
|
"observability_scope": ["Phase O monitoring", "baseline scrape"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 Phase O owner、stale config disposition、reload owner 與 post-check 指標。",
|
|
},
|
|
{
|
|
"surface_id": "prometheus_remote_write_signoz",
|
|
"label": "Prometheus remote write to SigNoz",
|
|
"source_path": "k8s/monitoring/prometheus-remote-write-signoz.yaml",
|
|
"expected_scope": "prometheus_remote_write_signoz",
|
|
"config_kind": "prometheus_remote_write",
|
|
"control_tier": "C1",
|
|
"current_state": "data_export_config_visible_gate_closed",
|
|
"observability_scope": ["remote write", "SigNoz ingestion", "data export boundary"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 data export owner、privacy boundary、SigNoz ingest owner、rollback owner 與 volume guard。",
|
|
},
|
|
{
|
|
"surface_id": "prometheus_generated_scrape_config",
|
|
"label": "Generated Prometheus scrape config",
|
|
"source_path": "ops/monitoring/generated/prometheus-scrape-generated.yaml",
|
|
"expected_scope": "generated_scrape_targets",
|
|
"config_kind": "prometheus_generated_config",
|
|
"control_tier": "C1",
|
|
"current_state": "generated_source_visible_needs_source_registry_parity",
|
|
"observability_scope": ["generated targets", "service registry"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 generator owner、service registry parity、live target count 與 stale target disposition。",
|
|
},
|
|
{
|
|
"surface_id": "prometheus_generated_blackbox_targets",
|
|
"label": "Generated blackbox targets",
|
|
"source_path": "ops/monitoring/generated/blackbox-targets-generated.yaml",
|
|
"expected_scope": "generated_blackbox_targets",
|
|
"config_kind": "prometheus_generated_config",
|
|
"control_tier": "C1",
|
|
"current_state": "generated_blackbox_targets_visible_probe_not_executed",
|
|
"observability_scope": ["blackbox targets", "public route smoke candidates"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 blackbox owner、target freshness、public route owner 與 probe execution approval。",
|
|
},
|
|
{
|
|
"surface_id": "monitoring_service_registry",
|
|
"label": "Monitoring service registry",
|
|
"source_path": "ops/monitoring/service-registry.yaml",
|
|
"expected_scope": "monitoring_service_registry",
|
|
"config_kind": "monitoring_service_registry",
|
|
"control_tier": "C1",
|
|
"current_state": "registry_visible_needs_owner_disposition",
|
|
"observability_scope": ["service registry", "target generation"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 registry owner、產品 owner mapping、retired target disposition 與 generator smoke。",
|
|
},
|
|
{
|
|
"surface_id": "postgres_exporter_queries",
|
|
"label": "Postgres exporter query config",
|
|
"source_path": "ops/monitoring/postgres-exporter-queries.yaml",
|
|
"expected_scope": "postgres_exporter_queries",
|
|
"config_kind": "exporter_query_config",
|
|
"control_tier": "C1",
|
|
"current_state": "query_config_visible_needs_db_owner_review",
|
|
"observability_scope": ["PostgreSQL metrics", "custom exporter queries"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 DB owner、query cost boundary、metric name owner 與 rollback owner。",
|
|
},
|
|
{
|
|
"surface_id": "monitoring_110_compose",
|
|
"label": "110 monitoring compose",
|
|
"source_path": "k8s/monitoring/docker-compose-110.yml",
|
|
"expected_scope": "192.168.0.110_monitoring_stack",
|
|
"config_kind": "monitoring_runtime_compose",
|
|
"control_tier": "C1",
|
|
"current_state": "runtime_compose_visible_live_hash_missing",
|
|
"observability_scope": ["Prometheus", "Grafana", "Alertmanager", "blackbox", "cadvisor"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 110 live compose hash、restart window、rollback owner、admin secret boundary 與 post-check。",
|
|
},
|
|
{
|
|
"surface_id": "monitoring_exporters_compose",
|
|
"label": "188 monitoring exporters compose",
|
|
"source_path": "ops/monitoring/docker-compose.exporters.yaml",
|
|
"expected_scope": "192.168.0.188_exporters",
|
|
"config_kind": "monitoring_runtime_compose",
|
|
"control_tier": "C1",
|
|
"current_state": "exporter_compose_visible_live_hash_missing",
|
|
"observability_scope": ["postgres-exporter", "redis-exporter"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 188 exporter owner、live hash、env source policy、restart window 與 rollback owner。",
|
|
},
|
|
{
|
|
"surface_id": "alertmanager_receiver_config",
|
|
"label": "Alertmanager receiver config",
|
|
"source_path": "ops/alertmanager/alertmanager.yml",
|
|
"expected_scope": "alertmanager_routes_receivers",
|
|
"config_kind": "alertmanager_receiver_config",
|
|
"control_tier": "C1",
|
|
"current_state": "receiver_route_visible_reload_not_authorized",
|
|
"observability_scope": ["routes", "receivers", "grouping", "Telegram / webhook receiver boundary"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 receiver owner、route diff、silence policy owner、reload owner 與 failure-only notification proof。",
|
|
},
|
|
{
|
|
"surface_id": "prometheus_alerts_ops",
|
|
"label": "Ops Prometheus alerts",
|
|
"source_path": "ops/monitoring/alerts.yml",
|
|
"expected_scope": "ops_prometheus_alerts",
|
|
"config_kind": "prometheus_alert_rules",
|
|
"control_tier": "C1",
|
|
"current_state": "alert_rules_visible_reload_not_authorized",
|
|
"observability_scope": ["infrastructure alerts", "backup alerts", "route alerts"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 alert rule owner、rule diff、receiver mapping、reload owner 與 false-green guard。",
|
|
},
|
|
{
|
|
"surface_id": "prometheus_alerts_unified",
|
|
"label": "Unified Prometheus alerts",
|
|
"source_path": "ops/monitoring/alerts-unified.yml",
|
|
"expected_scope": "unified_prometheus_alerts",
|
|
"config_kind": "prometheus_alert_rules",
|
|
"control_tier": "C1",
|
|
"current_state": "unified_rules_visible_needs_receiver_parity",
|
|
"observability_scope": ["unified alerts", "routing labels"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 unified rule owner、label taxonomy owner、receiver parity 與 rollout window。",
|
|
},
|
|
{
|
|
"surface_id": "prometheus_slo_rules",
|
|
"label": "Prometheus SLO rules",
|
|
"source_path": "ops/monitoring/slo-rules.yml",
|
|
"expected_scope": "prometheus_slo_rules",
|
|
"config_kind": "prometheus_alert_rules",
|
|
"control_tier": "C1",
|
|
"current_state": "slo_rules_visible_reload_not_authorized",
|
|
"observability_scope": ["SLO burn rate", "availability indicators"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 SLO owner、error budget owner、rule test evidence 與 receiver mapping。",
|
|
},
|
|
{
|
|
"surface_id": "prometheus_ollama_health_rules",
|
|
"label": "Ollama health alert rules",
|
|
"source_path": "ops/monitoring/ollama_health_rules.yaml",
|
|
"expected_scope": "ollama_health_rules",
|
|
"config_kind": "prometheus_alert_rules",
|
|
"control_tier": "C1",
|
|
"current_state": "ai_runtime_health_rules_visible_reload_not_authorized",
|
|
"observability_scope": ["Ollama health", "AI provider availability"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 AI provider owner、fallback owner、receiver owner 與 reload owner。",
|
|
},
|
|
{
|
|
"surface_id": "k8s_k3s_alerts",
|
|
"label": "K3s alert rules",
|
|
"source_path": "k8s/monitoring/k3s-alerts.yaml",
|
|
"expected_scope": "k3s_cluster_alerts",
|
|
"config_kind": "prometheus_alert_rules",
|
|
"control_tier": "C1",
|
|
"current_state": "k3s_rules_visible_apply_not_authorized",
|
|
"observability_scope": ["K3s workloads", "cluster health"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 K3s owner、rule apply window、ArgoCD / kubectl boundary 與 rollback owner。",
|
|
},
|
|
{
|
|
"surface_id": "k8s_k3s_supplemental_alerts",
|
|
"label": "K3s supplemental alert rules",
|
|
"source_path": "k8s/monitoring/k3s-alerts-supplemental.yaml",
|
|
"expected_scope": "k3s_supplemental_alerts",
|
|
"config_kind": "prometheus_alert_rules",
|
|
"control_tier": "C1",
|
|
"current_state": "supplemental_rules_visible_apply_not_authorized",
|
|
"observability_scope": ["K3s supplemental health", "gap alerts"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 supplemental rule owner、overlap disposition、receiver owner 與 apply boundary。",
|
|
},
|
|
{
|
|
"surface_id": "k8s_database_alerts",
|
|
"label": "Database alert rules",
|
|
"source_path": "k8s/monitoring/database-alerts.yaml",
|
|
"expected_scope": "database_alerts",
|
|
"config_kind": "prometheus_alert_rules",
|
|
"control_tier": "C1",
|
|
"current_state": "database_rules_visible_apply_not_authorized",
|
|
"observability_scope": ["PostgreSQL", "Redis", "DB availability"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 DB alert owner、threshold owner、receiver mapping 與 live metric evidence。",
|
|
},
|
|
{
|
|
"surface_id": "k8s_nvidia_alerts",
|
|
"label": "NVIDIA alert rules",
|
|
"source_path": "k8s/monitoring/nvidia-alerts.yaml",
|
|
"expected_scope": "nvidia_gpu_alerts",
|
|
"config_kind": "prometheus_alert_rules",
|
|
"control_tier": "C1",
|
|
"current_state": "gpu_rules_visible_apply_not_authorized",
|
|
"observability_scope": ["GPU health", "AI workload capacity"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 GPU owner、NVIDIA exporter live evidence、receiver owner 與 rollout boundary。",
|
|
},
|
|
{
|
|
"surface_id": "k8s_minio_kali_alerts",
|
|
"label": "MinIO / Kali alert rules",
|
|
"source_path": "k8s/monitoring/minio-kali-alerts.yaml",
|
|
"expected_scope": "minio_kali_alerts",
|
|
"config_kind": "prometheus_alert_rules",
|
|
"control_tier": "C1",
|
|
"current_state": "minio_kali_rules_visible_apply_not_authorized",
|
|
"observability_scope": ["MinIO", "Kali scanner", "read-only scanner status"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 MinIO / Kali owner、active scan boundary、receiver owner 與 rule apply owner。",
|
|
},
|
|
{
|
|
"surface_id": "k8s_flywheel_alerts",
|
|
"label": "AI flywheel alert rules",
|
|
"source_path": "k8s/monitoring/flywheel-alerts.yaml",
|
|
"expected_scope": "ai_flywheel_alerts",
|
|
"config_kind": "prometheus_alert_rules",
|
|
"control_tier": "C1",
|
|
"current_state": "flywheel_rules_visible_apply_not_authorized",
|
|
"observability_scope": ["AI flywheel", "learning / runtime health"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 AI flywheel owner、runtime gate boundary、receiver owner 與 false-green guard。",
|
|
},
|
|
{
|
|
"surface_id": "k8s_alert_chain_monitor",
|
|
"label": "Alert chain monitor",
|
|
"source_path": "k8s/monitoring/alert-chain-monitor.yaml",
|
|
"expected_scope": "alert_chain_monitor",
|
|
"config_kind": "prometheus_alert_rules",
|
|
"control_tier": "C1",
|
|
"current_state": "alert_chain_monitor_visible_apply_not_authorized",
|
|
"observability_scope": ["Alertmanager chain", "E2E alert visibility"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 alert chain owner、E2E smoke owner、receiver owner 與 post-check 指標。",
|
|
},
|
|
{
|
|
"surface_id": "app_alert_rules_yaml",
|
|
"label": "API alert rule engine rules",
|
|
"source_path": "apps/api/alert_rules.yaml",
|
|
"expected_scope": "api_alert_rule_engine",
|
|
"config_kind": "app_alert_rule_contract",
|
|
"control_tier": "C1",
|
|
"current_state": "app_rule_contract_visible_runtime_change_not_authorized",
|
|
"observability_scope": ["alert classification", "approval recommendations", "playbook matching"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 rule owner、AI decision owner、classification drift evidence 與 rollback owner。",
|
|
},
|
|
{
|
|
"surface_id": "grafana_agent_step_latency_rules",
|
|
"label": "Grafana agent step latency rules",
|
|
"source_path": "ops/monitoring/grafana/agent_step_latency_rules.yaml",
|
|
"expected_scope": "grafana_agent_step_latency_rules",
|
|
"config_kind": "grafana_alert_rules",
|
|
"control_tier": "C1",
|
|
"current_state": "grafana_rules_visible_apply_not_authorized",
|
|
"observability_scope": ["agent step latency", "Grafana alerting"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 Grafana alert owner、dashboard / rule UID owner、apply owner 與 rollback owner。",
|
|
},
|
|
{
|
|
"surface_id": "grafana_ai_slo_dashboard",
|
|
"label": "Grafana AI SLO dashboard",
|
|
"source_path": "ops/monitoring/grafana/dashboards/ai-slo-dashboard.json",
|
|
"expected_scope": "grafana_ai_slo_dashboard",
|
|
"config_kind": "grafana_dashboard",
|
|
"control_tier": "C1",
|
|
"current_state": "dashboard_json_visible_apply_not_authorized",
|
|
"observability_scope": ["AI SLO", "dashboard panels"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 dashboard owner、folder owner、UID conflict check 與 import rollback owner。",
|
|
},
|
|
{
|
|
"surface_id": "grafana_ollama_failover_dashboard",
|
|
"label": "Grafana Ollama failover dashboard",
|
|
"source_path": "ops/monitoring/grafana/dashboards/ollama_failover.json",
|
|
"expected_scope": "grafana_ollama_failover_dashboard",
|
|
"config_kind": "grafana_dashboard",
|
|
"control_tier": "C1",
|
|
"current_state": "dashboard_json_visible_apply_not_authorized",
|
|
"observability_scope": ["Ollama failover", "AI provider health"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 dashboard owner、AI provider owner、Grafana import owner 與 rollback ref。",
|
|
},
|
|
{
|
|
"surface_id": "grafana_ai_monitoring_dashboard",
|
|
"label": "Grafana AI monitoring dashboard",
|
|
"source_path": "ops/grafana/dashboards/ai-monitoring.json",
|
|
"expected_scope": "grafana_ai_monitoring_dashboard",
|
|
"config_kind": "grafana_dashboard",
|
|
"control_tier": "C1",
|
|
"current_state": "dashboard_json_visible_apply_not_authorized",
|
|
"observability_scope": ["AI monitoring", "runtime health"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 dashboard owner、panel query owner、import window 與 rollback owner。",
|
|
},
|
|
{
|
|
"surface_id": "grafana_infra_monitoring_dashboard",
|
|
"label": "Grafana infra monitoring dashboard",
|
|
"source_path": "ops/grafana/dashboards/infra-monitoring.json",
|
|
"expected_scope": "grafana_infra_monitoring_dashboard",
|
|
"config_kind": "grafana_dashboard",
|
|
"control_tier": "C1",
|
|
"current_state": "dashboard_json_visible_apply_not_authorized",
|
|
"observability_scope": ["infrastructure monitoring", "host health"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 infra dashboard owner、host source owner、import owner 與 smoke plan。",
|
|
},
|
|
{
|
|
"surface_id": "grafana_nvidia_nemotron_dashboard",
|
|
"label": "Grafana NVIDIA / NemoTron dashboard",
|
|
"source_path": "ops/grafana/dashboards/nvidia-nemotron.json",
|
|
"expected_scope": "grafana_nvidia_nemotron_dashboard",
|
|
"config_kind": "grafana_dashboard",
|
|
"control_tier": "C1",
|
|
"current_state": "dashboard_json_visible_apply_not_authorized",
|
|
"observability_scope": ["NVIDIA", "NemoTron", "GPU AI route"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 GPU / NemoTron owner、dashboard UID owner、import owner 與 rollback owner。",
|
|
},
|
|
{
|
|
"surface_id": "signoz_otel_collector_config",
|
|
"label": "SigNoz OTEL collector config",
|
|
"source_path": "ops/signoz/otel-collector-config-phase-o.yaml",
|
|
"expected_scope": "signoz_otel_collector_phase_o",
|
|
"config_kind": "otel_collector_config",
|
|
"control_tier": "C1",
|
|
"current_state": "otel_collector_config_visible_reload_not_authorized",
|
|
"observability_scope": ["OTEL collector", "traces / metrics / logs routing"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 OTEL owner、pipeline diff、secret redaction proof、reload owner 與 data export boundary。",
|
|
},
|
|
{
|
|
"surface_id": "signoz_alerting_rules",
|
|
"label": "SigNoz alerting rules",
|
|
"source_path": "ops/signoz/alerting/rules.yaml",
|
|
"expected_scope": "signoz_alerting_rules",
|
|
"config_kind": "signoz_alert_rules",
|
|
"control_tier": "C1",
|
|
"current_state": "signoz_rules_visible_apply_not_authorized",
|
|
"observability_scope": ["SigNoz alert rules", "logs / traces alerting"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 SigNoz rule owner、apply owner、receiver mapping、silence owner 與 rollback owner。",
|
|
},
|
|
{
|
|
"surface_id": "signoz_log_rules",
|
|
"label": "SigNoz log alert rules",
|
|
"source_path": "ops/signoz/alerting/log-rules.md",
|
|
"expected_scope": "signoz_log_alert_rules",
|
|
"config_kind": "signoz_log_rules",
|
|
"control_tier": "C1",
|
|
"current_state": "log_rule_runbook_visible_apply_not_authorized",
|
|
"observability_scope": ["log alerting", "manual rule translation"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 log rule owner、manual translation reviewer、apply proof 與 rollback owner。",
|
|
},
|
|
{
|
|
"surface_id": "sentry_self_hosted_compose",
|
|
"label": "Sentry self-hosted compose",
|
|
"source_path": "ops/sentry-self-hosted/docker-compose.yml",
|
|
"expected_scope": "sentry_self_hosted_runtime",
|
|
"config_kind": "sentry_runtime_compose",
|
|
"control_tier": "C1",
|
|
"current_state": "sentry_compose_visible_live_hash_missing",
|
|
"observability_scope": ["Sentry self-hosted", "ClickHouse / Postgres / Redis boundary"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 Sentry runtime owner、live hash、admin secret boundary、upgrade window 與 rollback owner。",
|
|
},
|
|
{
|
|
"surface_id": "sentry_self_hosted_deploy",
|
|
"label": "Sentry self-hosted deploy script",
|
|
"source_path": "ops/sentry-self-hosted/deploy.sh",
|
|
"expected_scope": "sentry_self_hosted_deploy",
|
|
"config_kind": "sentry_deploy_script",
|
|
"control_tier": "C1",
|
|
"current_state": "deploy_script_visible_not_executed",
|
|
"observability_scope": ["Sentry deploy", "compose pull / up", "migration boundary"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 deploy owner、maintenance window、backup owner、migration rollback owner 與 smoke plan。",
|
|
},
|
|
{
|
|
"surface_id": "langfuse_compose",
|
|
"label": "Langfuse compose",
|
|
"source_path": "infra/langfuse/docker-compose.yml",
|
|
"expected_scope": "langfuse_observability_runtime",
|
|
"config_kind": "langfuse_runtime_compose",
|
|
"control_tier": "C1",
|
|
"current_state": "langfuse_compose_visible_live_hash_missing",
|
|
"observability_scope": ["Langfuse", "LLM trace storage", "PostgreSQL dependency"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 Langfuse owner、trace privacy owner、live hash、restart window 與 rollback owner。",
|
|
},
|
|
{
|
|
"surface_id": "langfuse_readme",
|
|
"label": "Langfuse deployment README",
|
|
"source_path": "infra/langfuse/README.md",
|
|
"expected_scope": "langfuse_observability_runbook",
|
|
"config_kind": "langfuse_runbook",
|
|
"control_tier": "C1",
|
|
"current_state": "runbook_visible_needs_stale_disposition",
|
|
"observability_scope": ["Langfuse deployment", "trace governance"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 runbook owner、live deployment parity、trace retention owner 與 secret redaction proof。",
|
|
},
|
|
{
|
|
"surface_id": "service_health_failure_policy_snapshot",
|
|
"label": "Service health failure notification policy",
|
|
"source_path": "docs/evaluations/service_health_failure_notification_policy_2026-06-05.json",
|
|
"expected_scope": "service_health_failure_notification_policy",
|
|
"config_kind": "notification_policy_snapshot",
|
|
"control_tier": "C1",
|
|
"current_state": "failure_only_policy_visible_live_send_gate_closed",
|
|
"observability_scope": ["failure-only notification", "redaction contract"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 notification owner、receiver owner、live receipt proof、redaction proof 與 retry boundary。",
|
|
},
|
|
{
|
|
"surface_id": "backup_notification_policy_snapshot",
|
|
"label": "Backup notification policy",
|
|
"source_path": "docs/evaluations/backup_notification_policy_2026-06-04.json",
|
|
"expected_scope": "backup_notification_policy",
|
|
"config_kind": "notification_policy_snapshot",
|
|
"control_tier": "C1",
|
|
"current_state": "backup_notification_policy_visible_live_send_gate_closed",
|
|
"observability_scope": ["backup failure notification", "restore / offsite alerting"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 backup notification owner、Telegram receipt owner、failure-only proof 與 false-green guard。",
|
|
},
|
|
{
|
|
"surface_id": "observability_contract_matrix_snapshot",
|
|
"label": "Observability contract matrix",
|
|
"source_path": "docs/evaluations/observability_contract_matrix_2026-06-05.json",
|
|
"expected_scope": "observability_contract_matrix",
|
|
"config_kind": "observability_contract_snapshot",
|
|
"control_tier": "C1",
|
|
"current_state": "contract_matrix_visible_needs_live_parity",
|
|
"observability_scope": ["observability contract", "source coverage", "runtime evidence"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 contract owner、live parity owner、coverage gap disposition 與 evidence freshness owner。",
|
|
},
|
|
{
|
|
"surface_id": "telegram_action_required_digest_snapshot",
|
|
"label": "Telegram action required digest policy",
|
|
"source_path": "docs/evaluations/ai_agent_telegram_action_required_digest_policy_2026-06-11.json",
|
|
"expected_scope": "ai_agent_telegram_digest_policy",
|
|
"config_kind": "telegram_policy_snapshot",
|
|
"control_tier": "C1",
|
|
"current_state": "telegram_policy_visible_live_send_gate_closed",
|
|
"observability_scope": ["Telegram digest", "action required routing"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 Telegram owner、receiver owner、receipt owner、redaction proof 與 retry boundary。",
|
|
},
|
|
{
|
|
"surface_id": "telegram_receipt_approval_snapshot",
|
|
"label": "Telegram receipt approval package",
|
|
"source_path": "docs/evaluations/ai_agent_telegram_receipt_approval_package_2026-06-11.json",
|
|
"expected_scope": "ai_agent_telegram_receipt_policy",
|
|
"config_kind": "telegram_policy_snapshot",
|
|
"control_tier": "C1",
|
|
"current_state": "receipt_policy_visible_live_receipt_gate_closed",
|
|
"observability_scope": ["delivery receipt", "ack / retry policy"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 receipt owner、gateway queue owner、ack policy owner 與 no-live-send boundary。",
|
|
},
|
|
{
|
|
"surface_id": "telegram_gateway_service",
|
|
"label": "Telegram gateway service",
|
|
"source_path": "apps/api/src/services/telegram_gateway.py",
|
|
"expected_scope": "telegram_gateway_runtime_sender",
|
|
"config_kind": "telegram_runtime_sender",
|
|
"control_tier": "C1",
|
|
"current_state": "send_capable_service_visible_gate_closed",
|
|
"observability_scope": ["Telegram Bot API", "dedup", "delivery queue"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 gateway owner、token injection owner、receipt owner、send approval gate 與 rollback owner。",
|
|
},
|
|
{
|
|
"surface_id": "notification_manager_service",
|
|
"label": "Notification manager",
|
|
"source_path": "apps/api/src/services/notifications/manager.py",
|
|
"expected_scope": "notification_runtime_routing",
|
|
"config_kind": "notification_runtime_router",
|
|
"control_tier": "C1",
|
|
"current_state": "runtime_router_visible_send_gate_closed",
|
|
"observability_scope": ["notification channel routing", "Telegram / Discord manager"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 channel owner、routing owner、failure-only policy owner、receipt owner 與 rollback owner。",
|
|
},
|
|
{
|
|
"surface_id": "notification_matrix_service",
|
|
"label": "Notification matrix",
|
|
"source_path": "apps/api/src/services/notification_matrix.py",
|
|
"expected_scope": "notification_matrix_policy",
|
|
"config_kind": "notification_policy_code",
|
|
"control_tier": "C1",
|
|
"current_state": "policy_code_visible_runtime_gate_closed",
|
|
"observability_scope": ["notification policy", "recipient / channel mapping"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 matrix owner、recipient owner、channel parity evidence 與 redaction policy。",
|
|
},
|
|
{
|
|
"surface_id": "alert_chain_metrics_service",
|
|
"label": "Alert chain metrics service",
|
|
"source_path": "apps/api/src/services/alert_chain_metrics_service.py",
|
|
"expected_scope": "alert_chain_metrics",
|
|
"config_kind": "alert_metrics_service",
|
|
"control_tier": "C1",
|
|
"current_state": "metrics_service_visible_needs_runtime_evidence",
|
|
"observability_scope": ["alert chain metrics", "delivery visibility"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 metrics owner、Prometheus scrape owner、delivery evidence 與 false-green guard。",
|
|
},
|
|
{
|
|
"surface_id": "converged_alert_recurrence_notifier",
|
|
"label": "Converged alert recurrence notifier",
|
|
"source_path": "apps/api/src/services/converged_alert_recurrence_notifier.py",
|
|
"expected_scope": "recurrence_notification_runtime",
|
|
"config_kind": "notification_runtime_router",
|
|
"control_tier": "C1",
|
|
"current_state": "recurrence_notifier_visible_send_gate_closed",
|
|
"observability_scope": ["recurrence notification", "dedup", "escalation routing"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 recurrence owner、noise budget owner、receipt owner、silence boundary 與 rollback owner。",
|
|
},
|
|
{
|
|
"surface_id": "sentry_webhook_service",
|
|
"label": "Sentry webhook service",
|
|
"source_path": "apps/api/src/services/sentry_webhook_service.py",
|
|
"expected_scope": "sentry_webhook_receiver",
|
|
"config_kind": "webhook_receiver",
|
|
"control_tier": "C1",
|
|
"current_state": "webhook_receiver_visible_secret_value_not_collected",
|
|
"observability_scope": ["Sentry webhook", "signature validation", "triage route"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 webhook owner、secret name owner、signature proof、route smoke 與 replay boundary。",
|
|
},
|
|
{
|
|
"surface_id": "signoz_client_service",
|
|
"label": "SigNoz client service",
|
|
"source_path": "apps/api/src/services/signoz_client.py",
|
|
"expected_scope": "signoz_api_client",
|
|
"config_kind": "signoz_observability_client",
|
|
"control_tier": "C1",
|
|
"current_state": "client_visible_live_api_read_not_executed",
|
|
"observability_scope": ["SigNoz API", "log query", "trace query"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 SigNoz client owner、token name owner、rate limit boundary 與 read-only smoke owner。",
|
|
},
|
|
{
|
|
"surface_id": "sentry_service_client",
|
|
"label": "Sentry service client",
|
|
"source_path": "apps/api/src/services/sentry_service.py",
|
|
"expected_scope": "sentry_api_client",
|
|
"config_kind": "sentry_observability_client",
|
|
"control_tier": "C1",
|
|
"current_state": "client_visible_live_api_read_not_executed",
|
|
"observability_scope": ["Sentry API", "issue query", "AI triage"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 Sentry client owner、token name owner、read-only evidence 與 privacy boundary。",
|
|
},
|
|
{
|
|
"surface_id": "langfuse_client_service",
|
|
"label": "Langfuse client service",
|
|
"source_path": "apps/api/src/services/langfuse_client.py",
|
|
"expected_scope": "langfuse_api_client",
|
|
"config_kind": "langfuse_observability_client",
|
|
"control_tier": "C1",
|
|
"current_state": "client_visible_trace_read_write_boundary_needs_owner",
|
|
"observability_scope": ["Langfuse API", "AI trace", "prompt / output privacy"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 Langfuse owner、trace privacy owner、token name owner、write gate 與 retention owner。",
|
|
},
|
|
{
|
|
"surface_id": "deploy_alertmanager_config_script",
|
|
"label": "Alertmanager config deploy script",
|
|
"source_path": "scripts/ops/deploy-alertmanager-config.sh",
|
|
"expected_scope": "alertmanager_config_deploy",
|
|
"config_kind": "reload_capable_script",
|
|
"control_tier": "C1",
|
|
"current_state": "reload_capable_script_visible_gate_closed",
|
|
"observability_scope": ["Alertmanager config copy", "reload"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 deploy owner、maintenance window、rollback ref、receiver smoke 與 failure-only notification proof。",
|
|
},
|
|
{
|
|
"surface_id": "deploy_prometheus_alerts_script",
|
|
"label": "Prometheus alerts deploy script",
|
|
"source_path": "scripts/ops/deploy-alerts.sh",
|
|
"expected_scope": "prometheus_alerts_deploy",
|
|
"config_kind": "reload_capable_script",
|
|
"control_tier": "C1",
|
|
"current_state": "reload_capable_script_visible_gate_closed",
|
|
"observability_scope": ["Prometheus alert rule deploy", "reload"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 deploy owner、rule test proof、reload owner、receiver smoke 與 rollback owner。",
|
|
},
|
|
{
|
|
"surface_id": "prometheus_rule_drift_guard_script",
|
|
"label": "Prometheus rule drift guard",
|
|
"source_path": "scripts/ops/prometheus-rule-drift-guard.sh",
|
|
"expected_scope": "prometheus_rule_drift_guard",
|
|
"config_kind": "drift_guard_script",
|
|
"control_tier": "C1",
|
|
"current_state": "drift_guard_visible_not_executed_by_inventory",
|
|
"observability_scope": ["rule diff", "drift evidence", "reload guard"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 drift guard owner、live rule export owner、diff disposition 與 blocking policy owner。",
|
|
},
|
|
{
|
|
"surface_id": "k8s_deploy_prometheus_config_script",
|
|
"label": "K8s Prometheus config deploy script",
|
|
"source_path": "k8s/monitoring/deploy-prometheus-config.sh",
|
|
"expected_scope": "k8s_prometheus_config_deploy",
|
|
"config_kind": "reload_capable_script",
|
|
"control_tier": "C1",
|
|
"current_state": "deploy_script_visible_not_executed",
|
|
"observability_scope": ["K8s config apply", "Prometheus reload"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 K8s deploy owner、kubectl / ArgoCD boundary、reload owner 與 rollback owner。",
|
|
},
|
|
{
|
|
"surface_id": "api_apply_prometheus_config_script",
|
|
"label": "API Prometheus config apply script",
|
|
"source_path": "apps/api/scripts/apply_prometheus_config.sh",
|
|
"expected_scope": "api_prometheus_config_apply",
|
|
"config_kind": "reload_capable_script",
|
|
"control_tier": "C1",
|
|
"current_state": "apply_script_visible_not_executed",
|
|
"observability_scope": ["Prometheus config apply", "API-owned alert deploy"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 API deploy owner、config source owner、reload proof 與 rollback owner。",
|
|
},
|
|
{
|
|
"surface_id": "monitoring_exporter_deploy_script",
|
|
"label": "Monitoring exporter deploy script",
|
|
"source_path": "ops/monitoring/deploy-exporters.sh",
|
|
"expected_scope": "monitoring_exporter_deploy",
|
|
"config_kind": "host_deploy_script",
|
|
"control_tier": "C1",
|
|
"current_state": "host_deploy_script_visible_gate_closed",
|
|
"observability_scope": ["exporter deploy", "host service restart"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 host owner、SSH boundary、restart window、rollback owner 與 exporter scrape proof。",
|
|
},
|
|
{
|
|
"surface_id": "fire_live_alert_script",
|
|
"label": "Live alert firing script",
|
|
"source_path": "apps/api/scripts/fire_live_alert.py",
|
|
"expected_scope": "live_alert_fire_script",
|
|
"config_kind": "live_alert_sender",
|
|
"control_tier": "C1",
|
|
"current_state": "live_alert_sender_visible_gate_closed",
|
|
"observability_scope": ["live alert", "Alertmanager ingestion", "notification route"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 live alert owner、test window、receiver owner、noise budget 與 rollback / stop condition。",
|
|
},
|
|
{
|
|
"surface_id": "fire_test_alert_script",
|
|
"label": "Test alert firing script",
|
|
"source_path": "apps/api/scripts/fire_test_alert.py",
|
|
"expected_scope": "test_alert_fire_script",
|
|
"config_kind": "live_alert_sender",
|
|
"control_tier": "C1",
|
|
"current_state": "test_alert_sender_visible_gate_closed",
|
|
"observability_scope": ["test alert", "notification chain smoke"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 test alert owner、allowed receiver、dedup proof、noisy route guard 與 cleanup owner。",
|
|
},
|
|
{
|
|
"surface_id": "alert_chain_smoke_script",
|
|
"label": "Alert chain smoke script",
|
|
"source_path": "scripts/alert_chain_smoke_test.py",
|
|
"expected_scope": "alert_chain_smoke",
|
|
"config_kind": "smoke_script",
|
|
"control_tier": "C1",
|
|
"current_state": "smoke_script_visible_not_executed_by_inventory",
|
|
"observability_scope": ["alert chain E2E", "notification delivery"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 smoke owner、execution window、receiver owner、expected receipt 與 cleanup owner。",
|
|
},
|
|
{
|
|
"surface_id": "ops_alert_chain_smoke_script",
|
|
"label": "Ops alert chain smoke script",
|
|
"source_path": "ops/scripts/alert_chain_smoke_test.py",
|
|
"expected_scope": "ops_alert_chain_smoke",
|
|
"config_kind": "smoke_script",
|
|
"control_tier": "C1",
|
|
"current_state": "ops_smoke_script_visible_not_executed_by_inventory",
|
|
"observability_scope": ["ops alert chain", "Alertmanager / Telegram smoke"],
|
|
"requires_live_evidence": True,
|
|
"requires_owner_response": True,
|
|
"next_owner_action": "補 ops smoke owner、execution window、receiver owner、expected receipt 與 rollback owner。",
|
|
},
|
|
]
|
|
|
|
|
|
FALSE_BOUNDARIES = {
|
|
"runtime_execution_authorized": False,
|
|
"host_write_authorized": False,
|
|
"prometheus_reload_authorized": False,
|
|
"alertmanager_reload_authorized": False,
|
|
"grafana_dashboard_apply_authorized": False,
|
|
"signoz_rule_apply_authorized": False,
|
|
"sentry_deploy_authorized": False,
|
|
"langfuse_config_change_authorized": False,
|
|
"otel_collector_reload_authorized": False,
|
|
"receiver_route_change_authorized": False,
|
|
"silence_policy_change_authorized": False,
|
|
"telegram_send_authorized": False,
|
|
"notification_route_change_authorized": False,
|
|
"webhook_receiver_change_authorized": False,
|
|
"remote_write_change_authorized": False,
|
|
"exporter_deploy_authorized": False,
|
|
"live_alert_fire_authorized": False,
|
|
"alert_chain_smoke_authorized": False,
|
|
"ssh_read_authorized": False,
|
|
"ssh_write_authorized": False,
|
|
"kubectl_action_authorized": False,
|
|
"secret_value_collection_allowed": False,
|
|
"active_scan_authorized": False,
|
|
"action_buttons_allowed": False,
|
|
}
|
|
|
|
|
|
WRITE_CAPABLE_KINDS = {
|
|
"reload_capable_script",
|
|
"host_deploy_script",
|
|
"sentry_deploy_script",
|
|
"telegram_runtime_sender",
|
|
"notification_runtime_router",
|
|
"live_alert_sender",
|
|
}
|
|
|
|
|
|
def git_short_sha(root: Path) -> str:
|
|
try:
|
|
result = subprocess.run(
|
|
["git", "rev-parse", "--short", "HEAD"],
|
|
cwd=root,
|
|
check=True,
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
return result.stdout.strip()
|
|
except Exception:
|
|
return "unknown"
|
|
|
|
|
|
def file_metadata(root: Path, source_path: str) -> dict[str, Any]:
|
|
path = root / source_path
|
|
exists = path.exists()
|
|
if not exists:
|
|
return {"source_exists": False, "line_count": 0, "sha256": None}
|
|
content = path.read_bytes()
|
|
return {
|
|
"source_exists": True,
|
|
"line_count": len(content.decode("utf-8", errors="replace").splitlines()),
|
|
"sha256": hashlib.sha256(content).hexdigest(),
|
|
}
|
|
|
|
|
|
def build_surface(root: Path, surface: dict[str, Any]) -> dict[str, Any]:
|
|
metadata = file_metadata(root, surface["source_path"])
|
|
return {
|
|
**surface,
|
|
**metadata,
|
|
"owner_response_received": False,
|
|
"owner_response_accepted": False,
|
|
"live_evidence_received": False,
|
|
"reload_owner_accepted": False,
|
|
"receiver_owner_accepted": False,
|
|
"route_smoke_accepted": False,
|
|
"maintenance_window_accepted": False,
|
|
"rollback_owner_accepted": False,
|
|
"runtime_gate_open": False,
|
|
"action_buttons_allowed": False,
|
|
}
|
|
|
|
|
|
def count_kind(surfaces: list[dict[str, Any]], kinds: set[str]) -> int:
|
|
return sum(1 for surface in surfaces if surface["config_kind"] in kinds)
|
|
|
|
|
|
def build_report(root: Path, generated_at: str | None) -> dict[str, Any]:
|
|
report_time = generated_at or datetime.now(TAIPEI).isoformat(timespec="seconds")
|
|
surfaces = [build_surface(root, surface) for surface in SURFACES]
|
|
expected_scopes = sorted({surface["expected_scope"] for surface in surfaces})
|
|
write_capable = [surface for surface in surfaces if surface["config_kind"] in WRITE_CAPABLE_KINDS]
|
|
|
|
return {
|
|
"schema_version": "monitoring_alerting_observability_inventory_v1",
|
|
"generated_at": report_time,
|
|
"git_commit": git_short_sha(root),
|
|
"status": "repo_only_inventory_ready",
|
|
"source_scope": "committed_repo_files_only",
|
|
"summary": {
|
|
"surface_count": len(surfaces),
|
|
"source_exists_count": sum(1 for surface in surfaces if surface["source_exists"]),
|
|
"expected_scope_count": len(expected_scopes),
|
|
"prometheus_config_surface_count": count_kind(
|
|
surfaces,
|
|
{
|
|
"prometheus_config",
|
|
"prometheus_remote_write",
|
|
"prometheus_generated_config",
|
|
"monitoring_service_registry",
|
|
"exporter_query_config",
|
|
},
|
|
),
|
|
"alert_rule_surface_count": count_kind(
|
|
surfaces,
|
|
{"prometheus_alert_rules", "app_alert_rule_contract", "grafana_alert_rules"},
|
|
),
|
|
"alertmanager_receiver_surface_count": count_kind(surfaces, {"alertmanager_receiver_config"}),
|
|
"grafana_surface_count": count_kind(surfaces, {"grafana_alert_rules", "grafana_dashboard"}),
|
|
"signoz_surface_count": count_kind(
|
|
surfaces,
|
|
{"signoz_alert_rules", "signoz_log_rules", "signoz_observability_client"},
|
|
),
|
|
"sentry_surface_count": count_kind(
|
|
surfaces,
|
|
{"sentry_runtime_compose", "sentry_deploy_script", "webhook_receiver", "sentry_observability_client"},
|
|
),
|
|
"langfuse_surface_count": count_kind(
|
|
surfaces,
|
|
{"langfuse_runtime_compose", "langfuse_runbook", "langfuse_observability_client"},
|
|
),
|
|
"notification_policy_surface_count": count_kind(
|
|
surfaces,
|
|
{"notification_policy_snapshot", "observability_contract_snapshot", "notification_policy_code"},
|
|
),
|
|
"telegram_surface_count": count_kind(
|
|
surfaces,
|
|
{"telegram_policy_snapshot", "telegram_runtime_sender"},
|
|
),
|
|
"otel_surface_count": count_kind(surfaces, {"otel_collector_config"}),
|
|
"deploy_or_reload_surface_count": count_kind(
|
|
surfaces,
|
|
{"reload_capable_script", "host_deploy_script", "sentry_deploy_script"},
|
|
),
|
|
"drift_guard_surface_count": count_kind(surfaces, {"drift_guard_script"}),
|
|
"smoke_surface_count": count_kind(surfaces, {"smoke_script", "live_alert_sender"}),
|
|
"write_capable_surface_count": len(write_capable),
|
|
"surfaces_requiring_owner_response_count": sum(1 for surface in surfaces if surface["requires_owner_response"]),
|
|
"surfaces_requiring_live_evidence_count": sum(1 for surface in surfaces if surface["requires_live_evidence"]),
|
|
"owner_response_received_count": 0,
|
|
"owner_response_accepted_count": 0,
|
|
"live_evidence_received_count": 0,
|
|
"reload_owner_accepted_count": 0,
|
|
"receiver_owner_accepted_count": 0,
|
|
"route_smoke_accepted_count": 0,
|
|
"maintenance_window_accepted_count": 0,
|
|
"rollback_owner_accepted_count": 0,
|
|
"runtime_gate_count": 0,
|
|
"action_button_count": 0,
|
|
"coverage_percent_before_inventory": 56,
|
|
"coverage_percent_after_inventory": 62,
|
|
},
|
|
"execution_boundaries": FALSE_BOUNDARIES,
|
|
"expected_scopes": expected_scopes,
|
|
"observability_surfaces": surfaces,
|
|
"write_capable_surfaces": [
|
|
{
|
|
"surface_id": surface["surface_id"],
|
|
"label": surface["label"],
|
|
"config_kind": surface["config_kind"],
|
|
"expected_scope": surface["expected_scope"],
|
|
"required_gate": "owner_response_plus_maintenance_window_plus_rollback_owner",
|
|
}
|
|
for surface in write_capable
|
|
],
|
|
"next_collection_order": [
|
|
"alertmanager_receiver_config",
|
|
"prometheus_alerts_ops",
|
|
"prometheus_rule_drift_guard_script",
|
|
"deploy_alertmanager_config_script",
|
|
"deploy_prometheus_alerts_script",
|
|
"monitoring_110_compose",
|
|
"signoz_otel_collector_config",
|
|
"sentry_self_hosted_compose",
|
|
"telegram_gateway_service",
|
|
"alert_chain_smoke_script",
|
|
],
|
|
"operator_interpretation": [
|
|
"這是 monitoring / alerting / observability 的 repo-only 清冊,不是 live monitoring truth。",
|
|
"source_exists 與 sha256 只代表 repo source 可追溯,不代表 live Prometheus / Alertmanager / Grafana / SigNoz / Sentry 已一致。",
|
|
"write-capable surface 可改 route、reload、send notification、deploy exporter 或發測試告警;在 owner response、維護窗口、rollback owner 前全部維持 gate closed。",
|
|
"不得從本清冊啟動 reload、silence、Telegram send、Sentry deploy、SigNoz apply、Prometheus remote_write 變更、SSH 或 kubectl。",
|
|
],
|
|
}
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description="IwoooS monitoring / alerting / observability repo-only 清冊")
|
|
parser.add_argument("--root", default=".", help="repo root")
|
|
parser.add_argument("--output", help="寫出 JSON 報告")
|
|
parser.add_argument("--generated-at", help="固定報告時間,供 committed snapshot 使用")
|
|
args = parser.parse_args()
|
|
|
|
root = Path(args.root).resolve()
|
|
report = build_report(root, args.generated_at)
|
|
payload = json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True)
|
|
|
|
if args.output:
|
|
output = Path(args.output)
|
|
output.parent.mkdir(parents=True, exist_ok=True)
|
|
output.write_text(payload + "\n", encoding="utf-8")
|
|
else:
|
|
print(payload)
|
|
|
|
summary = report["summary"]
|
|
print(
|
|
"MONITORING_ALERTING_OBSERVABILITY_INVENTORY_OK "
|
|
f"surfaces={summary['surface_count']} "
|
|
f"alert_rules={summary['alert_rule_surface_count']} "
|
|
f"write_capable={summary['write_capable_surface_count']} "
|
|
f"runtime_gate={summary['runtime_gate_count']}",
|
|
file=sys.stderr,
|
|
)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|