Files
awoooi/scripts/security/monitoring-alerting-observability-inventory.py
Your Name 8a424f0c56
All checks were successful
CD Pipeline / tests (push) Successful in 1m26s
Code Review / ai-code-review (push) Successful in 23s
CD Pipeline / build-and-deploy (push) Successful in 4m52s
CD Pipeline / post-deploy-checks (push) Successful in 1m59s
feat(security): 新增 monitoring alerting 只讀清冊
2026-06-12 00:45:08 +08:00

1040 lines
50 KiB
Python

#!/usr/bin/env python3
"""
IwoooS monitoring / alerting / observability repo-only 清冊。
本工具只讀取已提交的 repo 檔案,整理 Prometheus、Alertmanager、
Grafana、SigNoz、Sentry、Langfuse、OTEL、Telegram / notification
policy、deploy / reload scripts 與 smoke scripts。它不連 live
Prometheus、不 reload Alertmanager、不改 Grafana、不套用 SigNoz rule、
不部署 Sentry、不發 Telegram、不建立 silence、不 SSH、不 kubectl、不讀
secret value。
"""
from __future__ import annotations
import argparse
import hashlib
import json
import subprocess
import sys
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Any
TAIPEI = timezone(timedelta(hours=8))
SURFACES: list[dict[str, Any]] = [
{
"surface_id": "prometheus_k8s_base_config",
"label": "K8s Prometheus base config",
"source_path": "k8s/monitoring/prometheus.yml",
"expected_scope": "k8s_monitoring_prometheus_base",
"config_kind": "prometheus_config",
"control_tier": "C1",
"current_state": "repo_source_visible_reload_not_authorized",
"observability_scope": ["scrape config", "service discovery", "alert rule includes"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 Prometheus owner、live config hash、reload owner、rule diff 與 route smoke 指標。",
},
{
"surface_id": "prometheus_k8s_additions_config",
"label": "K8s Prometheus additions",
"source_path": "k8s/monitoring/prometheus-config-additions.yaml",
"expected_scope": "k8s_monitoring_prometheus_additions",
"config_kind": "prometheus_config",
"control_tier": "C1",
"current_state": "repo_source_visible_needs_drift_disposition",
"observability_scope": ["supplemental scrape", "additional rule paths"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 additions owner、live parity evidence、rollout window 與 rollback owner。",
},
{
"surface_id": "prometheus_phase_o_config",
"label": "Prometheus Phase O config",
"source_path": "k8s/monitoring/prometheus-config-phase-o.yaml",
"expected_scope": "phase_o_prometheus_config",
"config_kind": "prometheus_config",
"control_tier": "C1",
"current_state": "phase_config_visible_reload_not_authorized",
"observability_scope": ["Phase O monitoring", "baseline scrape"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 Phase O owner、stale config disposition、reload owner 與 post-check 指標。",
},
{
"surface_id": "prometheus_remote_write_signoz",
"label": "Prometheus remote write to SigNoz",
"source_path": "k8s/monitoring/prometheus-remote-write-signoz.yaml",
"expected_scope": "prometheus_remote_write_signoz",
"config_kind": "prometheus_remote_write",
"control_tier": "C1",
"current_state": "data_export_config_visible_gate_closed",
"observability_scope": ["remote write", "SigNoz ingestion", "data export boundary"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 data export owner、privacy boundary、SigNoz ingest owner、rollback owner 與 volume guard。",
},
{
"surface_id": "prometheus_generated_scrape_config",
"label": "Generated Prometheus scrape config",
"source_path": "ops/monitoring/generated/prometheus-scrape-generated.yaml",
"expected_scope": "generated_scrape_targets",
"config_kind": "prometheus_generated_config",
"control_tier": "C1",
"current_state": "generated_source_visible_needs_source_registry_parity",
"observability_scope": ["generated targets", "service registry"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 generator owner、service registry parity、live target count 與 stale target disposition。",
},
{
"surface_id": "prometheus_generated_blackbox_targets",
"label": "Generated blackbox targets",
"source_path": "ops/monitoring/generated/blackbox-targets-generated.yaml",
"expected_scope": "generated_blackbox_targets",
"config_kind": "prometheus_generated_config",
"control_tier": "C1",
"current_state": "generated_blackbox_targets_visible_probe_not_executed",
"observability_scope": ["blackbox targets", "public route smoke candidates"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 blackbox owner、target freshness、public route owner 與 probe execution approval。",
},
{
"surface_id": "monitoring_service_registry",
"label": "Monitoring service registry",
"source_path": "ops/monitoring/service-registry.yaml",
"expected_scope": "monitoring_service_registry",
"config_kind": "monitoring_service_registry",
"control_tier": "C1",
"current_state": "registry_visible_needs_owner_disposition",
"observability_scope": ["service registry", "target generation"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 registry owner、產品 owner mapping、retired target disposition 與 generator smoke。",
},
{
"surface_id": "postgres_exporter_queries",
"label": "Postgres exporter query config",
"source_path": "ops/monitoring/postgres-exporter-queries.yaml",
"expected_scope": "postgres_exporter_queries",
"config_kind": "exporter_query_config",
"control_tier": "C1",
"current_state": "query_config_visible_needs_db_owner_review",
"observability_scope": ["PostgreSQL metrics", "custom exporter queries"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 DB owner、query cost boundary、metric name owner 與 rollback owner。",
},
{
"surface_id": "monitoring_110_compose",
"label": "110 monitoring compose",
"source_path": "k8s/monitoring/docker-compose-110.yml",
"expected_scope": "192.168.0.110_monitoring_stack",
"config_kind": "monitoring_runtime_compose",
"control_tier": "C1",
"current_state": "runtime_compose_visible_live_hash_missing",
"observability_scope": ["Prometheus", "Grafana", "Alertmanager", "blackbox", "cadvisor"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 110 live compose hash、restart window、rollback owner、admin secret boundary 與 post-check。",
},
{
"surface_id": "monitoring_exporters_compose",
"label": "188 monitoring exporters compose",
"source_path": "ops/monitoring/docker-compose.exporters.yaml",
"expected_scope": "192.168.0.188_exporters",
"config_kind": "monitoring_runtime_compose",
"control_tier": "C1",
"current_state": "exporter_compose_visible_live_hash_missing",
"observability_scope": ["postgres-exporter", "redis-exporter"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 188 exporter owner、live hash、env source policy、restart window 與 rollback owner。",
},
{
"surface_id": "alertmanager_receiver_config",
"label": "Alertmanager receiver config",
"source_path": "ops/alertmanager/alertmanager.yml",
"expected_scope": "alertmanager_routes_receivers",
"config_kind": "alertmanager_receiver_config",
"control_tier": "C1",
"current_state": "receiver_route_visible_reload_not_authorized",
"observability_scope": ["routes", "receivers", "grouping", "Telegram / webhook receiver boundary"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 receiver owner、route diff、silence policy owner、reload owner 與 failure-only notification proof。",
},
{
"surface_id": "prometheus_alerts_ops",
"label": "Ops Prometheus alerts",
"source_path": "ops/monitoring/alerts.yml",
"expected_scope": "ops_prometheus_alerts",
"config_kind": "prometheus_alert_rules",
"control_tier": "C1",
"current_state": "alert_rules_visible_reload_not_authorized",
"observability_scope": ["infrastructure alerts", "backup alerts", "route alerts"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 alert rule owner、rule diff、receiver mapping、reload owner 與 false-green guard。",
},
{
"surface_id": "prometheus_alerts_unified",
"label": "Unified Prometheus alerts",
"source_path": "ops/monitoring/alerts-unified.yml",
"expected_scope": "unified_prometheus_alerts",
"config_kind": "prometheus_alert_rules",
"control_tier": "C1",
"current_state": "unified_rules_visible_needs_receiver_parity",
"observability_scope": ["unified alerts", "routing labels"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 unified rule owner、label taxonomy owner、receiver parity 與 rollout window。",
},
{
"surface_id": "prometheus_slo_rules",
"label": "Prometheus SLO rules",
"source_path": "ops/monitoring/slo-rules.yml",
"expected_scope": "prometheus_slo_rules",
"config_kind": "prometheus_alert_rules",
"control_tier": "C1",
"current_state": "slo_rules_visible_reload_not_authorized",
"observability_scope": ["SLO burn rate", "availability indicators"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 SLO owner、error budget owner、rule test evidence 與 receiver mapping。",
},
{
"surface_id": "prometheus_ollama_health_rules",
"label": "Ollama health alert rules",
"source_path": "ops/monitoring/ollama_health_rules.yaml",
"expected_scope": "ollama_health_rules",
"config_kind": "prometheus_alert_rules",
"control_tier": "C1",
"current_state": "ai_runtime_health_rules_visible_reload_not_authorized",
"observability_scope": ["Ollama health", "AI provider availability"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 AI provider owner、fallback owner、receiver owner 與 reload owner。",
},
{
"surface_id": "k8s_k3s_alerts",
"label": "K3s alert rules",
"source_path": "k8s/monitoring/k3s-alerts.yaml",
"expected_scope": "k3s_cluster_alerts",
"config_kind": "prometheus_alert_rules",
"control_tier": "C1",
"current_state": "k3s_rules_visible_apply_not_authorized",
"observability_scope": ["K3s workloads", "cluster health"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 K3s owner、rule apply window、ArgoCD / kubectl boundary 與 rollback owner。",
},
{
"surface_id": "k8s_k3s_supplemental_alerts",
"label": "K3s supplemental alert rules",
"source_path": "k8s/monitoring/k3s-alerts-supplemental.yaml",
"expected_scope": "k3s_supplemental_alerts",
"config_kind": "prometheus_alert_rules",
"control_tier": "C1",
"current_state": "supplemental_rules_visible_apply_not_authorized",
"observability_scope": ["K3s supplemental health", "gap alerts"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 supplemental rule owner、overlap disposition、receiver owner 與 apply boundary。",
},
{
"surface_id": "k8s_database_alerts",
"label": "Database alert rules",
"source_path": "k8s/monitoring/database-alerts.yaml",
"expected_scope": "database_alerts",
"config_kind": "prometheus_alert_rules",
"control_tier": "C1",
"current_state": "database_rules_visible_apply_not_authorized",
"observability_scope": ["PostgreSQL", "Redis", "DB availability"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 DB alert owner、threshold owner、receiver mapping 與 live metric evidence。",
},
{
"surface_id": "k8s_nvidia_alerts",
"label": "NVIDIA alert rules",
"source_path": "k8s/monitoring/nvidia-alerts.yaml",
"expected_scope": "nvidia_gpu_alerts",
"config_kind": "prometheus_alert_rules",
"control_tier": "C1",
"current_state": "gpu_rules_visible_apply_not_authorized",
"observability_scope": ["GPU health", "AI workload capacity"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 GPU owner、NVIDIA exporter live evidence、receiver owner 與 rollout boundary。",
},
{
"surface_id": "k8s_minio_kali_alerts",
"label": "MinIO / Kali alert rules",
"source_path": "k8s/monitoring/minio-kali-alerts.yaml",
"expected_scope": "minio_kali_alerts",
"config_kind": "prometheus_alert_rules",
"control_tier": "C1",
"current_state": "minio_kali_rules_visible_apply_not_authorized",
"observability_scope": ["MinIO", "Kali scanner", "read-only scanner status"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 MinIO / Kali owner、active scan boundary、receiver owner 與 rule apply owner。",
},
{
"surface_id": "k8s_flywheel_alerts",
"label": "AI flywheel alert rules",
"source_path": "k8s/monitoring/flywheel-alerts.yaml",
"expected_scope": "ai_flywheel_alerts",
"config_kind": "prometheus_alert_rules",
"control_tier": "C1",
"current_state": "flywheel_rules_visible_apply_not_authorized",
"observability_scope": ["AI flywheel", "learning / runtime health"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 AI flywheel owner、runtime gate boundary、receiver owner 與 false-green guard。",
},
{
"surface_id": "k8s_alert_chain_monitor",
"label": "Alert chain monitor",
"source_path": "k8s/monitoring/alert-chain-monitor.yaml",
"expected_scope": "alert_chain_monitor",
"config_kind": "prometheus_alert_rules",
"control_tier": "C1",
"current_state": "alert_chain_monitor_visible_apply_not_authorized",
"observability_scope": ["Alertmanager chain", "E2E alert visibility"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 alert chain owner、E2E smoke owner、receiver owner 與 post-check 指標。",
},
{
"surface_id": "app_alert_rules_yaml",
"label": "API alert rule engine rules",
"source_path": "apps/api/alert_rules.yaml",
"expected_scope": "api_alert_rule_engine",
"config_kind": "app_alert_rule_contract",
"control_tier": "C1",
"current_state": "app_rule_contract_visible_runtime_change_not_authorized",
"observability_scope": ["alert classification", "approval recommendations", "playbook matching"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 rule owner、AI decision owner、classification drift evidence 與 rollback owner。",
},
{
"surface_id": "grafana_agent_step_latency_rules",
"label": "Grafana agent step latency rules",
"source_path": "ops/monitoring/grafana/agent_step_latency_rules.yaml",
"expected_scope": "grafana_agent_step_latency_rules",
"config_kind": "grafana_alert_rules",
"control_tier": "C1",
"current_state": "grafana_rules_visible_apply_not_authorized",
"observability_scope": ["agent step latency", "Grafana alerting"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 Grafana alert owner、dashboard / rule UID owner、apply owner 與 rollback owner。",
},
{
"surface_id": "grafana_ai_slo_dashboard",
"label": "Grafana AI SLO dashboard",
"source_path": "ops/monitoring/grafana/dashboards/ai-slo-dashboard.json",
"expected_scope": "grafana_ai_slo_dashboard",
"config_kind": "grafana_dashboard",
"control_tier": "C1",
"current_state": "dashboard_json_visible_apply_not_authorized",
"observability_scope": ["AI SLO", "dashboard panels"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 dashboard owner、folder owner、UID conflict check 與 import rollback owner。",
},
{
"surface_id": "grafana_ollama_failover_dashboard",
"label": "Grafana Ollama failover dashboard",
"source_path": "ops/monitoring/grafana/dashboards/ollama_failover.json",
"expected_scope": "grafana_ollama_failover_dashboard",
"config_kind": "grafana_dashboard",
"control_tier": "C1",
"current_state": "dashboard_json_visible_apply_not_authorized",
"observability_scope": ["Ollama failover", "AI provider health"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 dashboard owner、AI provider owner、Grafana import owner 與 rollback ref。",
},
{
"surface_id": "grafana_ai_monitoring_dashboard",
"label": "Grafana AI monitoring dashboard",
"source_path": "ops/grafana/dashboards/ai-monitoring.json",
"expected_scope": "grafana_ai_monitoring_dashboard",
"config_kind": "grafana_dashboard",
"control_tier": "C1",
"current_state": "dashboard_json_visible_apply_not_authorized",
"observability_scope": ["AI monitoring", "runtime health"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 dashboard owner、panel query owner、import window 與 rollback owner。",
},
{
"surface_id": "grafana_infra_monitoring_dashboard",
"label": "Grafana infra monitoring dashboard",
"source_path": "ops/grafana/dashboards/infra-monitoring.json",
"expected_scope": "grafana_infra_monitoring_dashboard",
"config_kind": "grafana_dashboard",
"control_tier": "C1",
"current_state": "dashboard_json_visible_apply_not_authorized",
"observability_scope": ["infrastructure monitoring", "host health"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 infra dashboard owner、host source owner、import owner 與 smoke plan。",
},
{
"surface_id": "grafana_nvidia_nemotron_dashboard",
"label": "Grafana NVIDIA / NemoTron dashboard",
"source_path": "ops/grafana/dashboards/nvidia-nemotron.json",
"expected_scope": "grafana_nvidia_nemotron_dashboard",
"config_kind": "grafana_dashboard",
"control_tier": "C1",
"current_state": "dashboard_json_visible_apply_not_authorized",
"observability_scope": ["NVIDIA", "NemoTron", "GPU AI route"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 GPU / NemoTron owner、dashboard UID owner、import owner 與 rollback owner。",
},
{
"surface_id": "signoz_otel_collector_config",
"label": "SigNoz OTEL collector config",
"source_path": "ops/signoz/otel-collector-config-phase-o.yaml",
"expected_scope": "signoz_otel_collector_phase_o",
"config_kind": "otel_collector_config",
"control_tier": "C1",
"current_state": "otel_collector_config_visible_reload_not_authorized",
"observability_scope": ["OTEL collector", "traces / metrics / logs routing"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 OTEL owner、pipeline diff、secret redaction proof、reload owner 與 data export boundary。",
},
{
"surface_id": "signoz_alerting_rules",
"label": "SigNoz alerting rules",
"source_path": "ops/signoz/alerting/rules.yaml",
"expected_scope": "signoz_alerting_rules",
"config_kind": "signoz_alert_rules",
"control_tier": "C1",
"current_state": "signoz_rules_visible_apply_not_authorized",
"observability_scope": ["SigNoz alert rules", "logs / traces alerting"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 SigNoz rule owner、apply owner、receiver mapping、silence owner 與 rollback owner。",
},
{
"surface_id": "signoz_log_rules",
"label": "SigNoz log alert rules",
"source_path": "ops/signoz/alerting/log-rules.md",
"expected_scope": "signoz_log_alert_rules",
"config_kind": "signoz_log_rules",
"control_tier": "C1",
"current_state": "log_rule_runbook_visible_apply_not_authorized",
"observability_scope": ["log alerting", "manual rule translation"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 log rule owner、manual translation reviewer、apply proof 與 rollback owner。",
},
{
"surface_id": "sentry_self_hosted_compose",
"label": "Sentry self-hosted compose",
"source_path": "ops/sentry-self-hosted/docker-compose.yml",
"expected_scope": "sentry_self_hosted_runtime",
"config_kind": "sentry_runtime_compose",
"control_tier": "C1",
"current_state": "sentry_compose_visible_live_hash_missing",
"observability_scope": ["Sentry self-hosted", "ClickHouse / Postgres / Redis boundary"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 Sentry runtime owner、live hash、admin secret boundary、upgrade window 與 rollback owner。",
},
{
"surface_id": "sentry_self_hosted_deploy",
"label": "Sentry self-hosted deploy script",
"source_path": "ops/sentry-self-hosted/deploy.sh",
"expected_scope": "sentry_self_hosted_deploy",
"config_kind": "sentry_deploy_script",
"control_tier": "C1",
"current_state": "deploy_script_visible_not_executed",
"observability_scope": ["Sentry deploy", "compose pull / up", "migration boundary"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 deploy owner、maintenance window、backup owner、migration rollback owner 與 smoke plan。",
},
{
"surface_id": "langfuse_compose",
"label": "Langfuse compose",
"source_path": "infra/langfuse/docker-compose.yml",
"expected_scope": "langfuse_observability_runtime",
"config_kind": "langfuse_runtime_compose",
"control_tier": "C1",
"current_state": "langfuse_compose_visible_live_hash_missing",
"observability_scope": ["Langfuse", "LLM trace storage", "PostgreSQL dependency"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 Langfuse owner、trace privacy owner、live hash、restart window 與 rollback owner。",
},
{
"surface_id": "langfuse_readme",
"label": "Langfuse deployment README",
"source_path": "infra/langfuse/README.md",
"expected_scope": "langfuse_observability_runbook",
"config_kind": "langfuse_runbook",
"control_tier": "C1",
"current_state": "runbook_visible_needs_stale_disposition",
"observability_scope": ["Langfuse deployment", "trace governance"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 runbook owner、live deployment parity、trace retention owner 與 secret redaction proof。",
},
{
"surface_id": "service_health_failure_policy_snapshot",
"label": "Service health failure notification policy",
"source_path": "docs/evaluations/service_health_failure_notification_policy_2026-06-05.json",
"expected_scope": "service_health_failure_notification_policy",
"config_kind": "notification_policy_snapshot",
"control_tier": "C1",
"current_state": "failure_only_policy_visible_live_send_gate_closed",
"observability_scope": ["failure-only notification", "redaction contract"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 notification owner、receiver owner、live receipt proof、redaction proof 與 retry boundary。",
},
{
"surface_id": "backup_notification_policy_snapshot",
"label": "Backup notification policy",
"source_path": "docs/evaluations/backup_notification_policy_2026-06-04.json",
"expected_scope": "backup_notification_policy",
"config_kind": "notification_policy_snapshot",
"control_tier": "C1",
"current_state": "backup_notification_policy_visible_live_send_gate_closed",
"observability_scope": ["backup failure notification", "restore / offsite alerting"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 backup notification owner、Telegram receipt owner、failure-only proof 與 false-green guard。",
},
{
"surface_id": "observability_contract_matrix_snapshot",
"label": "Observability contract matrix",
"source_path": "docs/evaluations/observability_contract_matrix_2026-06-05.json",
"expected_scope": "observability_contract_matrix",
"config_kind": "observability_contract_snapshot",
"control_tier": "C1",
"current_state": "contract_matrix_visible_needs_live_parity",
"observability_scope": ["observability contract", "source coverage", "runtime evidence"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 contract owner、live parity owner、coverage gap disposition 與 evidence freshness owner。",
},
{
"surface_id": "telegram_action_required_digest_snapshot",
"label": "Telegram action required digest policy",
"source_path": "docs/evaluations/ai_agent_telegram_action_required_digest_policy_2026-06-11.json",
"expected_scope": "ai_agent_telegram_digest_policy",
"config_kind": "telegram_policy_snapshot",
"control_tier": "C1",
"current_state": "telegram_policy_visible_live_send_gate_closed",
"observability_scope": ["Telegram digest", "action required routing"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 Telegram owner、receiver owner、receipt owner、redaction proof 與 retry boundary。",
},
{
"surface_id": "telegram_receipt_approval_snapshot",
"label": "Telegram receipt approval package",
"source_path": "docs/evaluations/ai_agent_telegram_receipt_approval_package_2026-06-11.json",
"expected_scope": "ai_agent_telegram_receipt_policy",
"config_kind": "telegram_policy_snapshot",
"control_tier": "C1",
"current_state": "receipt_policy_visible_live_receipt_gate_closed",
"observability_scope": ["delivery receipt", "ack / retry policy"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 receipt owner、gateway queue owner、ack policy owner 與 no-live-send boundary。",
},
{
"surface_id": "telegram_gateway_service",
"label": "Telegram gateway service",
"source_path": "apps/api/src/services/telegram_gateway.py",
"expected_scope": "telegram_gateway_runtime_sender",
"config_kind": "telegram_runtime_sender",
"control_tier": "C1",
"current_state": "send_capable_service_visible_gate_closed",
"observability_scope": ["Telegram Bot API", "dedup", "delivery queue"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 gateway owner、token injection owner、receipt owner、send approval gate 與 rollback owner。",
},
{
"surface_id": "notification_manager_service",
"label": "Notification manager",
"source_path": "apps/api/src/services/notifications/manager.py",
"expected_scope": "notification_runtime_routing",
"config_kind": "notification_runtime_router",
"control_tier": "C1",
"current_state": "runtime_router_visible_send_gate_closed",
"observability_scope": ["notification channel routing", "Telegram / Discord manager"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 channel owner、routing owner、failure-only policy owner、receipt owner 與 rollback owner。",
},
{
"surface_id": "notification_matrix_service",
"label": "Notification matrix",
"source_path": "apps/api/src/services/notification_matrix.py",
"expected_scope": "notification_matrix_policy",
"config_kind": "notification_policy_code",
"control_tier": "C1",
"current_state": "policy_code_visible_runtime_gate_closed",
"observability_scope": ["notification policy", "recipient / channel mapping"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 matrix owner、recipient owner、channel parity evidence 與 redaction policy。",
},
{
"surface_id": "alert_chain_metrics_service",
"label": "Alert chain metrics service",
"source_path": "apps/api/src/services/alert_chain_metrics_service.py",
"expected_scope": "alert_chain_metrics",
"config_kind": "alert_metrics_service",
"control_tier": "C1",
"current_state": "metrics_service_visible_needs_runtime_evidence",
"observability_scope": ["alert chain metrics", "delivery visibility"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 metrics owner、Prometheus scrape owner、delivery evidence 與 false-green guard。",
},
{
"surface_id": "converged_alert_recurrence_notifier",
"label": "Converged alert recurrence notifier",
"source_path": "apps/api/src/services/converged_alert_recurrence_notifier.py",
"expected_scope": "recurrence_notification_runtime",
"config_kind": "notification_runtime_router",
"control_tier": "C1",
"current_state": "recurrence_notifier_visible_send_gate_closed",
"observability_scope": ["recurrence notification", "dedup", "escalation routing"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 recurrence owner、noise budget owner、receipt owner、silence boundary 與 rollback owner。",
},
{
"surface_id": "sentry_webhook_service",
"label": "Sentry webhook service",
"source_path": "apps/api/src/services/sentry_webhook_service.py",
"expected_scope": "sentry_webhook_receiver",
"config_kind": "webhook_receiver",
"control_tier": "C1",
"current_state": "webhook_receiver_visible_secret_value_not_collected",
"observability_scope": ["Sentry webhook", "signature validation", "triage route"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 webhook owner、secret name owner、signature proof、route smoke 與 replay boundary。",
},
{
"surface_id": "signoz_client_service",
"label": "SigNoz client service",
"source_path": "apps/api/src/services/signoz_client.py",
"expected_scope": "signoz_api_client",
"config_kind": "signoz_observability_client",
"control_tier": "C1",
"current_state": "client_visible_live_api_read_not_executed",
"observability_scope": ["SigNoz API", "log query", "trace query"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 SigNoz client owner、token name owner、rate limit boundary 與 read-only smoke owner。",
},
{
"surface_id": "sentry_service_client",
"label": "Sentry service client",
"source_path": "apps/api/src/services/sentry_service.py",
"expected_scope": "sentry_api_client",
"config_kind": "sentry_observability_client",
"control_tier": "C1",
"current_state": "client_visible_live_api_read_not_executed",
"observability_scope": ["Sentry API", "issue query", "AI triage"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 Sentry client owner、token name owner、read-only evidence 與 privacy boundary。",
},
{
"surface_id": "langfuse_client_service",
"label": "Langfuse client service",
"source_path": "apps/api/src/services/langfuse_client.py",
"expected_scope": "langfuse_api_client",
"config_kind": "langfuse_observability_client",
"control_tier": "C1",
"current_state": "client_visible_trace_read_write_boundary_needs_owner",
"observability_scope": ["Langfuse API", "AI trace", "prompt / output privacy"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 Langfuse owner、trace privacy owner、token name owner、write gate 與 retention owner。",
},
{
"surface_id": "deploy_alertmanager_config_script",
"label": "Alertmanager config deploy script",
"source_path": "scripts/ops/deploy-alertmanager-config.sh",
"expected_scope": "alertmanager_config_deploy",
"config_kind": "reload_capable_script",
"control_tier": "C1",
"current_state": "reload_capable_script_visible_gate_closed",
"observability_scope": ["Alertmanager config copy", "reload"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 deploy owner、maintenance window、rollback ref、receiver smoke 與 failure-only notification proof。",
},
{
"surface_id": "deploy_prometheus_alerts_script",
"label": "Prometheus alerts deploy script",
"source_path": "scripts/ops/deploy-alerts.sh",
"expected_scope": "prometheus_alerts_deploy",
"config_kind": "reload_capable_script",
"control_tier": "C1",
"current_state": "reload_capable_script_visible_gate_closed",
"observability_scope": ["Prometheus alert rule deploy", "reload"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 deploy owner、rule test proof、reload owner、receiver smoke 與 rollback owner。",
},
{
"surface_id": "prometheus_rule_drift_guard_script",
"label": "Prometheus rule drift guard",
"source_path": "scripts/ops/prometheus-rule-drift-guard.sh",
"expected_scope": "prometheus_rule_drift_guard",
"config_kind": "drift_guard_script",
"control_tier": "C1",
"current_state": "drift_guard_visible_not_executed_by_inventory",
"observability_scope": ["rule diff", "drift evidence", "reload guard"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 drift guard owner、live rule export owner、diff disposition 與 blocking policy owner。",
},
{
"surface_id": "k8s_deploy_prometheus_config_script",
"label": "K8s Prometheus config deploy script",
"source_path": "k8s/monitoring/deploy-prometheus-config.sh",
"expected_scope": "k8s_prometheus_config_deploy",
"config_kind": "reload_capable_script",
"control_tier": "C1",
"current_state": "deploy_script_visible_not_executed",
"observability_scope": ["K8s config apply", "Prometheus reload"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 K8s deploy owner、kubectl / ArgoCD boundary、reload owner 與 rollback owner。",
},
{
"surface_id": "api_apply_prometheus_config_script",
"label": "API Prometheus config apply script",
"source_path": "apps/api/scripts/apply_prometheus_config.sh",
"expected_scope": "api_prometheus_config_apply",
"config_kind": "reload_capable_script",
"control_tier": "C1",
"current_state": "apply_script_visible_not_executed",
"observability_scope": ["Prometheus config apply", "API-owned alert deploy"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 API deploy owner、config source owner、reload proof 與 rollback owner。",
},
{
"surface_id": "monitoring_exporter_deploy_script",
"label": "Monitoring exporter deploy script",
"source_path": "ops/monitoring/deploy-exporters.sh",
"expected_scope": "monitoring_exporter_deploy",
"config_kind": "host_deploy_script",
"control_tier": "C1",
"current_state": "host_deploy_script_visible_gate_closed",
"observability_scope": ["exporter deploy", "host service restart"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 host owner、SSH boundary、restart window、rollback owner 與 exporter scrape proof。",
},
{
"surface_id": "fire_live_alert_script",
"label": "Live alert firing script",
"source_path": "apps/api/scripts/fire_live_alert.py",
"expected_scope": "live_alert_fire_script",
"config_kind": "live_alert_sender",
"control_tier": "C1",
"current_state": "live_alert_sender_visible_gate_closed",
"observability_scope": ["live alert", "Alertmanager ingestion", "notification route"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 live alert owner、test window、receiver owner、noise budget 與 rollback / stop condition。",
},
{
"surface_id": "fire_test_alert_script",
"label": "Test alert firing script",
"source_path": "apps/api/scripts/fire_test_alert.py",
"expected_scope": "test_alert_fire_script",
"config_kind": "live_alert_sender",
"control_tier": "C1",
"current_state": "test_alert_sender_visible_gate_closed",
"observability_scope": ["test alert", "notification chain smoke"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 test alert owner、allowed receiver、dedup proof、noisy route guard 與 cleanup owner。",
},
{
"surface_id": "alert_chain_smoke_script",
"label": "Alert chain smoke script",
"source_path": "scripts/alert_chain_smoke_test.py",
"expected_scope": "alert_chain_smoke",
"config_kind": "smoke_script",
"control_tier": "C1",
"current_state": "smoke_script_visible_not_executed_by_inventory",
"observability_scope": ["alert chain E2E", "notification delivery"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 smoke owner、execution window、receiver owner、expected receipt 與 cleanup owner。",
},
{
"surface_id": "ops_alert_chain_smoke_script",
"label": "Ops alert chain smoke script",
"source_path": "ops/scripts/alert_chain_smoke_test.py",
"expected_scope": "ops_alert_chain_smoke",
"config_kind": "smoke_script",
"control_tier": "C1",
"current_state": "ops_smoke_script_visible_not_executed_by_inventory",
"observability_scope": ["ops alert chain", "Alertmanager / Telegram smoke"],
"requires_live_evidence": True,
"requires_owner_response": True,
"next_owner_action": "補 ops smoke owner、execution window、receiver owner、expected receipt 與 rollback owner。",
},
]
FALSE_BOUNDARIES = {
"runtime_execution_authorized": False,
"host_write_authorized": False,
"prometheus_reload_authorized": False,
"alertmanager_reload_authorized": False,
"grafana_dashboard_apply_authorized": False,
"signoz_rule_apply_authorized": False,
"sentry_deploy_authorized": False,
"langfuse_config_change_authorized": False,
"otel_collector_reload_authorized": False,
"receiver_route_change_authorized": False,
"silence_policy_change_authorized": False,
"telegram_send_authorized": False,
"notification_route_change_authorized": False,
"webhook_receiver_change_authorized": False,
"remote_write_change_authorized": False,
"exporter_deploy_authorized": False,
"live_alert_fire_authorized": False,
"alert_chain_smoke_authorized": False,
"ssh_read_authorized": False,
"ssh_write_authorized": False,
"kubectl_action_authorized": False,
"secret_value_collection_allowed": False,
"active_scan_authorized": False,
"action_buttons_allowed": False,
}
WRITE_CAPABLE_KINDS = {
"reload_capable_script",
"host_deploy_script",
"sentry_deploy_script",
"telegram_runtime_sender",
"notification_runtime_router",
"live_alert_sender",
}
def git_short_sha(root: Path) -> str:
try:
result = subprocess.run(
["git", "rev-parse", "--short", "HEAD"],
cwd=root,
check=True,
capture_output=True,
text=True,
)
return result.stdout.strip()
except Exception:
return "unknown"
def file_metadata(root: Path, source_path: str) -> dict[str, Any]:
path = root / source_path
exists = path.exists()
if not exists:
return {"source_exists": False, "line_count": 0, "sha256": None}
content = path.read_bytes()
return {
"source_exists": True,
"line_count": len(content.decode("utf-8", errors="replace").splitlines()),
"sha256": hashlib.sha256(content).hexdigest(),
}
def build_surface(root: Path, surface: dict[str, Any]) -> dict[str, Any]:
metadata = file_metadata(root, surface["source_path"])
return {
**surface,
**metadata,
"owner_response_received": False,
"owner_response_accepted": False,
"live_evidence_received": False,
"reload_owner_accepted": False,
"receiver_owner_accepted": False,
"route_smoke_accepted": False,
"maintenance_window_accepted": False,
"rollback_owner_accepted": False,
"runtime_gate_open": False,
"action_buttons_allowed": False,
}
def count_kind(surfaces: list[dict[str, Any]], kinds: set[str]) -> int:
return sum(1 for surface in surfaces if surface["config_kind"] in kinds)
def build_report(root: Path, generated_at: str | None) -> dict[str, Any]:
report_time = generated_at or datetime.now(TAIPEI).isoformat(timespec="seconds")
surfaces = [build_surface(root, surface) for surface in SURFACES]
expected_scopes = sorted({surface["expected_scope"] for surface in surfaces})
write_capable = [surface for surface in surfaces if surface["config_kind"] in WRITE_CAPABLE_KINDS]
return {
"schema_version": "monitoring_alerting_observability_inventory_v1",
"generated_at": report_time,
"git_commit": git_short_sha(root),
"status": "repo_only_inventory_ready",
"source_scope": "committed_repo_files_only",
"summary": {
"surface_count": len(surfaces),
"source_exists_count": sum(1 for surface in surfaces if surface["source_exists"]),
"expected_scope_count": len(expected_scopes),
"prometheus_config_surface_count": count_kind(
surfaces,
{
"prometheus_config",
"prometheus_remote_write",
"prometheus_generated_config",
"monitoring_service_registry",
"exporter_query_config",
},
),
"alert_rule_surface_count": count_kind(
surfaces,
{"prometheus_alert_rules", "app_alert_rule_contract", "grafana_alert_rules"},
),
"alertmanager_receiver_surface_count": count_kind(surfaces, {"alertmanager_receiver_config"}),
"grafana_surface_count": count_kind(surfaces, {"grafana_alert_rules", "grafana_dashboard"}),
"signoz_surface_count": count_kind(
surfaces,
{"signoz_alert_rules", "signoz_log_rules", "signoz_observability_client"},
),
"sentry_surface_count": count_kind(
surfaces,
{"sentry_runtime_compose", "sentry_deploy_script", "webhook_receiver", "sentry_observability_client"},
),
"langfuse_surface_count": count_kind(
surfaces,
{"langfuse_runtime_compose", "langfuse_runbook", "langfuse_observability_client"},
),
"notification_policy_surface_count": count_kind(
surfaces,
{"notification_policy_snapshot", "observability_contract_snapshot", "notification_policy_code"},
),
"telegram_surface_count": count_kind(
surfaces,
{"telegram_policy_snapshot", "telegram_runtime_sender"},
),
"otel_surface_count": count_kind(surfaces, {"otel_collector_config"}),
"deploy_or_reload_surface_count": count_kind(
surfaces,
{"reload_capable_script", "host_deploy_script", "sentry_deploy_script"},
),
"drift_guard_surface_count": count_kind(surfaces, {"drift_guard_script"}),
"smoke_surface_count": count_kind(surfaces, {"smoke_script", "live_alert_sender"}),
"write_capable_surface_count": len(write_capable),
"surfaces_requiring_owner_response_count": sum(1 for surface in surfaces if surface["requires_owner_response"]),
"surfaces_requiring_live_evidence_count": sum(1 for surface in surfaces if surface["requires_live_evidence"]),
"owner_response_received_count": 0,
"owner_response_accepted_count": 0,
"live_evidence_received_count": 0,
"reload_owner_accepted_count": 0,
"receiver_owner_accepted_count": 0,
"route_smoke_accepted_count": 0,
"maintenance_window_accepted_count": 0,
"rollback_owner_accepted_count": 0,
"runtime_gate_count": 0,
"action_button_count": 0,
"coverage_percent_before_inventory": 56,
"coverage_percent_after_inventory": 62,
},
"execution_boundaries": FALSE_BOUNDARIES,
"expected_scopes": expected_scopes,
"observability_surfaces": surfaces,
"write_capable_surfaces": [
{
"surface_id": surface["surface_id"],
"label": surface["label"],
"config_kind": surface["config_kind"],
"expected_scope": surface["expected_scope"],
"required_gate": "owner_response_plus_maintenance_window_plus_rollback_owner",
}
for surface in write_capable
],
"next_collection_order": [
"alertmanager_receiver_config",
"prometheus_alerts_ops",
"prometheus_rule_drift_guard_script",
"deploy_alertmanager_config_script",
"deploy_prometheus_alerts_script",
"monitoring_110_compose",
"signoz_otel_collector_config",
"sentry_self_hosted_compose",
"telegram_gateway_service",
"alert_chain_smoke_script",
],
"operator_interpretation": [
"這是 monitoring / alerting / observability 的 repo-only 清冊,不是 live monitoring truth。",
"source_exists 與 sha256 只代表 repo source 可追溯,不代表 live Prometheus / Alertmanager / Grafana / SigNoz / Sentry 已一致。",
"write-capable surface 可改 route、reload、send notification、deploy exporter 或發測試告警;在 owner response、維護窗口、rollback owner 前全部維持 gate closed。",
"不得從本清冊啟動 reload、silence、Telegram send、Sentry deploy、SigNoz apply、Prometheus remote_write 變更、SSH 或 kubectl。",
],
}
def main() -> int:
parser = argparse.ArgumentParser(description="IwoooS monitoring / alerting / observability repo-only 清冊")
parser.add_argument("--root", default=".", help="repo root")
parser.add_argument("--output", help="寫出 JSON 報告")
parser.add_argument("--generated-at", help="固定報告時間,供 committed snapshot 使用")
args = parser.parse_args()
root = Path(args.root).resolve()
report = build_report(root, args.generated_at)
payload = json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True)
if args.output:
output = Path(args.output)
output.parent.mkdir(parents=True, exist_ok=True)
output.write_text(payload + "\n", encoding="utf-8")
else:
print(payload)
summary = report["summary"]
print(
"MONITORING_ALERTING_OBSERVABILITY_INVENTORY_OK "
f"surfaces={summary['surface_count']} "
f"alert_rules={summary['alert_rule_surface_count']} "
f"write_capable={summary['write_capable_surface_count']} "
f"runtime_gate={summary['runtime_gate_count']}",
file=sys.stderr,
)
return 0
if __name__ == "__main__":
sys.exit(main())