{ "schema_version": "observability_contract_matrix_v1", "generated_at": "2026-06-05T12:24:00+08:00", "program_status": { "overall_completion_percent": 100, "current_priority": "P1", "current_task_id": "P1-003", "next_task_id": "P1-004", "read_only_mode": true }, "source_refs": [ "docs/schemas/observability_contract_matrix_v1.schema.json", "docs/HARD_RULES.md#alertmanager-routing", "ops/alertmanager/alertmanager.yml", "ops/monitoring/alerts.yml", "ops/monitoring/alerts-unified.yml", "k8s/monitoring/prometheus.yml", "k8s/monitoring/alert-chain-monitor.yaml", "ops/grafana/dashboards/ai-monitoring.json", "ops/grafana/dashboards/infra-monitoring.json", "ops/signoz/alerting/rules.yaml", "ops/signoz/alerting/log-rules.md", "ops/signoz/otel-collector-config-phase-o.yaml", "k8s/observability/otel-collector-daemonset.yaml", "docs/runbooks/RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK.md", "docs/adr/ADR-053-observability-signoz-unified-architecture.md", "docs/adr/ADR-090-monitoring-blindspot-governance.md", "docs/LOGBOOK.md" ], "rollups": { "total_surfaces": 6, "by_kind": { "prometheus_rules": 1, "alertmanager_route": 1, "signoz_clickhouse": 1, "grafana_dashboard": 1, "sentry_source_link": 1, "otel_event_exporter": 1 }, "by_status": { "action_required": 2, "verified": 4 }, "by_evidence_status": { "committed_manifest": 4, "production_readback_recorded": 2 }, "by_noise_policy_status": { "proposal_only": 2, "preserved": 3, "needs_proposal": 1 }, "surface_ids_requiring_action": [ "grafana_dashboard_inventory", "prometheus_alert_rule_catalog" ], "surface_ids_with_proposal_only_noise_policy": [ "alertmanager_awoooi_route", "prometheus_alert_rule_catalog" ], "noise_reduction_opportunities_total": 5, "approval_required_opportunity_ids": [ "alertmanager_grouping_inhibit_tuning", "prometheus_noise_rule_tuning" ], "classification_gap_ids": [ "grafana_dashboard_owner_status", "prometheus_alert_rule_catalog_seed", "signoz_provider_native_real_alert_gap" ], "read_only_denials_total": 12, "surfaces_requiring_action": [ "grafana_dashboard_inventory", "prometheus_alert_rule_catalog" ], "proposal_only_count": 5 }, "observability_surfaces": [ { "surface_id": "prometheus_alert_rule_catalog", "display_name": "Prometheus 告警規則合約", "kind": "prometheus_rules", "status": "action_required", "risk_level": "critical", "evidence_status": "committed_manifest", "noise_policy_status": "proposal_only", "coverage_contract": "已提交 ops/monitoring/alerts-unified.yml 與 k8s/monitoring/* 規則;本快照只盤點規則、label、runbook 與分類缺口,不 reload Prometheus、不修改 alert rules。", "current_contract": "committed ops/monitoring/alerts-unified.yml 目前含 118 條 alert;LOGBOOK 曾記錄 production Prometheus rule count 142,需以正式 smoke 讀回確認。", "evidence_refs": [ "ops/monitoring/alerts-unified.yml", "ops/monitoring/alerts.yml", "k8s/monitoring/alert-chain-monitor.yaml", "docs/LOGBOOK.md" ], "next_action": "建立 alert_rule_catalog seed 與噪音率觀察 proposal;任何 rule 調整放到 P2-003 人工批准。" }, { "surface_id": "alertmanager_awoooi_route", "display_name": "Alertmanager → AWOOOI API 路由", "kind": "alertmanager_route", "status": "verified", "risk_level": "critical", "evidence_status": "committed_manifest", "noise_policy_status": "proposal_only", "coverage_contract": "Alertmanager receiver 必須指向 AWOOOI API;OpenClaw 只做 AI 分析,不得成為 Alertmanager receiver。", "current_contract": "ops/alertmanager/alertmanager.yml 以 awoooi-webhook 為主路徑,telegram-direct 僅限 alert-chain/API health 緊急旁路;group_by/team/alertname/severity 已存在。", "evidence_refs": [ "docs/HARD_RULES.md#alertmanager-routing", "ops/alertmanager/alertmanager.yml" ], "next_action": "只提出 group_by、inhibit、repeat interval 降噪 proposal;不得直接改 receiver、route 或 silence。" }, { "surface_id": "signoz_clickhouse_ingestion", "display_name": "SigNoz / ClickHouse / Provider Webhook", "kind": "signoz_clickhouse", "status": "verified", "risk_level": "high", "evidence_status": "production_readback_recorded", "noise_policy_status": "preserved", "coverage_contract": "SigNoz webhook、ClickHouse TTL、OTEL prometheus receiver 與 source provider heartbeat 需分開標示;heartbeat 不是 provider-native 真實告警。", "current_contract": "ops/signoz/alerting/rules.yaml、log-rules.md 與 RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK 已描述 webhook / rules;LOGBOOK 記錄 SigNoz webhook 與 source provider heartbeat 多次通過。", "evidence_refs": [ "ops/signoz/alerting/rules.yaml", "ops/signoz/alerting/log-rules.md", "ops/signoz/otel-collector-config-phase-o.yaml", "docs/runbooks/RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK.md", "docs/adr/ADR-053-observability-signoz-unified-architecture.md", "docs/LOGBOOK.md" ], "next_action": "保留 provider heartbeat / upstream canary 低噪音;補 provider-native 真實告警與 incident correlation gap 的只讀看板。" }, { "surface_id": "grafana_dashboard_inventory", "display_name": "Grafana Dashboard / Alert Chain 視覺化", "kind": "grafana_dashboard", "status": "action_required", "risk_level": "medium", "evidence_status": "committed_manifest", "noise_policy_status": "needs_proposal", "coverage_contract": "目前只確認 committed dashboard JSON;本快照不呼叫 Grafana API、不匯入 dashboard、不改 datasource。", "current_contract": "ai-monitoring dashboard 包含 Alert Chain 健康與最後成功時間;infra-monitoring dashboard 包含 Prometheus target up/down 與 API request rate。", "evidence_refs": [ "ops/grafana/dashboards/ai-monitoring.json", "ops/grafana/dashboards/infra-monitoring.json" ], "next_action": "補 dashboard owner、datasource parity、正式站可讀性與 alert-chain panel fresh readback;寫入或 import 需另案批准。" }, { "surface_id": "sentry_source_link_canary", "display_name": "Sentry Webhook / Source Link Canary", "kind": "sentry_source_link", "status": "verified", "risk_level": "high", "evidence_status": "production_readback_recorded", "noise_policy_status": "preserved", "coverage_contract": "Sentry webhook 與 source-link canary 用來驗證來源鏈路,不能被誤讀成真實 provider alert 全部已關聯。", "current_contract": "LOGBOOK 記錄 Alertmanager / SigNoz / Sentry webhook 與 Source Link Canary 通過,且 source provider freshness / incident matching 必須分開判斷。", "evidence_refs": [ "docs/adr/ADR-022-sentry-integration-architecture.md", "docs/LOGBOOK.md" ], "next_action": "持續把 heartbeat、upstream canary、direct/candidate/applied source link 分開呈現;不修改 Sentry project webhook。" }, { "surface_id": "otel_event_exporter_bridge", "display_name": "OTEL Collector / Event Exporter", "kind": "otel_event_exporter", "status": "verified", "risk_level": "medium", "evidence_status": "committed_manifest", "noise_policy_status": "preserved", "coverage_contract": "OTEL Collector DaemonSet 與 SigNoz prometheus receiver 只作為可觀測來源;本快照不部署 collector、不重啟 exporter。", "current_contract": "k8s/observability/otel-collector-daemonset.yaml 與 ops/signoz/otel-collector-config-phase-o.yaml 描述 log/metric/trace pipeline;LOGBOOK 記錄 OTEL Collector / Event Exporter post-deploy smoke 通過。", "evidence_refs": [ "k8s/observability/otel-collector-daemonset.yaml", "ops/signoz/otel-collector-config-phase-o.yaml", "docs/LOGBOOK.md" ], "next_action": "把 collector/exporter health 放入 observability readiness;任何 deploy / restart 仍需獨立批准。" } ], "noise_reduction_opportunities": [ { "opportunity_id": "prometheus_noise_rule_tuning", "display_name": "Prometheus 告警噪音調整提案", "status": "approval_required", "proposal_only": true, "impact": "降低 stale provider、低樣本 SLO、重複 resource alert 對 operator 的干擾;不得直接修改 alert rules。", "target_surface_ids": [ "prometheus_alert_rule_catalog" ], "evidence_refs": [ "ops/monitoring/alerts-unified.yml", "docs/adr/ADR-090-monitoring-blindspot-governance.md" ], "next_action": "進 P2-003 建立人工批准包,先收集 24h alert frequency / fingerprint evidence。" }, { "opportunity_id": "alertmanager_grouping_inhibit_tuning", "display_name": "Alertmanager grouping / inhibit 降噪提案", "status": "approval_required", "proposal_only": true, "impact": "針對同 team / alertname / severity 的爆量與 Host/K8s 重複告警做提案,不變更 receiver。", "target_surface_ids": [ "alertmanager_awoooi_route" ], "evidence_refs": [ "ops/alertmanager/alertmanager.yml", "docs/HARD_RULES.md#alertmanager-routing" ], "next_action": "產生 diff proposal 與 rollback plan;未批准前不得 reload Alertmanager。" }, { "opportunity_id": "success_notification_quiet_policy", "display_name": "Provider heartbeat 與真實告警分流", "status": "ready_for_proposal", "proposal_only": true, "impact": "避免把 Sentry / SigNoz heartbeat 誤當真實 provider alert,降低假綠與錯誤升級。", "target_surface_ids": [ "signoz_clickhouse_ingestion", "sentry_source_link_canary" ], "evidence_refs": [ "docs/LOGBOOK.md", "docs/runbooks/RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK.md" ], "next_action": "在 UI / API 上維持 heartbeat、upstream canary、direct source link、candidate source link 四種標籤。" }, { "opportunity_id": "grafana_dashboard_owner_freshness", "display_name": "Grafana dashboard owner / freshness 標籤", "status": "ready_for_proposal", "proposal_only": true, "impact": "讓 dashboard 缺 datasource、缺 owner 或 stale panel 不被誤讀成監控缺失已修復。", "target_surface_ids": [ "grafana_dashboard_inventory" ], "evidence_refs": [ "ops/grafana/dashboards/ai-monitoring.json", "ops/grafana/dashboards/infra-monitoring.json" ], "next_action": "只讀補 owner/freshness matrix;不寫 Grafana。" }, { "opportunity_id": "success_notification_quiet_policy", "display_name": "成功不洗版 / 失敗才升級", "status": "preserved", "proposal_only": true, "impact": "沿用備份與 Gitea 的 quiet-success 原則,讓 observability smoke 成功證據走 API/LOGBOOK,失敗才通知。", "target_surface_ids": [ "otel_event_exporter_bridge", "signoz_clickhouse_ingestion" ], "evidence_refs": [ "docs/ai/AI_AGENT_AUTOMATION_WORKLIST_2026-06-04.md", "docs/LOGBOOK.md" ], "next_action": "P1-003 僅記錄;未批准前不送 Telegram 測試通知。" } ], "classification_gaps": [ { "gap_id": "prometheus_alert_rule_catalog_seed", "display_name": "Alert rule catalog seed 未正式產品化", "status": "action_required", "severity": "high", "summary": "ADR-090 要求 alert_rule_catalog 能追蹤規則資產、noise_rate 與 superseded_by_rule_id;目前 P1-003 只完成只讀矩陣。", "evidence_refs": [ "docs/adr/ADR-090-monitoring-blindspot-governance.md", "ops/monitoring/alerts-unified.yml" ], "next_action": "P2-003 前先產生 seed proposal 與 migration/rollback 分離批准包。" }, { "gap_id": "signoz_provider_native_real_alert_gap", "display_name": "SigNoz provider-native 真實告警證據缺口", "status": "action_required", "severity": "medium", "summary": "Heartbeat / upstream canary 能證明管道新鮮,但不等於每種 provider-native alert 都已接到 incident correlation。", "evidence_refs": [ "docs/LOGBOOK.md", "docs/runbooks/RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK.md" ], "next_action": "只讀列出 provider-native alert coverage;需要 side effect 的 signed canary 另案批准。" }, { "gap_id": "grafana_dashboard_owner_status", "display_name": "Grafana dashboard owner / datasource 狀態未連到治理頁", "status": "action_required", "severity": "medium", "summary": "Committed dashboard JSON 存在,但尚未顯示 datasource freshness、owner、last import 或 panel stale 狀態。", "evidence_refs": [ "ops/grafana/dashboards/ai-monitoring.json", "ops/grafana/dashboards/infra-monitoring.json" ], "next_action": "下一輪只讀補 dashboard readiness,不呼叫 Grafana write API。" } ], "latest_observations": [ { "observation_id": "alertmanager_receiver_guard", "status": "verified", "summary": "HARD_RULES 與 ops/alertmanager/alertmanager.yml 都保留 Alertmanager 指向 AWOOOI API 的邊界;OpenClaw 不得成為 receiver。", "evidence_refs": [ "docs/HARD_RULES.md#alertmanager-routing", "ops/alertmanager/alertmanager.yml" ] }, { "observation_id": "prometheus_rule_source_split", "status": "action_required", "summary": "committed Prometheus 規則分散於 ops/monitoring 與 k8s/monitoring;P1-003 建立 matrix,尚未調整規則或 reload。", "evidence_refs": [ "ops/monitoring/alerts-unified.yml", "k8s/monitoring/alert-chain-monitor.yaml" ] }, { "observation_id": "post_deploy_observability_smoke_history", "status": "verified", "summary": "LOGBOOK 已多次記錄 Alertmanager / SigNoz / Sentry webhook、SigNoz、OTEL Collector、Event Exporter post-deploy smoke 通過。", "evidence_refs": [ "docs/LOGBOOK.md" ] } ], "operator_contract": { "display_mode": "read_only_observability_contract_matrix", "must_not_interpret_as": [ "Prometheus alert rule 修改批准", "Alertmanager receiver / route 修改批准", "Alertmanager 指向 OpenClaw receiver 批准", "Silence 建立或維護窗口批准", "Grafana dashboard 寫入批准", "SigNoz / Sentry webhook 設定修改批准", "Secret 已讀取或可輸出", "Telegram 測試通知批准", "deploy / reload / workflow 觸發批准", "runtime execution 授權" ], "secret_display_policy": "只允許顯示 committed file refs、endpoint role 與 redacted metadata;不得顯示 token、webhook secret 或 authorization header。", "alertmanager_route_policy": "Alertmanager webhook 必須指向 AWOOOI API;OpenClaw 不接收 Alertmanager webhook,只能在 API 持久化與分類後參與只讀分析。", "noise_reduction_policy": "P1-003 僅產生 proposal;P2-003 或任何 route/rule/silence 變更需人工批准。", "notification_policy": "成功 smoke 不即時通知洗版;失敗、action-required 或人工作業才可進通知批准流程。" }, "operation_boundaries": { "read_only_api_allowed": true, "prometheus_rule_write_allowed": false, "prometheus_reload_allowed": false, "alertmanager_route_write_allowed": false, "alertmanager_receiver_change_allowed": false, "alertmanager_to_openclaw_allowed": false, "silence_create_allowed": false, "grafana_dashboard_write_allowed": false, "grafana_api_write_allowed": false, "signoz_query_mutation_allowed": false, "signoz_webhook_change_allowed": false, "sentry_webhook_change_allowed": false, "otel_collector_deploy_allowed": false, "event_exporter_restart_allowed": false, "secret_read_allowed": false, "secret_plaintext_allowed": false, "notification_send_allowed": false, "external_api_call_allowed": false, "live_prometheus_query_allowed": false, "workflow_trigger_allowed": false, "deploy_trigger_allowed": false, "reload_trigger_allowed": false, "runtime_execution_allowed": false }, "approval_boundaries": { "prometheus_rule_change_authorized": false, "prometheus_reload_authorized": false, "alertmanager_route_change_authorized": false, "alertmanager_receiver_change_authorized": false, "alertmanager_to_openclaw_authorized": false, "silence_authorized": false, "grafana_write_authorized": false, "signoz_write_authorized": false, "sentry_write_authorized": false, "otel_deploy_authorized": false, "event_exporter_restart_authorized": false, "notification_send_authorized": false, "external_call_authorized": false, "secret_plaintext_allowed": false, "workflow_trigger_authorized": false, "deploy_reload_authorized": false, "runtime_execution_authorized": false } }