392 lines
17 KiB
JSON
392 lines
17 KiB
JSON
{
|
||
"schema_version": "observability_contract_matrix_v1",
|
||
"generated_at": "2026-06-05T12:24:00+08:00",
|
||
"program_status": {
|
||
"overall_completion_percent": 100,
|
||
"current_priority": "P1",
|
||
"current_task_id": "P1-003",
|
||
"next_task_id": "P1-004",
|
||
"read_only_mode": true
|
||
},
|
||
"source_refs": [
|
||
"docs/schemas/observability_contract_matrix_v1.schema.json",
|
||
"docs/HARD_RULES.md#alertmanager-routing",
|
||
"ops/alertmanager/alertmanager.yml",
|
||
"ops/monitoring/alerts.yml",
|
||
"ops/monitoring/alerts-unified.yml",
|
||
"k8s/monitoring/prometheus.yml",
|
||
"k8s/monitoring/alert-chain-monitor.yaml",
|
||
"ops/grafana/dashboards/ai-monitoring.json",
|
||
"ops/grafana/dashboards/infra-monitoring.json",
|
||
"ops/signoz/alerting/rules.yaml",
|
||
"ops/signoz/alerting/log-rules.md",
|
||
"ops/signoz/otel-collector-config-phase-o.yaml",
|
||
"k8s/observability/otel-collector-daemonset.yaml",
|
||
"docs/runbooks/RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK.md",
|
||
"docs/adr/ADR-053-observability-signoz-unified-architecture.md",
|
||
"docs/adr/ADR-090-monitoring-blindspot-governance.md",
|
||
"docs/LOGBOOK.md"
|
||
],
|
||
"rollups": {
|
||
"total_surfaces": 6,
|
||
"by_kind": {
|
||
"prometheus_rules": 1,
|
||
"alertmanager_route": 1,
|
||
"signoz_clickhouse": 1,
|
||
"grafana_dashboard": 1,
|
||
"sentry_source_link": 1,
|
||
"otel_event_exporter": 1
|
||
},
|
||
"by_status": {
|
||
"action_required": 2,
|
||
"verified": 4
|
||
},
|
||
"by_evidence_status": {
|
||
"committed_manifest": 4,
|
||
"production_readback_recorded": 2
|
||
},
|
||
"by_noise_policy_status": {
|
||
"proposal_only": 2,
|
||
"preserved": 3,
|
||
"needs_proposal": 1
|
||
},
|
||
"surface_ids_requiring_action": [
|
||
"grafana_dashboard_inventory",
|
||
"prometheus_alert_rule_catalog"
|
||
],
|
||
"surface_ids_with_proposal_only_noise_policy": [
|
||
"alertmanager_awoooi_route",
|
||
"prometheus_alert_rule_catalog"
|
||
],
|
||
"noise_reduction_opportunities_total": 5,
|
||
"approval_required_opportunity_ids": [
|
||
"alertmanager_grouping_inhibit_tuning",
|
||
"prometheus_noise_rule_tuning"
|
||
],
|
||
"classification_gap_ids": [
|
||
"grafana_dashboard_owner_status",
|
||
"prometheus_alert_rule_catalog_seed",
|
||
"signoz_provider_native_real_alert_gap"
|
||
],
|
||
"read_only_denials_total": 12,
|
||
"surfaces_requiring_action": [
|
||
"grafana_dashboard_inventory",
|
||
"prometheus_alert_rule_catalog"
|
||
],
|
||
"proposal_only_count": 5
|
||
},
|
||
"observability_surfaces": [
|
||
{
|
||
"surface_id": "prometheus_alert_rule_catalog",
|
||
"display_name": "Prometheus 告警規則合約",
|
||
"kind": "prometheus_rules",
|
||
"status": "action_required",
|
||
"risk_level": "critical",
|
||
"evidence_status": "committed_manifest",
|
||
"noise_policy_status": "proposal_only",
|
||
"coverage_contract": "已提交 ops/monitoring/alerts-unified.yml 與 k8s/monitoring/* 規則;本快照只盤點規則、label、runbook 與分類缺口,不 reload Prometheus、不修改 alert rules。",
|
||
"current_contract": "committed ops/monitoring/alerts-unified.yml 目前含 118 條 alert;LOGBOOK 曾記錄 production Prometheus rule count 142,需以正式 smoke 讀回確認。",
|
||
"evidence_refs": [
|
||
"ops/monitoring/alerts-unified.yml",
|
||
"ops/monitoring/alerts.yml",
|
||
"k8s/monitoring/alert-chain-monitor.yaml",
|
||
"docs/LOGBOOK.md"
|
||
],
|
||
"next_action": "建立 alert_rule_catalog seed 與噪音率觀察 proposal;任何 rule 調整放到 P2-003 人工批准。"
|
||
},
|
||
{
|
||
"surface_id": "alertmanager_awoooi_route",
|
||
"display_name": "Alertmanager → AWOOOI API 路由",
|
||
"kind": "alertmanager_route",
|
||
"status": "verified",
|
||
"risk_level": "critical",
|
||
"evidence_status": "committed_manifest",
|
||
"noise_policy_status": "proposal_only",
|
||
"coverage_contract": "Alertmanager receiver 必須指向 AWOOOI API;OpenClaw 只做 AI 分析,不得成為 Alertmanager receiver。",
|
||
"current_contract": "ops/alertmanager/alertmanager.yml 以 awoooi-webhook 為主路徑,telegram-direct 僅限 alert-chain/API health 緊急旁路;group_by/team/alertname/severity 已存在。",
|
||
"evidence_refs": [
|
||
"docs/HARD_RULES.md#alertmanager-routing",
|
||
"ops/alertmanager/alertmanager.yml"
|
||
],
|
||
"next_action": "只提出 group_by、inhibit、repeat interval 降噪 proposal;不得直接改 receiver、route 或 silence。"
|
||
},
|
||
{
|
||
"surface_id": "signoz_clickhouse_ingestion",
|
||
"display_name": "SigNoz / ClickHouse / Provider Webhook",
|
||
"kind": "signoz_clickhouse",
|
||
"status": "verified",
|
||
"risk_level": "high",
|
||
"evidence_status": "production_readback_recorded",
|
||
"noise_policy_status": "preserved",
|
||
"coverage_contract": "SigNoz webhook、ClickHouse TTL、OTEL prometheus receiver 與 source provider heartbeat 需分開標示;heartbeat 不是 provider-native 真實告警。",
|
||
"current_contract": "ops/signoz/alerting/rules.yaml、log-rules.md 與 RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK 已描述 webhook / rules;LOGBOOK 記錄 SigNoz webhook 與 source provider heartbeat 多次通過。",
|
||
"evidence_refs": [
|
||
"ops/signoz/alerting/rules.yaml",
|
||
"ops/signoz/alerting/log-rules.md",
|
||
"ops/signoz/otel-collector-config-phase-o.yaml",
|
||
"docs/runbooks/RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK.md",
|
||
"docs/adr/ADR-053-observability-signoz-unified-architecture.md",
|
||
"docs/LOGBOOK.md"
|
||
],
|
||
"next_action": "保留 provider heartbeat / upstream canary 低噪音;補 provider-native 真實告警與 incident correlation gap 的只讀看板。"
|
||
},
|
||
{
|
||
"surface_id": "grafana_dashboard_inventory",
|
||
"display_name": "Grafana Dashboard / Alert Chain 視覺化",
|
||
"kind": "grafana_dashboard",
|
||
"status": "action_required",
|
||
"risk_level": "medium",
|
||
"evidence_status": "committed_manifest",
|
||
"noise_policy_status": "needs_proposal",
|
||
"coverage_contract": "目前只確認 committed dashboard JSON;本快照不呼叫 Grafana API、不匯入 dashboard、不改 datasource。",
|
||
"current_contract": "ai-monitoring dashboard 包含 Alert Chain 健康與最後成功時間;infra-monitoring dashboard 包含 Prometheus target up/down 與 API request rate。",
|
||
"evidence_refs": [
|
||
"ops/grafana/dashboards/ai-monitoring.json",
|
||
"ops/grafana/dashboards/infra-monitoring.json"
|
||
],
|
||
"next_action": "補 dashboard owner、datasource parity、正式站可讀性與 alert-chain panel fresh readback;寫入或 import 需另案批准。"
|
||
},
|
||
{
|
||
"surface_id": "sentry_source_link_canary",
|
||
"display_name": "Sentry Webhook / Source Link Canary",
|
||
"kind": "sentry_source_link",
|
||
"status": "verified",
|
||
"risk_level": "high",
|
||
"evidence_status": "production_readback_recorded",
|
||
"noise_policy_status": "preserved",
|
||
"coverage_contract": "Sentry webhook 與 source-link canary 用來驗證來源鏈路,不能被誤讀成真實 provider alert 全部已關聯。",
|
||
"current_contract": "LOGBOOK 記錄 Alertmanager / SigNoz / Sentry webhook 與 Source Link Canary 通過,且 source provider freshness / incident matching 必須分開判斷。",
|
||
"evidence_refs": [
|
||
"docs/adr/ADR-022-sentry-integration-architecture.md",
|
||
"docs/LOGBOOK.md"
|
||
],
|
||
"next_action": "持續把 heartbeat、upstream canary、direct/candidate/applied source link 分開呈現;不修改 Sentry project webhook。"
|
||
},
|
||
{
|
||
"surface_id": "otel_event_exporter_bridge",
|
||
"display_name": "OTEL Collector / Event Exporter",
|
||
"kind": "otel_event_exporter",
|
||
"status": "verified",
|
||
"risk_level": "medium",
|
||
"evidence_status": "committed_manifest",
|
||
"noise_policy_status": "preserved",
|
||
"coverage_contract": "OTEL Collector DaemonSet 與 SigNoz prometheus receiver 只作為可觀測來源;本快照不部署 collector、不重啟 exporter。",
|
||
"current_contract": "k8s/observability/otel-collector-daemonset.yaml 與 ops/signoz/otel-collector-config-phase-o.yaml 描述 log/metric/trace pipeline;LOGBOOK 記錄 OTEL Collector / Event Exporter post-deploy smoke 通過。",
|
||
"evidence_refs": [
|
||
"k8s/observability/otel-collector-daemonset.yaml",
|
||
"ops/signoz/otel-collector-config-phase-o.yaml",
|
||
"docs/LOGBOOK.md"
|
||
],
|
||
"next_action": "把 collector/exporter health 放入 observability readiness;任何 deploy / restart 仍需獨立批准。"
|
||
}
|
||
],
|
||
"noise_reduction_opportunities": [
|
||
{
|
||
"opportunity_id": "prometheus_noise_rule_tuning",
|
||
"display_name": "Prometheus 告警噪音調整提案",
|
||
"status": "approval_required",
|
||
"proposal_only": true,
|
||
"impact": "降低 stale provider、低樣本 SLO、重複 resource alert 對 operator 的干擾;不得直接修改 alert rules。",
|
||
"target_surface_ids": [
|
||
"prometheus_alert_rule_catalog"
|
||
],
|
||
"evidence_refs": [
|
||
"ops/monitoring/alerts-unified.yml",
|
||
"docs/adr/ADR-090-monitoring-blindspot-governance.md"
|
||
],
|
||
"next_action": "進 P2-003 建立人工批准包,先收集 24h alert frequency / fingerprint evidence。"
|
||
},
|
||
{
|
||
"opportunity_id": "alertmanager_grouping_inhibit_tuning",
|
||
"display_name": "Alertmanager grouping / inhibit 降噪提案",
|
||
"status": "approval_required",
|
||
"proposal_only": true,
|
||
"impact": "針對同 team / alertname / severity 的爆量與 Host/K8s 重複告警做提案,不變更 receiver。",
|
||
"target_surface_ids": [
|
||
"alertmanager_awoooi_route"
|
||
],
|
||
"evidence_refs": [
|
||
"ops/alertmanager/alertmanager.yml",
|
||
"docs/HARD_RULES.md#alertmanager-routing"
|
||
],
|
||
"next_action": "產生 diff proposal 與 rollback plan;未批准前不得 reload Alertmanager。"
|
||
},
|
||
{
|
||
"opportunity_id": "success_notification_quiet_policy",
|
||
"display_name": "Provider heartbeat 與真實告警分流",
|
||
"status": "ready_for_proposal",
|
||
"proposal_only": true,
|
||
"impact": "避免把 Sentry / SigNoz heartbeat 誤當真實 provider alert,降低假綠與錯誤升級。",
|
||
"target_surface_ids": [
|
||
"signoz_clickhouse_ingestion",
|
||
"sentry_source_link_canary"
|
||
],
|
||
"evidence_refs": [
|
||
"docs/LOGBOOK.md",
|
||
"docs/runbooks/RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK.md"
|
||
],
|
||
"next_action": "在 UI / API 上維持 heartbeat、upstream canary、direct source link、candidate source link 四種標籤。"
|
||
},
|
||
{
|
||
"opportunity_id": "grafana_dashboard_owner_freshness",
|
||
"display_name": "Grafana dashboard owner / freshness 標籤",
|
||
"status": "ready_for_proposal",
|
||
"proposal_only": true,
|
||
"impact": "讓 dashboard 缺 datasource、缺 owner 或 stale panel 不被誤讀成監控缺失已修復。",
|
||
"target_surface_ids": [
|
||
"grafana_dashboard_inventory"
|
||
],
|
||
"evidence_refs": [
|
||
"ops/grafana/dashboards/ai-monitoring.json",
|
||
"ops/grafana/dashboards/infra-monitoring.json"
|
||
],
|
||
"next_action": "只讀補 owner/freshness matrix;不寫 Grafana。"
|
||
},
|
||
{
|
||
"opportunity_id": "success_notification_quiet_policy",
|
||
"display_name": "成功不洗版 / 失敗才升級",
|
||
"status": "preserved",
|
||
"proposal_only": true,
|
||
"impact": "沿用備份與 Gitea 的 quiet-success 原則,讓 observability smoke 成功證據走 API/LOGBOOK,失敗才通知。",
|
||
"target_surface_ids": [
|
||
"otel_event_exporter_bridge",
|
||
"signoz_clickhouse_ingestion"
|
||
],
|
||
"evidence_refs": [
|
||
"docs/ai/AI_AGENT_AUTOMATION_WORKLIST_2026-06-04.md",
|
||
"docs/LOGBOOK.md"
|
||
],
|
||
"next_action": "P1-003 僅記錄;未批准前不送 Telegram 測試通知。"
|
||
}
|
||
],
|
||
"classification_gaps": [
|
||
{
|
||
"gap_id": "prometheus_alert_rule_catalog_seed",
|
||
"display_name": "Alert rule catalog seed 未正式產品化",
|
||
"status": "action_required",
|
||
"severity": "high",
|
||
"summary": "ADR-090 要求 alert_rule_catalog 能追蹤規則資產、noise_rate 與 superseded_by_rule_id;目前 P1-003 只完成只讀矩陣。",
|
||
"evidence_refs": [
|
||
"docs/adr/ADR-090-monitoring-blindspot-governance.md",
|
||
"ops/monitoring/alerts-unified.yml"
|
||
],
|
||
"next_action": "P2-003 前先產生 seed proposal 與 migration/rollback 分離批准包。"
|
||
},
|
||
{
|
||
"gap_id": "signoz_provider_native_real_alert_gap",
|
||
"display_name": "SigNoz provider-native 真實告警證據缺口",
|
||
"status": "action_required",
|
||
"severity": "medium",
|
||
"summary": "Heartbeat / upstream canary 能證明管道新鮮,但不等於每種 provider-native alert 都已接到 incident correlation。",
|
||
"evidence_refs": [
|
||
"docs/LOGBOOK.md",
|
||
"docs/runbooks/RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK.md"
|
||
],
|
||
"next_action": "只讀列出 provider-native alert coverage;需要 side effect 的 signed canary 另案批准。"
|
||
},
|
||
{
|
||
"gap_id": "grafana_dashboard_owner_status",
|
||
"display_name": "Grafana dashboard owner / datasource 狀態未連到治理頁",
|
||
"status": "action_required",
|
||
"severity": "medium",
|
||
"summary": "Committed dashboard JSON 存在,但尚未顯示 datasource freshness、owner、last import 或 panel stale 狀態。",
|
||
"evidence_refs": [
|
||
"ops/grafana/dashboards/ai-monitoring.json",
|
||
"ops/grafana/dashboards/infra-monitoring.json"
|
||
],
|
||
"next_action": "下一輪只讀補 dashboard readiness,不呼叫 Grafana write API。"
|
||
}
|
||
],
|
||
"latest_observations": [
|
||
{
|
||
"observation_id": "alertmanager_receiver_guard",
|
||
"status": "verified",
|
||
"summary": "HARD_RULES 與 ops/alertmanager/alertmanager.yml 都保留 Alertmanager 指向 AWOOOI API 的邊界;OpenClaw 不得成為 receiver。",
|
||
"evidence_refs": [
|
||
"docs/HARD_RULES.md#alertmanager-routing",
|
||
"ops/alertmanager/alertmanager.yml"
|
||
]
|
||
},
|
||
{
|
||
"observation_id": "prometheus_rule_source_split",
|
||
"status": "action_required",
|
||
"summary": "committed Prometheus 規則分散於 ops/monitoring 與 k8s/monitoring;P1-003 建立 matrix,尚未調整規則或 reload。",
|
||
"evidence_refs": [
|
||
"ops/monitoring/alerts-unified.yml",
|
||
"k8s/monitoring/alert-chain-monitor.yaml"
|
||
]
|
||
},
|
||
{
|
||
"observation_id": "post_deploy_observability_smoke_history",
|
||
"status": "verified",
|
||
"summary": "LOGBOOK 已多次記錄 Alertmanager / SigNoz / Sentry webhook、SigNoz、OTEL Collector、Event Exporter post-deploy smoke 通過。",
|
||
"evidence_refs": [
|
||
"docs/LOGBOOK.md"
|
||
]
|
||
}
|
||
],
|
||
"operator_contract": {
|
||
"display_mode": "read_only_observability_contract_matrix",
|
||
"must_not_interpret_as": [
|
||
"Prometheus alert rule 修改批准",
|
||
"Alertmanager receiver / route 修改批准",
|
||
"Alertmanager 指向 OpenClaw receiver 批准",
|
||
"Silence 建立或維護窗口批准",
|
||
"Grafana dashboard 寫入批准",
|
||
"SigNoz / Sentry webhook 設定修改批准",
|
||
"Secret 已讀取或可輸出",
|
||
"Telegram 測試通知批准",
|
||
"deploy / reload / workflow 觸發批准",
|
||
"runtime execution 授權"
|
||
],
|
||
"secret_display_policy": "只允許顯示 committed file refs、endpoint role 與 redacted metadata;不得顯示 token、webhook secret 或 authorization header。",
|
||
"alertmanager_route_policy": "Alertmanager webhook 必須指向 AWOOOI API;OpenClaw 不接收 Alertmanager webhook,只能在 API 持久化與分類後參與只讀分析。",
|
||
"noise_reduction_policy": "P1-003 僅產生 proposal;P2-003 或任何 route/rule/silence 變更需人工批准。",
|
||
"notification_policy": "成功 smoke 不即時通知洗版;失敗、action-required 或人工作業才可進通知批准流程。"
|
||
},
|
||
"operation_boundaries": {
|
||
"read_only_api_allowed": true,
|
||
"prometheus_rule_write_allowed": false,
|
||
"prometheus_reload_allowed": false,
|
||
"alertmanager_route_write_allowed": false,
|
||
"alertmanager_receiver_change_allowed": false,
|
||
"alertmanager_to_openclaw_allowed": false,
|
||
"silence_create_allowed": false,
|
||
"grafana_dashboard_write_allowed": false,
|
||
"grafana_api_write_allowed": false,
|
||
"signoz_query_mutation_allowed": false,
|
||
"signoz_webhook_change_allowed": false,
|
||
"sentry_webhook_change_allowed": false,
|
||
"otel_collector_deploy_allowed": false,
|
||
"event_exporter_restart_allowed": false,
|
||
"secret_read_allowed": false,
|
||
"secret_plaintext_allowed": false,
|
||
"notification_send_allowed": false,
|
||
"external_api_call_allowed": false,
|
||
"live_prometheus_query_allowed": false,
|
||
"workflow_trigger_allowed": false,
|
||
"deploy_trigger_allowed": false,
|
||
"reload_trigger_allowed": false,
|
||
"runtime_execution_allowed": false
|
||
},
|
||
"approval_boundaries": {
|
||
"prometheus_rule_change_authorized": false,
|
||
"prometheus_reload_authorized": false,
|
||
"alertmanager_route_change_authorized": false,
|
||
"alertmanager_receiver_change_authorized": false,
|
||
"alertmanager_to_openclaw_authorized": false,
|
||
"silence_authorized": false,
|
||
"grafana_write_authorized": false,
|
||
"signoz_write_authorized": false,
|
||
"sentry_write_authorized": false,
|
||
"otel_deploy_authorized": false,
|
||
"event_exporter_restart_authorized": false,
|
||
"notification_send_authorized": false,
|
||
"external_call_authorized": false,
|
||
"secret_plaintext_allowed": false,
|
||
"workflow_trigger_authorized": false,
|
||
"deploy_reload_authorized": false,
|
||
"runtime_execution_authorized": false
|
||
}
|
||
}
|