Files
awoooi/docs/evaluations/observability_contract_matrix_2026-06-05.json
Your Name 4944d77093
All checks were successful
CD Pipeline / tests (push) Successful in 1m29s
Code Review / ai-code-review (push) Successful in 16s
CD Pipeline / build-and-deploy (push) Successful in 4m45s
CD Pipeline / post-deploy-checks (push) Successful in 1m31s
feat(governance): 新增監控合約降噪矩陣
2026-06-05 12:44:47 +08:00

392 lines
17 KiB
JSON
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"schema_version": "observability_contract_matrix_v1",
"generated_at": "2026-06-05T12:24:00+08:00",
"program_status": {
"overall_completion_percent": 100,
"current_priority": "P1",
"current_task_id": "P1-003",
"next_task_id": "P1-004",
"read_only_mode": true
},
"source_refs": [
"docs/schemas/observability_contract_matrix_v1.schema.json",
"docs/HARD_RULES.md#alertmanager-routing",
"ops/alertmanager/alertmanager.yml",
"ops/monitoring/alerts.yml",
"ops/monitoring/alerts-unified.yml",
"k8s/monitoring/prometheus.yml",
"k8s/monitoring/alert-chain-monitor.yaml",
"ops/grafana/dashboards/ai-monitoring.json",
"ops/grafana/dashboards/infra-monitoring.json",
"ops/signoz/alerting/rules.yaml",
"ops/signoz/alerting/log-rules.md",
"ops/signoz/otel-collector-config-phase-o.yaml",
"k8s/observability/otel-collector-daemonset.yaml",
"docs/runbooks/RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK.md",
"docs/adr/ADR-053-observability-signoz-unified-architecture.md",
"docs/adr/ADR-090-monitoring-blindspot-governance.md",
"docs/LOGBOOK.md"
],
"rollups": {
"total_surfaces": 6,
"by_kind": {
"prometheus_rules": 1,
"alertmanager_route": 1,
"signoz_clickhouse": 1,
"grafana_dashboard": 1,
"sentry_source_link": 1,
"otel_event_exporter": 1
},
"by_status": {
"action_required": 2,
"verified": 4
},
"by_evidence_status": {
"committed_manifest": 4,
"production_readback_recorded": 2
},
"by_noise_policy_status": {
"proposal_only": 2,
"preserved": 3,
"needs_proposal": 1
},
"surface_ids_requiring_action": [
"grafana_dashboard_inventory",
"prometheus_alert_rule_catalog"
],
"surface_ids_with_proposal_only_noise_policy": [
"alertmanager_awoooi_route",
"prometheus_alert_rule_catalog"
],
"noise_reduction_opportunities_total": 5,
"approval_required_opportunity_ids": [
"alertmanager_grouping_inhibit_tuning",
"prometheus_noise_rule_tuning"
],
"classification_gap_ids": [
"grafana_dashboard_owner_status",
"prometheus_alert_rule_catalog_seed",
"signoz_provider_native_real_alert_gap"
],
"read_only_denials_total": 12,
"surfaces_requiring_action": [
"grafana_dashboard_inventory",
"prometheus_alert_rule_catalog"
],
"proposal_only_count": 5
},
"observability_surfaces": [
{
"surface_id": "prometheus_alert_rule_catalog",
"display_name": "Prometheus 告警規則合約",
"kind": "prometheus_rules",
"status": "action_required",
"risk_level": "critical",
"evidence_status": "committed_manifest",
"noise_policy_status": "proposal_only",
"coverage_contract": "已提交 ops/monitoring/alerts-unified.yml 與 k8s/monitoring/* 規則本快照只盤點規則、label、runbook 與分類缺口,不 reload Prometheus、不修改 alert rules。",
"current_contract": "committed ops/monitoring/alerts-unified.yml 目前含 118 條 alertLOGBOOK 曾記錄 production Prometheus rule count 142需以正式 smoke 讀回確認。",
"evidence_refs": [
"ops/monitoring/alerts-unified.yml",
"ops/monitoring/alerts.yml",
"k8s/monitoring/alert-chain-monitor.yaml",
"docs/LOGBOOK.md"
],
"next_action": "建立 alert_rule_catalog seed 與噪音率觀察 proposal任何 rule 調整放到 P2-003 人工批准。"
},
{
"surface_id": "alertmanager_awoooi_route",
"display_name": "Alertmanager → AWOOOI API 路由",
"kind": "alertmanager_route",
"status": "verified",
"risk_level": "critical",
"evidence_status": "committed_manifest",
"noise_policy_status": "proposal_only",
"coverage_contract": "Alertmanager receiver 必須指向 AWOOOI APIOpenClaw 只做 AI 分析,不得成為 Alertmanager receiver。",
"current_contract": "ops/alertmanager/alertmanager.yml 以 awoooi-webhook 為主路徑telegram-direct 僅限 alert-chain/API health 緊急旁路group_by/team/alertname/severity 已存在。",
"evidence_refs": [
"docs/HARD_RULES.md#alertmanager-routing",
"ops/alertmanager/alertmanager.yml"
],
"next_action": "只提出 group_by、inhibit、repeat interval 降噪 proposal不得直接改 receiver、route 或 silence。"
},
{
"surface_id": "signoz_clickhouse_ingestion",
"display_name": "SigNoz / ClickHouse / Provider Webhook",
"kind": "signoz_clickhouse",
"status": "verified",
"risk_level": "high",
"evidence_status": "production_readback_recorded",
"noise_policy_status": "preserved",
"coverage_contract": "SigNoz webhook、ClickHouse TTL、OTEL prometheus receiver 與 source provider heartbeat 需分開標示heartbeat 不是 provider-native 真實告警。",
"current_contract": "ops/signoz/alerting/rules.yaml、log-rules.md 與 RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK 已描述 webhook / rulesLOGBOOK 記錄 SigNoz webhook 與 source provider heartbeat 多次通過。",
"evidence_refs": [
"ops/signoz/alerting/rules.yaml",
"ops/signoz/alerting/log-rules.md",
"ops/signoz/otel-collector-config-phase-o.yaml",
"docs/runbooks/RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK.md",
"docs/adr/ADR-053-observability-signoz-unified-architecture.md",
"docs/LOGBOOK.md"
],
"next_action": "保留 provider heartbeat / upstream canary 低噪音;補 provider-native 真實告警與 incident correlation gap 的只讀看板。"
},
{
"surface_id": "grafana_dashboard_inventory",
"display_name": "Grafana Dashboard / Alert Chain 視覺化",
"kind": "grafana_dashboard",
"status": "action_required",
"risk_level": "medium",
"evidence_status": "committed_manifest",
"noise_policy_status": "needs_proposal",
"coverage_contract": "目前只確認 committed dashboard JSON本快照不呼叫 Grafana API、不匯入 dashboard、不改 datasource。",
"current_contract": "ai-monitoring dashboard 包含 Alert Chain 健康與最後成功時間infra-monitoring dashboard 包含 Prometheus target up/down 與 API request rate。",
"evidence_refs": [
"ops/grafana/dashboards/ai-monitoring.json",
"ops/grafana/dashboards/infra-monitoring.json"
],
"next_action": "補 dashboard owner、datasource parity、正式站可讀性與 alert-chain panel fresh readback寫入或 import 需另案批准。"
},
{
"surface_id": "sentry_source_link_canary",
"display_name": "Sentry Webhook / Source Link Canary",
"kind": "sentry_source_link",
"status": "verified",
"risk_level": "high",
"evidence_status": "production_readback_recorded",
"noise_policy_status": "preserved",
"coverage_contract": "Sentry webhook 與 source-link canary 用來驗證來源鏈路,不能被誤讀成真實 provider alert 全部已關聯。",
"current_contract": "LOGBOOK 記錄 Alertmanager / SigNoz / Sentry webhook 與 Source Link Canary 通過,且 source provider freshness / incident matching 必須分開判斷。",
"evidence_refs": [
"docs/adr/ADR-022-sentry-integration-architecture.md",
"docs/LOGBOOK.md"
],
"next_action": "持續把 heartbeat、upstream canary、direct/candidate/applied source link 分開呈現;不修改 Sentry project webhook。"
},
{
"surface_id": "otel_event_exporter_bridge",
"display_name": "OTEL Collector / Event Exporter",
"kind": "otel_event_exporter",
"status": "verified",
"risk_level": "medium",
"evidence_status": "committed_manifest",
"noise_policy_status": "preserved",
"coverage_contract": "OTEL Collector DaemonSet 與 SigNoz prometheus receiver 只作為可觀測來源;本快照不部署 collector、不重啟 exporter。",
"current_contract": "k8s/observability/otel-collector-daemonset.yaml 與 ops/signoz/otel-collector-config-phase-o.yaml 描述 log/metric/trace pipelineLOGBOOK 記錄 OTEL Collector / Event Exporter post-deploy smoke 通過。",
"evidence_refs": [
"k8s/observability/otel-collector-daemonset.yaml",
"ops/signoz/otel-collector-config-phase-o.yaml",
"docs/LOGBOOK.md"
],
"next_action": "把 collector/exporter health 放入 observability readiness任何 deploy / restart 仍需獨立批准。"
}
],
"noise_reduction_opportunities": [
{
"opportunity_id": "prometheus_noise_rule_tuning",
"display_name": "Prometheus 告警噪音調整提案",
"status": "approval_required",
"proposal_only": true,
"impact": "降低 stale provider、低樣本 SLO、重複 resource alert 對 operator 的干擾;不得直接修改 alert rules。",
"target_surface_ids": [
"prometheus_alert_rule_catalog"
],
"evidence_refs": [
"ops/monitoring/alerts-unified.yml",
"docs/adr/ADR-090-monitoring-blindspot-governance.md"
],
"next_action": "進 P2-003 建立人工批准包,先收集 24h alert frequency / fingerprint evidence。"
},
{
"opportunity_id": "alertmanager_grouping_inhibit_tuning",
"display_name": "Alertmanager grouping / inhibit 降噪提案",
"status": "approval_required",
"proposal_only": true,
"impact": "針對同 team / alertname / severity 的爆量與 Host/K8s 重複告警做提案,不變更 receiver。",
"target_surface_ids": [
"alertmanager_awoooi_route"
],
"evidence_refs": [
"ops/alertmanager/alertmanager.yml",
"docs/HARD_RULES.md#alertmanager-routing"
],
"next_action": "產生 diff proposal 與 rollback plan未批准前不得 reload Alertmanager。"
},
{
"opportunity_id": "success_notification_quiet_policy",
"display_name": "Provider heartbeat 與真實告警分流",
"status": "ready_for_proposal",
"proposal_only": true,
"impact": "避免把 Sentry / SigNoz heartbeat 誤當真實 provider alert降低假綠與錯誤升級。",
"target_surface_ids": [
"signoz_clickhouse_ingestion",
"sentry_source_link_canary"
],
"evidence_refs": [
"docs/LOGBOOK.md",
"docs/runbooks/RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK.md"
],
"next_action": "在 UI / API 上維持 heartbeat、upstream canary、direct source link、candidate source link 四種標籤。"
},
{
"opportunity_id": "grafana_dashboard_owner_freshness",
"display_name": "Grafana dashboard owner / freshness 標籤",
"status": "ready_for_proposal",
"proposal_only": true,
"impact": "讓 dashboard 缺 datasource、缺 owner 或 stale panel 不被誤讀成監控缺失已修復。",
"target_surface_ids": [
"grafana_dashboard_inventory"
],
"evidence_refs": [
"ops/grafana/dashboards/ai-monitoring.json",
"ops/grafana/dashboards/infra-monitoring.json"
],
"next_action": "只讀補 owner/freshness matrix不寫 Grafana。"
},
{
"opportunity_id": "success_notification_quiet_policy",
"display_name": "成功不洗版 / 失敗才升級",
"status": "preserved",
"proposal_only": true,
"impact": "沿用備份與 Gitea 的 quiet-success 原則,讓 observability smoke 成功證據走 API/LOGBOOK失敗才通知。",
"target_surface_ids": [
"otel_event_exporter_bridge",
"signoz_clickhouse_ingestion"
],
"evidence_refs": [
"docs/ai/AI_AGENT_AUTOMATION_WORKLIST_2026-06-04.md",
"docs/LOGBOOK.md"
],
"next_action": "P1-003 僅記錄;未批准前不送 Telegram 測試通知。"
}
],
"classification_gaps": [
{
"gap_id": "prometheus_alert_rule_catalog_seed",
"display_name": "Alert rule catalog seed 未正式產品化",
"status": "action_required",
"severity": "high",
"summary": "ADR-090 要求 alert_rule_catalog 能追蹤規則資產、noise_rate 與 superseded_by_rule_id目前 P1-003 只完成只讀矩陣。",
"evidence_refs": [
"docs/adr/ADR-090-monitoring-blindspot-governance.md",
"ops/monitoring/alerts-unified.yml"
],
"next_action": "P2-003 前先產生 seed proposal 與 migration/rollback 分離批准包。"
},
{
"gap_id": "signoz_provider_native_real_alert_gap",
"display_name": "SigNoz provider-native 真實告警證據缺口",
"status": "action_required",
"severity": "medium",
"summary": "Heartbeat / upstream canary 能證明管道新鮮,但不等於每種 provider-native alert 都已接到 incident correlation。",
"evidence_refs": [
"docs/LOGBOOK.md",
"docs/runbooks/RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK.md"
],
"next_action": "只讀列出 provider-native alert coverage需要 side effect 的 signed canary 另案批准。"
},
{
"gap_id": "grafana_dashboard_owner_status",
"display_name": "Grafana dashboard owner / datasource 狀態未連到治理頁",
"status": "action_required",
"severity": "medium",
"summary": "Committed dashboard JSON 存在,但尚未顯示 datasource freshness、owner、last import 或 panel stale 狀態。",
"evidence_refs": [
"ops/grafana/dashboards/ai-monitoring.json",
"ops/grafana/dashboards/infra-monitoring.json"
],
"next_action": "下一輪只讀補 dashboard readiness不呼叫 Grafana write API。"
}
],
"latest_observations": [
{
"observation_id": "alertmanager_receiver_guard",
"status": "verified",
"summary": "HARD_RULES 與 ops/alertmanager/alertmanager.yml 都保留 Alertmanager 指向 AWOOOI API 的邊界OpenClaw 不得成為 receiver。",
"evidence_refs": [
"docs/HARD_RULES.md#alertmanager-routing",
"ops/alertmanager/alertmanager.yml"
]
},
{
"observation_id": "prometheus_rule_source_split",
"status": "action_required",
"summary": "committed Prometheus 規則分散於 ops/monitoring 與 k8s/monitoringP1-003 建立 matrix尚未調整規則或 reload。",
"evidence_refs": [
"ops/monitoring/alerts-unified.yml",
"k8s/monitoring/alert-chain-monitor.yaml"
]
},
{
"observation_id": "post_deploy_observability_smoke_history",
"status": "verified",
"summary": "LOGBOOK 已多次記錄 Alertmanager / SigNoz / Sentry webhook、SigNoz、OTEL Collector、Event Exporter post-deploy smoke 通過。",
"evidence_refs": [
"docs/LOGBOOK.md"
]
}
],
"operator_contract": {
"display_mode": "read_only_observability_contract_matrix",
"must_not_interpret_as": [
"Prometheus alert rule 修改批准",
"Alertmanager receiver / route 修改批准",
"Alertmanager 指向 OpenClaw receiver 批准",
"Silence 建立或維護窗口批准",
"Grafana dashboard 寫入批准",
"SigNoz / Sentry webhook 設定修改批准",
"Secret 已讀取或可輸出",
"Telegram 測試通知批准",
"deploy / reload / workflow 觸發批准",
"runtime execution 授權"
],
"secret_display_policy": "只允許顯示 committed file refs、endpoint role 與 redacted metadata不得顯示 token、webhook secret 或 authorization header。",
"alertmanager_route_policy": "Alertmanager webhook 必須指向 AWOOOI APIOpenClaw 不接收 Alertmanager webhook只能在 API 持久化與分類後參與只讀分析。",
"noise_reduction_policy": "P1-003 僅產生 proposalP2-003 或任何 route/rule/silence 變更需人工批准。",
"notification_policy": "成功 smoke 不即時通知洗版失敗、action-required 或人工作業才可進通知批准流程。"
},
"operation_boundaries": {
"read_only_api_allowed": true,
"prometheus_rule_write_allowed": false,
"prometheus_reload_allowed": false,
"alertmanager_route_write_allowed": false,
"alertmanager_receiver_change_allowed": false,
"alertmanager_to_openclaw_allowed": false,
"silence_create_allowed": false,
"grafana_dashboard_write_allowed": false,
"grafana_api_write_allowed": false,
"signoz_query_mutation_allowed": false,
"signoz_webhook_change_allowed": false,
"sentry_webhook_change_allowed": false,
"otel_collector_deploy_allowed": false,
"event_exporter_restart_allowed": false,
"secret_read_allowed": false,
"secret_plaintext_allowed": false,
"notification_send_allowed": false,
"external_api_call_allowed": false,
"live_prometheus_query_allowed": false,
"workflow_trigger_allowed": false,
"deploy_trigger_allowed": false,
"reload_trigger_allowed": false,
"runtime_execution_allowed": false
},
"approval_boundaries": {
"prometheus_rule_change_authorized": false,
"prometheus_reload_authorized": false,
"alertmanager_route_change_authorized": false,
"alertmanager_receiver_change_authorized": false,
"alertmanager_to_openclaw_authorized": false,
"silence_authorized": false,
"grafana_write_authorized": false,
"signoz_write_authorized": false,
"sentry_write_authorized": false,
"otel_deploy_authorized": false,
"event_exporter_restart_authorized": false,
"notification_send_authorized": false,
"external_call_authorized": false,
"secret_plaintext_allowed": false,
"workflow_trigger_authorized": false,
"deploy_reload_authorized": false,
"runtime_execution_authorized": false
}
}