feat(governance): 新增監控合約降噪矩陣
All checks were successful
CD Pipeline / tests (push) Successful in 1m29s
Code Review / ai-code-review (push) Successful in 16s
CD Pipeline / build-and-deploy (push) Successful in 4m45s
CD Pipeline / post-deploy-checks (push) Successful in 1m31s

This commit is contained in:
Your Name
2026-06-05 12:44:47 +08:00
parent 0980ae3e49
commit 4944d77093
17 changed files with 1733 additions and 64 deletions

View File

@@ -65,6 +65,9 @@ from src.services.runtime_surface_inventory import (
from src.services.gitea_workflow_runner_health import (
load_latest_gitea_workflow_runner_health,
)
from src.services.observability_contract_matrix import (
load_latest_observability_contract_matrix,
)
from src.services.package_supply_chain_inventory import (
load_latest_package_supply_chain_inventory,
)
@@ -536,6 +539,34 @@ async def get_gitea_workflow_runner_health() -> dict[str, Any]:
) from exc
@router.get(
"/observability-contract-matrix",
response_model=dict[str, Any],
summary="取得監控合約與降噪機會矩陣",
description=(
"讀取最新已提交的 Prometheus / Alertmanager / Grafana / SigNoz / ClickHouse / Sentry "
"只讀 observability matrix此端點不修改 alert rules、不呼叫 silence API、"
"不建立 Grafana dashboard、不改 SigNoz / Sentry 設定、不讀 Secret payload、"
"不送 Telegram 測試通知、不觸發 monitoring deploy。"
),
)
async def get_observability_contract_matrix() -> dict[str, Any]:
"""Return the latest read-only observability contract matrix."""
try:
return await asyncio.to_thread(load_latest_observability_contract_matrix)
except FileNotFoundError as exc:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=str(exc),
) from exc
except (json.JSONDecodeError, ValueError) as exc:
logger.error("observability_contract_matrix_invalid", error=str(exc))
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="監控合約與降噪機會矩陣快照無效",
) from exc
@router.get(
"/backup-dr-target-inventory",
response_model=dict[str, Any],

View File

@@ -0,0 +1,232 @@
"""
Observability contract and noise-reduction matrix snapshot.
Loads the latest committed, read-only Prometheus / Alertmanager / SigNoz /
Grafana observability contract matrix. This module never mutates alert rules,
routes, receivers, silences, dashboards, webhooks, collectors, secrets,
notifications, workflows, or runtime state.
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import Any
from src.services.snapshot_paths import default_evaluations_dir
_DEFAULT_EVALUATIONS_DIR = default_evaluations_dir(Path(__file__))
_SNAPSHOT_PATTERN = "observability_contract_matrix_*.json"
_SCHEMA_VERSION = "observability_contract_matrix_v1"
def load_latest_observability_contract_matrix(
evaluations_dir: Path | None = None,
) -> dict[str, Any]:
"""Load the newest committed observability contract matrix snapshot."""
directory = evaluations_dir or _DEFAULT_EVALUATIONS_DIR
candidates = sorted(directory.glob(_SNAPSHOT_PATTERN))
if not candidates:
raise FileNotFoundError(f"no observability contract matrix snapshots found in {directory}")
latest = candidates[-1]
with latest.open(encoding="utf-8") as handle:
payload = json.load(handle)
if not isinstance(payload, dict):
raise ValueError(f"{latest}: expected JSON object")
_require_schema(payload, _SCHEMA_VERSION, str(latest))
_require_read_only_boundaries(payload, str(latest))
_require_operation_boundaries(payload, str(latest))
_require_rollup_consistency(payload, str(latest))
_require_surface_evidence(payload, str(latest))
_require_noise_opportunities(payload, str(latest))
_require_operator_denials(payload, str(latest))
_require_no_plaintext_secret_payload_keys(payload, str(latest))
return payload
def _require_schema(payload: dict[str, Any], expected: str, label: str) -> None:
actual = payload.get("schema_version")
if actual != expected:
raise ValueError(f"{label}: expected schema_version={expected}, got {actual!r}")
def _require_read_only_boundaries(payload: dict[str, Any], label: str) -> None:
program_status = payload.get("program_status") or {}
if program_status.get("read_only_mode") is not True:
raise ValueError(f"{label}: program_status.read_only_mode must be true")
approval_boundaries = payload.get("approval_boundaries") or {}
allowed = sorted(key for key, value in approval_boundaries.items() if value is not False)
if allowed:
raise ValueError(f"{label}: approval boundaries must remain false: {allowed}")
def _require_operation_boundaries(payload: dict[str, Any], label: str) -> None:
boundaries = payload.get("operation_boundaries") or {}
if boundaries.get("read_only_api_allowed") is not True:
raise ValueError(f"{label}: read_only_api_allowed must be true")
blocked_flags = {
"prometheus_rule_write_allowed",
"prometheus_reload_allowed",
"alertmanager_route_write_allowed",
"alertmanager_receiver_change_allowed",
"alertmanager_to_openclaw_allowed",
"silence_create_allowed",
"grafana_dashboard_write_allowed",
"grafana_api_write_allowed",
"signoz_query_mutation_allowed",
"signoz_webhook_change_allowed",
"sentry_webhook_change_allowed",
"otel_collector_deploy_allowed",
"event_exporter_restart_allowed",
"secret_read_allowed",
"secret_plaintext_allowed",
"notification_send_allowed",
"external_api_call_allowed",
"live_prometheus_query_allowed",
"workflow_trigger_allowed",
"deploy_trigger_allowed",
"reload_trigger_allowed",
"runtime_execution_allowed",
}
allowed = sorted(flag for flag in blocked_flags if boundaries.get(flag) is not False)
if allowed:
raise ValueError(f"{label}: operation boundaries must remain false: {allowed}")
def _require_rollup_consistency(payload: dict[str, Any], label: str) -> None:
surfaces = payload.get("observability_surfaces") or []
opportunities = payload.get("noise_reduction_opportunities") or []
gaps = payload.get("classification_gaps") or []
rollups = payload.get("rollups") or {}
if rollups.get("total_surfaces") != len(surfaces):
raise ValueError(f"{label}: rollups.total_surfaces must match observability_surfaces")
if rollups.get("by_kind") != _count_by(surfaces, "kind"):
raise ValueError(f"{label}: rollups.by_kind must match observability_surfaces")
if rollups.get("by_status") != _count_by(surfaces, "status"):
raise ValueError(f"{label}: rollups.by_status must match observability_surfaces")
if rollups.get("by_evidence_status") != _count_by(surfaces, "evidence_status"):
raise ValueError(f"{label}: rollups.by_evidence_status must match observability_surfaces")
if rollups.get("by_noise_policy_status") != _count_by(surfaces, "noise_policy_status"):
raise ValueError(f"{label}: rollups.by_noise_policy_status must match observability_surfaces")
action_required = sorted(
surface.get("surface_id")
for surface in surfaces
if surface.get("status") == "action_required"
)
if sorted(rollups.get("surface_ids_requiring_action") or []) != action_required:
raise ValueError(f"{label}: rollups.surface_ids_requiring_action must match surfaces")
proposal_only_surfaces = sorted(
surface.get("surface_id")
for surface in surfaces
if surface.get("noise_policy_status") == "proposal_only"
)
if sorted(rollups.get("surface_ids_with_proposal_only_noise_policy") or []) != proposal_only_surfaces:
raise ValueError(
f"{label}: rollups.surface_ids_with_proposal_only_noise_policy must match surfaces"
)
approval_required = sorted(
opportunity.get("opportunity_id")
for opportunity in opportunities
if opportunity.get("status") == "approval_required"
)
if rollups.get("noise_reduction_opportunities_total") != len(opportunities):
raise ValueError(f"{label}: rollups.noise_reduction_opportunities_total must match opportunities")
if sorted(rollups.get("approval_required_opportunity_ids") or []) != approval_required:
raise ValueError(f"{label}: rollups.approval_required_opportunity_ids must match opportunities")
if sorted(rollups.get("classification_gap_ids") or []) != sorted(gap.get("gap_id") for gap in gaps):
raise ValueError(f"{label}: rollups.classification_gap_ids must match classification_gaps")
def _require_surface_evidence(payload: dict[str, Any], label: str) -> None:
surfaces = payload.get("observability_surfaces") or []
missing = sorted(
surface.get("surface_id")
for surface in surfaces
if not surface.get("coverage_contract")
or not surface.get("evidence_refs")
or not surface.get("next_action")
)
if missing:
raise ValueError(f"{label}: observability_surfaces must include contract, evidence, next_action: {missing}")
def _require_noise_opportunities(payload: dict[str, Any], label: str) -> None:
opportunities = payload.get("noise_reduction_opportunities") or []
non_proposal = sorted(
opportunity.get("opportunity_id")
for opportunity in opportunities
if opportunity.get("proposal_only") is not True
)
if non_proposal:
raise ValueError(f"{label}: noise opportunities must stay proposal_only: {non_proposal}")
required_ids = {
"prometheus_noise_rule_tuning",
"alertmanager_grouping_inhibit_tuning",
"success_notification_quiet_policy",
}
present = {opportunity.get("opportunity_id") for opportunity in opportunities}
if not required_ids.issubset(present):
raise ValueError(f"{label}: missing required noise-reduction opportunities")
def _require_operator_denials(payload: dict[str, Any], label: str) -> None:
contract = payload.get("operator_contract") or {}
must_not_interpret_as = set(contract.get("must_not_interpret_as") or [])
required_denials = {
"Prometheus alert rule 修改批准",
"Alertmanager receiver / route 修改批准",
"Alertmanager 指向 OpenClaw receiver 批准",
"Silence 建立或維護窗口批准",
"Grafana dashboard 寫入批准",
"SigNoz / Sentry webhook 設定修改批准",
"Secret 已讀取或可輸出",
"Telegram 測試通知批准",
"deploy / reload / workflow 觸發批准",
"runtime execution 授權",
}
if not required_denials.issubset(must_not_interpret_as):
raise ValueError(f"{label}: operator_contract.must_not_interpret_as is missing required denials")
route_policy = str(contract.get("alertmanager_route_policy") or "")
if "OpenClaw" not in route_policy or "不接收 Alertmanager webhook" not in route_policy:
raise ValueError(f"{label}: operator_contract.alertmanager_route_policy must block OpenClaw receiver use")
def _require_no_plaintext_secret_payload_keys(value: Any, label: str, path: str = "$") -> None:
if isinstance(value, dict):
forbidden_key_fragments = {
"secret_value",
"token_value",
"authorization_header",
"private_key",
"webhook_secret",
"runner_token",
"signoz_token",
"sentry_dsn",
}
for key, nested in value.items():
lowered = str(key).lower()
if any(fragment in lowered for fragment in forbidden_key_fragments):
raise ValueError(f"{label}: forbidden secret payload key at {path}.{key}")
_require_no_plaintext_secret_payload_keys(nested, label, f"{path}.{key}")
elif isinstance(value, list):
for index, nested in enumerate(value):
_require_no_plaintext_secret_payload_keys(nested, label, f"{path}[{index}]")
def _count_by(items: list[dict[str, Any]], key: str) -> dict[str, int]:
counts: dict[str, int] = {}
for item in items:
value = item.get(key)
counts[value] = counts.get(value, 0) + 1
return counts

View File

@@ -16,16 +16,16 @@ def test_ai_agent_automation_backlog_snapshot_endpoint_returns_committed_snapsho
assert response.status_code == 200
data = response.json()
assert data["schema_version"] == "ai_agent_automation_backlog_v1"
assert data["program_status"]["overall_completion_percent"] == 78
assert data["program_status"]["overall_completion_percent"] == 83
assert data["program_status"]["read_only_mode"] is True
assert data["program_status"]["current_task_id"] == "P1-002"
assert data["program_status"]["next_task_id"] == "P1-003"
assert data["program_status"]["current_task_id"] == "P1-003"
assert data["program_status"]["next_task_id"] == "P1-004"
assert data["rollups"]["total_items"] == len(data["backlog_items"]) == 23
assert data["rollups"]["by_priority"]["P1"] == 21
assert data["rollups"]["by_status"]["done"] == 18
assert data["rollups"]["by_status"]["done"] == 19
assert data["rollups"]["by_gate_status"]["read_only_allowed"] == 20
assert data["progress_summary"]["overall_percent"] == 78
assert data["progress_summary"]["done_items"] == 18
assert data["progress_summary"]["overall_percent"] == 83
assert data["progress_summary"]["done_items"] == 19
assert data["progress_summary"]["total_items"] == 23
assert data["item_approval_boundary_rollup"]["total_items"] == 23
assert data["item_approval_boundary_rollup"]["items_requiring_explicit_approval"] == [
@@ -51,6 +51,10 @@ def test_ai_agent_automation_backlog_snapshot_endpoint_returns_committed_snapsho
assert p1_002["status"] == "done"
assert p1_002["next_review"] == "P1-003"
assert "gitea_workflow_runner_health_2026-06-05.json" in p1_002["evidence_refs"][0]
p1_003 = next(item for item in data["backlog_items"] if item["item_id"] == "AUTO-P1-003")
assert p1_003["status"] == "done"
assert p1_003["next_review"] == "P1-004"
assert "observability_contract_matrix_2026-06-05.json" in p1_003["evidence_refs"][0]
p1_306 = next(item for item in data["backlog_items"] if item["item_id"] == "AUTO-P1-306")
assert p1_306["approval_boundary"]["mode"] == "read_only_allowed"
assert "runtime_execution" in p1_306["approval_boundary"]["blocked_actions"]

View File

@@ -18,10 +18,10 @@ def test_ai_agent_automation_inventory_snapshot_endpoint_returns_committed_snaps
assert data["schema_version"] == "ai_agent_automation_inventory_snapshot_v1"
assert data["program_status"]["overall_completion_percent"] == 100
assert data["program_status"]["read_only_mode"] is True
assert data["program_status"]["current_task_id"] == "P1-002"
assert data["program_status"]["next_task_id"] == "P1-003"
assert data["task_approval_boundary_rollup"]["total_tasks"] == len(data["tasks"]) == 28
assert data["task_approval_boundary_rollup"]["by_mode"]["read_only_allowed"] == 26
assert data["program_status"]["current_task_id"] == "P1-003"
assert data["program_status"]["next_task_id"] == "P1-004"
assert data["task_approval_boundary_rollup"]["total_tasks"] == len(data["tasks"]) == 29
assert data["task_approval_boundary_rollup"]["by_mode"]["read_only_allowed"] == 27
assert data["task_approval_boundary_rollup"]["tasks_requiring_explicit_approval"] == [
"P0-001",
"P0-004",
@@ -37,6 +37,10 @@ def test_ai_agent_automation_inventory_snapshot_endpoint_returns_committed_snaps
assert p1_002["status"] == "done"
assert p1_002["approval_boundary"]["mode"] == "read_only_allowed"
assert "gitea_workflow_runner_health_2026-06-05.json" in p1_002["output"]
p1_003 = next(task for task in data["tasks"] if task["task_id"] == "P1-003")
assert p1_003["status"] == "done"
assert p1_003["approval_boundary"]["mode"] == "read_only_allowed"
assert "observability_contract_matrix_2026-06-05.json" in p1_003["output"]
assert any(task["task_id"] == "P1-204" for task in data["tasks"])
assert any(task["task_id"] == "P1-205" for task in data["tasks"])
assert any(task["task_id"] == "P1-206" for task in data["tasks"])
@@ -67,3 +71,4 @@ def test_ai_agent_automation_inventory_snapshot_endpoint_returns_committed_snaps
assert any(evidence["evidence_id"] == "backlog_progress_summary_ui" for evidence in data["evidence"])
assert any(evidence["evidence_id"] == "runtime_surface_inventory_api" for evidence in data["evidence"])
assert any(evidence["evidence_id"] == "gitea_workflow_runner_health_api" for evidence in data["evidence"])
assert any(evidence["evidence_id"] == "observability_contract_matrix_api" for evidence in data["evidence"])

View File

@@ -0,0 +1,293 @@
from __future__ import annotations
import json
import pytest
from src.services.observability_contract_matrix import load_latest_observability_contract_matrix
def test_load_latest_observability_contract_matrix_reads_newest_file(tmp_path):
older = _snapshot(generated_at="2026-06-04T00:00:00+08:00", completion=40)
newer = _snapshot(generated_at="2026-06-05T00:00:00+08:00", completion=100)
(tmp_path / "observability_contract_matrix_2026-06-04.json").write_text(
json.dumps(older),
encoding="utf-8",
)
(tmp_path / "observability_contract_matrix_2026-06-05.json").write_text(
json.dumps(newer),
encoding="utf-8",
)
loaded = load_latest_observability_contract_matrix(tmp_path)
assert loaded["generated_at"] == "2026-06-05T00:00:00+08:00"
assert loaded["program_status"]["overall_completion_percent"] == 100
assert loaded["rollups"]["total_surfaces"] == 2
assert loaded["operation_boundaries"]["alertmanager_to_openclaw_allowed"] is False
def test_observability_contract_matrix_requires_read_only_mode(tmp_path):
snapshot = _snapshot()
snapshot["program_status"]["read_only_mode"] = False
(tmp_path / "observability_contract_matrix_2026-06-05.json").write_text(
json.dumps(snapshot),
encoding="utf-8",
)
with pytest.raises(ValueError, match="read_only_mode"):
load_latest_observability_contract_matrix(tmp_path)
def test_observability_contract_matrix_blocks_route_and_rule_mutations(tmp_path):
snapshot = _snapshot()
snapshot["operation_boundaries"]["prometheus_rule_write_allowed"] = True
snapshot["operation_boundaries"]["alertmanager_to_openclaw_allowed"] = True
(tmp_path / "observability_contract_matrix_2026-06-05.json").write_text(
json.dumps(snapshot),
encoding="utf-8",
)
with pytest.raises(ValueError, match="operation boundaries"):
load_latest_observability_contract_matrix(tmp_path)
def test_observability_contract_matrix_requires_rollup_consistency(tmp_path):
snapshot = _snapshot()
snapshot["rollups"]["surface_ids_requiring_action"] = []
(tmp_path / "observability_contract_matrix_2026-06-05.json").write_text(
json.dumps(snapshot),
encoding="utf-8",
)
with pytest.raises(ValueError, match="surface_ids_requiring_action"):
load_latest_observability_contract_matrix(tmp_path)
def test_observability_contract_matrix_requires_noise_candidates_to_be_proposal_only(tmp_path):
snapshot = _snapshot()
snapshot["noise_reduction_opportunities"][0]["proposal_only"] = False
(tmp_path / "observability_contract_matrix_2026-06-05.json").write_text(
json.dumps(snapshot),
encoding="utf-8",
)
with pytest.raises(ValueError, match="proposal_only"):
load_latest_observability_contract_matrix(tmp_path)
def test_observability_contract_matrix_requires_openclaw_receiver_denial(tmp_path):
snapshot = _snapshot()
snapshot["operator_contract"]["must_not_interpret_as"].remove(
"Alertmanager 指向 OpenClaw receiver 批准"
)
(tmp_path / "observability_contract_matrix_2026-06-05.json").write_text(
json.dumps(snapshot),
encoding="utf-8",
)
with pytest.raises(ValueError, match="operator_contract"):
load_latest_observability_contract_matrix(tmp_path)
def test_observability_contract_matrix_rejects_secret_payload_keys(tmp_path):
snapshot = _snapshot()
snapshot["latest_observations"][0]["webhook_secret"] = "redacted"
(tmp_path / "observability_contract_matrix_2026-06-05.json").write_text(
json.dumps(snapshot),
encoding="utf-8",
)
with pytest.raises(ValueError, match="forbidden secret payload key"):
load_latest_observability_contract_matrix(tmp_path)
def test_observability_contract_matrix_fails_when_missing(tmp_path):
with pytest.raises(FileNotFoundError):
load_latest_observability_contract_matrix(tmp_path)
def _snapshot(
*,
generated_at: str = "2026-06-05T00:00:00+08:00",
completion: int = 100,
) -> dict:
surfaces = [
_surface(
"prometheus_alert_rule_catalog",
"Prometheus 告警規則合約",
"prometheus_rules",
"action_required",
"proposal_only",
),
_surface(
"alertmanager_awoooi_route",
"Alertmanager → AWOOOI API 路由",
"alertmanager_route",
"verified",
"proposal_only",
),
]
opportunities = [
_opportunity("prometheus_noise_rule_tuning", "approval_required"),
_opportunity("alertmanager_grouping_inhibit_tuning", "approval_required"),
_opportunity("success_notification_quiet_policy", "preserved"),
]
gaps = [
{
"gap_id": "prometheus_alert_rule_catalog_seed",
"display_name": "Alert rule catalog seed 未正式產品化",
"status": "action_required",
"severity": "high",
"summary": "只讀矩陣已建立,尚未產生 catalog seed。",
"evidence_refs": ["docs/adr/ADR-090-monitoring-blindspot-governance.md"],
"next_action": "先產 proposal不改 rule。",
}
]
return {
"schema_version": "observability_contract_matrix_v1",
"generated_at": generated_at,
"program_status": {
"overall_completion_percent": completion,
"current_priority": "P1",
"current_task_id": "P1-003",
"next_task_id": "P1-004",
"read_only_mode": True,
},
"source_refs": ["docs/schemas/observability_contract_matrix_v1.schema.json"],
"rollups": {
"total_surfaces": len(surfaces),
"by_kind": _count_by(surfaces, "kind"),
"by_status": _count_by(surfaces, "status"),
"by_evidence_status": _count_by(surfaces, "evidence_status"),
"by_noise_policy_status": _count_by(surfaces, "noise_policy_status"),
"surface_ids_requiring_action": ["prometheus_alert_rule_catalog"],
"surface_ids_with_proposal_only_noise_policy": [
"alertmanager_awoooi_route",
"prometheus_alert_rule_catalog",
],
"noise_reduction_opportunities_total": len(opportunities),
"approval_required_opportunity_ids": [
"alertmanager_grouping_inhibit_tuning",
"prometheus_noise_rule_tuning",
],
"classification_gap_ids": ["prometheus_alert_rule_catalog_seed"],
"read_only_denials_total": 12,
},
"observability_surfaces": surfaces,
"noise_reduction_opportunities": opportunities,
"classification_gaps": gaps,
"latest_observations": [
{
"observation_id": "alertmanager_receiver_guard",
"status": "verified",
"summary": "Alertmanager 不得指向 OpenClaw。",
"evidence_refs": ["docs/HARD_RULES.md#alertmanager-routing"],
}
],
"operator_contract": {
"display_mode": "read_only_observability_contract_matrix",
"must_not_interpret_as": [
"Prometheus alert rule 修改批准",
"Alertmanager receiver / route 修改批准",
"Alertmanager 指向 OpenClaw receiver 批准",
"Silence 建立或維護窗口批准",
"Grafana dashboard 寫入批准",
"SigNoz / Sentry webhook 設定修改批准",
"Secret 已讀取或可輸出",
"Telegram 測試通知批准",
"deploy / reload / workflow 觸發批准",
"runtime execution 授權",
],
"secret_display_policy": "只顯示 redacted metadata。",
"alertmanager_route_policy": "OpenClaw 不接收 Alertmanager webhookreceiver 維持 AWOOOI API。",
"noise_reduction_policy": "只產生 proposal。",
"notification_policy": "成功不洗版。",
},
"operation_boundaries": {
"read_only_api_allowed": True,
"prometheus_rule_write_allowed": False,
"prometheus_reload_allowed": False,
"alertmanager_route_write_allowed": False,
"alertmanager_receiver_change_allowed": False,
"alertmanager_to_openclaw_allowed": False,
"silence_create_allowed": False,
"grafana_dashboard_write_allowed": False,
"grafana_api_write_allowed": False,
"signoz_query_mutation_allowed": False,
"signoz_webhook_change_allowed": False,
"sentry_webhook_change_allowed": False,
"otel_collector_deploy_allowed": False,
"event_exporter_restart_allowed": False,
"secret_read_allowed": False,
"secret_plaintext_allowed": False,
"notification_send_allowed": False,
"external_api_call_allowed": False,
"live_prometheus_query_allowed": False,
"workflow_trigger_allowed": False,
"deploy_trigger_allowed": False,
"reload_trigger_allowed": False,
"runtime_execution_allowed": False,
},
"approval_boundaries": {
"prometheus_rule_change_authorized": False,
"prometheus_reload_authorized": False,
"alertmanager_route_change_authorized": False,
"alertmanager_receiver_change_authorized": False,
"alertmanager_to_openclaw_authorized": False,
"silence_authorized": False,
"grafana_write_authorized": False,
"signoz_write_authorized": False,
"sentry_write_authorized": False,
"otel_deploy_authorized": False,
"event_exporter_restart_authorized": False,
"notification_send_authorized": False,
"external_call_authorized": False,
"secret_plaintext_allowed": False,
"workflow_trigger_authorized": False,
"deploy_reload_authorized": False,
"runtime_execution_authorized": False,
},
}
def _surface(
surface_id: str,
display_name: str,
kind: str,
status: str,
noise_policy_status: str,
) -> dict:
return {
"surface_id": surface_id,
"display_name": display_name,
"kind": kind,
"status": status,
"risk_level": "critical",
"evidence_status": "committed_manifest",
"noise_policy_status": noise_policy_status,
"coverage_contract": "只讀 committed evidence。",
"current_contract": "不得改 live 設定。",
"evidence_refs": ["docs/HARD_RULES.md"],
"next_action": "只產 proposal。",
}
def _opportunity(opportunity_id: str, status: str) -> dict:
return {
"opportunity_id": opportunity_id,
"display_name": opportunity_id,
"status": status,
"proposal_only": True,
"impact": "降噪提案。",
"evidence_refs": ["docs/HARD_RULES.md"],
"next_action": "人工批准前不執行。",
}
def _count_by(items: list[dict], key: str) -> dict[str, int]:
counts: dict[str, int] = {}
for item in items:
value = item[key]
counts[value] = counts.get(value, 0) + 1
return counts

View File

@@ -0,0 +1,53 @@
from __future__ import annotations
from fastapi import FastAPI
from fastapi.testclient import TestClient
from src.api.v1.agents import router
def test_observability_contract_matrix_endpoint_returns_committed_snapshot():
app = FastAPI()
app.include_router(router, prefix="/api/v1")
client = TestClient(app)
response = client.get("/api/v1/agents/observability-contract-matrix")
assert response.status_code == 200
data = response.json()
assert data["schema_version"] == "observability_contract_matrix_v1"
assert data["program_status"]["overall_completion_percent"] == 100
assert data["program_status"]["current_task_id"] == "P1-003"
assert data["program_status"]["next_task_id"] == "P1-004"
assert data["program_status"]["read_only_mode"] is True
assert data["rollups"]["total_surfaces"] == len(data["observability_surfaces"]) == 6
assert data["rollups"]["noise_reduction_opportunities_total"] == 5
assert data["rollups"]["surface_ids_requiring_action"] == [
"grafana_dashboard_inventory",
"prometheus_alert_rule_catalog",
]
assert data["rollups"]["approval_required_opportunity_ids"] == [
"alertmanager_grouping_inhibit_tuning",
"prometheus_noise_rule_tuning",
]
assert data["operation_boundaries"]["read_only_api_allowed"] is True
assert data["operation_boundaries"]["prometheus_rule_write_allowed"] is False
assert data["operation_boundaries"]["alertmanager_route_write_allowed"] is False
assert data["operation_boundaries"]["alertmanager_to_openclaw_allowed"] is False
assert data["operation_boundaries"]["silence_create_allowed"] is False
assert data["operation_boundaries"]["grafana_dashboard_write_allowed"] is False
assert data["operation_boundaries"]["notification_send_allowed"] is False
assert data["operation_boundaries"]["deploy_trigger_allowed"] is False
assert data["approval_boundaries"]["prometheus_rule_change_authorized"] is False
assert data["approval_boundaries"]["alertmanager_to_openclaw_authorized"] is False
assert data["approval_boundaries"]["deploy_reload_authorized"] is False
alertmanager = next(
row for row in data["observability_surfaces"] if row["surface_id"] == "alertmanager_awoooi_route"
)
assert alertmanager["status"] == "verified"
assert alertmanager["noise_policy_status"] == "proposal_only"
assert "OpenClaw 只做 AI 分析" in alertmanager["coverage_contract"]
assert "Alertmanager 指向 OpenClaw receiver 批准" in data["operator_contract"]["must_not_interpret_as"]
assert "不接收 Alertmanager webhook" in data["operator_contract"]["alertmanager_route_policy"]
for opportunity in data["noise_reduction_opportunities"]:
assert opportunity["proposal_only"] is True

View File

@@ -3259,6 +3259,54 @@
"not_applicable": "不適用",
"actionable_only_no_success_noise": "需處置才通知,成功不洗版"
}
},
"observability": {
"title": "監控合約與降噪機會",
"source": "{generated} · {current} → {next}",
"noiseTitle": "降噪 proposal",
"classificationTitle": "分類缺口",
"contractTitle": "不可誤讀合約",
"metrics": {
"surfaces": "監控面",
"actions": "需處置",
"proposals": "降噪提案",
"classificationGaps": "分類缺口",
"approvalRequired": "需批准"
},
"map": {
"coverage": "合約覆蓋",
"coverageDetail": "Prometheus / Alertmanager / Grafana / SigNoz / Sentry / taxonomy。",
"noise": "降噪路徑",
"noiseDetail": "只產生 proposal不改 receiver 或 silence。",
"classification": "批准邊界",
"classificationDetail": "降噪候選先進批准包不直接改規則、receiver 或分類器。",
"safeBoundary": "安全邊界",
"safeBoundaryDetail": "alert rule、silence、通知、dashboard、deploy 入口皆為 0。"
},
"labels": {
"evidence": "證據",
"noise": "降噪"
},
"values": {
"prometheus_rules": "Prometheus 規則",
"alertmanager_route": "Alertmanager 路由",
"grafana_dashboard": "Grafana Dashboard",
"signoz_clickhouse": "SigNoz / ClickHouse",
"sentry_source_link": "Sentry Source Link",
"otel_event_exporter": "OTEL / Event Exporter",
"verified": "已驗證",
"action_required": "需處置",
"blocked": "阻擋",
"committed_manifest": "已提交 manifest",
"production_readback_recorded": "正式讀回已記錄",
"proposal_only": "僅提案",
"preserved": "已保留",
"needs_proposal": "待提案",
"approval_required": "需批准",
"ready_for_proposal": "提案可審",
"deferred": "延後",
"proposal_required": "需提案"
}
}
}
},

View File

@@ -3259,6 +3259,54 @@
"not_applicable": "不適用",
"actionable_only_no_success_noise": "需處置才通知,成功不洗版"
}
},
"observability": {
"title": "監控合約與降噪機會",
"source": "{generated} · {current} → {next}",
"noiseTitle": "降噪 proposal",
"classificationTitle": "分類缺口",
"contractTitle": "不可誤讀合約",
"metrics": {
"surfaces": "監控面",
"actions": "需處置",
"proposals": "降噪提案",
"classificationGaps": "分類缺口",
"approvalRequired": "需批准"
},
"map": {
"coverage": "合約覆蓋",
"coverageDetail": "Prometheus / Alertmanager / Grafana / SigNoz / Sentry / taxonomy。",
"noise": "降噪路徑",
"noiseDetail": "只產生 proposal不改 receiver 或 silence。",
"classification": "批准邊界",
"classificationDetail": "降噪候選先進批准包不直接改規則、receiver 或分類器。",
"safeBoundary": "安全邊界",
"safeBoundaryDetail": "alert rule、silence、通知、dashboard、deploy 入口皆為 0。"
},
"labels": {
"evidence": "證據",
"noise": "降噪"
},
"values": {
"prometheus_rules": "Prometheus 規則",
"alertmanager_route": "Alertmanager 路由",
"grafana_dashboard": "Grafana Dashboard",
"signoz_clickhouse": "SigNoz / ClickHouse",
"sentry_source_link": "Sentry Source Link",
"otel_event_exporter": "OTEL / Event Exporter",
"verified": "已驗證",
"action_required": "需處置",
"blocked": "阻擋",
"committed_manifest": "已提交 manifest",
"production_readback_recorded": "正式讀回已記錄",
"proposal_only": "僅提案",
"preserved": "已保留",
"needs_proposal": "待提案",
"approval_required": "需批准",
"ready_for_proposal": "提案可審",
"deferred": "延後",
"proposal_required": "需提案"
}
}
}
},

View File

@@ -39,6 +39,7 @@ import {
type BackupDrTargetInventorySnapshot,
type BackupNotificationPolicySnapshot,
type GiteaWorkflowRunnerHealthSnapshot,
type ObservabilityContractMatrixSnapshot,
type OffsiteEscrowReadinessStatusSnapshot,
type RuntimeSurfaceInventorySnapshot,
} from '@/lib/api-client'
@@ -303,6 +304,7 @@ export function AutomationInventoryTab() {
const [offsiteEscrow, setOffsiteEscrow] = useState<OffsiteEscrowReadinessStatusSnapshot | null>(null)
const [runtimeSurface, setRuntimeSurface] = useState<RuntimeSurfaceInventorySnapshot | null>(null)
const [giteaHealth, setGiteaHealth] = useState<GiteaWorkflowRunnerHealthSnapshot | null>(null)
const [observabilityMatrix, setObservabilityMatrix] = useState<ObservabilityContractMatrixSnapshot | null>(null)
const [loading, setLoading] = useState(true)
const [error, setError] = useState(false)
@@ -317,6 +319,7 @@ export function AutomationInventoryTab() {
apiClient.getOffsiteEscrowReadinessStatus(),
apiClient.getRuntimeSurfaceInventory(),
apiClient.getGiteaWorkflowRunnerHealth(),
apiClient.getObservabilityContractMatrix(),
] as const
Promise.allSettled(requests)
@@ -330,6 +333,7 @@ export function AutomationInventoryTab() {
offsiteEscrowResult,
runtimeSurfaceResult,
giteaHealthResult,
observabilityMatrixResult,
] = results
setSnapshot(inventoryResult.status === 'fulfilled' ? inventoryResult.value : null)
@@ -340,6 +344,7 @@ export function AutomationInventoryTab() {
setOffsiteEscrow(offsiteEscrowResult.status === 'fulfilled' ? offsiteEscrowResult.value : null)
setRuntimeSurface(runtimeSurfaceResult.status === 'fulfilled' ? runtimeSurfaceResult.value : null)
setGiteaHealth(giteaHealthResult.status === 'fulfilled' ? giteaHealthResult.value : null)
setObservabilityMatrix(observabilityMatrixResult.status === 'fulfilled' ? observabilityMatrixResult.value : null)
setError([
inventoryResult,
backlogResult,
@@ -348,6 +353,7 @@ export function AutomationInventoryTab() {
policyResult,
offsiteEscrowResult,
giteaHealthResult,
observabilityMatrixResult,
].some(result => result.status === 'rejected'))
})
.catch(() => setError(true))
@@ -461,6 +467,28 @@ export function AutomationInventoryTab() {
})
}, [giteaHealth])
const visibleObservabilitySurfaces = useMemo(() => {
if (!observabilityMatrix) return []
const priority = { action_required: 0, blocked: 1, verified: 2 } as Record<string, number>
return [...observabilityMatrix.observability_surfaces].sort((a, b) => {
const left = priority[a.status] ?? 3
const right = priority[b.status] ?? 3
if (left !== right) return left - right
return a.surface_id.localeCompare(b.surface_id)
})
}, [observabilityMatrix])
const visibleNoiseOpportunities = useMemo(() => {
if (!observabilityMatrix) return []
const priority = { approval_required: 0, ready_for_proposal: 1, preserved: 2, deferred: 3 } as Record<string, number>
return [...observabilityMatrix.noise_reduction_opportunities].sort((a, b) => {
const left = priority[a.status] ?? 3
const right = priority[b.status] ?? 3
if (left !== right) return left - right
return a.opportunity_id.localeCompare(b.opportunity_id)
})
}, [observabilityMatrix])
if (loading) {
return (
<div style={{ padding: 20, display: 'grid', gridTemplateColumns: 'repeat(4, minmax(0, 1fr))', gap: 12 }} className="automation-inventory-kpi-grid">
@@ -474,7 +502,7 @@ export function AutomationInventoryTab() {
)
}
if (error || !snapshot || !backlog || !backupTargets || !backupReadiness || !backupPolicy || !offsiteEscrow || !giteaHealth) {
if (error || !snapshot || !backlog || !backupTargets || !backupReadiness || !backupPolicy || !offsiteEscrow || !giteaHealth || !observabilityMatrix) {
return (
<div style={{ padding: 20 }}>
<GlassCard variant="subtle" padding="lg">
@@ -527,6 +555,10 @@ export function AutomationInventoryTab() {
const runtimeBoundComponents = runtimeSurface?.rollups.source_components_with_runtime_binding ?? 0
const giteaRunnerActions = giteaHealth.rollups.workflow_ids_requiring_runner_attestation.length
const giteaQuietPolicies = giteaHealth.rollups.notification_contracts_quiet_success_count
const observabilityActions = observabilityMatrix.rollups.surface_ids_requiring_action.length
const observabilityProposalCount = observabilityMatrix.rollups.noise_reduction_opportunities_total
const observabilityClassificationGaps = observabilityMatrix.rollups.classification_gap_ids.length
const observabilityApprovalRequired = observabilityMatrix.rollups.approval_required_opportunity_ids.length
const backlogProgressPercent = backlog.progress_summary.overall_percent
const explicitApprovalItemCount = backlog.item_approval_boundary_rollup.items_requiring_explicit_approval.length
const taskBoundaryCount = snapshot.task_approval_boundary_rollup.total_tasks
@@ -643,6 +675,14 @@ export function AutomationInventoryTab() {
}
}
const observabilityValueLabel = (value: string) => {
try {
return t(`observability.values.${value}` as never)
} catch {
return value
}
}
return (
<div className="automation-inventory-tab-root" style={{ padding: 20, display: 'flex', flexDirection: 'column', gap: 16, minWidth: 0 }}>
<GlassCard variant="subtle" padding="md">
@@ -1450,6 +1490,169 @@ export function AutomationInventoryTab() {
</div>
</GlassCard>
<GlassCard variant="subtle" padding="md">
<div style={{ display: 'flex', flexDirection: 'column', gap: 13, minWidth: 0 }}>
<div style={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', gap: 12, flexWrap: 'wrap' }}>
<div style={{ display: 'flex', alignItems: 'center', gap: 7, minWidth: 0 }}>
<BellRing size={14} style={{ color: '#d97757' }} />
<span style={{ fontFamily: 'Syne, sans-serif', fontSize: 13, fontWeight: 700, color: '#141413' }}>
{t('observability.title')}
</span>
</div>
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#87867f' }}>
{t('observability.source', {
generated: formatDateTime(observabilityMatrix.generated_at),
current: observabilityMatrix.program_status.current_task_id,
next: observabilityMatrix.program_status.next_task_id,
})}
</div>
</div>
<div style={{ display: 'grid', gridTemplateColumns: 'repeat(5, minmax(0, 1fr))', gap: 12 }} className="automation-inventory-observability-kpi-grid">
<MetricCard label={t('observability.metrics.surfaces')} value={observabilityMatrix.rollups.total_surfaces} icon={<Database size={16} />} />
<MetricCard label={t('observability.metrics.actions')} value={observabilityActions} tone={observabilityActions > 0 ? 'warn' : 'ok'} icon={<AlertTriangle size={16} />} />
<MetricCard label={t('observability.metrics.proposals')} value={observabilityProposalCount} tone="warn" icon={<BellOff size={16} />} />
<MetricCard label={t('observability.metrics.classificationGaps')} value={observabilityClassificationGaps} tone="warn" icon={<Target size={16} />} />
<MetricCard label={t('observability.metrics.approvalRequired')} value={observabilityApprovalRequired} tone={observabilityApprovalRequired > 0 ? 'warn' : 'ok'} icon={<ShieldCheck size={16} />} />
</div>
<div style={{ display: 'grid', gridTemplateColumns: 'repeat(4, minmax(0, 1fr))', gap: 10 }} className="automation-inventory-observability-map-grid">
<SummaryTile
label={t('observability.map.coverage')}
value={`${observabilityMatrix.rollups.total_surfaces}`}
detail={t('observability.map.coverageDetail')}
tone="ok"
icon={<Layers3 size={16} />}
/>
<SummaryTile
label={t('observability.map.noise')}
value={`${observabilityProposalCount}`}
detail={t('observability.map.noiseDetail')}
tone="warn"
icon={<BellOff size={16} />}
/>
<SummaryTile
label={t('observability.map.classification')}
value={`${observabilityApprovalRequired}`}
detail={t('observability.map.classificationDetail')}
tone="warn"
icon={<Target size={16} />}
/>
<SummaryTile
label={t('observability.map.safeBoundary')}
value="0"
detail={t('observability.map.safeBoundaryDetail')}
tone="ok"
icon={<ShieldCheck size={16} />}
/>
</div>
<div style={{ display: 'grid', gridTemplateColumns: 'minmax(0, 1.35fr) minmax(0, 0.65fr)', gap: 12 }} className="automation-inventory-observability-grid">
<div style={{ display: 'grid', gridTemplateColumns: 'repeat(2, minmax(0, 1fr))', gap: 10 }} className="automation-inventory-observability-surface-grid">
{visibleObservabilitySurfaces.map(surface => (
<div key={surface.surface_id} style={{ padding: 11, border: '0.5px solid #e0ddd4', borderRadius: 7, background: '#fff', minWidth: 0 }}>
<div style={{ display: 'flex', flexDirection: 'column', gap: 8, minWidth: 0 }}>
<div style={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', gap: 8, minWidth: 0 }}>
<span style={{
fontFamily: 'Syne, sans-serif',
fontSize: 12,
fontWeight: 700,
color: '#141413',
overflow: 'hidden',
textOverflow: 'ellipsis',
whiteSpace: 'nowrap',
}}>
{surface.display_name}
</span>
<Chip value={observabilityValueLabel(surface.status)} muted={surface.status === 'verified'} />
</div>
<div style={{ display: 'flex', flexWrap: 'wrap', gap: 6, minWidth: 0 }}>
<Chip value={observabilityValueLabel(surface.kind)} />
<Chip value={`${t('observability.labels.evidence')}: ${observabilityValueLabel(surface.evidence_status)}`} muted={surface.evidence_status === 'committed_manifest' || surface.evidence_status === 'production_readback_recorded'} />
<Chip value={`${t('observability.labels.noise')}: ${observabilityValueLabel(surface.noise_policy_status)}`} muted />
</div>
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#87867f', lineHeight: 1.45, overflowWrap: 'anywhere' }}>
{surface.coverage_contract}
</div>
{surface.current_contract ? (
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#87867f', lineHeight: 1.45, overflowWrap: 'anywhere' }}>
{surface.current_contract}
</div>
) : null}
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#141413', lineHeight: 1.45, overflowWrap: 'anywhere' }}>
{surface.next_action}
</div>
<div style={{ display: 'flex', flexWrap: 'wrap', gap: 6, minWidth: 0 }}>
<Chip value={surface.evidence_refs[0] ?? t('backupEvidence.noEvidence')} muted />
</div>
</div>
</div>
))}
</div>
<div style={{ display: 'flex', flexDirection: 'column', gap: 12, minWidth: 0 }}>
<div style={{ padding: 12, border: '0.5px solid #e0ddd4', borderRadius: 7, background: '#fff', display: 'flex', flexDirection: 'column', gap: 10, minWidth: 0 }}>
<span style={{ fontFamily: 'Syne, sans-serif', fontSize: 13, fontWeight: 700, color: '#141413' }}>
{t('observability.noiseTitle')}
</span>
{visibleNoiseOpportunities.slice(0, 5).map(opportunity => (
<div key={opportunity.opportunity_id} style={{ display: 'flex', flexDirection: 'column', gap: 6, minWidth: 0 }}>
<div style={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', gap: 8 }}>
<span style={{ fontFamily: "'DM Mono', monospace", fontSize: 11, fontWeight: 700, color: '#141413', overflow: 'hidden', textOverflow: 'ellipsis', whiteSpace: 'nowrap' }}>
{opportunity.display_name}
</span>
<Chip value={observabilityValueLabel(opportunity.status)} muted />
</div>
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#87867f', lineHeight: 1.45, overflowWrap: 'anywhere' }}>
{opportunity.impact}
</div>
</div>
))}
</div>
<div style={{ padding: 12, border: '0.5px solid #e0ddd4', borderRadius: 7, background: '#fff', display: 'flex', flexDirection: 'column', gap: 10, minWidth: 0 }}>
<span style={{ fontFamily: 'Syne, sans-serif', fontSize: 13, fontWeight: 700, color: '#141413' }}>
{t('observability.classificationTitle')}
</span>
{observabilityMatrix.classification_gaps.map(gap => (
<div key={gap.gap_id} style={{ display: 'flex', flexDirection: 'column', gap: 6, minWidth: 0 }}>
<div style={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', gap: 8 }}>
<span style={{ fontFamily: "'DM Mono', monospace", fontSize: 11, fontWeight: 700, color: '#141413', overflow: 'hidden', textOverflow: 'ellipsis', whiteSpace: 'nowrap' }}>
{gap.display_name}
</span>
<Chip value={observabilityValueLabel(gap.status)} />
</div>
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#87867f', lineHeight: 1.45, overflowWrap: 'anywhere' }}>
{gap.summary}
</div>
</div>
))}
</div>
<div style={{ padding: 12, border: '0.5px solid #e0ddd4', borderRadius: 7, background: '#fff', display: 'flex', flexDirection: 'column', gap: 10, minWidth: 0 }}>
<span style={{ fontFamily: 'Syne, sans-serif', fontSize: 13, fontWeight: 700, color: '#141413' }}>
{t('observability.contractTitle')}
</span>
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#87867f', lineHeight: 1.5, overflowWrap: 'anywhere' }}>
{observabilityMatrix.operator_contract.alertmanager_route_policy}
</div>
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#87867f', lineHeight: 1.5, overflowWrap: 'anywhere' }}>
{observabilityMatrix.operator_contract.noise_reduction_policy}
</div>
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#87867f', lineHeight: 1.5, overflowWrap: 'anywhere' }}>
{observabilityMatrix.operator_contract.notification_policy}
</div>
<div style={{ display: 'flex', flexWrap: 'wrap', gap: 6, minWidth: 0 }}>
{observabilityMatrix.operator_contract.must_not_interpret_as.slice(0, 6).map(item => (
<Chip key={item} value={item} />
))}
</div>
</div>
</div>
</div>
</div>
</GlassCard>
<div style={{ display: 'grid', gridTemplateColumns: 'minmax(0, 1.2fr) minmax(0, 0.8fr)', gap: 12 }} className="automation-inventory-bottom-grid">
<GlassCard variant="subtle" padding="md">
<div style={{ display: 'flex', flexDirection: 'column', gap: 13, minWidth: 0 }}>
@@ -1564,6 +1767,10 @@ export function AutomationInventoryTab() {
.automation-inventory-gitea-map-grid,
.automation-inventory-gitea-grid,
.automation-inventory-gitea-workflow-grid,
.automation-inventory-observability-kpi-grid,
.automation-inventory-observability-map-grid,
.automation-inventory-observability-grid,
.automation-inventory-observability-surface-grid,
.automation-inventory-bottom-grid,
.automation-inventory-task-grid {
grid-template-columns: 1fr !important;

View File

@@ -272,6 +272,11 @@ export const apiClient = {
return handleResponse<GiteaWorkflowRunnerHealthSnapshot>(res)
},
async getObservabilityContractMatrix() {
const res = await fetch(`${API_BASE_URL}/agents/observability-contract-matrix`)
return handleResponse<ObservabilityContractMatrixSnapshot>(res)
},
async getBackupDrTargetInventory() {
const res = await fetch(`${API_BASE_URL}/agents/backup-dr-target-inventory`)
return handleResponse<BackupDrTargetInventorySnapshot>(res)
@@ -946,6 +951,80 @@ export interface GiteaWorkflowRunnerHealthSnapshot {
approval_boundaries: Record<string, false>
}
export interface ObservabilityContractMatrixSnapshot {
schema_version: 'observability_contract_matrix_v1'
generated_at: string
program_status: {
overall_completion_percent: number
current_priority: 'P0' | 'P1' | 'P2' | 'P3'
current_task_id: string
next_task_id: string
read_only_mode: true
}
source_refs: string[]
rollups: {
total_surfaces: number
by_kind: Record<string, number>
by_status: Record<string, number>
by_evidence_status: Record<string, number>
by_noise_policy_status: Record<string, number>
surface_ids_requiring_action: string[]
surface_ids_with_proposal_only_noise_policy: string[]
noise_reduction_opportunities_total: number
approval_required_opportunity_ids: string[]
classification_gap_ids: string[]
read_only_denials_total: number
}
observability_surfaces: Array<{
surface_id: string
display_name: string
kind: string
status: 'verified' | 'action_required' | 'blocked'
risk_level: 'low' | 'medium' | 'high' | 'critical'
evidence_status: string
noise_policy_status: string
coverage_contract: string
current_contract?: string
evidence_refs: string[]
next_action: string
}>
noise_reduction_opportunities: Array<{
opportunity_id: string
display_name: string
status: string
proposal_only: true
impact: string
target_surface_ids?: string[]
evidence_refs: string[]
next_action: string
}>
classification_gaps: Array<{
gap_id: string
display_name: string
status: string
severity: 'low' | 'medium' | 'high' | 'critical'
summary: string
evidence_refs: string[]
next_action: string
}>
latest_observations: Array<{
observation_id: string
status: string
summary: string
evidence_refs: string[]
}>
operator_contract: {
display_mode: 'read_only_observability_contract_matrix'
must_not_interpret_as: string[]
secret_display_policy: string
alertmanager_route_policy: string
noise_reduction_policy: string
notification_policy: string
}
operation_boundaries: Record<string, boolean>
approval_boundaries: Record<string, false>
}
export interface BackupDrTargetInventorySnapshot {
schema_version: 'backup_dr_target_inventory_v1'
generated_at: string

View File

@@ -1,3 +1,38 @@
## 2026-06-05P1-003 監控合約與降噪矩陣本地完成
**背景**:接續 P1-002 Gitea workflow / runner health contract 與決策摘要正式驗證,依工作清單推進 `P1-003`。本段只建立 Prometheus / Alertmanager / SigNoz / Grafana / Sentry / OTEL 的 committed observability matrix、只讀 API 與治理頁顯示,不修改 alert rules、不 reload Prometheus、不改 Alertmanager receiver / route、不建立 silence、不寫 Grafana、不改 SigNoz / Sentry webhook、不發 Telegram 測試通知。
**本輪完成**
- 新增 `observability_contract_matrix_v1` schema 與 `docs/evaluations/observability_contract_matrix_2026-06-05.json`
- 新增 `GET /api/v1/agents/observability-contract-matrix` 與 service guard強制驗證 read-only mode、operation / approval boundaries、rollup consistency、降噪候選只能 proposal、Alertmanager 不得指向 OpenClaw、不得出現 secret payload key。
- 治理頁 `/zh-TW/governance?tab=automation-inventory` 新增「監控合約與降噪機會」區塊,顯示監控面、需處置、降噪候選、需批准候選、分類缺口與不可誤讀合約。
- 同步 automation backlog / inventory snapshotcurrent `P1-003`、next `P1-004`、backlog overall `83%`、P1 `90%`、done `19/23`、inventory tasks `29`
**目前數字**
- Observability surfaces`6`
- 需處置 surfaces`2``grafana_dashboard_inventory``prometheus_alert_rule_catalog`)。
- 降噪候選:`5`
- 需人工批准的降噪候選:`2`Prometheus rule tuning、Alertmanager grouping / inhibit tuning
- Classification gaps`3`
- Read-only denials`12`
**本地驗證**
- JSON parse 通過:`observability_contract_matrix_2026-06-05.json``observability_contract_matrix_v1.schema.json`、automation backlog / inventory snapshots。
- 目標測試通過observability contract matrix service / API、automation inventory / backlog snapshot API、Gitea workflow runner health service / API 共 `19 passed`
- Python py_compile 通過:`apps/api/src/services/observability_contract_matrix.py``apps/api/src/api/v1/agents.py`
- zh-TW / en i18n key 差異 `0`web typecheck 通過Next production build 通過。
- source-control-owner-response guard、security-mirror-progress guard、`git diff --check` 通過。
- 本地 API readback 回 `observability_contract_matrix_v1`、current `P1-003`、next `P1-004`、surfaces `6`、noise opportunities `5`、approval-required opportunities `2`backlog 回 overall `83%`、done `19/23`
**邊界**
- Prometheus alert rule 修改、Prometheus reload、Alertmanager route / receiver 修改、Alertmanager 指向 OpenClaw、silence 建立、Grafana 寫入、SigNoz / Sentry webhook 修改、OTEL / Event Exporter deploy 或 restart、Secret payload read、Telegram 測試通知、external API / live query、workflow / deploy / reload / runtime execution 全部仍未批准。
- 成功 smoke 不即時通知洗版失敗、action-required 或人工作業才可進通知批准流程。
**下一步**
1. Commit 並推 `gitea main`
2. 等 deploy marker 後執行 production API / Browser smoke。
3. P1-004盤點 AI Router / Ollama / Nemotron / Gemini provider 路徑。
## 2026-06-05AI Agent 自動化盤點決策摘要正式上線
**背景**:接續 P1-002 Gitea workflow / runner health contract 與文字換行正式驗證,治理頁 `/zh-TW/governance?tab=automation-inventory` 已能呈現完整資料,但首屏資訊密度偏高,使用者難以快速判讀「目前狀態、拖累因素、下一步」。本段只優化既有資訊呈現,不移除原明細、不新增 API、不改 workflow / runner / secret / runtime 行為。

View File

@@ -10,10 +10,10 @@
|---|---:|---|---|
| Agent 市場治理 | 72% | 進行中 | `agent_market_governance_snapshot_v1`、API、UI 分頁、每週觀察流程 |
| Nemotron 實際整合應用 | 30% | 完整回放前仍被關卡擋下 | `blocked_needs_evidence`,下一關是 `refresh_source_evidence_then_5_record_smoke_only` |
| 工具 / 服務 / 套件 AI 自動化 | 78% | P0 已完成P1 套件 / 供應鏈主線已完成;備份 / DR 主線已完成到異地 / escrow 準備度顯示任務批准邊界、進度彙總、P1-001 執行面只讀矩陣P1-002 Gitea 工作流程 / runner 健康合約已完成,下一主線是 P1-003 監控合約與降噪機會 | 狀態分類、盤點 schema、權限矩陣、靜態盤點種子、只讀 API、UI 骨架、驗證、自動化待辦 schema / 快照 / API / 分組 UI、Backup / DR 目標盤點、準備度矩陣、備份通知政策、Backup / DR 證據 UI、復原演練批准包模板、異地 / escrow 準備度狀態、任務批准邊界、確定性進度彙總、Python 套件 / 供應鏈只讀基線、JS pnpm/npm 只讀基線、Docker build surface 只讀基線、CVE / license / drift 嚴重度政策、定期依賴漂移與外部資料來源檢查設計、依賴升級批准包模板、runtime_surface_inventory_v1 schema / snapshot / API / UI、gitea_workflow_runner_health_v1 schema / snapshot / API / UI 已完成 |
| 工具 / 服務 / 套件 AI 自動化 | 83% | P0 已完成P1 套件 / 供應鏈主線已完成;備份 / DR 主線已完成到異地 / escrow 準備度顯示任務批准邊界、進度彙總、P1-001 執行面只讀矩陣P1-002 Gitea 工作流程 / runner 健康合約與 P1-003 監控合約 / 降噪矩陣已完成,下一主線是 P1-004 AI Router / provider route 盤點 | 狀態分類、盤點 schema、權限矩陣、靜態盤點種子、只讀 API、UI 骨架、驗證、自動化待辦 schema / 快照 / API / 分組 UI、Backup / DR 目標盤點、準備度矩陣、備份通知政策、Backup / DR 證據 UI、復原演練批准包模板、異地 / escrow 準備度狀態、任務批准邊界、確定性進度彙總、Python 套件 / 供應鏈只讀基線、JS pnpm/npm 只讀基線、Docker build surface 只讀基線、CVE / license / drift 嚴重度政策、定期依賴漂移與外部資料來源檢查設計、依賴升級批准包模板、runtime_surface_inventory_v1 schema / snapshot / API / UI、gitea_workflow_runner_health_v1 schema / snapshot / API / UI、observability_contract_matrix_v1 schema / snapshot / API / UI 已完成 |
| 本工作清單與分析報告 | 100% | 已完成 | 本 MD 文件 |
AI Agent 自動化工作包目前完成度:**78%**。本工作清單文件本身完成度:**100%**。
AI Agent 自動化工作包目前完成度:**83%**。本工作清單文件本身完成度:**100%**。
完成度計算模型:
@@ -868,7 +868,7 @@ UI
|---|---|---:|---|---|---|---|
| P1-001 | 完成 | 100 | OpenClaw | 盤點 API / Web / Worker / K8s runtime surface | `runtime_surface_inventory_v1` / `GET /api/v1/agents/runtime-surface-inventory` / 執行面只讀矩陣 | 只讀;不得查 Secret payload、不得 rollout / restart / scale / delete |
| P1-002 | 完成 | 100 | Hermes | 盤點 Gitea 工作流程與 runner 健康合約 | `gitea_workflow_runner_health_v1` / `GET /api/v1/agents/gitea-workflow-runner-health` / Gitea 健康合約 UI | 只讀;不修改 workflow、不重啟 runner、不停止 container、不讀 Secret、不發通知 |
| P1-003 | 待辦 | 0 | Hermes | 盤點 Prometheus / Alertmanager / SigNoz / Grafana 監控合約 | 可觀測性矩陣 | 只讀 |
| P1-003 | 完成 | 100 | Hermes | 盤點 Prometheus / Alertmanager / SigNoz / Grafana 監控合約 | `observability_contract_matrix_v1` / `GET /api/v1/agents/observability-contract-matrix` / 監控合約與降噪 UI | 只讀;不修改 alert rules、不改 receiver/route、不建立 silence、不寫 Grafana、不發通知 |
| P1-004 | 待辦 | 0 | OpenClaw | 盤點 AI Router / Ollama / Nemotron / Gemini provider 路徑 | 推理路由矩陣 | 不切 provider |
| P1-005 | 待辦 | 0 | OpenClaw | 偵測服務健康缺口與過期端點 | 需處置清單 | 不重啟 |
| P1-006 | 待辦 | 0 | Hermes | 在 UI 顯示 service health 證據卡 | 狀態卡 | 瀏覽器驗證 |
@@ -1084,10 +1084,25 @@ UI
下一步P1-003 盤點監控合約與降噪機會。
```
本次同步:
```text
進度83%。
目前優先級P1。
目前任務P1-003 盤點監控合約與降噪機會。
狀態變更:待辦 -> 完成。
證據observability_contract_matrix_v1 schema / snapshotGET /api/v1/agents/observability-contract-matrix治理頁監控合約與降噪機會區塊automation backlog 83%inventory tasks 29。
目前數字observability surfaces 6需處置 2降噪候選 5需人工批准的降噪候選 2classification gaps 3backlog done 19/23overall 83%P1 90%WS3 監控自動化 75%。
驗證JSON parse 通過observability contract matrix service / API、automation inventory / backlog snapshot API、Gitea workflow runner health service / API 目標測試 `19 passed`Python py_compile 通過zh-TW / en i18n key 差異 `0`web typecheck 通過Next production build 通過source-control-owner-response guard、security-mirror-progress guard、git diff --check 通過;本地 API readback 回 `observability_contract_matrix_v1`、current `P1-003`、next `P1-004`、surfaces `6`、noise opportunities `5`、approval-required opportunities `2`。
正式驗證:尚未推版;待本地驗證完成後推 `gitea main` 並補 production API / browser smoke。
阻擋Prometheus alert rule 修改、Prometheus reload、Alertmanager route / receiver 修改、Alertmanager 指向 OpenClaw、silence 建立、Grafana 寫入、SigNoz / Sentry webhook 修改、OTEL/Event Exporter deploy 或 restart、Secret payload read、Telegram 測試通知、external API/live query、workflow/deploy/reload/runtime execution 仍全部禁止。
下一步P1-004 盤點 AI Router / Ollama / Nemotron / Gemini provider 路徑。
```
## 13. 立即執行順序
1. P1-003:盤點 Prometheus / Alertmanager / SigNoz / Grafana 監控合約與降噪機會
2. P1-004盤點 AI Router / Ollama / Nemotron / Gemini provider 路徑
1. P1-004:盤點 AI Router / Ollama / Nemotron / Gemini provider 路徑
2. P1-005偵測服務健康缺口與過期端點
3. P2 / P3 必須等 P1 服務、監控與 provider runtime surface 可見且關卡穩定後再做。
## 14. 目前風險

View File

@@ -1,12 +1,12 @@
{
"schema_version": "ai_agent_automation_backlog_v1",
"generated_at": "2026-06-05T10:56:16+08:00",
"generated_at": "2026-06-05T12:34:00+08:00",
"source_inventory_snapshot_ref": "docs/evaluations/ai_agent_automation_inventory_snapshot_2026-06-04_static_seed.json",
"program_status": {
"overall_completion_percent": 78,
"overall_completion_percent": 83,
"current_priority": "P1",
"current_task_id": "P1-002",
"next_task_id": "P1-003",
"current_task_id": "P1-003",
"next_task_id": "P1-004",
"read_only_mode": true
},
"rollups": {
@@ -17,8 +17,8 @@
"P3": 1
},
"by_status": {
"done": 18,
"planned": 5
"done": 19,
"planned": 4
},
"by_gate_status": {
"read_only_allowed": 20,
@@ -318,26 +318,31 @@
{
"item_id": "AUTO-P1-003",
"priority": "P1",
"status": "planned",
"status": "done",
"workstream_id": "WS3",
"source_asset_id": "prometheus_alertmanager",
"source_signal_kind": "health_gap",
"title": "盤點監控合約與降噪機會",
"owner_agent": "hermes",
"recommended_action": "建立 Prometheus / Alertmanager / Grafana / SigNoz / ClickHouse 的只讀 observability matrix。",
"recommended_action": "建立 Prometheus / Alertmanager / Grafana / SigNoz / ClickHouse / Sentry 的只讀 observability matrix;降噪與分類缺口只產生 proposal不修改 alert rules。",
"action_class": "observe",
"gate_status": "read_only_allowed",
"risk_level": "high",
"evidence_refs": [
"k8s/monitoring/prometheus.yml",
"ops/monitoring/"
"docs/evaluations/observability_contract_matrix_2026-06-05.json",
"GET /api/v1/agents/observability-contract-matrix",
"k8s/monitoring/",
"ops/alertmanager/alertmanager.yml",
"ops/monitoring/",
"apps/api/src/constants/alert_types.py"
],
"acceptance_criteria": [
"不修改 alert rules",
"降噪只產生 proposal",
"標出 stale、缺 evidence、過度通知與 classification gap"
"不修改 alert rules、不呼叫 silence API、不送測試通知",
"列出 Prometheus / Alertmanager / Grafana / SigNoz / ClickHouse / Sentry 的只讀合約",
"降噪只產生 proposal標出 stale、缺 evidence、過度通知與 classification gap",
"API / UI 僅顯示 committed snapshot 與不可誤讀合約"
],
"next_review": "P1-003",
"next_review": "P1-004",
"approval_boundary": {
"mode": "read_only_allowed",
"display_summary": "只允許只讀盤點、顯示與批准包準備;不得直接執行寫入、部署、通知或外部呼叫。",
@@ -1170,16 +1175,16 @@
]
},
"progress_summary": {
"overall_percent": 78,
"done_items": 18,
"planned_items": 5,
"overall_percent": 83,
"done_items": 19,
"planned_items": 4,
"total_items": 23,
"formula": "round(done_items / total_items * 100),只有 status=done 計入完成planned/in_progress/blocked/deferred/rejected 不計入。",
"by_priority": [
{
"priority": "P1",
"completion_percent": 86,
"done_items": 18,
"completion_percent": 90,
"done_items": 19,
"total_items": 21
},
{
@@ -1207,10 +1212,10 @@
{
"workstream_id": "WS3",
"display_name": "監控自動化",
"completion_percent": 50,
"done_items": 2,
"completion_percent": 75,
"done_items": 3,
"total_items": 4,
"next_task_id": "P1-003"
"next_task_id": "P1-004"
},
{
"workstream_id": "WS4",
@@ -1250,7 +1255,7 @@
"completion_percent": 100,
"done_items": 2,
"total_items": 2,
"next_task_id": "P1-003"
"next_task_id": "P1-004"
}
]
}

View File

@@ -1,11 +1,11 @@
{
"schema_version": "ai_agent_automation_inventory_snapshot_v1",
"generated_at": "2026-06-05T10:56:16+08:00",
"generated_at": "2026-06-05T12:34:00+08:00",
"program_status": {
"overall_completion_percent": 100,
"current_priority": "P1",
"current_task_id": "P1-002",
"next_task_id": "P1-003",
"current_task_id": "P1-003",
"next_task_id": "P1-004",
"read_only_mode": true
},
"status_taxonomy": {
@@ -287,44 +287,50 @@
"domain_id": "observability",
"display_name": "Prometheus / Alertmanager",
"asset_type": "observability_tool",
"status": "planned",
"status": "done",
"gate_status": "read_only_allowed",
"owner_agent": "hermes",
"risk_level": "high",
"evidence_refs": [
"k8s/monitoring/prometheus.yml",
"docs/evaluations/observability_contract_matrix_2026-06-05.json",
"GET /api/v1/agents/observability-contract-matrix",
"k8s/monitoring/",
"ops/alertmanager/alertmanager.yml",
"ops/monitoring/"
],
"next_action": "P1-003 盤點告警合約與降噪機會。"
"next_action": "P1-003 已完成只讀監控合約與降噪機會矩陣P1-004 盤點 AI Router / provider route。"
},
{
"asset_id": "signoz_clickhouse",
"domain_id": "observability",
"display_name": "SigNoz / ClickHouse",
"asset_type": "observability_tool",
"status": "planned",
"status": "done",
"gate_status": "read_only_allowed",
"owner_agent": "hermes",
"risk_level": "medium",
"evidence_refs": [
"docs/LOGBOOK.md"
"docs/evaluations/observability_contract_matrix_2026-06-05.json",
"apps/api/src/services/signoz_client.py",
"ops/signoz"
],
"next_action": "P1-003 補 trace / metrics / log 可見性盤點。"
"next_action": "P1-003 補 trace / metrics / log 可見性只讀合約live readback 仍需後續人工批准範圍。"
},
{
"asset_id": "sentry",
"domain_id": "tools",
"display_name": "Sentry",
"asset_type": "external_service",
"status": "planned",
"status": "done",
"gate_status": "read_only_allowed",
"owner_agent": "hermes",
"risk_level": "medium",
"evidence_refs": [
"scripts/backup/backup-sentry.sh",
"apps/web/src/instrumentation.ts"
"docs/evaluations/observability_contract_matrix_2026-06-05.json",
"apps/web/src/instrumentation.ts",
"scripts/backup/backup-sentry.sh"
],
"next_action": "P1-003 盤點錯誤監控與備份狀態。"
"next_action": "P1-003 已補 Sentry error monitoring 合約;不讀 DSN secret、不送事件。"
},
{
"asset_id": "telegram_chain",
@@ -472,9 +478,9 @@
{
"workstream_id": "WS3",
"display_name": "監控自動化",
"completion_percent": 50,
"completion_percent": 75,
"status": "in_progress",
"next_task_id": "P1-003"
"next_task_id": "P1-004"
},
{
"workstream_id": "WS4",
@@ -509,7 +515,7 @@
"display_name": "產品 UI",
"completion_percent": 94,
"status": "in_progress",
"next_task_id": "P1-003"
"next_task_id": "P1-004"
}
],
"tasks": [
@@ -834,6 +840,38 @@
]
}
},
{
"task_id": "P1-003",
"priority": "P1",
"status": "done",
"completion_percent": 100,
"owner_agent": "hermes",
"title": "盤點監控合約與降噪機會",
"output": "docs/evaluations/observability_contract_matrix_2026-06-05.json + GET /api/v1/agents/observability-contract-matrix",
"gate_status": "read_only_allowed",
"next_action": "完成 committed observability contract matrix下一步 P1-004 盤點 AI Router / provider route。",
"approval_boundary": {
"mode": "read_only_allowed",
"display_summary": "只允許只讀盤點、顯示與批准包準備;不得直接執行寫入、部署、通知或外部呼叫。",
"allowed_actions": [
"讀取 committed snapshot",
"整理只讀證據",
"顯示治理 UI"
],
"blocked_actions": [
"production_write",
"runtime_execution",
"destructive_operation",
"secret_plaintext_collection",
"unapproved_deploy",
"unapproved_external_call"
],
"requires_operator_approval_for": [
"任何非只讀操作",
"任何部署、排程、通知或外部呼叫變更"
]
}
},
{
"task_id": "P1-301",
"priority": "P1",
@@ -1729,6 +1767,13 @@
"kind": "api",
"ref": "GET /api/v1/agents/gitea-workflow-runner-health",
"result": "只讀 API 回傳 gitea_workflow_runner_health_v1不修改 workflow、不重啟 runner、不停止 container、不讀 Secret、不送通知。"
},
{
"evidence_id": "observability_contract_matrix_api",
"title": "監控合約與降噪機會只讀 API",
"source_ref": "GET /api/v1/agents/observability-contract-matrix",
"status": "done",
"summary": "只讀呈現 Prometheus / Alertmanager / Grafana / SigNoz / ClickHouse / Sentry 合約、降噪 proposal 與分類缺口;不修改 alert rules、不送通知、不讀 Secret。"
}
],
"approval_boundaries": {
@@ -1739,10 +1784,10 @@
"destructive_operation_allowed": false
},
"task_approval_boundary_rollup": {
"total_tasks": 28,
"total_tasks": 29,
"by_mode": {
"ready_for_operator_review": 1,
"read_only_allowed": 26,
"read_only_allowed": 27,
"approval_required": 1
},
"tasks_requiring_explicit_approval": [
@@ -1760,6 +1805,13 @@
"P0-008",
"P1-001",
"P1-002",
"P1-003",
"P1-301",
"P1-302",
"P1-303",
"P1-304",
"P1-305",
"P1-306",
"P1-101",
"P1-102",
"P1-103",
@@ -1771,13 +1823,7 @@
"P1-203",
"P1-204",
"P1-205",
"P1-206",
"P1-301",
"P1-302",
"P1-303",
"P1-304",
"P1-305",
"P1-306"
"P1-206"
]
}
}

View File

@@ -0,0 +1,391 @@
{
"schema_version": "observability_contract_matrix_v1",
"generated_at": "2026-06-05T12:24:00+08:00",
"program_status": {
"overall_completion_percent": 100,
"current_priority": "P1",
"current_task_id": "P1-003",
"next_task_id": "P1-004",
"read_only_mode": true
},
"source_refs": [
"docs/schemas/observability_contract_matrix_v1.schema.json",
"docs/HARD_RULES.md#alertmanager-routing",
"ops/alertmanager/alertmanager.yml",
"ops/monitoring/alerts.yml",
"ops/monitoring/alerts-unified.yml",
"k8s/monitoring/prometheus.yml",
"k8s/monitoring/alert-chain-monitor.yaml",
"ops/grafana/dashboards/ai-monitoring.json",
"ops/grafana/dashboards/infra-monitoring.json",
"ops/signoz/alerting/rules.yaml",
"ops/signoz/alerting/log-rules.md",
"ops/signoz/otel-collector-config-phase-o.yaml",
"k8s/observability/otel-collector-daemonset.yaml",
"docs/runbooks/RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK.md",
"docs/adr/ADR-053-observability-signoz-unified-architecture.md",
"docs/adr/ADR-090-monitoring-blindspot-governance.md",
"docs/LOGBOOK.md"
],
"rollups": {
"total_surfaces": 6,
"by_kind": {
"prometheus_rules": 1,
"alertmanager_route": 1,
"signoz_clickhouse": 1,
"grafana_dashboard": 1,
"sentry_source_link": 1,
"otel_event_exporter": 1
},
"by_status": {
"action_required": 2,
"verified": 4
},
"by_evidence_status": {
"committed_manifest": 4,
"production_readback_recorded": 2
},
"by_noise_policy_status": {
"proposal_only": 2,
"preserved": 3,
"needs_proposal": 1
},
"surface_ids_requiring_action": [
"grafana_dashboard_inventory",
"prometheus_alert_rule_catalog"
],
"surface_ids_with_proposal_only_noise_policy": [
"alertmanager_awoooi_route",
"prometheus_alert_rule_catalog"
],
"noise_reduction_opportunities_total": 5,
"approval_required_opportunity_ids": [
"alertmanager_grouping_inhibit_tuning",
"prometheus_noise_rule_tuning"
],
"classification_gap_ids": [
"grafana_dashboard_owner_status",
"prometheus_alert_rule_catalog_seed",
"signoz_provider_native_real_alert_gap"
],
"read_only_denials_total": 12,
"surfaces_requiring_action": [
"grafana_dashboard_inventory",
"prometheus_alert_rule_catalog"
],
"proposal_only_count": 5
},
"observability_surfaces": [
{
"surface_id": "prometheus_alert_rule_catalog",
"display_name": "Prometheus 告警規則合約",
"kind": "prometheus_rules",
"status": "action_required",
"risk_level": "critical",
"evidence_status": "committed_manifest",
"noise_policy_status": "proposal_only",
"coverage_contract": "已提交 ops/monitoring/alerts-unified.yml 與 k8s/monitoring/* 規則本快照只盤點規則、label、runbook 與分類缺口,不 reload Prometheus、不修改 alert rules。",
"current_contract": "committed ops/monitoring/alerts-unified.yml 目前含 118 條 alertLOGBOOK 曾記錄 production Prometheus rule count 142需以正式 smoke 讀回確認。",
"evidence_refs": [
"ops/monitoring/alerts-unified.yml",
"ops/monitoring/alerts.yml",
"k8s/monitoring/alert-chain-monitor.yaml",
"docs/LOGBOOK.md"
],
"next_action": "建立 alert_rule_catalog seed 與噪音率觀察 proposal任何 rule 調整放到 P2-003 人工批准。"
},
{
"surface_id": "alertmanager_awoooi_route",
"display_name": "Alertmanager → AWOOOI API 路由",
"kind": "alertmanager_route",
"status": "verified",
"risk_level": "critical",
"evidence_status": "committed_manifest",
"noise_policy_status": "proposal_only",
"coverage_contract": "Alertmanager receiver 必須指向 AWOOOI APIOpenClaw 只做 AI 分析,不得成為 Alertmanager receiver。",
"current_contract": "ops/alertmanager/alertmanager.yml 以 awoooi-webhook 為主路徑telegram-direct 僅限 alert-chain/API health 緊急旁路group_by/team/alertname/severity 已存在。",
"evidence_refs": [
"docs/HARD_RULES.md#alertmanager-routing",
"ops/alertmanager/alertmanager.yml"
],
"next_action": "只提出 group_by、inhibit、repeat interval 降噪 proposal不得直接改 receiver、route 或 silence。"
},
{
"surface_id": "signoz_clickhouse_ingestion",
"display_name": "SigNoz / ClickHouse / Provider Webhook",
"kind": "signoz_clickhouse",
"status": "verified",
"risk_level": "high",
"evidence_status": "production_readback_recorded",
"noise_policy_status": "preserved",
"coverage_contract": "SigNoz webhook、ClickHouse TTL、OTEL prometheus receiver 與 source provider heartbeat 需分開標示heartbeat 不是 provider-native 真實告警。",
"current_contract": "ops/signoz/alerting/rules.yaml、log-rules.md 與 RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK 已描述 webhook / rulesLOGBOOK 記錄 SigNoz webhook 與 source provider heartbeat 多次通過。",
"evidence_refs": [
"ops/signoz/alerting/rules.yaml",
"ops/signoz/alerting/log-rules.md",
"ops/signoz/otel-collector-config-phase-o.yaml",
"docs/runbooks/RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK.md",
"docs/adr/ADR-053-observability-signoz-unified-architecture.md",
"docs/LOGBOOK.md"
],
"next_action": "保留 provider heartbeat / upstream canary 低噪音;補 provider-native 真實告警與 incident correlation gap 的只讀看板。"
},
{
"surface_id": "grafana_dashboard_inventory",
"display_name": "Grafana Dashboard / Alert Chain 視覺化",
"kind": "grafana_dashboard",
"status": "action_required",
"risk_level": "medium",
"evidence_status": "committed_manifest",
"noise_policy_status": "needs_proposal",
"coverage_contract": "目前只確認 committed dashboard JSON本快照不呼叫 Grafana API、不匯入 dashboard、不改 datasource。",
"current_contract": "ai-monitoring dashboard 包含 Alert Chain 健康與最後成功時間infra-monitoring dashboard 包含 Prometheus target up/down 與 API request rate。",
"evidence_refs": [
"ops/grafana/dashboards/ai-monitoring.json",
"ops/grafana/dashboards/infra-monitoring.json"
],
"next_action": "補 dashboard owner、datasource parity、正式站可讀性與 alert-chain panel fresh readback寫入或 import 需另案批准。"
},
{
"surface_id": "sentry_source_link_canary",
"display_name": "Sentry Webhook / Source Link Canary",
"kind": "sentry_source_link",
"status": "verified",
"risk_level": "high",
"evidence_status": "production_readback_recorded",
"noise_policy_status": "preserved",
"coverage_contract": "Sentry webhook 與 source-link canary 用來驗證來源鏈路,不能被誤讀成真實 provider alert 全部已關聯。",
"current_contract": "LOGBOOK 記錄 Alertmanager / SigNoz / Sentry webhook 與 Source Link Canary 通過,且 source provider freshness / incident matching 必須分開判斷。",
"evidence_refs": [
"docs/adr/ADR-022-sentry-integration-architecture.md",
"docs/LOGBOOK.md"
],
"next_action": "持續把 heartbeat、upstream canary、direct/candidate/applied source link 分開呈現;不修改 Sentry project webhook。"
},
{
"surface_id": "otel_event_exporter_bridge",
"display_name": "OTEL Collector / Event Exporter",
"kind": "otel_event_exporter",
"status": "verified",
"risk_level": "medium",
"evidence_status": "committed_manifest",
"noise_policy_status": "preserved",
"coverage_contract": "OTEL Collector DaemonSet 與 SigNoz prometheus receiver 只作為可觀測來源;本快照不部署 collector、不重啟 exporter。",
"current_contract": "k8s/observability/otel-collector-daemonset.yaml 與 ops/signoz/otel-collector-config-phase-o.yaml 描述 log/metric/trace pipelineLOGBOOK 記錄 OTEL Collector / Event Exporter post-deploy smoke 通過。",
"evidence_refs": [
"k8s/observability/otel-collector-daemonset.yaml",
"ops/signoz/otel-collector-config-phase-o.yaml",
"docs/LOGBOOK.md"
],
"next_action": "把 collector/exporter health 放入 observability readiness任何 deploy / restart 仍需獨立批准。"
}
],
"noise_reduction_opportunities": [
{
"opportunity_id": "prometheus_noise_rule_tuning",
"display_name": "Prometheus 告警噪音調整提案",
"status": "approval_required",
"proposal_only": true,
"impact": "降低 stale provider、低樣本 SLO、重複 resource alert 對 operator 的干擾;不得直接修改 alert rules。",
"target_surface_ids": [
"prometheus_alert_rule_catalog"
],
"evidence_refs": [
"ops/monitoring/alerts-unified.yml",
"docs/adr/ADR-090-monitoring-blindspot-governance.md"
],
"next_action": "進 P2-003 建立人工批准包,先收集 24h alert frequency / fingerprint evidence。"
},
{
"opportunity_id": "alertmanager_grouping_inhibit_tuning",
"display_name": "Alertmanager grouping / inhibit 降噪提案",
"status": "approval_required",
"proposal_only": true,
"impact": "針對同 team / alertname / severity 的爆量與 Host/K8s 重複告警做提案,不變更 receiver。",
"target_surface_ids": [
"alertmanager_awoooi_route"
],
"evidence_refs": [
"ops/alertmanager/alertmanager.yml",
"docs/HARD_RULES.md#alertmanager-routing"
],
"next_action": "產生 diff proposal 與 rollback plan未批准前不得 reload Alertmanager。"
},
{
"opportunity_id": "success_notification_quiet_policy",
"display_name": "Provider heartbeat 與真實告警分流",
"status": "ready_for_proposal",
"proposal_only": true,
"impact": "避免把 Sentry / SigNoz heartbeat 誤當真實 provider alert降低假綠與錯誤升級。",
"target_surface_ids": [
"signoz_clickhouse_ingestion",
"sentry_source_link_canary"
],
"evidence_refs": [
"docs/LOGBOOK.md",
"docs/runbooks/RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK.md"
],
"next_action": "在 UI / API 上維持 heartbeat、upstream canary、direct source link、candidate source link 四種標籤。"
},
{
"opportunity_id": "grafana_dashboard_owner_freshness",
"display_name": "Grafana dashboard owner / freshness 標籤",
"status": "ready_for_proposal",
"proposal_only": true,
"impact": "讓 dashboard 缺 datasource、缺 owner 或 stale panel 不被誤讀成監控缺失已修復。",
"target_surface_ids": [
"grafana_dashboard_inventory"
],
"evidence_refs": [
"ops/grafana/dashboards/ai-monitoring.json",
"ops/grafana/dashboards/infra-monitoring.json"
],
"next_action": "只讀補 owner/freshness matrix不寫 Grafana。"
},
{
"opportunity_id": "success_notification_quiet_policy",
"display_name": "成功不洗版 / 失敗才升級",
"status": "preserved",
"proposal_only": true,
"impact": "沿用備份與 Gitea 的 quiet-success 原則,讓 observability smoke 成功證據走 API/LOGBOOK失敗才通知。",
"target_surface_ids": [
"otel_event_exporter_bridge",
"signoz_clickhouse_ingestion"
],
"evidence_refs": [
"docs/ai/AI_AGENT_AUTOMATION_WORKLIST_2026-06-04.md",
"docs/LOGBOOK.md"
],
"next_action": "P1-003 僅記錄;未批准前不送 Telegram 測試通知。"
}
],
"classification_gaps": [
{
"gap_id": "prometheus_alert_rule_catalog_seed",
"display_name": "Alert rule catalog seed 未正式產品化",
"status": "action_required",
"severity": "high",
"summary": "ADR-090 要求 alert_rule_catalog 能追蹤規則資產、noise_rate 與 superseded_by_rule_id目前 P1-003 只完成只讀矩陣。",
"evidence_refs": [
"docs/adr/ADR-090-monitoring-blindspot-governance.md",
"ops/monitoring/alerts-unified.yml"
],
"next_action": "P2-003 前先產生 seed proposal 與 migration/rollback 分離批准包。"
},
{
"gap_id": "signoz_provider_native_real_alert_gap",
"display_name": "SigNoz provider-native 真實告警證據缺口",
"status": "action_required",
"severity": "medium",
"summary": "Heartbeat / upstream canary 能證明管道新鮮,但不等於每種 provider-native alert 都已接到 incident correlation。",
"evidence_refs": [
"docs/LOGBOOK.md",
"docs/runbooks/RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK.md"
],
"next_action": "只讀列出 provider-native alert coverage需要 side effect 的 signed canary 另案批准。"
},
{
"gap_id": "grafana_dashboard_owner_status",
"display_name": "Grafana dashboard owner / datasource 狀態未連到治理頁",
"status": "action_required",
"severity": "medium",
"summary": "Committed dashboard JSON 存在,但尚未顯示 datasource freshness、owner、last import 或 panel stale 狀態。",
"evidence_refs": [
"ops/grafana/dashboards/ai-monitoring.json",
"ops/grafana/dashboards/infra-monitoring.json"
],
"next_action": "下一輪只讀補 dashboard readiness不呼叫 Grafana write API。"
}
],
"latest_observations": [
{
"observation_id": "alertmanager_receiver_guard",
"status": "verified",
"summary": "HARD_RULES 與 ops/alertmanager/alertmanager.yml 都保留 Alertmanager 指向 AWOOOI API 的邊界OpenClaw 不得成為 receiver。",
"evidence_refs": [
"docs/HARD_RULES.md#alertmanager-routing",
"ops/alertmanager/alertmanager.yml"
]
},
{
"observation_id": "prometheus_rule_source_split",
"status": "action_required",
"summary": "committed Prometheus 規則分散於 ops/monitoring 與 k8s/monitoringP1-003 建立 matrix尚未調整規則或 reload。",
"evidence_refs": [
"ops/monitoring/alerts-unified.yml",
"k8s/monitoring/alert-chain-monitor.yaml"
]
},
{
"observation_id": "post_deploy_observability_smoke_history",
"status": "verified",
"summary": "LOGBOOK 已多次記錄 Alertmanager / SigNoz / Sentry webhook、SigNoz、OTEL Collector、Event Exporter post-deploy smoke 通過。",
"evidence_refs": [
"docs/LOGBOOK.md"
]
}
],
"operator_contract": {
"display_mode": "read_only_observability_contract_matrix",
"must_not_interpret_as": [
"Prometheus alert rule 修改批准",
"Alertmanager receiver / route 修改批准",
"Alertmanager 指向 OpenClaw receiver 批准",
"Silence 建立或維護窗口批准",
"Grafana dashboard 寫入批准",
"SigNoz / Sentry webhook 設定修改批准",
"Secret 已讀取或可輸出",
"Telegram 測試通知批准",
"deploy / reload / workflow 觸發批准",
"runtime execution 授權"
],
"secret_display_policy": "只允許顯示 committed file refs、endpoint role 與 redacted metadata不得顯示 token、webhook secret 或 authorization header。",
"alertmanager_route_policy": "Alertmanager webhook 必須指向 AWOOOI APIOpenClaw 不接收 Alertmanager webhook只能在 API 持久化與分類後參與只讀分析。",
"noise_reduction_policy": "P1-003 僅產生 proposalP2-003 或任何 route/rule/silence 變更需人工批准。",
"notification_policy": "成功 smoke 不即時通知洗版失敗、action-required 或人工作業才可進通知批准流程。"
},
"operation_boundaries": {
"read_only_api_allowed": true,
"prometheus_rule_write_allowed": false,
"prometheus_reload_allowed": false,
"alertmanager_route_write_allowed": false,
"alertmanager_receiver_change_allowed": false,
"alertmanager_to_openclaw_allowed": false,
"silence_create_allowed": false,
"grafana_dashboard_write_allowed": false,
"grafana_api_write_allowed": false,
"signoz_query_mutation_allowed": false,
"signoz_webhook_change_allowed": false,
"sentry_webhook_change_allowed": false,
"otel_collector_deploy_allowed": false,
"event_exporter_restart_allowed": false,
"secret_read_allowed": false,
"secret_plaintext_allowed": false,
"notification_send_allowed": false,
"external_api_call_allowed": false,
"live_prometheus_query_allowed": false,
"workflow_trigger_allowed": false,
"deploy_trigger_allowed": false,
"reload_trigger_allowed": false,
"runtime_execution_allowed": false
},
"approval_boundaries": {
"prometheus_rule_change_authorized": false,
"prometheus_reload_authorized": false,
"alertmanager_route_change_authorized": false,
"alertmanager_receiver_change_authorized": false,
"alertmanager_to_openclaw_authorized": false,
"silence_authorized": false,
"grafana_write_authorized": false,
"signoz_write_authorized": false,
"sentry_write_authorized": false,
"otel_deploy_authorized": false,
"event_exporter_restart_authorized": false,
"notification_send_authorized": false,
"external_call_authorized": false,
"secret_plaintext_allowed": false,
"workflow_trigger_authorized": false,
"deploy_reload_authorized": false,
"runtime_execution_authorized": false
}
}

View File

@@ -0,0 +1,159 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "urn:awoooi:observability-contract-matrix-v1",
"title": "AWOOOI 監控合約與降噪機會矩陣 v1",
"description": "以 repo 內 committed Prometheus / Alertmanager / Grafana / SigNoz / ClickHouse / Sentry evidence 建立只讀 observability matrix不修改 alert rules、不發通知、不打 silence API、不部署 exporter、不觸發 workflow。",
"type": "object",
"required": [
"schema_version",
"generated_at",
"program_status",
"source_refs",
"rollups",
"observability_surfaces",
"noise_reduction_opportunities",
"classification_gaps",
"latest_observations",
"operator_contract",
"operation_boundaries",
"approval_boundaries"
],
"properties": {
"schema_version": {
"type": "string",
"const": "observability_contract_matrix_v1"
},
"generated_at": {
"type": "string",
"minLength": 1
},
"program_status": {
"type": "object",
"required": [
"overall_completion_percent",
"current_priority",
"current_task_id",
"next_task_id",
"read_only_mode"
],
"properties": {
"overall_completion_percent": {
"type": "integer",
"minimum": 0,
"maximum": 100
},
"current_priority": {
"type": "string",
"enum": [
"P0",
"P1",
"P2",
"P3"
]
},
"current_task_id": {
"type": "string",
"minLength": 1
},
"next_task_id": {
"type": "string",
"minLength": 1
},
"read_only_mode": {
"type": "boolean",
"const": true
}
},
"additionalProperties": false
},
"source_refs": {
"type": "array",
"minItems": 1,
"items": {
"type": "string",
"minLength": 1
}
},
"rollups": {
"type": "object",
"additionalProperties": true
},
"observability_surfaces": {
"type": "array",
"minItems": 1,
"items": {
"type": "object",
"required": [
"surface_id",
"display_name",
"kind",
"status",
"risk_level",
"evidence_status",
"noise_policy_status",
"coverage_contract",
"evidence_refs",
"next_action"
],
"additionalProperties": true
}
},
"noise_reduction_opportunities": {
"type": "array",
"items": {
"type": "object",
"required": [
"opportunity_id",
"display_name",
"status",
"proposal_only",
"impact",
"evidence_refs",
"next_action"
],
"additionalProperties": true
}
},
"classification_gaps": {
"type": "array",
"items": {
"type": "object",
"required": [
"gap_id",
"display_name",
"status",
"severity",
"summary",
"evidence_refs",
"next_action"
],
"additionalProperties": true
}
},
"latest_observations": {
"type": "array",
"items": {
"type": "object",
"additionalProperties": true
}
},
"operator_contract": {
"type": "object",
"additionalProperties": true
},
"operation_boundaries": {
"type": "object",
"additionalProperties": {
"type": "boolean"
}
},
"approval_boundaries": {
"type": "object",
"additionalProperties": {
"type": "boolean",
"const": false
}
}
},
"additionalProperties": false
}

View File

@@ -3516,3 +3516,21 @@ Phase 6 完成後
1. P1-003盤點 Prometheus / Alertmanager / SigNoz / Grafana 監控合約與降噪機會。
**裁決:** P1-002 只完成 read-only committed workflow / runner health contract。不得把 `ubuntu-latest` owner attestation 缺口、runner watchdog 草案、stale-job dry-run guard 或 notification contract 解讀成 workflow 修改、runner restart / stop、container stop、runner label change、runner registration、Secret payload collection、Telegram 測試通知、schedule enable、Gitea write、deploy / migration trigger 或任何 runtime execution 授權。
### 2026-06-05 下午 (台北) — P1-003 監控合約與降噪矩陣本地完成
**觸發**:統帥批准繼續,要求依 `docs/ai/AI_AGENT_AUTOMATION_WORKLIST_2026-06-04.md` 的優先順序推進,並同步完成度、工作狀態與正式環境推版。
**已推進:**
- P1-003新增 `observability_contract_matrix_v1` schema 與 `docs/evaluations/observability_contract_matrix_2026-06-05.json`,以 committed Prometheus / Alertmanager / Grafana / SigNoz / ClickHouse / Sentry / OTEL evidence 建立只讀監控合約矩陣。
- P1-003新增 `GET /api/v1/agents/observability-contract-matrix` 只讀 API 與 service guard強制拒絕把 snapshot 誤讀成 alert rule 修改、Prometheus reload、Alertmanager receiver / route 修改、Alertmanager 指向 OpenClaw、silence 建立、Grafana 寫入、SigNoz / Sentry webhook 修改、Secret payload、Telegram 測試通知、deploy / reload / workflow 觸發或 runtime execution 授權。
- P1-003治理頁 `/zh-TW/governance?tab=automation-inventory` 新增監控合約與降噪機會區塊;顯示監控面、需處置、降噪候選、需批准候選、分類缺口與不可誤讀合約,不新增任何執行按鈕。
- 目前數字observability surfaces `6`;需處置 surfaces `2`;降噪候選 `5`;需人工批准的降噪候選 `2`classification gaps `3`read-only denials `12`automation backlog done `19/23`、overall `83%`、P1 `90%`、WS3 `75%`inventory tasks `29`
- 本地驗證JSON parse 通過observability contract matrix service / API、automation inventory / backlog snapshot API、Gitea workflow runner health service / API 目標測試 `19 passed`Python py_compile 通過zh-TW / en i18n key 差異 `0`web typecheck 通過Next production build 通過source-control-owner-response guard、security-mirror-progress guard、`git diff --check` 通過;本地 API readback 回 `observability_contract_matrix_v1`、current `P1-003`、next `P1-004`、surfaces `6`、noise opportunities `5`、approval-required opportunities `2`
**下一步:**
1. Commit 並推 `gitea main`
2. 等 deploy marker 後補 production API / Browser smoke。
3. P1-004盤點 AI Router / Ollama / Nemotron / Gemini provider 路徑。
**裁決:** P1-003 只完成 read-only observability contract matrix 與降噪候選顯示。不得把 Prometheus rule count、Alertmanager grouping、SigNoz / Sentry heartbeat、Grafana dashboard JSON、OTEL/Event Exporter evidence 或 classification gap 解讀成 alert rule 變更、receiver/route 變更、OpenClaw receiver 授權、silence、dashboard import、webhook 修改、secret 讀取、通知發送、deploy/reload/workflow 或 runtime execution 批准。成功 smoke 不即時通知洗版;失敗與需處置才進批准流程。