feat(governance): 新增監控合約降噪矩陣
This commit is contained in:
@@ -65,6 +65,9 @@ from src.services.runtime_surface_inventory import (
|
||||
from src.services.gitea_workflow_runner_health import (
|
||||
load_latest_gitea_workflow_runner_health,
|
||||
)
|
||||
from src.services.observability_contract_matrix import (
|
||||
load_latest_observability_contract_matrix,
|
||||
)
|
||||
from src.services.package_supply_chain_inventory import (
|
||||
load_latest_package_supply_chain_inventory,
|
||||
)
|
||||
@@ -536,6 +539,34 @@ async def get_gitea_workflow_runner_health() -> dict[str, Any]:
|
||||
) from exc
|
||||
|
||||
|
||||
@router.get(
|
||||
"/observability-contract-matrix",
|
||||
response_model=dict[str, Any],
|
||||
summary="取得監控合約與降噪機會矩陣",
|
||||
description=(
|
||||
"讀取最新已提交的 Prometheus / Alertmanager / Grafana / SigNoz / ClickHouse / Sentry "
|
||||
"只讀 observability matrix;此端點不修改 alert rules、不呼叫 silence API、"
|
||||
"不建立 Grafana dashboard、不改 SigNoz / Sentry 設定、不讀 Secret payload、"
|
||||
"不送 Telegram 測試通知、不觸發 monitoring deploy。"
|
||||
),
|
||||
)
|
||||
async def get_observability_contract_matrix() -> dict[str, Any]:
|
||||
"""Return the latest read-only observability contract matrix."""
|
||||
try:
|
||||
return await asyncio.to_thread(load_latest_observability_contract_matrix)
|
||||
except FileNotFoundError as exc:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail=str(exc),
|
||||
) from exc
|
||||
except (json.JSONDecodeError, ValueError) as exc:
|
||||
logger.error("observability_contract_matrix_invalid", error=str(exc))
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail="監控合約與降噪機會矩陣快照無效",
|
||||
) from exc
|
||||
|
||||
|
||||
@router.get(
|
||||
"/backup-dr-target-inventory",
|
||||
response_model=dict[str, Any],
|
||||
|
||||
232
apps/api/src/services/observability_contract_matrix.py
Normal file
232
apps/api/src/services/observability_contract_matrix.py
Normal file
@@ -0,0 +1,232 @@
|
||||
"""
|
||||
Observability contract and noise-reduction matrix snapshot.
|
||||
|
||||
Loads the latest committed, read-only Prometheus / Alertmanager / SigNoz /
|
||||
Grafana observability contract matrix. This module never mutates alert rules,
|
||||
routes, receivers, silences, dashboards, webhooks, collectors, secrets,
|
||||
notifications, workflows, or runtime state.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from src.services.snapshot_paths import default_evaluations_dir
|
||||
|
||||
_DEFAULT_EVALUATIONS_DIR = default_evaluations_dir(Path(__file__))
|
||||
_SNAPSHOT_PATTERN = "observability_contract_matrix_*.json"
|
||||
_SCHEMA_VERSION = "observability_contract_matrix_v1"
|
||||
|
||||
|
||||
def load_latest_observability_contract_matrix(
|
||||
evaluations_dir: Path | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Load the newest committed observability contract matrix snapshot."""
|
||||
directory = evaluations_dir or _DEFAULT_EVALUATIONS_DIR
|
||||
candidates = sorted(directory.glob(_SNAPSHOT_PATTERN))
|
||||
if not candidates:
|
||||
raise FileNotFoundError(f"no observability contract matrix snapshots found in {directory}")
|
||||
|
||||
latest = candidates[-1]
|
||||
with latest.open(encoding="utf-8") as handle:
|
||||
payload = json.load(handle)
|
||||
|
||||
if not isinstance(payload, dict):
|
||||
raise ValueError(f"{latest}: expected JSON object")
|
||||
_require_schema(payload, _SCHEMA_VERSION, str(latest))
|
||||
_require_read_only_boundaries(payload, str(latest))
|
||||
_require_operation_boundaries(payload, str(latest))
|
||||
_require_rollup_consistency(payload, str(latest))
|
||||
_require_surface_evidence(payload, str(latest))
|
||||
_require_noise_opportunities(payload, str(latest))
|
||||
_require_operator_denials(payload, str(latest))
|
||||
_require_no_plaintext_secret_payload_keys(payload, str(latest))
|
||||
return payload
|
||||
|
||||
|
||||
def _require_schema(payload: dict[str, Any], expected: str, label: str) -> None:
|
||||
actual = payload.get("schema_version")
|
||||
if actual != expected:
|
||||
raise ValueError(f"{label}: expected schema_version={expected}, got {actual!r}")
|
||||
|
||||
|
||||
def _require_read_only_boundaries(payload: dict[str, Any], label: str) -> None:
|
||||
program_status = payload.get("program_status") or {}
|
||||
if program_status.get("read_only_mode") is not True:
|
||||
raise ValueError(f"{label}: program_status.read_only_mode must be true")
|
||||
|
||||
approval_boundaries = payload.get("approval_boundaries") or {}
|
||||
allowed = sorted(key for key, value in approval_boundaries.items() if value is not False)
|
||||
if allowed:
|
||||
raise ValueError(f"{label}: approval boundaries must remain false: {allowed}")
|
||||
|
||||
|
||||
def _require_operation_boundaries(payload: dict[str, Any], label: str) -> None:
|
||||
boundaries = payload.get("operation_boundaries") or {}
|
||||
if boundaries.get("read_only_api_allowed") is not True:
|
||||
raise ValueError(f"{label}: read_only_api_allowed must be true")
|
||||
|
||||
blocked_flags = {
|
||||
"prometheus_rule_write_allowed",
|
||||
"prometheus_reload_allowed",
|
||||
"alertmanager_route_write_allowed",
|
||||
"alertmanager_receiver_change_allowed",
|
||||
"alertmanager_to_openclaw_allowed",
|
||||
"silence_create_allowed",
|
||||
"grafana_dashboard_write_allowed",
|
||||
"grafana_api_write_allowed",
|
||||
"signoz_query_mutation_allowed",
|
||||
"signoz_webhook_change_allowed",
|
||||
"sentry_webhook_change_allowed",
|
||||
"otel_collector_deploy_allowed",
|
||||
"event_exporter_restart_allowed",
|
||||
"secret_read_allowed",
|
||||
"secret_plaintext_allowed",
|
||||
"notification_send_allowed",
|
||||
"external_api_call_allowed",
|
||||
"live_prometheus_query_allowed",
|
||||
"workflow_trigger_allowed",
|
||||
"deploy_trigger_allowed",
|
||||
"reload_trigger_allowed",
|
||||
"runtime_execution_allowed",
|
||||
}
|
||||
allowed = sorted(flag for flag in blocked_flags if boundaries.get(flag) is not False)
|
||||
if allowed:
|
||||
raise ValueError(f"{label}: operation boundaries must remain false: {allowed}")
|
||||
|
||||
|
||||
def _require_rollup_consistency(payload: dict[str, Any], label: str) -> None:
|
||||
surfaces = payload.get("observability_surfaces") or []
|
||||
opportunities = payload.get("noise_reduction_opportunities") or []
|
||||
gaps = payload.get("classification_gaps") or []
|
||||
rollups = payload.get("rollups") or {}
|
||||
|
||||
if rollups.get("total_surfaces") != len(surfaces):
|
||||
raise ValueError(f"{label}: rollups.total_surfaces must match observability_surfaces")
|
||||
if rollups.get("by_kind") != _count_by(surfaces, "kind"):
|
||||
raise ValueError(f"{label}: rollups.by_kind must match observability_surfaces")
|
||||
if rollups.get("by_status") != _count_by(surfaces, "status"):
|
||||
raise ValueError(f"{label}: rollups.by_status must match observability_surfaces")
|
||||
if rollups.get("by_evidence_status") != _count_by(surfaces, "evidence_status"):
|
||||
raise ValueError(f"{label}: rollups.by_evidence_status must match observability_surfaces")
|
||||
if rollups.get("by_noise_policy_status") != _count_by(surfaces, "noise_policy_status"):
|
||||
raise ValueError(f"{label}: rollups.by_noise_policy_status must match observability_surfaces")
|
||||
|
||||
action_required = sorted(
|
||||
surface.get("surface_id")
|
||||
for surface in surfaces
|
||||
if surface.get("status") == "action_required"
|
||||
)
|
||||
if sorted(rollups.get("surface_ids_requiring_action") or []) != action_required:
|
||||
raise ValueError(f"{label}: rollups.surface_ids_requiring_action must match surfaces")
|
||||
|
||||
proposal_only_surfaces = sorted(
|
||||
surface.get("surface_id")
|
||||
for surface in surfaces
|
||||
if surface.get("noise_policy_status") == "proposal_only"
|
||||
)
|
||||
if sorted(rollups.get("surface_ids_with_proposal_only_noise_policy") or []) != proposal_only_surfaces:
|
||||
raise ValueError(
|
||||
f"{label}: rollups.surface_ids_with_proposal_only_noise_policy must match surfaces"
|
||||
)
|
||||
|
||||
approval_required = sorted(
|
||||
opportunity.get("opportunity_id")
|
||||
for opportunity in opportunities
|
||||
if opportunity.get("status") == "approval_required"
|
||||
)
|
||||
if rollups.get("noise_reduction_opportunities_total") != len(opportunities):
|
||||
raise ValueError(f"{label}: rollups.noise_reduction_opportunities_total must match opportunities")
|
||||
if sorted(rollups.get("approval_required_opportunity_ids") or []) != approval_required:
|
||||
raise ValueError(f"{label}: rollups.approval_required_opportunity_ids must match opportunities")
|
||||
|
||||
if sorted(rollups.get("classification_gap_ids") or []) != sorted(gap.get("gap_id") for gap in gaps):
|
||||
raise ValueError(f"{label}: rollups.classification_gap_ids must match classification_gaps")
|
||||
|
||||
|
||||
def _require_surface_evidence(payload: dict[str, Any], label: str) -> None:
|
||||
surfaces = payload.get("observability_surfaces") or []
|
||||
missing = sorted(
|
||||
surface.get("surface_id")
|
||||
for surface in surfaces
|
||||
if not surface.get("coverage_contract")
|
||||
or not surface.get("evidence_refs")
|
||||
or not surface.get("next_action")
|
||||
)
|
||||
if missing:
|
||||
raise ValueError(f"{label}: observability_surfaces must include contract, evidence, next_action: {missing}")
|
||||
|
||||
|
||||
def _require_noise_opportunities(payload: dict[str, Any], label: str) -> None:
|
||||
opportunities = payload.get("noise_reduction_opportunities") or []
|
||||
non_proposal = sorted(
|
||||
opportunity.get("opportunity_id")
|
||||
for opportunity in opportunities
|
||||
if opportunity.get("proposal_only") is not True
|
||||
)
|
||||
if non_proposal:
|
||||
raise ValueError(f"{label}: noise opportunities must stay proposal_only: {non_proposal}")
|
||||
|
||||
required_ids = {
|
||||
"prometheus_noise_rule_tuning",
|
||||
"alertmanager_grouping_inhibit_tuning",
|
||||
"success_notification_quiet_policy",
|
||||
}
|
||||
present = {opportunity.get("opportunity_id") for opportunity in opportunities}
|
||||
if not required_ids.issubset(present):
|
||||
raise ValueError(f"{label}: missing required noise-reduction opportunities")
|
||||
|
||||
|
||||
def _require_operator_denials(payload: dict[str, Any], label: str) -> None:
|
||||
contract = payload.get("operator_contract") or {}
|
||||
must_not_interpret_as = set(contract.get("must_not_interpret_as") or [])
|
||||
required_denials = {
|
||||
"Prometheus alert rule 修改批准",
|
||||
"Alertmanager receiver / route 修改批准",
|
||||
"Alertmanager 指向 OpenClaw receiver 批准",
|
||||
"Silence 建立或維護窗口批准",
|
||||
"Grafana dashboard 寫入批准",
|
||||
"SigNoz / Sentry webhook 設定修改批准",
|
||||
"Secret 已讀取或可輸出",
|
||||
"Telegram 測試通知批准",
|
||||
"deploy / reload / workflow 觸發批准",
|
||||
"runtime execution 授權",
|
||||
}
|
||||
if not required_denials.issubset(must_not_interpret_as):
|
||||
raise ValueError(f"{label}: operator_contract.must_not_interpret_as is missing required denials")
|
||||
|
||||
route_policy = str(contract.get("alertmanager_route_policy") or "")
|
||||
if "OpenClaw" not in route_policy or "不接收 Alertmanager webhook" not in route_policy:
|
||||
raise ValueError(f"{label}: operator_contract.alertmanager_route_policy must block OpenClaw receiver use")
|
||||
|
||||
|
||||
def _require_no_plaintext_secret_payload_keys(value: Any, label: str, path: str = "$") -> None:
|
||||
if isinstance(value, dict):
|
||||
forbidden_key_fragments = {
|
||||
"secret_value",
|
||||
"token_value",
|
||||
"authorization_header",
|
||||
"private_key",
|
||||
"webhook_secret",
|
||||
"runner_token",
|
||||
"signoz_token",
|
||||
"sentry_dsn",
|
||||
}
|
||||
for key, nested in value.items():
|
||||
lowered = str(key).lower()
|
||||
if any(fragment in lowered for fragment in forbidden_key_fragments):
|
||||
raise ValueError(f"{label}: forbidden secret payload key at {path}.{key}")
|
||||
_require_no_plaintext_secret_payload_keys(nested, label, f"{path}.{key}")
|
||||
elif isinstance(value, list):
|
||||
for index, nested in enumerate(value):
|
||||
_require_no_plaintext_secret_payload_keys(nested, label, f"{path}[{index}]")
|
||||
|
||||
|
||||
def _count_by(items: list[dict[str, Any]], key: str) -> dict[str, int]:
|
||||
counts: dict[str, int] = {}
|
||||
for item in items:
|
||||
value = item.get(key)
|
||||
counts[value] = counts.get(value, 0) + 1
|
||||
return counts
|
||||
@@ -16,16 +16,16 @@ def test_ai_agent_automation_backlog_snapshot_endpoint_returns_committed_snapsho
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["schema_version"] == "ai_agent_automation_backlog_v1"
|
||||
assert data["program_status"]["overall_completion_percent"] == 78
|
||||
assert data["program_status"]["overall_completion_percent"] == 83
|
||||
assert data["program_status"]["read_only_mode"] is True
|
||||
assert data["program_status"]["current_task_id"] == "P1-002"
|
||||
assert data["program_status"]["next_task_id"] == "P1-003"
|
||||
assert data["program_status"]["current_task_id"] == "P1-003"
|
||||
assert data["program_status"]["next_task_id"] == "P1-004"
|
||||
assert data["rollups"]["total_items"] == len(data["backlog_items"]) == 23
|
||||
assert data["rollups"]["by_priority"]["P1"] == 21
|
||||
assert data["rollups"]["by_status"]["done"] == 18
|
||||
assert data["rollups"]["by_status"]["done"] == 19
|
||||
assert data["rollups"]["by_gate_status"]["read_only_allowed"] == 20
|
||||
assert data["progress_summary"]["overall_percent"] == 78
|
||||
assert data["progress_summary"]["done_items"] == 18
|
||||
assert data["progress_summary"]["overall_percent"] == 83
|
||||
assert data["progress_summary"]["done_items"] == 19
|
||||
assert data["progress_summary"]["total_items"] == 23
|
||||
assert data["item_approval_boundary_rollup"]["total_items"] == 23
|
||||
assert data["item_approval_boundary_rollup"]["items_requiring_explicit_approval"] == [
|
||||
@@ -51,6 +51,10 @@ def test_ai_agent_automation_backlog_snapshot_endpoint_returns_committed_snapsho
|
||||
assert p1_002["status"] == "done"
|
||||
assert p1_002["next_review"] == "P1-003"
|
||||
assert "gitea_workflow_runner_health_2026-06-05.json" in p1_002["evidence_refs"][0]
|
||||
p1_003 = next(item for item in data["backlog_items"] if item["item_id"] == "AUTO-P1-003")
|
||||
assert p1_003["status"] == "done"
|
||||
assert p1_003["next_review"] == "P1-004"
|
||||
assert "observability_contract_matrix_2026-06-05.json" in p1_003["evidence_refs"][0]
|
||||
p1_306 = next(item for item in data["backlog_items"] if item["item_id"] == "AUTO-P1-306")
|
||||
assert p1_306["approval_boundary"]["mode"] == "read_only_allowed"
|
||||
assert "runtime_execution" in p1_306["approval_boundary"]["blocked_actions"]
|
||||
|
||||
@@ -18,10 +18,10 @@ def test_ai_agent_automation_inventory_snapshot_endpoint_returns_committed_snaps
|
||||
assert data["schema_version"] == "ai_agent_automation_inventory_snapshot_v1"
|
||||
assert data["program_status"]["overall_completion_percent"] == 100
|
||||
assert data["program_status"]["read_only_mode"] is True
|
||||
assert data["program_status"]["current_task_id"] == "P1-002"
|
||||
assert data["program_status"]["next_task_id"] == "P1-003"
|
||||
assert data["task_approval_boundary_rollup"]["total_tasks"] == len(data["tasks"]) == 28
|
||||
assert data["task_approval_boundary_rollup"]["by_mode"]["read_only_allowed"] == 26
|
||||
assert data["program_status"]["current_task_id"] == "P1-003"
|
||||
assert data["program_status"]["next_task_id"] == "P1-004"
|
||||
assert data["task_approval_boundary_rollup"]["total_tasks"] == len(data["tasks"]) == 29
|
||||
assert data["task_approval_boundary_rollup"]["by_mode"]["read_only_allowed"] == 27
|
||||
assert data["task_approval_boundary_rollup"]["tasks_requiring_explicit_approval"] == [
|
||||
"P0-001",
|
||||
"P0-004",
|
||||
@@ -37,6 +37,10 @@ def test_ai_agent_automation_inventory_snapshot_endpoint_returns_committed_snaps
|
||||
assert p1_002["status"] == "done"
|
||||
assert p1_002["approval_boundary"]["mode"] == "read_only_allowed"
|
||||
assert "gitea_workflow_runner_health_2026-06-05.json" in p1_002["output"]
|
||||
p1_003 = next(task for task in data["tasks"] if task["task_id"] == "P1-003")
|
||||
assert p1_003["status"] == "done"
|
||||
assert p1_003["approval_boundary"]["mode"] == "read_only_allowed"
|
||||
assert "observability_contract_matrix_2026-06-05.json" in p1_003["output"]
|
||||
assert any(task["task_id"] == "P1-204" for task in data["tasks"])
|
||||
assert any(task["task_id"] == "P1-205" for task in data["tasks"])
|
||||
assert any(task["task_id"] == "P1-206" for task in data["tasks"])
|
||||
@@ -67,3 +71,4 @@ def test_ai_agent_automation_inventory_snapshot_endpoint_returns_committed_snaps
|
||||
assert any(evidence["evidence_id"] == "backlog_progress_summary_ui" for evidence in data["evidence"])
|
||||
assert any(evidence["evidence_id"] == "runtime_surface_inventory_api" for evidence in data["evidence"])
|
||||
assert any(evidence["evidence_id"] == "gitea_workflow_runner_health_api" for evidence in data["evidence"])
|
||||
assert any(evidence["evidence_id"] == "observability_contract_matrix_api" for evidence in data["evidence"])
|
||||
|
||||
293
apps/api/tests/test_observability_contract_matrix.py
Normal file
293
apps/api/tests/test_observability_contract_matrix.py
Normal file
@@ -0,0 +1,293 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from src.services.observability_contract_matrix import load_latest_observability_contract_matrix
|
||||
|
||||
|
||||
def test_load_latest_observability_contract_matrix_reads_newest_file(tmp_path):
|
||||
older = _snapshot(generated_at="2026-06-04T00:00:00+08:00", completion=40)
|
||||
newer = _snapshot(generated_at="2026-06-05T00:00:00+08:00", completion=100)
|
||||
(tmp_path / "observability_contract_matrix_2026-06-04.json").write_text(
|
||||
json.dumps(older),
|
||||
encoding="utf-8",
|
||||
)
|
||||
(tmp_path / "observability_contract_matrix_2026-06-05.json").write_text(
|
||||
json.dumps(newer),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
loaded = load_latest_observability_contract_matrix(tmp_path)
|
||||
|
||||
assert loaded["generated_at"] == "2026-06-05T00:00:00+08:00"
|
||||
assert loaded["program_status"]["overall_completion_percent"] == 100
|
||||
assert loaded["rollups"]["total_surfaces"] == 2
|
||||
assert loaded["operation_boundaries"]["alertmanager_to_openclaw_allowed"] is False
|
||||
|
||||
|
||||
def test_observability_contract_matrix_requires_read_only_mode(tmp_path):
|
||||
snapshot = _snapshot()
|
||||
snapshot["program_status"]["read_only_mode"] = False
|
||||
(tmp_path / "observability_contract_matrix_2026-06-05.json").write_text(
|
||||
json.dumps(snapshot),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="read_only_mode"):
|
||||
load_latest_observability_contract_matrix(tmp_path)
|
||||
|
||||
|
||||
def test_observability_contract_matrix_blocks_route_and_rule_mutations(tmp_path):
|
||||
snapshot = _snapshot()
|
||||
snapshot["operation_boundaries"]["prometheus_rule_write_allowed"] = True
|
||||
snapshot["operation_boundaries"]["alertmanager_to_openclaw_allowed"] = True
|
||||
(tmp_path / "observability_contract_matrix_2026-06-05.json").write_text(
|
||||
json.dumps(snapshot),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="operation boundaries"):
|
||||
load_latest_observability_contract_matrix(tmp_path)
|
||||
|
||||
|
||||
def test_observability_contract_matrix_requires_rollup_consistency(tmp_path):
|
||||
snapshot = _snapshot()
|
||||
snapshot["rollups"]["surface_ids_requiring_action"] = []
|
||||
(tmp_path / "observability_contract_matrix_2026-06-05.json").write_text(
|
||||
json.dumps(snapshot),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="surface_ids_requiring_action"):
|
||||
load_latest_observability_contract_matrix(tmp_path)
|
||||
|
||||
|
||||
def test_observability_contract_matrix_requires_noise_candidates_to_be_proposal_only(tmp_path):
|
||||
snapshot = _snapshot()
|
||||
snapshot["noise_reduction_opportunities"][0]["proposal_only"] = False
|
||||
(tmp_path / "observability_contract_matrix_2026-06-05.json").write_text(
|
||||
json.dumps(snapshot),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="proposal_only"):
|
||||
load_latest_observability_contract_matrix(tmp_path)
|
||||
|
||||
|
||||
def test_observability_contract_matrix_requires_openclaw_receiver_denial(tmp_path):
|
||||
snapshot = _snapshot()
|
||||
snapshot["operator_contract"]["must_not_interpret_as"].remove(
|
||||
"Alertmanager 指向 OpenClaw receiver 批准"
|
||||
)
|
||||
(tmp_path / "observability_contract_matrix_2026-06-05.json").write_text(
|
||||
json.dumps(snapshot),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="operator_contract"):
|
||||
load_latest_observability_contract_matrix(tmp_path)
|
||||
|
||||
|
||||
def test_observability_contract_matrix_rejects_secret_payload_keys(tmp_path):
|
||||
snapshot = _snapshot()
|
||||
snapshot["latest_observations"][0]["webhook_secret"] = "redacted"
|
||||
(tmp_path / "observability_contract_matrix_2026-06-05.json").write_text(
|
||||
json.dumps(snapshot),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="forbidden secret payload key"):
|
||||
load_latest_observability_contract_matrix(tmp_path)
|
||||
|
||||
|
||||
def test_observability_contract_matrix_fails_when_missing(tmp_path):
|
||||
with pytest.raises(FileNotFoundError):
|
||||
load_latest_observability_contract_matrix(tmp_path)
|
||||
|
||||
|
||||
def _snapshot(
|
||||
*,
|
||||
generated_at: str = "2026-06-05T00:00:00+08:00",
|
||||
completion: int = 100,
|
||||
) -> dict:
|
||||
surfaces = [
|
||||
_surface(
|
||||
"prometheus_alert_rule_catalog",
|
||||
"Prometheus 告警規則合約",
|
||||
"prometheus_rules",
|
||||
"action_required",
|
||||
"proposal_only",
|
||||
),
|
||||
_surface(
|
||||
"alertmanager_awoooi_route",
|
||||
"Alertmanager → AWOOOI API 路由",
|
||||
"alertmanager_route",
|
||||
"verified",
|
||||
"proposal_only",
|
||||
),
|
||||
]
|
||||
opportunities = [
|
||||
_opportunity("prometheus_noise_rule_tuning", "approval_required"),
|
||||
_opportunity("alertmanager_grouping_inhibit_tuning", "approval_required"),
|
||||
_opportunity("success_notification_quiet_policy", "preserved"),
|
||||
]
|
||||
gaps = [
|
||||
{
|
||||
"gap_id": "prometheus_alert_rule_catalog_seed",
|
||||
"display_name": "Alert rule catalog seed 未正式產品化",
|
||||
"status": "action_required",
|
||||
"severity": "high",
|
||||
"summary": "只讀矩陣已建立,尚未產生 catalog seed。",
|
||||
"evidence_refs": ["docs/adr/ADR-090-monitoring-blindspot-governance.md"],
|
||||
"next_action": "先產 proposal,不改 rule。",
|
||||
}
|
||||
]
|
||||
return {
|
||||
"schema_version": "observability_contract_matrix_v1",
|
||||
"generated_at": generated_at,
|
||||
"program_status": {
|
||||
"overall_completion_percent": completion,
|
||||
"current_priority": "P1",
|
||||
"current_task_id": "P1-003",
|
||||
"next_task_id": "P1-004",
|
||||
"read_only_mode": True,
|
||||
},
|
||||
"source_refs": ["docs/schemas/observability_contract_matrix_v1.schema.json"],
|
||||
"rollups": {
|
||||
"total_surfaces": len(surfaces),
|
||||
"by_kind": _count_by(surfaces, "kind"),
|
||||
"by_status": _count_by(surfaces, "status"),
|
||||
"by_evidence_status": _count_by(surfaces, "evidence_status"),
|
||||
"by_noise_policy_status": _count_by(surfaces, "noise_policy_status"),
|
||||
"surface_ids_requiring_action": ["prometheus_alert_rule_catalog"],
|
||||
"surface_ids_with_proposal_only_noise_policy": [
|
||||
"alertmanager_awoooi_route",
|
||||
"prometheus_alert_rule_catalog",
|
||||
],
|
||||
"noise_reduction_opportunities_total": len(opportunities),
|
||||
"approval_required_opportunity_ids": [
|
||||
"alertmanager_grouping_inhibit_tuning",
|
||||
"prometheus_noise_rule_tuning",
|
||||
],
|
||||
"classification_gap_ids": ["prometheus_alert_rule_catalog_seed"],
|
||||
"read_only_denials_total": 12,
|
||||
},
|
||||
"observability_surfaces": surfaces,
|
||||
"noise_reduction_opportunities": opportunities,
|
||||
"classification_gaps": gaps,
|
||||
"latest_observations": [
|
||||
{
|
||||
"observation_id": "alertmanager_receiver_guard",
|
||||
"status": "verified",
|
||||
"summary": "Alertmanager 不得指向 OpenClaw。",
|
||||
"evidence_refs": ["docs/HARD_RULES.md#alertmanager-routing"],
|
||||
}
|
||||
],
|
||||
"operator_contract": {
|
||||
"display_mode": "read_only_observability_contract_matrix",
|
||||
"must_not_interpret_as": [
|
||||
"Prometheus alert rule 修改批准",
|
||||
"Alertmanager receiver / route 修改批准",
|
||||
"Alertmanager 指向 OpenClaw receiver 批准",
|
||||
"Silence 建立或維護窗口批准",
|
||||
"Grafana dashboard 寫入批准",
|
||||
"SigNoz / Sentry webhook 設定修改批准",
|
||||
"Secret 已讀取或可輸出",
|
||||
"Telegram 測試通知批准",
|
||||
"deploy / reload / workflow 觸發批准",
|
||||
"runtime execution 授權",
|
||||
],
|
||||
"secret_display_policy": "只顯示 redacted metadata。",
|
||||
"alertmanager_route_policy": "OpenClaw 不接收 Alertmanager webhook;receiver 維持 AWOOOI API。",
|
||||
"noise_reduction_policy": "只產生 proposal。",
|
||||
"notification_policy": "成功不洗版。",
|
||||
},
|
||||
"operation_boundaries": {
|
||||
"read_only_api_allowed": True,
|
||||
"prometheus_rule_write_allowed": False,
|
||||
"prometheus_reload_allowed": False,
|
||||
"alertmanager_route_write_allowed": False,
|
||||
"alertmanager_receiver_change_allowed": False,
|
||||
"alertmanager_to_openclaw_allowed": False,
|
||||
"silence_create_allowed": False,
|
||||
"grafana_dashboard_write_allowed": False,
|
||||
"grafana_api_write_allowed": False,
|
||||
"signoz_query_mutation_allowed": False,
|
||||
"signoz_webhook_change_allowed": False,
|
||||
"sentry_webhook_change_allowed": False,
|
||||
"otel_collector_deploy_allowed": False,
|
||||
"event_exporter_restart_allowed": False,
|
||||
"secret_read_allowed": False,
|
||||
"secret_plaintext_allowed": False,
|
||||
"notification_send_allowed": False,
|
||||
"external_api_call_allowed": False,
|
||||
"live_prometheus_query_allowed": False,
|
||||
"workflow_trigger_allowed": False,
|
||||
"deploy_trigger_allowed": False,
|
||||
"reload_trigger_allowed": False,
|
||||
"runtime_execution_allowed": False,
|
||||
},
|
||||
"approval_boundaries": {
|
||||
"prometheus_rule_change_authorized": False,
|
||||
"prometheus_reload_authorized": False,
|
||||
"alertmanager_route_change_authorized": False,
|
||||
"alertmanager_receiver_change_authorized": False,
|
||||
"alertmanager_to_openclaw_authorized": False,
|
||||
"silence_authorized": False,
|
||||
"grafana_write_authorized": False,
|
||||
"signoz_write_authorized": False,
|
||||
"sentry_write_authorized": False,
|
||||
"otel_deploy_authorized": False,
|
||||
"event_exporter_restart_authorized": False,
|
||||
"notification_send_authorized": False,
|
||||
"external_call_authorized": False,
|
||||
"secret_plaintext_allowed": False,
|
||||
"workflow_trigger_authorized": False,
|
||||
"deploy_reload_authorized": False,
|
||||
"runtime_execution_authorized": False,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _surface(
|
||||
surface_id: str,
|
||||
display_name: str,
|
||||
kind: str,
|
||||
status: str,
|
||||
noise_policy_status: str,
|
||||
) -> dict:
|
||||
return {
|
||||
"surface_id": surface_id,
|
||||
"display_name": display_name,
|
||||
"kind": kind,
|
||||
"status": status,
|
||||
"risk_level": "critical",
|
||||
"evidence_status": "committed_manifest",
|
||||
"noise_policy_status": noise_policy_status,
|
||||
"coverage_contract": "只讀 committed evidence。",
|
||||
"current_contract": "不得改 live 設定。",
|
||||
"evidence_refs": ["docs/HARD_RULES.md"],
|
||||
"next_action": "只產 proposal。",
|
||||
}
|
||||
|
||||
|
||||
def _opportunity(opportunity_id: str, status: str) -> dict:
|
||||
return {
|
||||
"opportunity_id": opportunity_id,
|
||||
"display_name": opportunity_id,
|
||||
"status": status,
|
||||
"proposal_only": True,
|
||||
"impact": "降噪提案。",
|
||||
"evidence_refs": ["docs/HARD_RULES.md"],
|
||||
"next_action": "人工批准前不執行。",
|
||||
}
|
||||
|
||||
|
||||
def _count_by(items: list[dict], key: str) -> dict[str, int]:
|
||||
counts: dict[str, int] = {}
|
||||
for item in items:
|
||||
value = item[key]
|
||||
counts[value] = counts.get(value, 0) + 1
|
||||
return counts
|
||||
53
apps/api/tests/test_observability_contract_matrix_api.py
Normal file
53
apps/api/tests/test_observability_contract_matrix_api.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import FastAPI
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from src.api.v1.agents import router
|
||||
|
||||
|
||||
def test_observability_contract_matrix_endpoint_returns_committed_snapshot():
|
||||
app = FastAPI()
|
||||
app.include_router(router, prefix="/api/v1")
|
||||
client = TestClient(app)
|
||||
|
||||
response = client.get("/api/v1/agents/observability-contract-matrix")
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["schema_version"] == "observability_contract_matrix_v1"
|
||||
assert data["program_status"]["overall_completion_percent"] == 100
|
||||
assert data["program_status"]["current_task_id"] == "P1-003"
|
||||
assert data["program_status"]["next_task_id"] == "P1-004"
|
||||
assert data["program_status"]["read_only_mode"] is True
|
||||
assert data["rollups"]["total_surfaces"] == len(data["observability_surfaces"]) == 6
|
||||
assert data["rollups"]["noise_reduction_opportunities_total"] == 5
|
||||
assert data["rollups"]["surface_ids_requiring_action"] == [
|
||||
"grafana_dashboard_inventory",
|
||||
"prometheus_alert_rule_catalog",
|
||||
]
|
||||
assert data["rollups"]["approval_required_opportunity_ids"] == [
|
||||
"alertmanager_grouping_inhibit_tuning",
|
||||
"prometheus_noise_rule_tuning",
|
||||
]
|
||||
assert data["operation_boundaries"]["read_only_api_allowed"] is True
|
||||
assert data["operation_boundaries"]["prometheus_rule_write_allowed"] is False
|
||||
assert data["operation_boundaries"]["alertmanager_route_write_allowed"] is False
|
||||
assert data["operation_boundaries"]["alertmanager_to_openclaw_allowed"] is False
|
||||
assert data["operation_boundaries"]["silence_create_allowed"] is False
|
||||
assert data["operation_boundaries"]["grafana_dashboard_write_allowed"] is False
|
||||
assert data["operation_boundaries"]["notification_send_allowed"] is False
|
||||
assert data["operation_boundaries"]["deploy_trigger_allowed"] is False
|
||||
assert data["approval_boundaries"]["prometheus_rule_change_authorized"] is False
|
||||
assert data["approval_boundaries"]["alertmanager_to_openclaw_authorized"] is False
|
||||
assert data["approval_boundaries"]["deploy_reload_authorized"] is False
|
||||
alertmanager = next(
|
||||
row for row in data["observability_surfaces"] if row["surface_id"] == "alertmanager_awoooi_route"
|
||||
)
|
||||
assert alertmanager["status"] == "verified"
|
||||
assert alertmanager["noise_policy_status"] == "proposal_only"
|
||||
assert "OpenClaw 只做 AI 分析" in alertmanager["coverage_contract"]
|
||||
assert "Alertmanager 指向 OpenClaw receiver 批准" in data["operator_contract"]["must_not_interpret_as"]
|
||||
assert "不接收 Alertmanager webhook" in data["operator_contract"]["alertmanager_route_policy"]
|
||||
for opportunity in data["noise_reduction_opportunities"]:
|
||||
assert opportunity["proposal_only"] is True
|
||||
@@ -3259,6 +3259,54 @@
|
||||
"not_applicable": "不適用",
|
||||
"actionable_only_no_success_noise": "需處置才通知,成功不洗版"
|
||||
}
|
||||
},
|
||||
"observability": {
|
||||
"title": "監控合約與降噪機會",
|
||||
"source": "{generated} · {current} → {next}",
|
||||
"noiseTitle": "降噪 proposal",
|
||||
"classificationTitle": "分類缺口",
|
||||
"contractTitle": "不可誤讀合約",
|
||||
"metrics": {
|
||||
"surfaces": "監控面",
|
||||
"actions": "需處置",
|
||||
"proposals": "降噪提案",
|
||||
"classificationGaps": "分類缺口",
|
||||
"approvalRequired": "需批准"
|
||||
},
|
||||
"map": {
|
||||
"coverage": "合約覆蓋",
|
||||
"coverageDetail": "Prometheus / Alertmanager / Grafana / SigNoz / Sentry / taxonomy。",
|
||||
"noise": "降噪路徑",
|
||||
"noiseDetail": "只產生 proposal,不改 receiver 或 silence。",
|
||||
"classification": "批准邊界",
|
||||
"classificationDetail": "降噪候選先進批准包,不直接改規則、receiver 或分類器。",
|
||||
"safeBoundary": "安全邊界",
|
||||
"safeBoundaryDetail": "alert rule、silence、通知、dashboard、deploy 入口皆為 0。"
|
||||
},
|
||||
"labels": {
|
||||
"evidence": "證據",
|
||||
"noise": "降噪"
|
||||
},
|
||||
"values": {
|
||||
"prometheus_rules": "Prometheus 規則",
|
||||
"alertmanager_route": "Alertmanager 路由",
|
||||
"grafana_dashboard": "Grafana Dashboard",
|
||||
"signoz_clickhouse": "SigNoz / ClickHouse",
|
||||
"sentry_source_link": "Sentry Source Link",
|
||||
"otel_event_exporter": "OTEL / Event Exporter",
|
||||
"verified": "已驗證",
|
||||
"action_required": "需處置",
|
||||
"blocked": "阻擋",
|
||||
"committed_manifest": "已提交 manifest",
|
||||
"production_readback_recorded": "正式讀回已記錄",
|
||||
"proposal_only": "僅提案",
|
||||
"preserved": "已保留",
|
||||
"needs_proposal": "待提案",
|
||||
"approval_required": "需批准",
|
||||
"ready_for_proposal": "提案可審",
|
||||
"deferred": "延後",
|
||||
"proposal_required": "需提案"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
@@ -3259,6 +3259,54 @@
|
||||
"not_applicable": "不適用",
|
||||
"actionable_only_no_success_noise": "需處置才通知,成功不洗版"
|
||||
}
|
||||
},
|
||||
"observability": {
|
||||
"title": "監控合約與降噪機會",
|
||||
"source": "{generated} · {current} → {next}",
|
||||
"noiseTitle": "降噪 proposal",
|
||||
"classificationTitle": "分類缺口",
|
||||
"contractTitle": "不可誤讀合約",
|
||||
"metrics": {
|
||||
"surfaces": "監控面",
|
||||
"actions": "需處置",
|
||||
"proposals": "降噪提案",
|
||||
"classificationGaps": "分類缺口",
|
||||
"approvalRequired": "需批准"
|
||||
},
|
||||
"map": {
|
||||
"coverage": "合約覆蓋",
|
||||
"coverageDetail": "Prometheus / Alertmanager / Grafana / SigNoz / Sentry / taxonomy。",
|
||||
"noise": "降噪路徑",
|
||||
"noiseDetail": "只產生 proposal,不改 receiver 或 silence。",
|
||||
"classification": "批准邊界",
|
||||
"classificationDetail": "降噪候選先進批准包,不直接改規則、receiver 或分類器。",
|
||||
"safeBoundary": "安全邊界",
|
||||
"safeBoundaryDetail": "alert rule、silence、通知、dashboard、deploy 入口皆為 0。"
|
||||
},
|
||||
"labels": {
|
||||
"evidence": "證據",
|
||||
"noise": "降噪"
|
||||
},
|
||||
"values": {
|
||||
"prometheus_rules": "Prometheus 規則",
|
||||
"alertmanager_route": "Alertmanager 路由",
|
||||
"grafana_dashboard": "Grafana Dashboard",
|
||||
"signoz_clickhouse": "SigNoz / ClickHouse",
|
||||
"sentry_source_link": "Sentry Source Link",
|
||||
"otel_event_exporter": "OTEL / Event Exporter",
|
||||
"verified": "已驗證",
|
||||
"action_required": "需處置",
|
||||
"blocked": "阻擋",
|
||||
"committed_manifest": "已提交 manifest",
|
||||
"production_readback_recorded": "正式讀回已記錄",
|
||||
"proposal_only": "僅提案",
|
||||
"preserved": "已保留",
|
||||
"needs_proposal": "待提案",
|
||||
"approval_required": "需批准",
|
||||
"ready_for_proposal": "提案可審",
|
||||
"deferred": "延後",
|
||||
"proposal_required": "需提案"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
@@ -39,6 +39,7 @@ import {
|
||||
type BackupDrTargetInventorySnapshot,
|
||||
type BackupNotificationPolicySnapshot,
|
||||
type GiteaWorkflowRunnerHealthSnapshot,
|
||||
type ObservabilityContractMatrixSnapshot,
|
||||
type OffsiteEscrowReadinessStatusSnapshot,
|
||||
type RuntimeSurfaceInventorySnapshot,
|
||||
} from '@/lib/api-client'
|
||||
@@ -303,6 +304,7 @@ export function AutomationInventoryTab() {
|
||||
const [offsiteEscrow, setOffsiteEscrow] = useState<OffsiteEscrowReadinessStatusSnapshot | null>(null)
|
||||
const [runtimeSurface, setRuntimeSurface] = useState<RuntimeSurfaceInventorySnapshot | null>(null)
|
||||
const [giteaHealth, setGiteaHealth] = useState<GiteaWorkflowRunnerHealthSnapshot | null>(null)
|
||||
const [observabilityMatrix, setObservabilityMatrix] = useState<ObservabilityContractMatrixSnapshot | null>(null)
|
||||
const [loading, setLoading] = useState(true)
|
||||
const [error, setError] = useState(false)
|
||||
|
||||
@@ -317,6 +319,7 @@ export function AutomationInventoryTab() {
|
||||
apiClient.getOffsiteEscrowReadinessStatus(),
|
||||
apiClient.getRuntimeSurfaceInventory(),
|
||||
apiClient.getGiteaWorkflowRunnerHealth(),
|
||||
apiClient.getObservabilityContractMatrix(),
|
||||
] as const
|
||||
|
||||
Promise.allSettled(requests)
|
||||
@@ -330,6 +333,7 @@ export function AutomationInventoryTab() {
|
||||
offsiteEscrowResult,
|
||||
runtimeSurfaceResult,
|
||||
giteaHealthResult,
|
||||
observabilityMatrixResult,
|
||||
] = results
|
||||
|
||||
setSnapshot(inventoryResult.status === 'fulfilled' ? inventoryResult.value : null)
|
||||
@@ -340,6 +344,7 @@ export function AutomationInventoryTab() {
|
||||
setOffsiteEscrow(offsiteEscrowResult.status === 'fulfilled' ? offsiteEscrowResult.value : null)
|
||||
setRuntimeSurface(runtimeSurfaceResult.status === 'fulfilled' ? runtimeSurfaceResult.value : null)
|
||||
setGiteaHealth(giteaHealthResult.status === 'fulfilled' ? giteaHealthResult.value : null)
|
||||
setObservabilityMatrix(observabilityMatrixResult.status === 'fulfilled' ? observabilityMatrixResult.value : null)
|
||||
setError([
|
||||
inventoryResult,
|
||||
backlogResult,
|
||||
@@ -348,6 +353,7 @@ export function AutomationInventoryTab() {
|
||||
policyResult,
|
||||
offsiteEscrowResult,
|
||||
giteaHealthResult,
|
||||
observabilityMatrixResult,
|
||||
].some(result => result.status === 'rejected'))
|
||||
})
|
||||
.catch(() => setError(true))
|
||||
@@ -461,6 +467,28 @@ export function AutomationInventoryTab() {
|
||||
})
|
||||
}, [giteaHealth])
|
||||
|
||||
const visibleObservabilitySurfaces = useMemo(() => {
|
||||
if (!observabilityMatrix) return []
|
||||
const priority = { action_required: 0, blocked: 1, verified: 2 } as Record<string, number>
|
||||
return [...observabilityMatrix.observability_surfaces].sort((a, b) => {
|
||||
const left = priority[a.status] ?? 3
|
||||
const right = priority[b.status] ?? 3
|
||||
if (left !== right) return left - right
|
||||
return a.surface_id.localeCompare(b.surface_id)
|
||||
})
|
||||
}, [observabilityMatrix])
|
||||
|
||||
const visibleNoiseOpportunities = useMemo(() => {
|
||||
if (!observabilityMatrix) return []
|
||||
const priority = { approval_required: 0, ready_for_proposal: 1, preserved: 2, deferred: 3 } as Record<string, number>
|
||||
return [...observabilityMatrix.noise_reduction_opportunities].sort((a, b) => {
|
||||
const left = priority[a.status] ?? 3
|
||||
const right = priority[b.status] ?? 3
|
||||
if (left !== right) return left - right
|
||||
return a.opportunity_id.localeCompare(b.opportunity_id)
|
||||
})
|
||||
}, [observabilityMatrix])
|
||||
|
||||
if (loading) {
|
||||
return (
|
||||
<div style={{ padding: 20, display: 'grid', gridTemplateColumns: 'repeat(4, minmax(0, 1fr))', gap: 12 }} className="automation-inventory-kpi-grid">
|
||||
@@ -474,7 +502,7 @@ export function AutomationInventoryTab() {
|
||||
)
|
||||
}
|
||||
|
||||
if (error || !snapshot || !backlog || !backupTargets || !backupReadiness || !backupPolicy || !offsiteEscrow || !giteaHealth) {
|
||||
if (error || !snapshot || !backlog || !backupTargets || !backupReadiness || !backupPolicy || !offsiteEscrow || !giteaHealth || !observabilityMatrix) {
|
||||
return (
|
||||
<div style={{ padding: 20 }}>
|
||||
<GlassCard variant="subtle" padding="lg">
|
||||
@@ -527,6 +555,10 @@ export function AutomationInventoryTab() {
|
||||
const runtimeBoundComponents = runtimeSurface?.rollups.source_components_with_runtime_binding ?? 0
|
||||
const giteaRunnerActions = giteaHealth.rollups.workflow_ids_requiring_runner_attestation.length
|
||||
const giteaQuietPolicies = giteaHealth.rollups.notification_contracts_quiet_success_count
|
||||
const observabilityActions = observabilityMatrix.rollups.surface_ids_requiring_action.length
|
||||
const observabilityProposalCount = observabilityMatrix.rollups.noise_reduction_opportunities_total
|
||||
const observabilityClassificationGaps = observabilityMatrix.rollups.classification_gap_ids.length
|
||||
const observabilityApprovalRequired = observabilityMatrix.rollups.approval_required_opportunity_ids.length
|
||||
const backlogProgressPercent = backlog.progress_summary.overall_percent
|
||||
const explicitApprovalItemCount = backlog.item_approval_boundary_rollup.items_requiring_explicit_approval.length
|
||||
const taskBoundaryCount = snapshot.task_approval_boundary_rollup.total_tasks
|
||||
@@ -643,6 +675,14 @@ export function AutomationInventoryTab() {
|
||||
}
|
||||
}
|
||||
|
||||
const observabilityValueLabel = (value: string) => {
|
||||
try {
|
||||
return t(`observability.values.${value}` as never)
|
||||
} catch {
|
||||
return value
|
||||
}
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="automation-inventory-tab-root" style={{ padding: 20, display: 'flex', flexDirection: 'column', gap: 16, minWidth: 0 }}>
|
||||
<GlassCard variant="subtle" padding="md">
|
||||
@@ -1450,6 +1490,169 @@ export function AutomationInventoryTab() {
|
||||
</div>
|
||||
</GlassCard>
|
||||
|
||||
<GlassCard variant="subtle" padding="md">
|
||||
<div style={{ display: 'flex', flexDirection: 'column', gap: 13, minWidth: 0 }}>
|
||||
<div style={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', gap: 12, flexWrap: 'wrap' }}>
|
||||
<div style={{ display: 'flex', alignItems: 'center', gap: 7, minWidth: 0 }}>
|
||||
<BellRing size={14} style={{ color: '#d97757' }} />
|
||||
<span style={{ fontFamily: 'Syne, sans-serif', fontSize: 13, fontWeight: 700, color: '#141413' }}>
|
||||
{t('observability.title')}
|
||||
</span>
|
||||
</div>
|
||||
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#87867f' }}>
|
||||
{t('observability.source', {
|
||||
generated: formatDateTime(observabilityMatrix.generated_at),
|
||||
current: observabilityMatrix.program_status.current_task_id,
|
||||
next: observabilityMatrix.program_status.next_task_id,
|
||||
})}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div style={{ display: 'grid', gridTemplateColumns: 'repeat(5, minmax(0, 1fr))', gap: 12 }} className="automation-inventory-observability-kpi-grid">
|
||||
<MetricCard label={t('observability.metrics.surfaces')} value={observabilityMatrix.rollups.total_surfaces} icon={<Database size={16} />} />
|
||||
<MetricCard label={t('observability.metrics.actions')} value={observabilityActions} tone={observabilityActions > 0 ? 'warn' : 'ok'} icon={<AlertTriangle size={16} />} />
|
||||
<MetricCard label={t('observability.metrics.proposals')} value={observabilityProposalCount} tone="warn" icon={<BellOff size={16} />} />
|
||||
<MetricCard label={t('observability.metrics.classificationGaps')} value={observabilityClassificationGaps} tone="warn" icon={<Target size={16} />} />
|
||||
<MetricCard label={t('observability.metrics.approvalRequired')} value={observabilityApprovalRequired} tone={observabilityApprovalRequired > 0 ? 'warn' : 'ok'} icon={<ShieldCheck size={16} />} />
|
||||
</div>
|
||||
|
||||
<div style={{ display: 'grid', gridTemplateColumns: 'repeat(4, minmax(0, 1fr))', gap: 10 }} className="automation-inventory-observability-map-grid">
|
||||
<SummaryTile
|
||||
label={t('observability.map.coverage')}
|
||||
value={`${observabilityMatrix.rollups.total_surfaces}`}
|
||||
detail={t('observability.map.coverageDetail')}
|
||||
tone="ok"
|
||||
icon={<Layers3 size={16} />}
|
||||
/>
|
||||
<SummaryTile
|
||||
label={t('observability.map.noise')}
|
||||
value={`${observabilityProposalCount}`}
|
||||
detail={t('observability.map.noiseDetail')}
|
||||
tone="warn"
|
||||
icon={<BellOff size={16} />}
|
||||
/>
|
||||
<SummaryTile
|
||||
label={t('observability.map.classification')}
|
||||
value={`${observabilityApprovalRequired}`}
|
||||
detail={t('observability.map.classificationDetail')}
|
||||
tone="warn"
|
||||
icon={<Target size={16} />}
|
||||
/>
|
||||
<SummaryTile
|
||||
label={t('observability.map.safeBoundary')}
|
||||
value="0"
|
||||
detail={t('observability.map.safeBoundaryDetail')}
|
||||
tone="ok"
|
||||
icon={<ShieldCheck size={16} />}
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div style={{ display: 'grid', gridTemplateColumns: 'minmax(0, 1.35fr) minmax(0, 0.65fr)', gap: 12 }} className="automation-inventory-observability-grid">
|
||||
<div style={{ display: 'grid', gridTemplateColumns: 'repeat(2, minmax(0, 1fr))', gap: 10 }} className="automation-inventory-observability-surface-grid">
|
||||
{visibleObservabilitySurfaces.map(surface => (
|
||||
<div key={surface.surface_id} style={{ padding: 11, border: '0.5px solid #e0ddd4', borderRadius: 7, background: '#fff', minWidth: 0 }}>
|
||||
<div style={{ display: 'flex', flexDirection: 'column', gap: 8, minWidth: 0 }}>
|
||||
<div style={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', gap: 8, minWidth: 0 }}>
|
||||
<span style={{
|
||||
fontFamily: 'Syne, sans-serif',
|
||||
fontSize: 12,
|
||||
fontWeight: 700,
|
||||
color: '#141413',
|
||||
overflow: 'hidden',
|
||||
textOverflow: 'ellipsis',
|
||||
whiteSpace: 'nowrap',
|
||||
}}>
|
||||
{surface.display_name}
|
||||
</span>
|
||||
<Chip value={observabilityValueLabel(surface.status)} muted={surface.status === 'verified'} />
|
||||
</div>
|
||||
<div style={{ display: 'flex', flexWrap: 'wrap', gap: 6, minWidth: 0 }}>
|
||||
<Chip value={observabilityValueLabel(surface.kind)} />
|
||||
<Chip value={`${t('observability.labels.evidence')}: ${observabilityValueLabel(surface.evidence_status)}`} muted={surface.evidence_status === 'committed_manifest' || surface.evidence_status === 'production_readback_recorded'} />
|
||||
<Chip value={`${t('observability.labels.noise')}: ${observabilityValueLabel(surface.noise_policy_status)}`} muted />
|
||||
</div>
|
||||
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#87867f', lineHeight: 1.45, overflowWrap: 'anywhere' }}>
|
||||
{surface.coverage_contract}
|
||||
</div>
|
||||
{surface.current_contract ? (
|
||||
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#87867f', lineHeight: 1.45, overflowWrap: 'anywhere' }}>
|
||||
{surface.current_contract}
|
||||
</div>
|
||||
) : null}
|
||||
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#141413', lineHeight: 1.45, overflowWrap: 'anywhere' }}>
|
||||
{surface.next_action}
|
||||
</div>
|
||||
<div style={{ display: 'flex', flexWrap: 'wrap', gap: 6, minWidth: 0 }}>
|
||||
<Chip value={surface.evidence_refs[0] ?? t('backupEvidence.noEvidence')} muted />
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
|
||||
<div style={{ display: 'flex', flexDirection: 'column', gap: 12, minWidth: 0 }}>
|
||||
<div style={{ padding: 12, border: '0.5px solid #e0ddd4', borderRadius: 7, background: '#fff', display: 'flex', flexDirection: 'column', gap: 10, minWidth: 0 }}>
|
||||
<span style={{ fontFamily: 'Syne, sans-serif', fontSize: 13, fontWeight: 700, color: '#141413' }}>
|
||||
{t('observability.noiseTitle')}
|
||||
</span>
|
||||
{visibleNoiseOpportunities.slice(0, 5).map(opportunity => (
|
||||
<div key={opportunity.opportunity_id} style={{ display: 'flex', flexDirection: 'column', gap: 6, minWidth: 0 }}>
|
||||
<div style={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', gap: 8 }}>
|
||||
<span style={{ fontFamily: "'DM Mono', monospace", fontSize: 11, fontWeight: 700, color: '#141413', overflow: 'hidden', textOverflow: 'ellipsis', whiteSpace: 'nowrap' }}>
|
||||
{opportunity.display_name}
|
||||
</span>
|
||||
<Chip value={observabilityValueLabel(opportunity.status)} muted />
|
||||
</div>
|
||||
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#87867f', lineHeight: 1.45, overflowWrap: 'anywhere' }}>
|
||||
{opportunity.impact}
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
|
||||
<div style={{ padding: 12, border: '0.5px solid #e0ddd4', borderRadius: 7, background: '#fff', display: 'flex', flexDirection: 'column', gap: 10, minWidth: 0 }}>
|
||||
<span style={{ fontFamily: 'Syne, sans-serif', fontSize: 13, fontWeight: 700, color: '#141413' }}>
|
||||
{t('observability.classificationTitle')}
|
||||
</span>
|
||||
{observabilityMatrix.classification_gaps.map(gap => (
|
||||
<div key={gap.gap_id} style={{ display: 'flex', flexDirection: 'column', gap: 6, minWidth: 0 }}>
|
||||
<div style={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', gap: 8 }}>
|
||||
<span style={{ fontFamily: "'DM Mono', monospace", fontSize: 11, fontWeight: 700, color: '#141413', overflow: 'hidden', textOverflow: 'ellipsis', whiteSpace: 'nowrap' }}>
|
||||
{gap.display_name}
|
||||
</span>
|
||||
<Chip value={observabilityValueLabel(gap.status)} />
|
||||
</div>
|
||||
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#87867f', lineHeight: 1.45, overflowWrap: 'anywhere' }}>
|
||||
{gap.summary}
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
|
||||
<div style={{ padding: 12, border: '0.5px solid #e0ddd4', borderRadius: 7, background: '#fff', display: 'flex', flexDirection: 'column', gap: 10, minWidth: 0 }}>
|
||||
<span style={{ fontFamily: 'Syne, sans-serif', fontSize: 13, fontWeight: 700, color: '#141413' }}>
|
||||
{t('observability.contractTitle')}
|
||||
</span>
|
||||
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#87867f', lineHeight: 1.5, overflowWrap: 'anywhere' }}>
|
||||
{observabilityMatrix.operator_contract.alertmanager_route_policy}
|
||||
</div>
|
||||
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#87867f', lineHeight: 1.5, overflowWrap: 'anywhere' }}>
|
||||
{observabilityMatrix.operator_contract.noise_reduction_policy}
|
||||
</div>
|
||||
<div style={{ fontFamily: "'DM Mono', monospace", fontSize: 10, color: '#87867f', lineHeight: 1.5, overflowWrap: 'anywhere' }}>
|
||||
{observabilityMatrix.operator_contract.notification_policy}
|
||||
</div>
|
||||
<div style={{ display: 'flex', flexWrap: 'wrap', gap: 6, minWidth: 0 }}>
|
||||
{observabilityMatrix.operator_contract.must_not_interpret_as.slice(0, 6).map(item => (
|
||||
<Chip key={item} value={item} />
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</GlassCard>
|
||||
|
||||
<div style={{ display: 'grid', gridTemplateColumns: 'minmax(0, 1.2fr) minmax(0, 0.8fr)', gap: 12 }} className="automation-inventory-bottom-grid">
|
||||
<GlassCard variant="subtle" padding="md">
|
||||
<div style={{ display: 'flex', flexDirection: 'column', gap: 13, minWidth: 0 }}>
|
||||
@@ -1564,6 +1767,10 @@ export function AutomationInventoryTab() {
|
||||
.automation-inventory-gitea-map-grid,
|
||||
.automation-inventory-gitea-grid,
|
||||
.automation-inventory-gitea-workflow-grid,
|
||||
.automation-inventory-observability-kpi-grid,
|
||||
.automation-inventory-observability-map-grid,
|
||||
.automation-inventory-observability-grid,
|
||||
.automation-inventory-observability-surface-grid,
|
||||
.automation-inventory-bottom-grid,
|
||||
.automation-inventory-task-grid {
|
||||
grid-template-columns: 1fr !important;
|
||||
|
||||
@@ -272,6 +272,11 @@ export const apiClient = {
|
||||
return handleResponse<GiteaWorkflowRunnerHealthSnapshot>(res)
|
||||
},
|
||||
|
||||
async getObservabilityContractMatrix() {
|
||||
const res = await fetch(`${API_BASE_URL}/agents/observability-contract-matrix`)
|
||||
return handleResponse<ObservabilityContractMatrixSnapshot>(res)
|
||||
},
|
||||
|
||||
async getBackupDrTargetInventory() {
|
||||
const res = await fetch(`${API_BASE_URL}/agents/backup-dr-target-inventory`)
|
||||
return handleResponse<BackupDrTargetInventorySnapshot>(res)
|
||||
@@ -946,6 +951,80 @@ export interface GiteaWorkflowRunnerHealthSnapshot {
|
||||
approval_boundaries: Record<string, false>
|
||||
}
|
||||
|
||||
export interface ObservabilityContractMatrixSnapshot {
|
||||
schema_version: 'observability_contract_matrix_v1'
|
||||
generated_at: string
|
||||
program_status: {
|
||||
overall_completion_percent: number
|
||||
current_priority: 'P0' | 'P1' | 'P2' | 'P3'
|
||||
current_task_id: string
|
||||
next_task_id: string
|
||||
read_only_mode: true
|
||||
}
|
||||
source_refs: string[]
|
||||
rollups: {
|
||||
total_surfaces: number
|
||||
by_kind: Record<string, number>
|
||||
by_status: Record<string, number>
|
||||
by_evidence_status: Record<string, number>
|
||||
by_noise_policy_status: Record<string, number>
|
||||
surface_ids_requiring_action: string[]
|
||||
surface_ids_with_proposal_only_noise_policy: string[]
|
||||
noise_reduction_opportunities_total: number
|
||||
approval_required_opportunity_ids: string[]
|
||||
classification_gap_ids: string[]
|
||||
read_only_denials_total: number
|
||||
}
|
||||
observability_surfaces: Array<{
|
||||
surface_id: string
|
||||
display_name: string
|
||||
kind: string
|
||||
status: 'verified' | 'action_required' | 'blocked'
|
||||
risk_level: 'low' | 'medium' | 'high' | 'critical'
|
||||
evidence_status: string
|
||||
noise_policy_status: string
|
||||
coverage_contract: string
|
||||
current_contract?: string
|
||||
evidence_refs: string[]
|
||||
next_action: string
|
||||
}>
|
||||
noise_reduction_opportunities: Array<{
|
||||
opportunity_id: string
|
||||
display_name: string
|
||||
status: string
|
||||
proposal_only: true
|
||||
impact: string
|
||||
target_surface_ids?: string[]
|
||||
evidence_refs: string[]
|
||||
next_action: string
|
||||
}>
|
||||
classification_gaps: Array<{
|
||||
gap_id: string
|
||||
display_name: string
|
||||
status: string
|
||||
severity: 'low' | 'medium' | 'high' | 'critical'
|
||||
summary: string
|
||||
evidence_refs: string[]
|
||||
next_action: string
|
||||
}>
|
||||
latest_observations: Array<{
|
||||
observation_id: string
|
||||
status: string
|
||||
summary: string
|
||||
evidence_refs: string[]
|
||||
}>
|
||||
operator_contract: {
|
||||
display_mode: 'read_only_observability_contract_matrix'
|
||||
must_not_interpret_as: string[]
|
||||
secret_display_policy: string
|
||||
alertmanager_route_policy: string
|
||||
noise_reduction_policy: string
|
||||
notification_policy: string
|
||||
}
|
||||
operation_boundaries: Record<string, boolean>
|
||||
approval_boundaries: Record<string, false>
|
||||
}
|
||||
|
||||
export interface BackupDrTargetInventorySnapshot {
|
||||
schema_version: 'backup_dr_target_inventory_v1'
|
||||
generated_at: string
|
||||
|
||||
@@ -1,3 +1,38 @@
|
||||
## 2026-06-05|P1-003 監控合約與降噪矩陣本地完成
|
||||
|
||||
**背景**:接續 P1-002 Gitea workflow / runner health contract 與決策摘要正式驗證,依工作清單推進 `P1-003`。本段只建立 Prometheus / Alertmanager / SigNoz / Grafana / Sentry / OTEL 的 committed observability matrix、只讀 API 與治理頁顯示,不修改 alert rules、不 reload Prometheus、不改 Alertmanager receiver / route、不建立 silence、不寫 Grafana、不改 SigNoz / Sentry webhook、不發 Telegram 測試通知。
|
||||
|
||||
**本輪完成**:
|
||||
- 新增 `observability_contract_matrix_v1` schema 與 `docs/evaluations/observability_contract_matrix_2026-06-05.json`。
|
||||
- 新增 `GET /api/v1/agents/observability-contract-matrix` 與 service guard,強制驗證 read-only mode、operation / approval boundaries、rollup consistency、降噪候選只能 proposal、Alertmanager 不得指向 OpenClaw、不得出現 secret payload key。
|
||||
- 治理頁 `/zh-TW/governance?tab=automation-inventory` 新增「監控合約與降噪機會」區塊,顯示監控面、需處置、降噪候選、需批准候選、分類缺口與不可誤讀合約。
|
||||
- 同步 automation backlog / inventory snapshot:current `P1-003`、next `P1-004`、backlog overall `83%`、P1 `90%`、done `19/23`、inventory tasks `29`。
|
||||
|
||||
**目前數字**:
|
||||
- Observability surfaces:`6`。
|
||||
- 需處置 surfaces:`2`(`grafana_dashboard_inventory`、`prometheus_alert_rule_catalog`)。
|
||||
- 降噪候選:`5`。
|
||||
- 需人工批准的降噪候選:`2`(Prometheus rule tuning、Alertmanager grouping / inhibit tuning)。
|
||||
- Classification gaps:`3`。
|
||||
- Read-only denials:`12`。
|
||||
|
||||
**本地驗證**:
|
||||
- JSON parse 通過:`observability_contract_matrix_2026-06-05.json`、`observability_contract_matrix_v1.schema.json`、automation backlog / inventory snapshots。
|
||||
- 目標測試通過:observability contract matrix service / API、automation inventory / backlog snapshot API、Gitea workflow runner health service / API 共 `19 passed`。
|
||||
- Python py_compile 通過:`apps/api/src/services/observability_contract_matrix.py`、`apps/api/src/api/v1/agents.py`。
|
||||
- zh-TW / en i18n key 差異 `0`;web typecheck 通過;Next production build 通過。
|
||||
- source-control-owner-response guard、security-mirror-progress guard、`git diff --check` 通過。
|
||||
- 本地 API readback 回 `observability_contract_matrix_v1`、current `P1-003`、next `P1-004`、surfaces `6`、noise opportunities `5`、approval-required opportunities `2`;backlog 回 overall `83%`、done `19/23`。
|
||||
|
||||
**邊界**:
|
||||
- Prometheus alert rule 修改、Prometheus reload、Alertmanager route / receiver 修改、Alertmanager 指向 OpenClaw、silence 建立、Grafana 寫入、SigNoz / Sentry webhook 修改、OTEL / Event Exporter deploy 或 restart、Secret payload read、Telegram 測試通知、external API / live query、workflow / deploy / reload / runtime execution 全部仍未批准。
|
||||
- 成功 smoke 不即時通知洗版;失敗、action-required 或人工作業才可進通知批准流程。
|
||||
|
||||
**下一步**:
|
||||
1. Commit 並推 `gitea main`。
|
||||
2. 等 deploy marker 後執行 production API / Browser smoke。
|
||||
3. P1-004:盤點 AI Router / Ollama / Nemotron / Gemini provider 路徑。
|
||||
|
||||
## 2026-06-05|AI Agent 自動化盤點決策摘要正式上線
|
||||
|
||||
**背景**:接續 P1-002 Gitea workflow / runner health contract 與文字換行正式驗證,治理頁 `/zh-TW/governance?tab=automation-inventory` 已能呈現完整資料,但首屏資訊密度偏高,使用者難以快速判讀「目前狀態、拖累因素、下一步」。本段只優化既有資訊呈現,不移除原明細、不新增 API、不改 workflow / runner / secret / runtime 行為。
|
||||
|
||||
@@ -10,10 +10,10 @@
|
||||
|---|---:|---|---|
|
||||
| Agent 市場治理 | 72% | 進行中 | `agent_market_governance_snapshot_v1`、API、UI 分頁、每週觀察流程 |
|
||||
| Nemotron 實際整合應用 | 30% | 完整回放前仍被關卡擋下 | `blocked_needs_evidence`,下一關是 `refresh_source_evidence_then_5_record_smoke_only` |
|
||||
| 工具 / 服務 / 套件 AI 自動化 | 78% | P0 已完成,P1 套件 / 供應鏈主線已完成;備份 / DR 主線已完成到異地 / escrow 準備度顯示;任務批准邊界、進度彙總、P1-001 執行面只讀矩陣與 P1-002 Gitea 工作流程 / runner 健康合約已完成,下一主線是 P1-003 監控合約與降噪機會 | 狀態分類、盤點 schema、權限矩陣、靜態盤點種子、只讀 API、UI 骨架、驗證、自動化待辦 schema / 快照 / API / 分組 UI、Backup / DR 目標盤點、準備度矩陣、備份通知政策、Backup / DR 證據 UI、復原演練批准包模板、異地 / escrow 準備度狀態、任務批准邊界、確定性進度彙總、Python 套件 / 供應鏈只讀基線、JS pnpm/npm 只讀基線、Docker build surface 只讀基線、CVE / license / drift 嚴重度政策、定期依賴漂移與外部資料來源檢查設計、依賴升級批准包模板、runtime_surface_inventory_v1 schema / snapshot / API / UI、gitea_workflow_runner_health_v1 schema / snapshot / API / UI 已完成 |
|
||||
| 工具 / 服務 / 套件 AI 自動化 | 83% | P0 已完成,P1 套件 / 供應鏈主線已完成;備份 / DR 主線已完成到異地 / escrow 準備度顯示;任務批准邊界、進度彙總、P1-001 執行面只讀矩陣、P1-002 Gitea 工作流程 / runner 健康合約與 P1-003 監控合約 / 降噪矩陣已完成,下一主線是 P1-004 AI Router / provider route 盤點 | 狀態分類、盤點 schema、權限矩陣、靜態盤點種子、只讀 API、UI 骨架、驗證、自動化待辦 schema / 快照 / API / 分組 UI、Backup / DR 目標盤點、準備度矩陣、備份通知政策、Backup / DR 證據 UI、復原演練批准包模板、異地 / escrow 準備度狀態、任務批准邊界、確定性進度彙總、Python 套件 / 供應鏈只讀基線、JS pnpm/npm 只讀基線、Docker build surface 只讀基線、CVE / license / drift 嚴重度政策、定期依賴漂移與外部資料來源檢查設計、依賴升級批准包模板、runtime_surface_inventory_v1 schema / snapshot / API / UI、gitea_workflow_runner_health_v1 schema / snapshot / API / UI、observability_contract_matrix_v1 schema / snapshot / API / UI 已完成 |
|
||||
| 本工作清單與分析報告 | 100% | 已完成 | 本 MD 文件 |
|
||||
|
||||
AI Agent 自動化工作包目前完成度:**78%**。本工作清單文件本身完成度:**100%**。
|
||||
AI Agent 自動化工作包目前完成度:**83%**。本工作清單文件本身完成度:**100%**。
|
||||
|
||||
完成度計算模型:
|
||||
|
||||
@@ -868,7 +868,7 @@ UI:
|
||||
|---|---|---:|---|---|---|---|
|
||||
| P1-001 | 完成 | 100 | OpenClaw | 盤點 API / Web / Worker / K8s runtime surface | `runtime_surface_inventory_v1` / `GET /api/v1/agents/runtime-surface-inventory` / 執行面只讀矩陣 | 只讀;不得查 Secret payload、不得 rollout / restart / scale / delete |
|
||||
| P1-002 | 完成 | 100 | Hermes | 盤點 Gitea 工作流程與 runner 健康合約 | `gitea_workflow_runner_health_v1` / `GET /api/v1/agents/gitea-workflow-runner-health` / Gitea 健康合約 UI | 只讀;不修改 workflow、不重啟 runner、不停止 container、不讀 Secret、不發通知 |
|
||||
| P1-003 | 待辦 | 0 | Hermes | 盤點 Prometheus / Alertmanager / SigNoz / Grafana 監控合約 | 可觀測性矩陣 | 只讀 |
|
||||
| P1-003 | 完成 | 100 | Hermes | 盤點 Prometheus / Alertmanager / SigNoz / Grafana 監控合約 | `observability_contract_matrix_v1` / `GET /api/v1/agents/observability-contract-matrix` / 監控合約與降噪 UI | 只讀;不修改 alert rules、不改 receiver/route、不建立 silence、不寫 Grafana、不發通知 |
|
||||
| P1-004 | 待辦 | 0 | OpenClaw | 盤點 AI Router / Ollama / Nemotron / Gemini provider 路徑 | 推理路由矩陣 | 不切 provider |
|
||||
| P1-005 | 待辦 | 0 | OpenClaw | 偵測服務健康缺口與過期端點 | 需處置清單 | 不重啟 |
|
||||
| P1-006 | 待辦 | 0 | Hermes | 在 UI 顯示 service health 證據卡 | 狀態卡 | 瀏覽器驗證 |
|
||||
@@ -1084,10 +1084,25 @@ UI:
|
||||
下一步:P1-003 盤點監控合約與降噪機會。
|
||||
```
|
||||
|
||||
本次同步:
|
||||
|
||||
```text
|
||||
進度:83%。
|
||||
目前優先級:P1。
|
||||
目前任務:P1-003 盤點監控合約與降噪機會。
|
||||
狀態變更:待辦 -> 完成。
|
||||
證據:observability_contract_matrix_v1 schema / snapshot;GET /api/v1/agents/observability-contract-matrix;治理頁監控合約與降噪機會區塊;automation backlog 83%;inventory tasks 29。
|
||||
目前數字:observability surfaces 6;需處置 2;降噪候選 5;需人工批准的降噪候選 2;classification gaps 3;backlog done 19/23;overall 83%;P1 90%;WS3 監控自動化 75%。
|
||||
驗證:JSON parse 通過;observability contract matrix service / API、automation inventory / backlog snapshot API、Gitea workflow runner health service / API 目標測試 `19 passed`;Python py_compile 通過;zh-TW / en i18n key 差異 `0`;web typecheck 通過;Next production build 通過;source-control-owner-response guard、security-mirror-progress guard、git diff --check 通過;本地 API readback 回 `observability_contract_matrix_v1`、current `P1-003`、next `P1-004`、surfaces `6`、noise opportunities `5`、approval-required opportunities `2`。
|
||||
正式驗證:尚未推版;待本地驗證完成後推 `gitea main` 並補 production API / browser smoke。
|
||||
阻擋:Prometheus alert rule 修改、Prometheus reload、Alertmanager route / receiver 修改、Alertmanager 指向 OpenClaw、silence 建立、Grafana 寫入、SigNoz / Sentry webhook 修改、OTEL/Event Exporter deploy 或 restart、Secret payload read、Telegram 測試通知、external API/live query、workflow/deploy/reload/runtime execution 仍全部禁止。
|
||||
下一步:P1-004 盤點 AI Router / Ollama / Nemotron / Gemini provider 路徑。
|
||||
```
|
||||
|
||||
## 13. 立即執行順序
|
||||
|
||||
1. P1-003:盤點 Prometheus / Alertmanager / SigNoz / Grafana 監控合約與降噪機會。
|
||||
2. P1-004:盤點 AI Router / Ollama / Nemotron / Gemini provider 路徑。
|
||||
1. P1-004:盤點 AI Router / Ollama / Nemotron / Gemini provider 路徑。
|
||||
2. P1-005:偵測服務健康缺口與過期端點。
|
||||
3. P2 / P3 必須等 P1 服務、監控與 provider runtime surface 可見且關卡穩定後再做。
|
||||
|
||||
## 14. 目前風險
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"schema_version": "ai_agent_automation_backlog_v1",
|
||||
"generated_at": "2026-06-05T10:56:16+08:00",
|
||||
"generated_at": "2026-06-05T12:34:00+08:00",
|
||||
"source_inventory_snapshot_ref": "docs/evaluations/ai_agent_automation_inventory_snapshot_2026-06-04_static_seed.json",
|
||||
"program_status": {
|
||||
"overall_completion_percent": 78,
|
||||
"overall_completion_percent": 83,
|
||||
"current_priority": "P1",
|
||||
"current_task_id": "P1-002",
|
||||
"next_task_id": "P1-003",
|
||||
"current_task_id": "P1-003",
|
||||
"next_task_id": "P1-004",
|
||||
"read_only_mode": true
|
||||
},
|
||||
"rollups": {
|
||||
@@ -17,8 +17,8 @@
|
||||
"P3": 1
|
||||
},
|
||||
"by_status": {
|
||||
"done": 18,
|
||||
"planned": 5
|
||||
"done": 19,
|
||||
"planned": 4
|
||||
},
|
||||
"by_gate_status": {
|
||||
"read_only_allowed": 20,
|
||||
@@ -318,26 +318,31 @@
|
||||
{
|
||||
"item_id": "AUTO-P1-003",
|
||||
"priority": "P1",
|
||||
"status": "planned",
|
||||
"status": "done",
|
||||
"workstream_id": "WS3",
|
||||
"source_asset_id": "prometheus_alertmanager",
|
||||
"source_signal_kind": "health_gap",
|
||||
"title": "盤點監控合約與降噪機會",
|
||||
"owner_agent": "hermes",
|
||||
"recommended_action": "建立 Prometheus / Alertmanager / Grafana / SigNoz / ClickHouse 的只讀 observability matrix。",
|
||||
"recommended_action": "已建立 Prometheus / Alertmanager / Grafana / SigNoz / ClickHouse / Sentry 的只讀 observability matrix;降噪與分類缺口只產生 proposal,不修改 alert rules。",
|
||||
"action_class": "observe",
|
||||
"gate_status": "read_only_allowed",
|
||||
"risk_level": "high",
|
||||
"evidence_refs": [
|
||||
"k8s/monitoring/prometheus.yml",
|
||||
"ops/monitoring/"
|
||||
"docs/evaluations/observability_contract_matrix_2026-06-05.json",
|
||||
"GET /api/v1/agents/observability-contract-matrix",
|
||||
"k8s/monitoring/",
|
||||
"ops/alertmanager/alertmanager.yml",
|
||||
"ops/monitoring/",
|
||||
"apps/api/src/constants/alert_types.py"
|
||||
],
|
||||
"acceptance_criteria": [
|
||||
"不修改 alert rules",
|
||||
"降噪只產生 proposal",
|
||||
"標出 stale、缺 evidence、過度通知與 classification gap"
|
||||
"不修改 alert rules、不呼叫 silence API、不送測試通知",
|
||||
"列出 Prometheus / Alertmanager / Grafana / SigNoz / ClickHouse / Sentry 的只讀合約",
|
||||
"降噪只產生 proposal,標出 stale、缺 evidence、過度通知與 classification gap",
|
||||
"API / UI 僅顯示 committed snapshot 與不可誤讀合約"
|
||||
],
|
||||
"next_review": "P1-003",
|
||||
"next_review": "P1-004",
|
||||
"approval_boundary": {
|
||||
"mode": "read_only_allowed",
|
||||
"display_summary": "只允許只讀盤點、顯示與批准包準備;不得直接執行寫入、部署、通知或外部呼叫。",
|
||||
@@ -1170,16 +1175,16 @@
|
||||
]
|
||||
},
|
||||
"progress_summary": {
|
||||
"overall_percent": 78,
|
||||
"done_items": 18,
|
||||
"planned_items": 5,
|
||||
"overall_percent": 83,
|
||||
"done_items": 19,
|
||||
"planned_items": 4,
|
||||
"total_items": 23,
|
||||
"formula": "round(done_items / total_items * 100),只有 status=done 計入完成;planned/in_progress/blocked/deferred/rejected 不計入。",
|
||||
"by_priority": [
|
||||
{
|
||||
"priority": "P1",
|
||||
"completion_percent": 86,
|
||||
"done_items": 18,
|
||||
"completion_percent": 90,
|
||||
"done_items": 19,
|
||||
"total_items": 21
|
||||
},
|
||||
{
|
||||
@@ -1207,10 +1212,10 @@
|
||||
{
|
||||
"workstream_id": "WS3",
|
||||
"display_name": "監控自動化",
|
||||
"completion_percent": 50,
|
||||
"done_items": 2,
|
||||
"completion_percent": 75,
|
||||
"done_items": 3,
|
||||
"total_items": 4,
|
||||
"next_task_id": "P1-003"
|
||||
"next_task_id": "P1-004"
|
||||
},
|
||||
{
|
||||
"workstream_id": "WS4",
|
||||
@@ -1250,7 +1255,7 @@
|
||||
"completion_percent": 100,
|
||||
"done_items": 2,
|
||||
"total_items": 2,
|
||||
"next_task_id": "P1-003"
|
||||
"next_task_id": "P1-004"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
{
|
||||
"schema_version": "ai_agent_automation_inventory_snapshot_v1",
|
||||
"generated_at": "2026-06-05T10:56:16+08:00",
|
||||
"generated_at": "2026-06-05T12:34:00+08:00",
|
||||
"program_status": {
|
||||
"overall_completion_percent": 100,
|
||||
"current_priority": "P1",
|
||||
"current_task_id": "P1-002",
|
||||
"next_task_id": "P1-003",
|
||||
"current_task_id": "P1-003",
|
||||
"next_task_id": "P1-004",
|
||||
"read_only_mode": true
|
||||
},
|
||||
"status_taxonomy": {
|
||||
@@ -287,44 +287,50 @@
|
||||
"domain_id": "observability",
|
||||
"display_name": "Prometheus / Alertmanager",
|
||||
"asset_type": "observability_tool",
|
||||
"status": "planned",
|
||||
"status": "done",
|
||||
"gate_status": "read_only_allowed",
|
||||
"owner_agent": "hermes",
|
||||
"risk_level": "high",
|
||||
"evidence_refs": [
|
||||
"k8s/monitoring/prometheus.yml",
|
||||
"docs/evaluations/observability_contract_matrix_2026-06-05.json",
|
||||
"GET /api/v1/agents/observability-contract-matrix",
|
||||
"k8s/monitoring/",
|
||||
"ops/alertmanager/alertmanager.yml",
|
||||
"ops/monitoring/"
|
||||
],
|
||||
"next_action": "P1-003 盤點告警合約與降噪機會。"
|
||||
"next_action": "P1-003 已完成只讀監控合約與降噪機會矩陣;P1-004 盤點 AI Router / provider route。"
|
||||
},
|
||||
{
|
||||
"asset_id": "signoz_clickhouse",
|
||||
"domain_id": "observability",
|
||||
"display_name": "SigNoz / ClickHouse",
|
||||
"asset_type": "observability_tool",
|
||||
"status": "planned",
|
||||
"status": "done",
|
||||
"gate_status": "read_only_allowed",
|
||||
"owner_agent": "hermes",
|
||||
"risk_level": "medium",
|
||||
"evidence_refs": [
|
||||
"docs/LOGBOOK.md"
|
||||
"docs/evaluations/observability_contract_matrix_2026-06-05.json",
|
||||
"apps/api/src/services/signoz_client.py",
|
||||
"ops/signoz"
|
||||
],
|
||||
"next_action": "P1-003 補 trace / metrics / log 可見性盤點。"
|
||||
"next_action": "P1-003 已補 trace / metrics / log 可見性只讀合約;live readback 仍需後續人工批准範圍。"
|
||||
},
|
||||
{
|
||||
"asset_id": "sentry",
|
||||
"domain_id": "tools",
|
||||
"display_name": "Sentry",
|
||||
"asset_type": "external_service",
|
||||
"status": "planned",
|
||||
"status": "done",
|
||||
"gate_status": "read_only_allowed",
|
||||
"owner_agent": "hermes",
|
||||
"risk_level": "medium",
|
||||
"evidence_refs": [
|
||||
"scripts/backup/backup-sentry.sh",
|
||||
"apps/web/src/instrumentation.ts"
|
||||
"docs/evaluations/observability_contract_matrix_2026-06-05.json",
|
||||
"apps/web/src/instrumentation.ts",
|
||||
"scripts/backup/backup-sentry.sh"
|
||||
],
|
||||
"next_action": "P1-003 盤點錯誤監控與備份狀態。"
|
||||
"next_action": "P1-003 已補 Sentry error monitoring 合約;不讀 DSN secret、不送事件。"
|
||||
},
|
||||
{
|
||||
"asset_id": "telegram_chain",
|
||||
@@ -472,9 +478,9 @@
|
||||
{
|
||||
"workstream_id": "WS3",
|
||||
"display_name": "監控自動化",
|
||||
"completion_percent": 50,
|
||||
"completion_percent": 75,
|
||||
"status": "in_progress",
|
||||
"next_task_id": "P1-003"
|
||||
"next_task_id": "P1-004"
|
||||
},
|
||||
{
|
||||
"workstream_id": "WS4",
|
||||
@@ -509,7 +515,7 @@
|
||||
"display_name": "產品 UI",
|
||||
"completion_percent": 94,
|
||||
"status": "in_progress",
|
||||
"next_task_id": "P1-003"
|
||||
"next_task_id": "P1-004"
|
||||
}
|
||||
],
|
||||
"tasks": [
|
||||
@@ -834,6 +840,38 @@
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"task_id": "P1-003",
|
||||
"priority": "P1",
|
||||
"status": "done",
|
||||
"completion_percent": 100,
|
||||
"owner_agent": "hermes",
|
||||
"title": "盤點監控合約與降噪機會",
|
||||
"output": "docs/evaluations/observability_contract_matrix_2026-06-05.json + GET /api/v1/agents/observability-contract-matrix",
|
||||
"gate_status": "read_only_allowed",
|
||||
"next_action": "完成 committed observability contract matrix;下一步 P1-004 盤點 AI Router / provider route。",
|
||||
"approval_boundary": {
|
||||
"mode": "read_only_allowed",
|
||||
"display_summary": "只允許只讀盤點、顯示與批准包準備;不得直接執行寫入、部署、通知或外部呼叫。",
|
||||
"allowed_actions": [
|
||||
"讀取 committed snapshot",
|
||||
"整理只讀證據",
|
||||
"顯示治理 UI"
|
||||
],
|
||||
"blocked_actions": [
|
||||
"production_write",
|
||||
"runtime_execution",
|
||||
"destructive_operation",
|
||||
"secret_plaintext_collection",
|
||||
"unapproved_deploy",
|
||||
"unapproved_external_call"
|
||||
],
|
||||
"requires_operator_approval_for": [
|
||||
"任何非只讀操作",
|
||||
"任何部署、排程、通知或外部呼叫變更"
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"task_id": "P1-301",
|
||||
"priority": "P1",
|
||||
@@ -1729,6 +1767,13 @@
|
||||
"kind": "api",
|
||||
"ref": "GET /api/v1/agents/gitea-workflow-runner-health",
|
||||
"result": "只讀 API 回傳 gitea_workflow_runner_health_v1;不修改 workflow、不重啟 runner、不停止 container、不讀 Secret、不送通知。"
|
||||
},
|
||||
{
|
||||
"evidence_id": "observability_contract_matrix_api",
|
||||
"title": "監控合約與降噪機會只讀 API",
|
||||
"source_ref": "GET /api/v1/agents/observability-contract-matrix",
|
||||
"status": "done",
|
||||
"summary": "只讀呈現 Prometheus / Alertmanager / Grafana / SigNoz / ClickHouse / Sentry 合約、降噪 proposal 與分類缺口;不修改 alert rules、不送通知、不讀 Secret。"
|
||||
}
|
||||
],
|
||||
"approval_boundaries": {
|
||||
@@ -1739,10 +1784,10 @@
|
||||
"destructive_operation_allowed": false
|
||||
},
|
||||
"task_approval_boundary_rollup": {
|
||||
"total_tasks": 28,
|
||||
"total_tasks": 29,
|
||||
"by_mode": {
|
||||
"ready_for_operator_review": 1,
|
||||
"read_only_allowed": 26,
|
||||
"read_only_allowed": 27,
|
||||
"approval_required": 1
|
||||
},
|
||||
"tasks_requiring_explicit_approval": [
|
||||
@@ -1760,6 +1805,13 @@
|
||||
"P0-008",
|
||||
"P1-001",
|
||||
"P1-002",
|
||||
"P1-003",
|
||||
"P1-301",
|
||||
"P1-302",
|
||||
"P1-303",
|
||||
"P1-304",
|
||||
"P1-305",
|
||||
"P1-306",
|
||||
"P1-101",
|
||||
"P1-102",
|
||||
"P1-103",
|
||||
@@ -1771,13 +1823,7 @@
|
||||
"P1-203",
|
||||
"P1-204",
|
||||
"P1-205",
|
||||
"P1-206",
|
||||
"P1-301",
|
||||
"P1-302",
|
||||
"P1-303",
|
||||
"P1-304",
|
||||
"P1-305",
|
||||
"P1-306"
|
||||
"P1-206"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
391
docs/evaluations/observability_contract_matrix_2026-06-05.json
Normal file
391
docs/evaluations/observability_contract_matrix_2026-06-05.json
Normal file
@@ -0,0 +1,391 @@
|
||||
{
|
||||
"schema_version": "observability_contract_matrix_v1",
|
||||
"generated_at": "2026-06-05T12:24:00+08:00",
|
||||
"program_status": {
|
||||
"overall_completion_percent": 100,
|
||||
"current_priority": "P1",
|
||||
"current_task_id": "P1-003",
|
||||
"next_task_id": "P1-004",
|
||||
"read_only_mode": true
|
||||
},
|
||||
"source_refs": [
|
||||
"docs/schemas/observability_contract_matrix_v1.schema.json",
|
||||
"docs/HARD_RULES.md#alertmanager-routing",
|
||||
"ops/alertmanager/alertmanager.yml",
|
||||
"ops/monitoring/alerts.yml",
|
||||
"ops/monitoring/alerts-unified.yml",
|
||||
"k8s/monitoring/prometheus.yml",
|
||||
"k8s/monitoring/alert-chain-monitor.yaml",
|
||||
"ops/grafana/dashboards/ai-monitoring.json",
|
||||
"ops/grafana/dashboards/infra-monitoring.json",
|
||||
"ops/signoz/alerting/rules.yaml",
|
||||
"ops/signoz/alerting/log-rules.md",
|
||||
"ops/signoz/otel-collector-config-phase-o.yaml",
|
||||
"k8s/observability/otel-collector-daemonset.yaml",
|
||||
"docs/runbooks/RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK.md",
|
||||
"docs/adr/ADR-053-observability-signoz-unified-architecture.md",
|
||||
"docs/adr/ADR-090-monitoring-blindspot-governance.md",
|
||||
"docs/LOGBOOK.md"
|
||||
],
|
||||
"rollups": {
|
||||
"total_surfaces": 6,
|
||||
"by_kind": {
|
||||
"prometheus_rules": 1,
|
||||
"alertmanager_route": 1,
|
||||
"signoz_clickhouse": 1,
|
||||
"grafana_dashboard": 1,
|
||||
"sentry_source_link": 1,
|
||||
"otel_event_exporter": 1
|
||||
},
|
||||
"by_status": {
|
||||
"action_required": 2,
|
||||
"verified": 4
|
||||
},
|
||||
"by_evidence_status": {
|
||||
"committed_manifest": 4,
|
||||
"production_readback_recorded": 2
|
||||
},
|
||||
"by_noise_policy_status": {
|
||||
"proposal_only": 2,
|
||||
"preserved": 3,
|
||||
"needs_proposal": 1
|
||||
},
|
||||
"surface_ids_requiring_action": [
|
||||
"grafana_dashboard_inventory",
|
||||
"prometheus_alert_rule_catalog"
|
||||
],
|
||||
"surface_ids_with_proposal_only_noise_policy": [
|
||||
"alertmanager_awoooi_route",
|
||||
"prometheus_alert_rule_catalog"
|
||||
],
|
||||
"noise_reduction_opportunities_total": 5,
|
||||
"approval_required_opportunity_ids": [
|
||||
"alertmanager_grouping_inhibit_tuning",
|
||||
"prometheus_noise_rule_tuning"
|
||||
],
|
||||
"classification_gap_ids": [
|
||||
"grafana_dashboard_owner_status",
|
||||
"prometheus_alert_rule_catalog_seed",
|
||||
"signoz_provider_native_real_alert_gap"
|
||||
],
|
||||
"read_only_denials_total": 12,
|
||||
"surfaces_requiring_action": [
|
||||
"grafana_dashboard_inventory",
|
||||
"prometheus_alert_rule_catalog"
|
||||
],
|
||||
"proposal_only_count": 5
|
||||
},
|
||||
"observability_surfaces": [
|
||||
{
|
||||
"surface_id": "prometheus_alert_rule_catalog",
|
||||
"display_name": "Prometheus 告警規則合約",
|
||||
"kind": "prometheus_rules",
|
||||
"status": "action_required",
|
||||
"risk_level": "critical",
|
||||
"evidence_status": "committed_manifest",
|
||||
"noise_policy_status": "proposal_only",
|
||||
"coverage_contract": "已提交 ops/monitoring/alerts-unified.yml 與 k8s/monitoring/* 規則;本快照只盤點規則、label、runbook 與分類缺口,不 reload Prometheus、不修改 alert rules。",
|
||||
"current_contract": "committed ops/monitoring/alerts-unified.yml 目前含 118 條 alert;LOGBOOK 曾記錄 production Prometheus rule count 142,需以正式 smoke 讀回確認。",
|
||||
"evidence_refs": [
|
||||
"ops/monitoring/alerts-unified.yml",
|
||||
"ops/monitoring/alerts.yml",
|
||||
"k8s/monitoring/alert-chain-monitor.yaml",
|
||||
"docs/LOGBOOK.md"
|
||||
],
|
||||
"next_action": "建立 alert_rule_catalog seed 與噪音率觀察 proposal;任何 rule 調整放到 P2-003 人工批准。"
|
||||
},
|
||||
{
|
||||
"surface_id": "alertmanager_awoooi_route",
|
||||
"display_name": "Alertmanager → AWOOOI API 路由",
|
||||
"kind": "alertmanager_route",
|
||||
"status": "verified",
|
||||
"risk_level": "critical",
|
||||
"evidence_status": "committed_manifest",
|
||||
"noise_policy_status": "proposal_only",
|
||||
"coverage_contract": "Alertmanager receiver 必須指向 AWOOOI API;OpenClaw 只做 AI 分析,不得成為 Alertmanager receiver。",
|
||||
"current_contract": "ops/alertmanager/alertmanager.yml 以 awoooi-webhook 為主路徑,telegram-direct 僅限 alert-chain/API health 緊急旁路;group_by/team/alertname/severity 已存在。",
|
||||
"evidence_refs": [
|
||||
"docs/HARD_RULES.md#alertmanager-routing",
|
||||
"ops/alertmanager/alertmanager.yml"
|
||||
],
|
||||
"next_action": "只提出 group_by、inhibit、repeat interval 降噪 proposal;不得直接改 receiver、route 或 silence。"
|
||||
},
|
||||
{
|
||||
"surface_id": "signoz_clickhouse_ingestion",
|
||||
"display_name": "SigNoz / ClickHouse / Provider Webhook",
|
||||
"kind": "signoz_clickhouse",
|
||||
"status": "verified",
|
||||
"risk_level": "high",
|
||||
"evidence_status": "production_readback_recorded",
|
||||
"noise_policy_status": "preserved",
|
||||
"coverage_contract": "SigNoz webhook、ClickHouse TTL、OTEL prometheus receiver 與 source provider heartbeat 需分開標示;heartbeat 不是 provider-native 真實告警。",
|
||||
"current_contract": "ops/signoz/alerting/rules.yaml、log-rules.md 與 RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK 已描述 webhook / rules;LOGBOOK 記錄 SigNoz webhook 與 source provider heartbeat 多次通過。",
|
||||
"evidence_refs": [
|
||||
"ops/signoz/alerting/rules.yaml",
|
||||
"ops/signoz/alerting/log-rules.md",
|
||||
"ops/signoz/otel-collector-config-phase-o.yaml",
|
||||
"docs/runbooks/RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK.md",
|
||||
"docs/adr/ADR-053-observability-signoz-unified-architecture.md",
|
||||
"docs/LOGBOOK.md"
|
||||
],
|
||||
"next_action": "保留 provider heartbeat / upstream canary 低噪音;補 provider-native 真實告警與 incident correlation gap 的只讀看板。"
|
||||
},
|
||||
{
|
||||
"surface_id": "grafana_dashboard_inventory",
|
||||
"display_name": "Grafana Dashboard / Alert Chain 視覺化",
|
||||
"kind": "grafana_dashboard",
|
||||
"status": "action_required",
|
||||
"risk_level": "medium",
|
||||
"evidence_status": "committed_manifest",
|
||||
"noise_policy_status": "needs_proposal",
|
||||
"coverage_contract": "目前只確認 committed dashboard JSON;本快照不呼叫 Grafana API、不匯入 dashboard、不改 datasource。",
|
||||
"current_contract": "ai-monitoring dashboard 包含 Alert Chain 健康與最後成功時間;infra-monitoring dashboard 包含 Prometheus target up/down 與 API request rate。",
|
||||
"evidence_refs": [
|
||||
"ops/grafana/dashboards/ai-monitoring.json",
|
||||
"ops/grafana/dashboards/infra-monitoring.json"
|
||||
],
|
||||
"next_action": "補 dashboard owner、datasource parity、正式站可讀性與 alert-chain panel fresh readback;寫入或 import 需另案批准。"
|
||||
},
|
||||
{
|
||||
"surface_id": "sentry_source_link_canary",
|
||||
"display_name": "Sentry Webhook / Source Link Canary",
|
||||
"kind": "sentry_source_link",
|
||||
"status": "verified",
|
||||
"risk_level": "high",
|
||||
"evidence_status": "production_readback_recorded",
|
||||
"noise_policy_status": "preserved",
|
||||
"coverage_contract": "Sentry webhook 與 source-link canary 用來驗證來源鏈路,不能被誤讀成真實 provider alert 全部已關聯。",
|
||||
"current_contract": "LOGBOOK 記錄 Alertmanager / SigNoz / Sentry webhook 與 Source Link Canary 通過,且 source provider freshness / incident matching 必須分開判斷。",
|
||||
"evidence_refs": [
|
||||
"docs/adr/ADR-022-sentry-integration-architecture.md",
|
||||
"docs/LOGBOOK.md"
|
||||
],
|
||||
"next_action": "持續把 heartbeat、upstream canary、direct/candidate/applied source link 分開呈現;不修改 Sentry project webhook。"
|
||||
},
|
||||
{
|
||||
"surface_id": "otel_event_exporter_bridge",
|
||||
"display_name": "OTEL Collector / Event Exporter",
|
||||
"kind": "otel_event_exporter",
|
||||
"status": "verified",
|
||||
"risk_level": "medium",
|
||||
"evidence_status": "committed_manifest",
|
||||
"noise_policy_status": "preserved",
|
||||
"coverage_contract": "OTEL Collector DaemonSet 與 SigNoz prometheus receiver 只作為可觀測來源;本快照不部署 collector、不重啟 exporter。",
|
||||
"current_contract": "k8s/observability/otel-collector-daemonset.yaml 與 ops/signoz/otel-collector-config-phase-o.yaml 描述 log/metric/trace pipeline;LOGBOOK 記錄 OTEL Collector / Event Exporter post-deploy smoke 通過。",
|
||||
"evidence_refs": [
|
||||
"k8s/observability/otel-collector-daemonset.yaml",
|
||||
"ops/signoz/otel-collector-config-phase-o.yaml",
|
||||
"docs/LOGBOOK.md"
|
||||
],
|
||||
"next_action": "把 collector/exporter health 放入 observability readiness;任何 deploy / restart 仍需獨立批准。"
|
||||
}
|
||||
],
|
||||
"noise_reduction_opportunities": [
|
||||
{
|
||||
"opportunity_id": "prometheus_noise_rule_tuning",
|
||||
"display_name": "Prometheus 告警噪音調整提案",
|
||||
"status": "approval_required",
|
||||
"proposal_only": true,
|
||||
"impact": "降低 stale provider、低樣本 SLO、重複 resource alert 對 operator 的干擾;不得直接修改 alert rules。",
|
||||
"target_surface_ids": [
|
||||
"prometheus_alert_rule_catalog"
|
||||
],
|
||||
"evidence_refs": [
|
||||
"ops/monitoring/alerts-unified.yml",
|
||||
"docs/adr/ADR-090-monitoring-blindspot-governance.md"
|
||||
],
|
||||
"next_action": "進 P2-003 建立人工批准包,先收集 24h alert frequency / fingerprint evidence。"
|
||||
},
|
||||
{
|
||||
"opportunity_id": "alertmanager_grouping_inhibit_tuning",
|
||||
"display_name": "Alertmanager grouping / inhibit 降噪提案",
|
||||
"status": "approval_required",
|
||||
"proposal_only": true,
|
||||
"impact": "針對同 team / alertname / severity 的爆量與 Host/K8s 重複告警做提案,不變更 receiver。",
|
||||
"target_surface_ids": [
|
||||
"alertmanager_awoooi_route"
|
||||
],
|
||||
"evidence_refs": [
|
||||
"ops/alertmanager/alertmanager.yml",
|
||||
"docs/HARD_RULES.md#alertmanager-routing"
|
||||
],
|
||||
"next_action": "產生 diff proposal 與 rollback plan;未批准前不得 reload Alertmanager。"
|
||||
},
|
||||
{
|
||||
"opportunity_id": "success_notification_quiet_policy",
|
||||
"display_name": "Provider heartbeat 與真實告警分流",
|
||||
"status": "ready_for_proposal",
|
||||
"proposal_only": true,
|
||||
"impact": "避免把 Sentry / SigNoz heartbeat 誤當真實 provider alert,降低假綠與錯誤升級。",
|
||||
"target_surface_ids": [
|
||||
"signoz_clickhouse_ingestion",
|
||||
"sentry_source_link_canary"
|
||||
],
|
||||
"evidence_refs": [
|
||||
"docs/LOGBOOK.md",
|
||||
"docs/runbooks/RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK.md"
|
||||
],
|
||||
"next_action": "在 UI / API 上維持 heartbeat、upstream canary、direct source link、candidate source link 四種標籤。"
|
||||
},
|
||||
{
|
||||
"opportunity_id": "grafana_dashboard_owner_freshness",
|
||||
"display_name": "Grafana dashboard owner / freshness 標籤",
|
||||
"status": "ready_for_proposal",
|
||||
"proposal_only": true,
|
||||
"impact": "讓 dashboard 缺 datasource、缺 owner 或 stale panel 不被誤讀成監控缺失已修復。",
|
||||
"target_surface_ids": [
|
||||
"grafana_dashboard_inventory"
|
||||
],
|
||||
"evidence_refs": [
|
||||
"ops/grafana/dashboards/ai-monitoring.json",
|
||||
"ops/grafana/dashboards/infra-monitoring.json"
|
||||
],
|
||||
"next_action": "只讀補 owner/freshness matrix;不寫 Grafana。"
|
||||
},
|
||||
{
|
||||
"opportunity_id": "success_notification_quiet_policy",
|
||||
"display_name": "成功不洗版 / 失敗才升級",
|
||||
"status": "preserved",
|
||||
"proposal_only": true,
|
||||
"impact": "沿用備份與 Gitea 的 quiet-success 原則,讓 observability smoke 成功證據走 API/LOGBOOK,失敗才通知。",
|
||||
"target_surface_ids": [
|
||||
"otel_event_exporter_bridge",
|
||||
"signoz_clickhouse_ingestion"
|
||||
],
|
||||
"evidence_refs": [
|
||||
"docs/ai/AI_AGENT_AUTOMATION_WORKLIST_2026-06-04.md",
|
||||
"docs/LOGBOOK.md"
|
||||
],
|
||||
"next_action": "P1-003 僅記錄;未批准前不送 Telegram 測試通知。"
|
||||
}
|
||||
],
|
||||
"classification_gaps": [
|
||||
{
|
||||
"gap_id": "prometheus_alert_rule_catalog_seed",
|
||||
"display_name": "Alert rule catalog seed 未正式產品化",
|
||||
"status": "action_required",
|
||||
"severity": "high",
|
||||
"summary": "ADR-090 要求 alert_rule_catalog 能追蹤規則資產、noise_rate 與 superseded_by_rule_id;目前 P1-003 只完成只讀矩陣。",
|
||||
"evidence_refs": [
|
||||
"docs/adr/ADR-090-monitoring-blindspot-governance.md",
|
||||
"ops/monitoring/alerts-unified.yml"
|
||||
],
|
||||
"next_action": "P2-003 前先產生 seed proposal 與 migration/rollback 分離批准包。"
|
||||
},
|
||||
{
|
||||
"gap_id": "signoz_provider_native_real_alert_gap",
|
||||
"display_name": "SigNoz provider-native 真實告警證據缺口",
|
||||
"status": "action_required",
|
||||
"severity": "medium",
|
||||
"summary": "Heartbeat / upstream canary 能證明管道新鮮,但不等於每種 provider-native alert 都已接到 incident correlation。",
|
||||
"evidence_refs": [
|
||||
"docs/LOGBOOK.md",
|
||||
"docs/runbooks/RUNBOOK-PHASE-E-SIGNOZ-WEBHOOK.md"
|
||||
],
|
||||
"next_action": "只讀列出 provider-native alert coverage;需要 side effect 的 signed canary 另案批准。"
|
||||
},
|
||||
{
|
||||
"gap_id": "grafana_dashboard_owner_status",
|
||||
"display_name": "Grafana dashboard owner / datasource 狀態未連到治理頁",
|
||||
"status": "action_required",
|
||||
"severity": "medium",
|
||||
"summary": "Committed dashboard JSON 存在,但尚未顯示 datasource freshness、owner、last import 或 panel stale 狀態。",
|
||||
"evidence_refs": [
|
||||
"ops/grafana/dashboards/ai-monitoring.json",
|
||||
"ops/grafana/dashboards/infra-monitoring.json"
|
||||
],
|
||||
"next_action": "下一輪只讀補 dashboard readiness,不呼叫 Grafana write API。"
|
||||
}
|
||||
],
|
||||
"latest_observations": [
|
||||
{
|
||||
"observation_id": "alertmanager_receiver_guard",
|
||||
"status": "verified",
|
||||
"summary": "HARD_RULES 與 ops/alertmanager/alertmanager.yml 都保留 Alertmanager 指向 AWOOOI API 的邊界;OpenClaw 不得成為 receiver。",
|
||||
"evidence_refs": [
|
||||
"docs/HARD_RULES.md#alertmanager-routing",
|
||||
"ops/alertmanager/alertmanager.yml"
|
||||
]
|
||||
},
|
||||
{
|
||||
"observation_id": "prometheus_rule_source_split",
|
||||
"status": "action_required",
|
||||
"summary": "committed Prometheus 規則分散於 ops/monitoring 與 k8s/monitoring;P1-003 建立 matrix,尚未調整規則或 reload。",
|
||||
"evidence_refs": [
|
||||
"ops/monitoring/alerts-unified.yml",
|
||||
"k8s/monitoring/alert-chain-monitor.yaml"
|
||||
]
|
||||
},
|
||||
{
|
||||
"observation_id": "post_deploy_observability_smoke_history",
|
||||
"status": "verified",
|
||||
"summary": "LOGBOOK 已多次記錄 Alertmanager / SigNoz / Sentry webhook、SigNoz、OTEL Collector、Event Exporter post-deploy smoke 通過。",
|
||||
"evidence_refs": [
|
||||
"docs/LOGBOOK.md"
|
||||
]
|
||||
}
|
||||
],
|
||||
"operator_contract": {
|
||||
"display_mode": "read_only_observability_contract_matrix",
|
||||
"must_not_interpret_as": [
|
||||
"Prometheus alert rule 修改批准",
|
||||
"Alertmanager receiver / route 修改批准",
|
||||
"Alertmanager 指向 OpenClaw receiver 批准",
|
||||
"Silence 建立或維護窗口批准",
|
||||
"Grafana dashboard 寫入批准",
|
||||
"SigNoz / Sentry webhook 設定修改批准",
|
||||
"Secret 已讀取或可輸出",
|
||||
"Telegram 測試通知批准",
|
||||
"deploy / reload / workflow 觸發批准",
|
||||
"runtime execution 授權"
|
||||
],
|
||||
"secret_display_policy": "只允許顯示 committed file refs、endpoint role 與 redacted metadata;不得顯示 token、webhook secret 或 authorization header。",
|
||||
"alertmanager_route_policy": "Alertmanager webhook 必須指向 AWOOOI API;OpenClaw 不接收 Alertmanager webhook,只能在 API 持久化與分類後參與只讀分析。",
|
||||
"noise_reduction_policy": "P1-003 僅產生 proposal;P2-003 或任何 route/rule/silence 變更需人工批准。",
|
||||
"notification_policy": "成功 smoke 不即時通知洗版;失敗、action-required 或人工作業才可進通知批准流程。"
|
||||
},
|
||||
"operation_boundaries": {
|
||||
"read_only_api_allowed": true,
|
||||
"prometheus_rule_write_allowed": false,
|
||||
"prometheus_reload_allowed": false,
|
||||
"alertmanager_route_write_allowed": false,
|
||||
"alertmanager_receiver_change_allowed": false,
|
||||
"alertmanager_to_openclaw_allowed": false,
|
||||
"silence_create_allowed": false,
|
||||
"grafana_dashboard_write_allowed": false,
|
||||
"grafana_api_write_allowed": false,
|
||||
"signoz_query_mutation_allowed": false,
|
||||
"signoz_webhook_change_allowed": false,
|
||||
"sentry_webhook_change_allowed": false,
|
||||
"otel_collector_deploy_allowed": false,
|
||||
"event_exporter_restart_allowed": false,
|
||||
"secret_read_allowed": false,
|
||||
"secret_plaintext_allowed": false,
|
||||
"notification_send_allowed": false,
|
||||
"external_api_call_allowed": false,
|
||||
"live_prometheus_query_allowed": false,
|
||||
"workflow_trigger_allowed": false,
|
||||
"deploy_trigger_allowed": false,
|
||||
"reload_trigger_allowed": false,
|
||||
"runtime_execution_allowed": false
|
||||
},
|
||||
"approval_boundaries": {
|
||||
"prometheus_rule_change_authorized": false,
|
||||
"prometheus_reload_authorized": false,
|
||||
"alertmanager_route_change_authorized": false,
|
||||
"alertmanager_receiver_change_authorized": false,
|
||||
"alertmanager_to_openclaw_authorized": false,
|
||||
"silence_authorized": false,
|
||||
"grafana_write_authorized": false,
|
||||
"signoz_write_authorized": false,
|
||||
"sentry_write_authorized": false,
|
||||
"otel_deploy_authorized": false,
|
||||
"event_exporter_restart_authorized": false,
|
||||
"notification_send_authorized": false,
|
||||
"external_call_authorized": false,
|
||||
"secret_plaintext_allowed": false,
|
||||
"workflow_trigger_authorized": false,
|
||||
"deploy_reload_authorized": false,
|
||||
"runtime_execution_authorized": false
|
||||
}
|
||||
}
|
||||
159
docs/schemas/observability_contract_matrix_v1.schema.json
Normal file
159
docs/schemas/observability_contract_matrix_v1.schema.json
Normal file
@@ -0,0 +1,159 @@
|
||||
{
|
||||
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
||||
"$id": "urn:awoooi:observability-contract-matrix-v1",
|
||||
"title": "AWOOOI 監控合約與降噪機會矩陣 v1",
|
||||
"description": "以 repo 內 committed Prometheus / Alertmanager / Grafana / SigNoz / ClickHouse / Sentry evidence 建立只讀 observability matrix;不修改 alert rules、不發通知、不打 silence API、不部署 exporter、不觸發 workflow。",
|
||||
"type": "object",
|
||||
"required": [
|
||||
"schema_version",
|
||||
"generated_at",
|
||||
"program_status",
|
||||
"source_refs",
|
||||
"rollups",
|
||||
"observability_surfaces",
|
||||
"noise_reduction_opportunities",
|
||||
"classification_gaps",
|
||||
"latest_observations",
|
||||
"operator_contract",
|
||||
"operation_boundaries",
|
||||
"approval_boundaries"
|
||||
],
|
||||
"properties": {
|
||||
"schema_version": {
|
||||
"type": "string",
|
||||
"const": "observability_contract_matrix_v1"
|
||||
},
|
||||
"generated_at": {
|
||||
"type": "string",
|
||||
"minLength": 1
|
||||
},
|
||||
"program_status": {
|
||||
"type": "object",
|
||||
"required": [
|
||||
"overall_completion_percent",
|
||||
"current_priority",
|
||||
"current_task_id",
|
||||
"next_task_id",
|
||||
"read_only_mode"
|
||||
],
|
||||
"properties": {
|
||||
"overall_completion_percent": {
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
"maximum": 100
|
||||
},
|
||||
"current_priority": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"P0",
|
||||
"P1",
|
||||
"P2",
|
||||
"P3"
|
||||
]
|
||||
},
|
||||
"current_task_id": {
|
||||
"type": "string",
|
||||
"minLength": 1
|
||||
},
|
||||
"next_task_id": {
|
||||
"type": "string",
|
||||
"minLength": 1
|
||||
},
|
||||
"read_only_mode": {
|
||||
"type": "boolean",
|
||||
"const": true
|
||||
}
|
||||
},
|
||||
"additionalProperties": false
|
||||
},
|
||||
"source_refs": {
|
||||
"type": "array",
|
||||
"minItems": 1,
|
||||
"items": {
|
||||
"type": "string",
|
||||
"minLength": 1
|
||||
}
|
||||
},
|
||||
"rollups": {
|
||||
"type": "object",
|
||||
"additionalProperties": true
|
||||
},
|
||||
"observability_surfaces": {
|
||||
"type": "array",
|
||||
"minItems": 1,
|
||||
"items": {
|
||||
"type": "object",
|
||||
"required": [
|
||||
"surface_id",
|
||||
"display_name",
|
||||
"kind",
|
||||
"status",
|
||||
"risk_level",
|
||||
"evidence_status",
|
||||
"noise_policy_status",
|
||||
"coverage_contract",
|
||||
"evidence_refs",
|
||||
"next_action"
|
||||
],
|
||||
"additionalProperties": true
|
||||
}
|
||||
},
|
||||
"noise_reduction_opportunities": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"required": [
|
||||
"opportunity_id",
|
||||
"display_name",
|
||||
"status",
|
||||
"proposal_only",
|
||||
"impact",
|
||||
"evidence_refs",
|
||||
"next_action"
|
||||
],
|
||||
"additionalProperties": true
|
||||
}
|
||||
},
|
||||
"classification_gaps": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"required": [
|
||||
"gap_id",
|
||||
"display_name",
|
||||
"status",
|
||||
"severity",
|
||||
"summary",
|
||||
"evidence_refs",
|
||||
"next_action"
|
||||
],
|
||||
"additionalProperties": true
|
||||
}
|
||||
},
|
||||
"latest_observations": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"additionalProperties": true
|
||||
}
|
||||
},
|
||||
"operator_contract": {
|
||||
"type": "object",
|
||||
"additionalProperties": true
|
||||
},
|
||||
"operation_boundaries": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "boolean"
|
||||
}
|
||||
},
|
||||
"approval_boundaries": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "boolean",
|
||||
"const": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"additionalProperties": false
|
||||
}
|
||||
@@ -3516,3 +3516,21 @@ Phase 6 完成後
|
||||
1. P1-003:盤點 Prometheus / Alertmanager / SigNoz / Grafana 監控合約與降噪機會。
|
||||
|
||||
**裁決:** P1-002 只完成 read-only committed workflow / runner health contract。不得把 `ubuntu-latest` owner attestation 缺口、runner watchdog 草案、stale-job dry-run guard 或 notification contract 解讀成 workflow 修改、runner restart / stop、container stop、runner label change、runner registration、Secret payload collection、Telegram 測試通知、schedule enable、Gitea write、deploy / migration trigger 或任何 runtime execution 授權。
|
||||
|
||||
### 2026-06-05 下午 (台北) — P1-003 監控合約與降噪矩陣本地完成
|
||||
|
||||
**觸發**:統帥批准繼續,要求依 `docs/ai/AI_AGENT_AUTOMATION_WORKLIST_2026-06-04.md` 的優先順序推進,並同步完成度、工作狀態與正式環境推版。
|
||||
|
||||
**已推進:**
|
||||
- P1-003:新增 `observability_contract_matrix_v1` schema 與 `docs/evaluations/observability_contract_matrix_2026-06-05.json`,以 committed Prometheus / Alertmanager / Grafana / SigNoz / ClickHouse / Sentry / OTEL evidence 建立只讀監控合約矩陣。
|
||||
- P1-003:新增 `GET /api/v1/agents/observability-contract-matrix` 只讀 API 與 service guard,強制拒絕把 snapshot 誤讀成 alert rule 修改、Prometheus reload、Alertmanager receiver / route 修改、Alertmanager 指向 OpenClaw、silence 建立、Grafana 寫入、SigNoz / Sentry webhook 修改、Secret payload、Telegram 測試通知、deploy / reload / workflow 觸發或 runtime execution 授權。
|
||||
- P1-003:治理頁 `/zh-TW/governance?tab=automation-inventory` 新增監控合約與降噪機會區塊;顯示監控面、需處置、降噪候選、需批准候選、分類缺口與不可誤讀合約,不新增任何執行按鈕。
|
||||
- 目前數字:observability surfaces `6`;需處置 surfaces `2`;降噪候選 `5`;需人工批准的降噪候選 `2`;classification gaps `3`;read-only denials `12`;automation backlog done `19/23`、overall `83%`、P1 `90%`、WS3 `75%`;inventory tasks `29`。
|
||||
- 本地驗證:JSON parse 通過;observability contract matrix service / API、automation inventory / backlog snapshot API、Gitea workflow runner health service / API 目標測試 `19 passed`;Python py_compile 通過;zh-TW / en i18n key 差異 `0`;web typecheck 通過;Next production build 通過;source-control-owner-response guard、security-mirror-progress guard、`git diff --check` 通過;本地 API readback 回 `observability_contract_matrix_v1`、current `P1-003`、next `P1-004`、surfaces `6`、noise opportunities `5`、approval-required opportunities `2`。
|
||||
|
||||
**下一步:**
|
||||
1. Commit 並推 `gitea main`。
|
||||
2. 等 deploy marker 後補 production API / Browser smoke。
|
||||
3. P1-004:盤點 AI Router / Ollama / Nemotron / Gemini provider 路徑。
|
||||
|
||||
**裁決:** P1-003 只完成 read-only observability contract matrix 與降噪候選顯示。不得把 Prometheus rule count、Alertmanager grouping、SigNoz / Sentry heartbeat、Grafana dashboard JSON、OTEL/Event Exporter evidence 或 classification gap 解讀成 alert rule 變更、receiver/route 變更、OpenClaw receiver 授權、silence、dashboard import、webhook 修改、secret 讀取、通知發送、deploy/reload/workflow 或 runtime execution 批准。成功 smoke 不即時通知洗版;失敗與需處置才進批准流程。
|
||||
|
||||
Reference in New Issue
Block a user